1492 files changed, 159590 insertions, 64524 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 84da76be98bb..4c29aeaa622f 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -332,8 +332,8 @@ FunctionModRefBehavior AAResults::getModRefBehavior(const Function *F) {
 
 ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
                                     const MemoryLocation &Loc) {
-  // Be conservative in the face of volatile/atomic.
-  if (!L->isUnordered())
+  // Be conservative in the face of atomic.
+  if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered))
     return MRI_ModRef;
 
   // If the load address doesn't alias the given address, it doesn't read
@@ -347,8 +347,8 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
 
 ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
                                     const MemoryLocation &Loc) {
-  // Be conservative in the face of volatile/atomic.
-  if (!S->isUnordered())
+  // Be conservative in the face of atomic.
+  if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered))
     return MRI_ModRef;
 
   if (Loc.Ptr) {
@@ -367,6 +367,14 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
   return MRI_Mod;
 }
 
+ModRefInfo AAResults::getModRefInfo(const FenceInst *S, const MemoryLocation &Loc) {
+  // If we know that the location is a constant memory location, the fence
+  // cannot modify this location.
+  if (Loc.Ptr && pointsToConstantMemory(Loc))
+    return MRI_Ref;
+  return MRI_ModRef;
+}
+
 ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,
                                     const MemoryLocation &Loc) {
 
@@ -689,7 +697,7 @@ AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
 
 bool llvm::isNoAliasCall(const Value *V) {
   if (auto CS = ImmutableCallSite(V))
-    return CS.paramHasAttr(0, Attribute::NoAlias);
+    return CS.hasRetAttr(Attribute::NoAlias);
   return false;
 }
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 701b0e1a5925..16b711a69ec3 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -199,9 +199,10 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
   // Check the unknown instructions...
   if (!UnknownInsts.empty()) {
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
-      if (AA.getModRefInfo(UnknownInsts[i],
-                           MemoryLocation(Ptr, Size, AAInfo)) != MRI_NoModRef)
-        return true;
+      if (auto *Inst = getUnknownInst(i))
+        if (AA.getModRefInfo(Inst, MemoryLocation(Ptr, Size, AAInfo)) !=
+            MRI_NoModRef)
+          return true;
   }
 
   return false;
@@ -217,10 +218,12 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
     return false;
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
-    ImmutableCallSite C1(getUnknownInst(i)), C2(Inst);
-    if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
-        AA.getModRefInfo(C2, C1) != MRI_NoModRef)
-      return true;
+    if (auto *Inst = getUnknownInst(i)) {
+      ImmutableCallSite C1(Inst), C2(Inst);
+      if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
+          AA.getModRefInfo(C2, C1) != MRI_NoModRef)
+        return true;
+    }
   }
 
   for (iterator I = begin(), E = end(); I != E; ++I)
@@ -471,7 +474,8 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 
     // If there are any call sites in the alias set, add them to this AST.
     for (unsigned i = 0, e = AS.UnknownInsts.size(); i != e; ++i)
-      add(AS.UnknownInsts[i]);
+      if (auto *Inst = AS.getUnknownInst(i))
+        add(Inst);
 
     // Loop over all of the pointers in this alias set.
     for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
@@ -489,19 +493,6 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 // dangling pointers to deleted instructions.
 //
 void AliasSetTracker::deleteValue(Value *PtrVal) {
-  // If this is a call instruction, remove the callsite from the appropriate
-  // AliasSet (if present).
-  if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
-    if (Inst->mayReadOrWriteMemory()) {
-      // Scan all the alias sets to see if this call site is contained.
-      for (iterator I = begin(), E = end(); I != E;) {
-        iterator Cur = I++;
-        if (!Cur->Forward)
-          Cur->removeUnknownInst(*this, Inst);
-      }
-    }
-  }
-
   // First, look up the PointerRec for this pointer.
   PointerMapType::iterator I = PointerMap.find_as(PtrVal);
   if (I == PointerMap.end()) return;  // Noop
@@ -633,7 +624,8 @@ void AliasSet::print(raw_ostream &OS) const {
     OS << "\n    " << UnknownInsts.size() << " Unknown instructions: ";
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
       if (i) OS << ", ";
-      UnknownInsts[i]->printAsOperand(OS);
+      if (auto *I = getUnknownInst(i))
+        I->printAsOperand(OS);
     }
   }
   OS << "\n";
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 0e7cf402cdb5..0e0b5c92a918 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -57,6 +57,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLazyBranchProbabilityInfoPassPass(Registry);
   initializeLazyBlockFrequencyInfoPassPass(Registry);
   initializeLazyValueInfoWrapperPassPass(Registry);
+  initializeLazyValueInfoPrinterPass(Registry);
   initializeLintPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
@@ -78,6 +79,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeTypeBasedAAWrapperPassPass(Registry);
   initializeScopedNoAliasAAWrapperPassPass(Registry);
   initializeLCSSAVerificationPassPass(Registry);
+  initializeMemorySSAWrapperPassPass(Registry);
+  initializeMemorySSAPrinterLegacyPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index 5851594700a4..1fae94724487 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -24,6 +24,11 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+static cl::opt<bool>
+    VerifyAssumptionCache("verify-assumption-cache", cl::Hidden,
+                          cl::desc("Enable verification of assumption cache"),
+                          cl::init(false));
+
 SmallVector<WeakVH, 1> &AssumptionCache::getOrInsertAffectedValues(Value *V) {
   // Try using find_as first to avoid creating extra value handles just for the
   // purpose of doing the lookup.
@@ -47,9 +52,11 @@ void AssumptionCache::updateAffectedValues(CallInst *CI) {
     } else if (auto *I = dyn_cast<Instruction>(V)) {
       Affected.push_back(I);
 
-      if (I->getOpcode() == Instruction::BitCast ||
-          I->getOpcode() == Instruction::PtrToInt) {
-        auto *Op = I->getOperand(0);
+      // Peek through unary operators to find the source of the condition.
+      Value *Op;
+      if (match(I, m_BitCast(m_Value(Op))) ||
+          match(I, m_PtrToInt(m_Value(Op))) ||
+          match(I, m_Not(m_Value(Op)))) {
         if (isa<Instruction>(Op) || isa<Argument>(Op))
           Affected.push_back(Op);
       }
@@ -229,7 +236,13 @@ AssumptionCache &AssumptionCacheTracker::getAssumptionCache(Function &F) {
 }
 
 void AssumptionCacheTracker::verifyAnalysis() const {
-#ifndef NDEBUG
+  // FIXME: In the long term the verifier should not be controllable with a
+  // flag. We should either fix all passes to correctly update the assumption
+  // cache and enable the verifier unconditionally or somehow arrange for the
+  // assumption list to be updated automatically by passes.
+  if (!VerifyAssumptionCache)
+    return;
+
   SmallPtrSet<const CallInst *, 4> AssumptionSet;
   for (const auto &I : AssumptionCaches) {
     for (auto &VH : I.second->assumptions())
@@ -238,11 +251,10 @@ void AssumptionCacheTracker::verifyAnalysis() const {
 
     for (const BasicBlock &B : cast<Function>(*I.first))
       for (const Instruction &II : B)
-        if (match(&II, m_Intrinsic<Intrinsic::assume>()))
-          assert(AssumptionSet.count(cast<CallInst>(&II)) &&
-                 "Assumption in scanned function not in cache");
+        if (match(&II, m_Intrinsic<Intrinsic::assume>()) &&
+            !AssumptionSet.count(cast<CallInst>(&II)))
+          report_fatal_error("Assumption in scanned function not in cache");
   }
-#endif
 }
 
 AssumptionCacheTracker::AssumptionCacheTracker() : ImmutablePass(ID) {
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index c8d057949493..09582cf9a71d 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -127,7 +127,9 @@ static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
   uint64_t Size;
-  if (getObjectSize(V, Size, DL, &TLI, RoundToAlign))
+  ObjectSizeOpts Opts;
+  Opts.RoundToAlign = RoundToAlign;
+  if (getObjectSize(V, Size, DL, &TLI, Opts))
     return Size;
   return MemoryLocation::UnknownSize;
 }
@@ -635,7 +637,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
 /// Returns true if this is a writeonly (i.e Mod only) parameter.
 static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
                              const TargetLibraryInfo &TLI) {
-  if (CS.paramHasAttr(ArgIdx + 1, Attribute::WriteOnly))
+  if (CS.paramHasAttr(ArgIdx, Attribute::WriteOnly))
     return true;
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
@@ -644,9 +646,9 @@ static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
   // whenever possible.
   // FIXME Consider handling this in InferFunctionAttr.cpp together with other
   // attributes.
-  LibFunc::Func F;
+  LibFunc F;
   if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
-      F == LibFunc::memset_pattern16 && TLI.has(F))
+      F == LibFunc_memset_pattern16 && TLI.has(F))
     if (ArgIdx == 0)
       return true;
 
@@ -664,10 +666,10 @@ ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
   if (isWriteOnlyParam(CS, ArgIdx, TLI))
     return MRI_Mod;
 
-  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadOnly))
+  if (CS.paramHasAttr(ArgIdx, Attribute::ReadOnly))
     return MRI_Ref;
 
-  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadNone))
+  if (CS.paramHasAttr(ArgIdx, Attribute::ReadNone))
     return MRI_NoModRef;
 
   return AAResultBase::getArgModRefInfo(CS, ArgIdx);
@@ -749,7 +751,11 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
   // as an argument, and itself doesn't capture it.
   if (!isa<Constant>(Object) && CS.getInstruction() != Object &&
       isNonEscapingLocalObject(Object)) {
-    bool PassedAsArg = false;
+
+    // Optimistically assume that call doesn't touch Object and check this
+    // assumption in the following loop.
+    ModRefInfo Result = MRI_NoModRef;
+
     unsigned OperandNo = 0;
     for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
          CI != CE; ++CI, ++OperandNo) {
@@ -761,20 +767,38 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
            OperandNo < CS.getNumArgOperands() && !CS.isByValArgument(OperandNo)))
         continue;
 
+      // Call doesn't access memory through this operand, so we don't care
+      // if it aliases with Object.
+      if (CS.doesNotAccessMemory(OperandNo))
+        continue;
+
       // If this is a no-capture pointer argument, see if we can tell that it
-      // is impossible to alias the pointer we're checking.  If not, we have to
-      // assume that the call could touch the pointer, even though it doesn't
-      // escape.
+      // is impossible to alias the pointer we're checking.
       AliasResult AR =
           getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object));
-      if (AR) {
-        PassedAsArg = true;
-        break;
+
+      // Operand doesnt alias 'Object', continue looking for other aliases
+      if (AR == NoAlias)
+        continue;
+      // Operand aliases 'Object', but call doesn't modify it. Strengthen
+      // initial assumption and keep looking in case if there are more aliases.
+      if (CS.onlyReadsMemory(OperandNo)) {
+        Result = static_cast<ModRefInfo>(Result | MRI_Ref);
+        continue;
+      }
+      // Operand aliases 'Object' but call only writes into it.
+      if (CS.doesNotReadMemory(OperandNo)) {
+        Result = static_cast<ModRefInfo>(Result | MRI_Mod);
+        continue;
       }
+      // This operand aliases 'Object' and call reads and writes into it.
+      Result = MRI_ModRef;
+      break;
     }
 
-    if (!PassedAsArg)
-      return MRI_NoModRef;
+    // Early return if we improved mod ref information
+    if (Result != MRI_ModRef)
+      return Result;
   }
 
   // If the CallSite is to malloc or calloc, we can assume that it doesn't
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 4cdbe4d0fcf6..07a2a9229fd5 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "block-freq"
 
-#ifndef NDEBUG
 static cl::opt<GVDAGType> ViewBlockFreqPropagationDAG(
     "view-block-freq-propagation-dags", cl::Hidden,
     cl::desc("Pop up a window to show a dag displaying how block "
@@ -55,8 +54,29 @@ cl::opt<unsigned>
                                 "is no less than the max frequency of the "
                                 "function multiplied by this percent."));
 
+// Command line option to turn on CFG dot dump after profile annotation.
+cl::opt<bool>
+    PGOViewCounts("pgo-view-counts", cl::init(false), cl::Hidden,
+                  cl::desc("A boolean option to show CFG dag with "
+                           "block profile counts and branch probabilities "
+                           "right after PGO profile annotation step. The "
+                           "profile counts are computed using branch "
+                           "probabilities from the runtime profile data and "
+                           "block frequency propagation algorithm. To view "
+                           "the raw counts from the profile, use option "
+                           "-pgo-view-raw-counts instead. To limit graph "
+                           "display to only one function, use filtering option "
+                           "-view-bfi-func-name."));
+
 namespace llvm {
 
+static GVDAGType getGVDT() {
+
+  if (PGOViewCounts)
+    return GVDT_Count;
+  return ViewBlockFreqPropagationDAG;
+}
+
 template <>
 struct GraphTraits<BlockFrequencyInfo *> {
   typedef const BasicBlock *NodeRef;
@@ -89,8 +109,7 @@ struct DOTGraphTraits<BlockFrequencyInfo *> : public BFIDOTGTraitsBase {
   std::string getNodeLabel(const BasicBlock *Node,
                            const BlockFrequencyInfo *Graph) {
 
-    return BFIDOTGTraitsBase::getNodeLabel(Node, Graph,
-                                           ViewBlockFreqPropagationDAG);
+    return BFIDOTGTraitsBase::getNodeLabel(Node, Graph, getGVDT());
   }
 
   std::string getNodeAttributes(const BasicBlock *Node,
@@ -107,7 +126,6 @@ struct DOTGraphTraits<BlockFrequencyInfo *> : public BFIDOTGTraitsBase {
 };
 
 } // end namespace llvm
-#endif
 
 BlockFrequencyInfo::BlockFrequencyInfo() {}
 
@@ -132,19 +150,26 @@ BlockFrequencyInfo &BlockFrequencyInfo::operator=(BlockFrequencyInfo &&RHS) {
 // template instantiated which is not available in the header.
 BlockFrequencyInfo::~BlockFrequencyInfo() {}
 
+bool BlockFrequencyInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                                    FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<BlockFrequencyAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 void BlockFrequencyInfo::calculate(const Function &F,
                                    const BranchProbabilityInfo &BPI,
                                    const LoopInfo &LI) {
   if (!BFI)
     BFI.reset(new ImplType);
   BFI->calculate(F, BPI, LI);
-#ifndef NDEBUG
   if (ViewBlockFreqPropagationDAG != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F.getName().equals(ViewBlockFreqFuncName))) {
     view();
   }
-#endif
 }
 
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
@@ -171,16 +196,32 @@ void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
   BFI->setBlockFreq(BB, Freq);
 }
 
+void BlockFrequencyInfo::setBlockFreqAndScale(
+    const BasicBlock *ReferenceBB, uint64_t Freq,
+    SmallPtrSetImpl<BasicBlock *> &BlocksToScale) {
+  assert(BFI && "Expected analysis to be available");
+  // Use 128 bits APInt to avoid overflow.
+  APInt NewFreq(128, Freq);
+  APInt OldFreq(128, BFI->getBlockFreq(ReferenceBB).getFrequency());
+  APInt BBFreq(128, 0);
+  for (auto *BB : BlocksToScale) {
+    BBFreq = BFI->getBlockFreq(BB).getFrequency();
+    // Multiply first by NewFreq and then divide by OldFreq
+    // to minimize loss of precision.
+    BBFreq *= NewFreq;
+    // udiv is an expensive operation in the general case. If this ends up being
+    // a hot spot, one of the options proposed in
+    // https://reviews.llvm.org/D28535#650071 could be used to avoid this.
+    BBFreq = BBFreq.udiv(OldFreq);
+    BFI->setBlockFreq(BB, BBFreq.getLimitedValue());
+  }
+  BFI->setBlockFreq(ReferenceBB, Freq);
+}
+
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
 void BlockFrequencyInfo::view() const {
-// This code is only for debugging.
-#ifndef NDEBUG
   ViewGraph(const_cast<BlockFrequencyInfo *>(this), "BlockFrequencyDAGs");
-#else
-  errs() << "BlockFrequencyInfo::view is only available in debug builds on "
-            "systems with Graphviz or gv!\n";
-#endif // NDEBUG
 }
 
 const Function *BlockFrequencyInfo::getFunction() const {
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 9850e02fca22..e5d8c3347c16 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -28,7 +28,9 @@ ScaledNumber<uint64_t> BlockMass::toScaled() const {
   return ScaledNumber<uint64_t>(getMass() + 1, -64);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); }
+#endif
 
 static char getHexDigit(int N) {
   assert(N < 16);
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 3eabb780398c..5935dec15c70 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -108,11 +108,9 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
 /// instruction. This is essentially never taken.
 static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 
-/// \brief Calculate edge weights for successors lead to unreachable.
-///
-/// Predict that a successor which leads necessarily to an
-/// unreachable-terminated block as extremely unlikely.
-bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+/// \brief Add \p BB to PostDominatedByUnreachable set if applicable.
+void
+BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0) {
     if (isa<UnreachableInst>(TI) ||
@@ -122,38 +120,86 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
         // never execute.
         BB->getTerminatingDeoptimizeCall())
       PostDominatedByUnreachable.insert(BB);
-    return false;
+    return;
+  }
+
+  // If the terminator is an InvokeInst, check only the normal destination block
+  // as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    if (PostDominatedByUnreachable.count(II->getNormalDest()))
+      PostDominatedByUnreachable.insert(BB);
+    return;
   }
 
+  for (auto *I : successors(BB))
+    // If any of successor is not post dominated then BB is also not.
+    if (!PostDominatedByUnreachable.count(I))
+      return;
+
+  PostDominatedByUnreachable.insert(BB);
+}
+
+/// \brief Add \p BB to PostDominatedByColdCall set if applicable.
+void
+BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
+  assert(!PostDominatedByColdCall.count(BB));
+  const TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0)
+    return;
+
+  // If all of successor are post dominated then BB is also done.
+  if (llvm::all_of(successors(BB), [&](const BasicBlock *SuccBB) {
+        return PostDominatedByColdCall.count(SuccBB);
+      })) {
+    PostDominatedByColdCall.insert(BB);
+    return;
+  }
+
+  // If the terminator is an InvokeInst, check only the normal destination
+  // block as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    if (PostDominatedByColdCall.count(II->getNormalDest())) {
+      PostDominatedByColdCall.insert(BB);
+      return;
+    }
+
+  // Otherwise, if the block itself contains a cold function, add it to the
+  // set of blocks post-dominated by a cold call.
+  for (auto &I : *BB)
+    if (const CallInst *CI = dyn_cast<CallInst>(&I))
+      if (CI->hasFnAttr(Attribute::Cold)) {
+        PostDominatedByColdCall.insert(BB);
+        return;
+      }
+}
+
+/// \brief Calculate edge weights for successors lead to unreachable.
+///
+/// Predict that a successor which leads necessarily to an
+/// unreachable-terminated block as extremely unlikely.
+bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0)
+    return false;
+
   SmallVector<unsigned, 4> UnreachableEdges;
   SmallVector<unsigned, 4> ReachableEdges;
 
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
     if (PostDominatedByUnreachable.count(*I))
       UnreachableEdges.push_back(I.getSuccessorIndex());
     else
       ReachableEdges.push_back(I.getSuccessorIndex());
-  }
-
-  // If all successors are in the set of blocks post-dominated by unreachable,
-  // this block is too.
-  if (UnreachableEdges.size() == TI->getNumSuccessors())
-    PostDominatedByUnreachable.insert(BB);
 
   // Skip probabilities if this block has a single successor or if all were
   // reachable.
   if (TI->getNumSuccessors() == 1 || UnreachableEdges.empty())
     return false;
 
-  // If the terminator is an InvokeInst, check only the normal destination block
-  // as the unwind edge of InvokeInst is also very unlikely taken.
-  if (auto *II = dyn_cast<InvokeInst>(TI))
-    if (PostDominatedByUnreachable.count(II->getNormalDest())) {
-      PostDominatedByUnreachable.insert(BB);
-      // Return false here so that edge weights for InvokeInst could be decided
-      // in calcInvokeHeuristics().
-      return false;
-    }
+  // Return false here so that edge weights for InvokeInst could be decided
+  // in calcInvokeHeuristics().
+  if (isa<InvokeInst>(TI))
+    return false;
 
   if (ReachableEdges.empty()) {
     BranchProbability Prob(1, UnreachableEdges.size());
@@ -263,31 +309,10 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
     else
       NormalEdges.push_back(I.getSuccessorIndex());
 
-  // If all successors are in the set of blocks post-dominated by cold calls,
-  // this block is in the set post-dominated by cold calls.
-  if (ColdEdges.size() == TI->getNumSuccessors())
-    PostDominatedByColdCall.insert(BB);
-  else {
-    // Otherwise, if the block itself contains a cold function, add it to the
-    // set of blocks postdominated by a cold call.
-    assert(!PostDominatedByColdCall.count(BB));
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (const CallInst *CI = dyn_cast<CallInst>(I))
-        if (CI->hasFnAttr(Attribute::Cold)) {
-          PostDominatedByColdCall.insert(BB);
-          break;
-        }
-  }
-
-  if (auto *II = dyn_cast<InvokeInst>(TI)) {
-    // If the terminator is an InvokeInst, consider only the normal destination
-    // block.
-    if (PostDominatedByColdCall.count(II->getNormalDest()))
-      PostDominatedByColdCall.insert(BB);
-    // Return false here so that edge weights for InvokeInst could be decided
-    // in calcInvokeHeuristics().
+  // Return false here so that edge weights for InvokeInst could be decided
+  // in calcInvokeHeuristics().
+  if (isa<InvokeInst>(TI))
     return false;
-  }
 
   // Skip probabilities if this block has a single successor.
   if (TI->getNumSuccessors() == 1 || ColdEdges.empty())
@@ -671,6 +696,8 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
   // the successors of a block iteratively.
   for (auto BB : post_order(&F.getEntryBlock())) {
     DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n");
+    updatePostDominatedByUnreachable(BB);
+    updatePostDominatedByColdCall(BB);
     if (calcUnreachableHeuristics(BB))
       continue;
     if (calcMetadataWeights(BB))
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp
index e48ff230f43c..ddd5123d0eff 100644
--- a/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -307,7 +307,7 @@ class CFLAndersAAResult::FunctionInfo {
 
 public:
   FunctionInfo(const Function &, const SmallVectorImpl<Value *> &,
-               const ReachabilitySet &, AliasAttrMap);
+               const ReachabilitySet &, const AliasAttrMap &);
 
   bool mayAlias(const Value *, uint64_t, const Value *, uint64_t) const;
   const AliasSummary &getAliasSummary() const { return Summary; }
@@ -470,7 +470,7 @@ static void populateExternalAttributes(
 
 CFLAndersAAResult::FunctionInfo::FunctionInfo(
     const Function &Fn, const SmallVectorImpl<Value *> &RetVals,
-    const ReachabilitySet &ReachSet, AliasAttrMap AMap) {
+    const ReachabilitySet &ReachSet, const AliasAttrMap &AMap) {
   populateAttrMap(AttrMap, AMap);
   populateExternalAttributes(Summary.RetParamAttributes, Fn, RetVals, AMap);
   populateAliasMap(AliasMap, ReachSet);
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 054bdc45ad67..9d4521221f47 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -117,6 +117,7 @@ bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
       PA.allAnalysesInSetPreserved<AllAnalysesOn<LazyCallGraph::SCC>>();
 
   // Ok, we have a graph, so we can propagate the invalidation down into it.
+  G->buildRefSCCs();
   for (auto &RC : G->postorder_ref_sccs())
     for (auto &C : RC) {
       Optional<PreservedAnalyses> InnerPA;
@@ -273,9 +274,9 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   // demoted edges.
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Constant *, 16> Visited;
-  SmallPtrSet<Function *, 16> RetainedEdges;
-  SmallSetVector<Function *, 4> PromotedRefTargets;
-  SmallSetVector<Function *, 4> DemotedCallTargets;
+  SmallPtrSet<Node *, 16> RetainedEdges;
+  SmallSetVector<Node *, 4> PromotedRefTargets;
+  SmallSetVector<Node *, 4> DemotedCallTargets;
 
   // First walk the function and handle all called functions. We do this first
   // because if there is a single call edge, whether there are ref edges is
@@ -284,7 +285,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (auto CS = CallSite(&I))
       if (Function *Callee = CS.getCalledFunction())
         if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
-          const Edge *E = N.lookup(*Callee);
+          Node &CalleeN = *G.lookup(*Callee);
+          Edge *E = N->lookup(CalleeN);
           // FIXME: We should really handle adding new calls. While it will
           // make downstream usage more complex, there is no fundamental
           // limitation and it will allow passes within the CGSCC to be a bit
@@ -293,9 +295,9 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
           assert(E && "No function transformations should introduce *new* "
                       "call edges! Any new calls should be modeled as "
                       "promoted existing ref edges!");
-          RetainedEdges.insert(Callee);
+          RetainedEdges.insert(&CalleeN);
           if (!E->isCall())
-            PromotedRefTargets.insert(Callee);
+            PromotedRefTargets.insert(&CalleeN);
         }
 
   // Now walk all references.
@@ -306,24 +308,25 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
           Worklist.push_back(C);
 
   LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
-    const Edge *E = N.lookup(Referee);
+    Node &RefereeN = *G.lookup(Referee);
+    Edge *E = N->lookup(RefereeN);
     // FIXME: Similarly to new calls, we also currently preclude
     // introducing new references. See above for details.
     assert(E && "No function transformations should introduce *new* ref "
                 "edges! Any new ref edges would require IPO which "
                 "function passes aren't allowed to do!");
-    RetainedEdges.insert(&Referee);
+    RetainedEdges.insert(&RefereeN);
     if (E->isCall())
-      DemotedCallTargets.insert(&Referee);
+      DemotedCallTargets.insert(&RefereeN);
   });
 
   // First remove all of the edges that are no longer present in this function.
   // We have to build a list of dead targets first and then remove them as the
   // data structures will all be invalidated by removing them.
   SmallVector<PointerIntPair<Node *, 1, Edge::Kind>, 4> DeadTargets;
-  for (Edge &E : N)
-    if (!RetainedEdges.count(&E.getFunction()))
-      DeadTargets.push_back({E.getNode(), E.getKind()});
+  for (Edge &E : *N)
+    if (!RetainedEdges.count(&E.getNode()))
+      DeadTargets.push_back({&E.getNode(), E.getKind()});
   for (auto DeadTarget : DeadTargets) {
     Node &TargetN = *DeadTarget.getPointer();
     bool IsCall = DeadTarget.getInt() == Edge::Call;
@@ -397,9 +400,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   // Next demote all the call edges that are now ref edges. This helps make
   // the SCCs small which should minimize the work below as we don't want to
   // form cycles that this would break.
-  for (Function *RefTarget : DemotedCallTargets) {
-    Node &TargetN = *G.lookup(*RefTarget);
-    SCC &TargetC = *G.lookupSCC(TargetN);
+  for (Node *RefTarget : DemotedCallTargets) {
+    SCC &TargetC = *G.lookupSCC(*RefTarget);
     RefSCC &TargetRC = TargetC.getOuterRefSCC();
 
     // The easy case is when the target RefSCC is not this RefSCC. This is
@@ -407,10 +409,10 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (&TargetRC != RC) {
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
-      RC->switchOutgoingEdgeToRef(N, TargetN);
+      RC->switchOutgoingEdgeToRef(N, *RefTarget);
       if (DebugLogging)
         dbgs() << "Switch outgoing call edge to a ref edge from '" << N
-               << "' to '" << TargetN << "'\n";
+               << "' to '" << *RefTarget << "'\n";
       continue;
     }
 
@@ -418,7 +420,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     // some SCCs.
     if (C != &TargetC) {
       // For separate SCCs this is trivial.
-      RC->switchTrivialInternalEdgeToRef(N, TargetN);
+      RC->switchTrivialInternalEdgeToRef(N, *RefTarget);
       continue;
     }
 
@@ -430,14 +432,13 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     // structure is changed.
     AM.invalidate(*C, PreservedAnalyses::none());
     // Now update the call graph.
-    C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, TargetN), G,
-                               N, C, AM, UR, DebugLogging);
+    C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, *RefTarget), G, N,
+                               C, AM, UR, DebugLogging);
   }
 
   // Now promote ref edges into call edges.
-  for (Function *CallTarget : PromotedRefTargets) {
-    Node &TargetN = *G.lookup(*CallTarget);
-    SCC &TargetC = *G.lookupSCC(TargetN);
+  for (Node *CallTarget : PromotedRefTargets) {
+    SCC &TargetC = *G.lookupSCC(*CallTarget);
     RefSCC &TargetRC = TargetC.getOuterRefSCC();
 
     // The easy case is when the target RefSCC is not this RefSCC. This is
@@ -445,22 +446,22 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (&TargetRC != RC) {
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
-      RC->switchOutgoingEdgeToCall(N, TargetN);
+      RC->switchOutgoingEdgeToCall(N, *CallTarget);
       if (DebugLogging)
         dbgs() << "Switch outgoing ref edge to a call edge from '" << N
-               << "' to '" << TargetN << "'\n";
+               << "' to '" << *CallTarget << "'\n";
       continue;
     }
     if (DebugLogging)
       dbgs() << "Switch an internal ref edge to a call edge from '" << N
-             << "' to '" << TargetN << "'\n";
+             << "' to '" << *CallTarget << "'\n";
 
     // Otherwise we are switching an internal ref edge to a call edge. This
     // may merge away some SCCs, and we add those to the UpdateResult. We also
     // need to make sure to update the worklist in the event SCCs have moved
     // before the current one in the post-order sequence.
     auto InitialSCCIndex = RC->find(*C) - RC->begin();
-    auto InvalidatedSCCs = RC->switchInternalEdgeToCall(N, TargetN);
+    auto InvalidatedSCCs = RC->switchInternalEdgeToCall(N, *CallTarget);
     if (!InvalidatedSCCs.empty()) {
       C = &TargetC;
       assert(G.lookupSCC(N) == C && "Failed to update current SCC!");
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index d53364373d7b..161709a48466 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -53,6 +53,8 @@ add_llvm_library(LLVMAnalysis
   MemoryBuiltins.cpp
   MemoryDependenceAnalysis.cpp
   MemoryLocation.cpp
+  MemorySSA.cpp
+  MemorySSAUpdater.cpp
   ModuleDebugInfoPrinter.cpp
   ModuleSummaryAnalysis.cpp
   ObjCARCAliasAnalysis.cpp
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index 458b7bfae959..6942176ae6ae 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -125,8 +125,9 @@ void CallGraph::print(raw_ostream &OS) const {
     CN->print(OS);
 }
 
-LLVM_DUMP_METHOD
-void CallGraph::dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void CallGraph::dump() const { print(dbgs()); }
+#endif
 
 // removeFunctionFromModule - Unlink the function from this module, returning
 // it.  Because this removes the function from the module, the call graph node
@@ -194,8 +195,9 @@ void CallGraphNode::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
-LLVM_DUMP_METHOD
-void CallGraphNode::dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void CallGraphNode::dump() const { print(dbgs()); }
+#endif
 
 /// removeCallEdgeFor - This method removes the edge in the node for the
 /// specified call site.  Note that this method takes linear time, so it
@@ -307,8 +309,10 @@ void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
   G->print(OS);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); }
+#endif
 
 namespace {
 struct CallGraphPrinterLegacyPass : public ModulePass {
diff --git a/lib/Analysis/CallGraphSCCPass.cpp b/lib/Analysis/CallGraphSCCPass.cpp
index 9cef78144150..ea70f5752c61 100644
--- a/lib/Analysis/CallGraphSCCPass.cpp
+++ b/lib/Analysis/CallGraphSCCPass.cpp
@@ -609,16 +609,28 @@ namespace {
     }
 
     bool runOnSCC(CallGraphSCC &SCC) override {
-      Out << Banner;
+      auto PrintBannerOnce = [&] () {
+        static bool BannerPrinted = false;
+        if (BannerPrinted)
+          return;
+        Out << Banner;
+        BannerPrinted = true;
+        };
       for (CallGraphNode *CGN : SCC) {
         if (CGN->getFunction()) {
-          if (isFunctionInPrintList(CGN->getFunction()->getName()))
+          if (isFunctionInPrintList(CGN->getFunction()->getName())) {
+            PrintBannerOnce();
             CGN->getFunction()->print(Out);
-        } else
+          }
+        } else if (llvm::isFunctionInPrintList("*")) {
+          PrintBannerOnce();
           Out << "\nPrinting <null> Function\n";
+        }
       }
       return false;
     }
+    
+    StringRef getPassName() const override { return "Print CallGraph IR"; }
   };
   
 } // end anonymous namespace.
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 73867279abe4..14176dac2104 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1058,8 +1058,8 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL,
       if (It == FoldedOps.end()) {
         if (auto *FoldedC =
                 ConstantFoldConstantImpl(NewC, DL, TLI, FoldedOps)) {
-          NewC = FoldedC;
           FoldedOps.insert({NewC, FoldedC});
+          NewC = FoldedC;
         } else {
           FoldedOps.insert({NewC, NewC});
         }
@@ -1401,7 +1401,7 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
     return true;
   default:
     return false;
-  case 0: break;
+  case Intrinsic::not_intrinsic: break;
   }
 
   if (!F->hasName())
@@ -1518,9 +1518,9 @@ Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
   bool isExact = false;
   APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
                                               : APFloat::rmNearestTiesToEven;
-  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
-                                                  /*isSigned=*/true, mode,
-                                                  &isExact);
+  APFloat::opStatus status =
+      Val.convertToInteger(makeMutableArrayRef(UIntVal), ResultWidth,
+                           /*isSigned=*/true, mode, &isExact);
   if (status != APFloat::opOK &&
       (!roundTowardZero || status != APFloat::opInexact))
     return nullptr;
@@ -1630,6 +1630,8 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
           return ConstantFoldFP(sin, V, Ty);
         case Intrinsic::cos:
           return ConstantFoldFP(cos, V, Ty);
+        case Intrinsic::sqrt:
+          return ConstantFoldFP(sqrt, V, Ty);
       }
 
       if (!TLI)
@@ -1637,87 +1639,74 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
 
       switch (Name[0]) {
       case 'a':
-        if ((Name == "acos" && TLI->has(LibFunc::acos)) ||
-            (Name == "acosf" && TLI->has(LibFunc::acosf)))
+        if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
+            (Name == "acosf" && TLI->has(LibFunc_acosf)))
           return ConstantFoldFP(acos, V, Ty);
-        else if ((Name == "asin" && TLI->has(LibFunc::asin)) ||
-                 (Name == "asinf" && TLI->has(LibFunc::asinf)))
+        else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
+                 (Name == "asinf" && TLI->has(LibFunc_asinf)))
           return ConstantFoldFP(asin, V, Ty);
-        else if ((Name == "atan" && TLI->has(LibFunc::atan)) ||
-                 (Name == "atanf" && TLI->has(LibFunc::atanf)))
+        else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
+                 (Name == "atanf" && TLI->has(LibFunc_atanf)))
           return ConstantFoldFP(atan, V, Ty);
         break;
       case 'c':
-        if ((Name == "ceil" && TLI->has(LibFunc::ceil)) ||
-            (Name == "ceilf" && TLI->has(LibFunc::ceilf)))
+        if ((Name == "ceil" && TLI->has(LibFunc_ceil)) ||
+            (Name == "ceilf" && TLI->has(LibFunc_ceilf)))
           return ConstantFoldFP(ceil, V, Ty);
-        else if ((Name == "cos" && TLI->has(LibFunc::cos)) ||
-                 (Name == "cosf" && TLI->has(LibFunc::cosf)))
+        else if ((Name == "cos" && TLI->has(LibFunc_cos)) ||
+                 (Name == "cosf" && TLI->has(LibFunc_cosf)))
           return ConstantFoldFP(cos, V, Ty);
-        else if ((Name == "cosh" && TLI->has(LibFunc::cosh)) ||
-                 (Name == "coshf" && TLI->has(LibFunc::coshf)))
+        else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
+                 (Name == "coshf" && TLI->has(LibFunc_coshf)))
           return ConstantFoldFP(cosh, V, Ty);
         break;
       case 'e':
-        if ((Name == "exp" && TLI->has(LibFunc::exp)) ||
-            (Name == "expf" && TLI->has(LibFunc::expf)))
+        if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
+            (Name == "expf" && TLI->has(LibFunc_expf)))
           return ConstantFoldFP(exp, V, Ty);
-        if ((Name == "exp2" && TLI->has(LibFunc::exp2)) ||
-            (Name == "exp2f" && TLI->has(LibFunc::exp2f)))
+        if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
+            (Name == "exp2f" && TLI->has(LibFunc_exp2f)))
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
           return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
         break;
       case 'f':
-        if ((Name == "fabs" && TLI->has(LibFunc::fabs)) ||
-            (Name == "fabsf" && TLI->has(LibFunc::fabsf)))
+        if ((Name == "fabs" && TLI->has(LibFunc_fabs)) ||
+            (Name == "fabsf" && TLI->has(LibFunc_fabsf)))
           return ConstantFoldFP(fabs, V, Ty);
-        else if ((Name == "floor" && TLI->has(LibFunc::floor)) ||
-                 (Name == "floorf" && TLI->has(LibFunc::floorf)))
+        else if ((Name == "floor" && TLI->has(LibFunc_floor)) ||
+                 (Name == "floorf" && TLI->has(LibFunc_floorf)))
           return ConstantFoldFP(floor, V, Ty);
         break;
       case 'l':
-        if ((Name == "log" && V > 0 && TLI->has(LibFunc::log)) ||
-            (Name == "logf" && V > 0 && TLI->has(LibFunc::logf)))
+        if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
+            (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)))
           return ConstantFoldFP(log, V, Ty);
-        else if ((Name == "log10" && V > 0 && TLI->has(LibFunc::log10)) ||
-                 (Name == "log10f" && V > 0 && TLI->has(LibFunc::log10f)))
+        else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
+                 (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)))
           return ConstantFoldFP(log10, V, Ty);
-        else if (IntrinsicID == Intrinsic::sqrt &&
-                 (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) {
-          if (V >= -0.0)
-            return ConstantFoldFP(sqrt, V, Ty);
-          else {
-            // Unlike the sqrt definitions in C/C++, POSIX, and IEEE-754 - which
-            // all guarantee or favor returning NaN - the square root of a
-            // negative number is not defined for the LLVM sqrt intrinsic.
-            // This is because the intrinsic should only be emitted in place of
-            // libm's sqrt function when using "no-nans-fp-math".
-            return UndefValue::get(Ty);
-          }
-        }
         break;
       case 'r':
-        if ((Name == "round" && TLI->has(LibFunc::round)) ||
-            (Name == "roundf" && TLI->has(LibFunc::roundf)))
+        if ((Name == "round" && TLI->has(LibFunc_round)) ||
+            (Name == "roundf" && TLI->has(LibFunc_roundf)))
           return ConstantFoldFP(round, V, Ty);
       case 's':
-        if ((Name == "sin" && TLI->has(LibFunc::sin)) ||
-            (Name == "sinf" && TLI->has(LibFunc::sinf)))
+        if ((Name == "sin" && TLI->has(LibFunc_sin)) ||
+            (Name == "sinf" && TLI->has(LibFunc_sinf)))
           return ConstantFoldFP(sin, V, Ty);
-        else if ((Name == "sinh" && TLI->has(LibFunc::sinh)) ||
-                 (Name == "sinhf" && TLI->has(LibFunc::sinhf)))
+        else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
+                 (Name == "sinhf" && TLI->has(LibFunc_sinhf)))
           return ConstantFoldFP(sinh, V, Ty);
-        else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt)) ||
-                 (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf)))
+        else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
+                 (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
           return ConstantFoldFP(sqrt, V, Ty);
         break;
       case 't':
-        if ((Name == "tan" && TLI->has(LibFunc::tan)) ||
-            (Name == "tanf" && TLI->has(LibFunc::tanf)))
+        if ((Name == "tan" && TLI->has(LibFunc_tan)) ||
+            (Name == "tanf" && TLI->has(LibFunc_tanf)))
           return ConstantFoldFP(tan, V, Ty);
-        else if ((Name == "tanh" && TLI->has(LibFunc::tanh)) ||
-                 (Name == "tanhf" && TLI->has(LibFunc::tanhf)))
+        else if ((Name == "tanh" && TLI->has(LibFunc_tanh)) ||
+                 (Name == "tanhf" && TLI->has(LibFunc_tanhf)))
           return ConstantFoldFP(tanh, V, Ty);
         break;
       default:
@@ -1779,7 +1768,8 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
     }
 
     if (isa<UndefValue>(Operands[0])) {
-      if (IntrinsicID == Intrinsic::bswap)
+      if (IntrinsicID == Intrinsic::bswap ||
+          IntrinsicID == Intrinsic::bitreverse)
         return Operands[0];
       return nullptr;
     }
@@ -1822,14 +1812,14 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
 
         if (!TLI)
           return nullptr;
-        if ((Name == "pow" && TLI->has(LibFunc::pow)) ||
-            (Name == "powf" && TLI->has(LibFunc::powf)))
+        if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
+            (Name == "powf" && TLI->has(LibFunc_powf)))
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
-        if ((Name == "fmod" && TLI->has(LibFunc::fmod)) ||
-            (Name == "fmodf" && TLI->has(LibFunc::fmodf)))
+        if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
+            (Name == "fmodf" && TLI->has(LibFunc_fmodf)))
           return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
-        if ((Name == "atan2" && TLI->has(LibFunc::atan2)) ||
-            (Name == "atan2f" && TLI->has(LibFunc::atan2f)))
+        if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
+            (Name == "atan2f" && TLI->has(LibFunc_atan2f)))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
       } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
         if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
@@ -2022,7 +2012,7 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
   if (!F)
     return false;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!TLI || !TLI->getLibFunc(*F, Func))
     return false;
 
@@ -2030,20 +2020,20 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
     if (ConstantFP *OpC = dyn_cast<ConstantFP>(CS.getArgOperand(0))) {
       const APFloat &Op = OpC->getValueAPF();
       switch (Func) {
-      case LibFunc::logl:
-      case LibFunc::log:
-      case LibFunc::logf:
-      case LibFunc::log2l:
-      case LibFunc::log2:
-      case LibFunc::log2f:
-      case LibFunc::log10l:
-      case LibFunc::log10:
-      case LibFunc::log10f:
+      case LibFunc_logl:
+      case LibFunc_log:
+      case LibFunc_logf:
+      case LibFunc_log2l:
+      case LibFunc_log2:
+      case LibFunc_log2f:
+      case LibFunc_log10l:
+      case LibFunc_log10:
+      case LibFunc_log10f:
         return Op.isNaN() || (!Op.isZero() && !Op.isNegative());
 
-      case LibFunc::expl:
-      case LibFunc::exp:
-      case LibFunc::expf:
+      case LibFunc_expl:
+      case LibFunc_exp:
+      case LibFunc_expf:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
           return Op.compare(APFloat(-745.0)) != APFloat::cmpLessThan &&
@@ -2053,9 +2043,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
                  Op.compare(APFloat(88.0f)) != APFloat::cmpGreaterThan;
         break;
 
-      case LibFunc::exp2l:
-      case LibFunc::exp2:
-      case LibFunc::exp2f:
+      case LibFunc_exp2l:
+      case LibFunc_exp2:
+      case LibFunc_exp2f:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
           return Op.compare(APFloat(-1074.0)) != APFloat::cmpLessThan &&
@@ -2065,17 +2055,17 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
                  Op.compare(APFloat(127.0f)) != APFloat::cmpGreaterThan;
         break;
 
-      case LibFunc::sinl:
-      case LibFunc::sin:
-      case LibFunc::sinf:
-      case LibFunc::cosl:
-      case LibFunc::cos:
-      case LibFunc::cosf:
+      case LibFunc_sinl:
+      case LibFunc_sin:
+      case LibFunc_sinf:
+      case LibFunc_cosl:
+      case LibFunc_cos:
+      case LibFunc_cosf:
         return !Op.isInfinity();
 
-      case LibFunc::tanl:
-      case LibFunc::tan:
-      case LibFunc::tanf: {
+      case LibFunc_tanl:
+      case LibFunc_tan:
+      case LibFunc_tanf: {
         // FIXME: Stop using the host math library.
         // FIXME: The computation isn't done in the right precision.
         Type *Ty = OpC->getType();
@@ -2086,23 +2076,23 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
         break;
       }
 
-      case LibFunc::asinl:
-      case LibFunc::asin:
-      case LibFunc::asinf:
-      case LibFunc::acosl:
-      case LibFunc::acos:
-      case LibFunc::acosf:
+      case LibFunc_asinl:
+      case LibFunc_asin:
+      case LibFunc_asinf:
+      case LibFunc_acosl:
+      case LibFunc_acos:
+      case LibFunc_acosf:
         return Op.compare(APFloat(Op.getSemantics(), "-1")) !=
                    APFloat::cmpLessThan &&
                Op.compare(APFloat(Op.getSemantics(), "1")) !=
                    APFloat::cmpGreaterThan;
 
-      case LibFunc::sinh:
-      case LibFunc::cosh:
-      case LibFunc::sinhf:
-      case LibFunc::coshf:
-      case LibFunc::sinhl:
-      case LibFunc::coshl:
+      case LibFunc_sinh:
+      case LibFunc_cosh:
+      case LibFunc_sinhf:
+      case LibFunc_coshf:
+      case LibFunc_sinhl:
+      case LibFunc_coshl:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
           return Op.compare(APFloat(-710.0)) != APFloat::cmpLessThan &&
@@ -2112,9 +2102,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
                  Op.compare(APFloat(89.0f)) != APFloat::cmpGreaterThan;
         break;
 
-      case LibFunc::sqrtl:
-      case LibFunc::sqrt:
-      case LibFunc::sqrtf:
+      case LibFunc_sqrtl:
+      case LibFunc_sqrt:
+      case LibFunc_sqrtf:
         return Op.isNaN() || Op.isZero() || !Op.isNegative();
 
       // FIXME: Add more functions: sqrt_finite, atanh, expm1, log1p,
@@ -2133,9 +2123,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
       const APFloat &Op1 = Op1C->getValueAPF();
 
       switch (Func) {
-      case LibFunc::powl:
-      case LibFunc::pow:
-      case LibFunc::powf: {
+      case LibFunc_powl:
+      case LibFunc_pow:
+      case LibFunc_powf: {
         // FIXME: Stop using the host math library.
         // FIXME: The computation isn't done in the right precision.
         Type *Ty = Op0C->getType();
@@ -2149,9 +2139,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
         break;
       }
 
-      case LibFunc::fmodl:
-      case LibFunc::fmod:
-      case LibFunc::fmodf:
+      case LibFunc_fmodl:
+      case LibFunc_fmod:
+      case LibFunc_fmodf:
         return Op0.isNaN() || Op1.isNaN() ||
                (!Op0.isInfinity() && !Op1.isZero());
 
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 6b77397956cd..32bfea58bf9d 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -447,25 +447,25 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
     Type *CondTy = SI->getCondition()->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
   }
   case Instruction::Store: {
     const StoreInst *SI = cast<StoreInst>(I);
     Type *ValTy = SI->getValueOperand()->getType();
     return TTI->getMemoryOpCost(I->getOpcode(), ValTy,
-                                 SI->getAlignment(),
-                                 SI->getPointerAddressSpace());
+                                SI->getAlignment(),
+                                SI->getPointerAddressSpace(), I);
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(I);
     return TTI->getMemoryOpCost(I->getOpcode(), I->getType(),
-                                 LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
+                                LI->getAlignment(),
+                                LI->getPointerAddressSpace(), I);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -481,7 +481,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast: {
     Type *SrcTy = I->getOperand(0)->getType();
-    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I);
   }
   case Instruction::ExtractElement: {
     const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
@@ -542,9 +542,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   }
   case Instruction::Call:
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      SmallVector<Value *, 4> Args;
-      for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
-        Args.push_back(II->getArgOperand(J));
+      SmallVector<Value *, 4> Args(II->arg_operands());
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 688c1db534c1..151c0b0e6c93 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -110,6 +110,9 @@ void DemandedBits::determineLiveOperandBits(
         // the output.
         AB = AOut.byteSwap();
         break;
+      case Intrinsic::bitreverse:
+        AB = AOut.reverseBits();
+        break;
       case Intrinsic::ctlz:
         if (OperandNo == 0) {
           // We need some output bits, so we need all bits of the
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index a332a07ce864..a4672efeedd6 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -385,9 +385,9 @@ void DependenceInfo::Constraint::setAny(ScalarEvolution *NewSE) {
   Kind = Any;
 }
 
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // For debugging purposes. Dumps the constraint out to OS.
-void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
+LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   if (isEmpty())
     OS << " Empty\n";
   else if (isAny())
@@ -403,6 +403,7 @@ void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   else
     llvm_unreachable("unknown constraint type in Constraint::dump");
 }
+#endif
 
 
 // Updates X with the intersection
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 15856c3f8b7a..5b6e2d0476e4 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -56,6 +56,16 @@ LLVM_DUMP_METHOD void DominanceFrontierWrapperPass::dump() const {
 }
 #endif
 
+/// Handle invalidation explicitly.
+bool DominanceFrontier::invalidate(Function &F, const PreservedAnalyses &PA,
+                                   FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<DominanceFrontierAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 AnalysisKey DominanceFrontierAnalysis::Key;
 
 DominanceFrontier DominanceFrontierAnalysis::run(Function &F,
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index a661b0101e6a..fde805a5fde5 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -76,9 +76,8 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
   // An add is interesting if exactly one of its operands is interesting.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     bool AnyInterestingYet = false;
-    for (SCEVAddExpr::op_iterator OI = Add->op_begin(), OE = Add->op_end();
-         OI != OE; ++OI)
-      if (isInteresting(*OI, I, L, SE, LI)) {
+    for (const auto *Op : Add->operands())
+      if (isInteresting(Op, I, L, SE, LI)) {
         if (AnyInterestingYet)
           return false;
         AnyInterestingYet = true;
@@ -118,6 +117,50 @@ static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
   return true;
 }
 
+/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
+/// and now we need to decide whether the user should use the preinc or post-inc
+/// value.  If this user should use the post-inc version of the IV, return true.
+///
+/// Choosing wrong here can break dominance properties (if we choose to use the
+/// post-inc value when we cannot) or it can end up adding extra live-ranges to
+/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
+/// should use the post-inc value).
+static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
+                                       const Loop *L, DominatorTree *DT) {
+  // If the user is in the loop, use the preinc value.
+  if (L->contains(User))
+    return false;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (!LatchBlock)
+    return false;
+
+  // Ok, the user is outside of the loop.  If it is dominated by the latch
+  // block, use the post-inc value.
+  if (DT->dominates(LatchBlock, User->getParent()))
+    return true;
+
+  // There is one case we have to be careful of: PHI nodes.  These little guys
+  // can live in blocks that are not dominated by the latch block, but (since
+  // their uses occur in the predecessor block, not the block the PHI lives in)
+  // should still use the post-inc value.  Check for this case now.
+  PHINode *PN = dyn_cast<PHINode>(User);
+  if (!PN || !Operand)
+    return false; // not a phi, not dominated by latch block.
+
+  // Look at all of the uses of Operand by the PHI node.  If any use corresponds
+  // to a block that is not dominated by the latch block, give up and use the
+  // preincremented value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == Operand &&
+        !DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+      return false;
+
+  // Okay, all uses of Operand by PN are in predecessor blocks that really are
+  // dominated by the latch block.  Use the post-incremented value.
+  return true;
+}
+
 /// AddUsersImpl - Inspect the specified instruction.  If it is a
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
@@ -208,10 +251,26 @@ bool IVUsers::AddUsersImpl(Instruction *I,
       // The regular return value here is discarded; instead of recording
       // it, we just recompute it when we need it.
       const SCEV *OriginalISE = ISE;
-      ISE = TransformForPostIncUse(NormalizeAutodetect,
-                                   ISE, User, I,
-                                   NewUse.PostIncLoops,
-                                   *SE, *DT);
+
+      auto NormalizePred = [&](const SCEVAddRecExpr *AR) {
+        // We only allow affine AddRecs to be normalized, otherwise we would not
+        // be able to correctly denormalize.
+        // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2}
+        // Normalized form:   {-2,+,1,+,2}
+        // Denormalized form: {1,+,3,+,2}
+        //
+        // However, denormalization would use a different step expression than
+        // normalization (see getPostIncExpr), generating the wrong final
+        // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
+        auto *L = AR->getLoop();
+        bool Result =
+            AR->isAffine() && IVUseShouldUsePostIncValue(User, I, L, DT);
+        if (Result)
+          NewUse.PostIncLoops.insert(L);
+        return Result;
+      };
+
+      ISE = normalizeForPostIncUseIf(ISE, NormalizePred, *SE);
 
       // PostIncNormalization effectively simplifies the expression under
       // pre-increment assumptions. Those assumptions (no wrapping) might not
@@ -219,8 +278,7 @@ bool IVUsers::AddUsersImpl(Instruction *I,
       // transformation is invertible.
       if (OriginalISE != ISE) {
         const SCEV *DenormalizedISE =
-          TransformForPostIncUse(Denormalize, ISE, User, I,
-              NewUse.PostIncLoops, *SE, *DT);
+            denormalizeForPostIncUse(ISE, NewUse.PostIncLoops, *SE);
 
         // If we normalized the expression, but denormalization doesn't give the
         // original one, discard this user.
@@ -338,11 +396,8 @@ const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &IU) const {
 
 /// getExpr - Return the expression for the use.
 const SCEV *IVUsers::getExpr(const IVStrideUse &IU) const {
-  return
-    TransformForPostIncUse(Normalize, getReplacementExpr(IU),
-                           IU.getUser(), IU.getOperandValToReplace(),
-                           const_cast<PostIncLoopSet &>(IU.getPostIncLoops()),
-                           *SE, *DT);
+  return normalizeForPostIncUse(getReplacementExpr(IU), IU.getPostIncLoops(),
+                                *SE);
 }
 
 static const SCEVAddRecExpr *findAddRecForLoop(const SCEV *S, const Loop *L) {
@@ -353,9 +408,8 @@ static const SCEVAddRecExpr *findAddRecForLoop(const SCEV *S, const Loop *L) {
   }
 
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
-         I != E; ++I)
-      if (const SCEVAddRecExpr *AR = findAddRecForLoop(*I, L))
+    for (const auto *Op : Add->operands())
+      if (const SCEVAddRecExpr *AR = findAddRecForLoop(Op, L))
         return AR;
     return nullptr;
   }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 4109049ecabc..1f8dec2aed80 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -48,6 +49,11 @@ static cl::opt<int> HintThreshold(
     "inlinehint-threshold", cl::Hidden, cl::init(325),
     cl::desc("Threshold for inlining functions with inline hint"));
 
+static cl::opt<int>
+    ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
+                          cl::init(45),
+                          cl::desc("Threshold for inlining cold callsites"));
+
 // We introduce this threshold to help performance of instrumentation based
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
@@ -72,12 +78,18 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// Getter for the cache of @llvm.assume intrinsics.
   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
 
+  /// Getter for BlockFrequencyInfo
+  Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
+
   /// Profile summary information.
   ProfileSummaryInfo *PSI;
 
   /// The called function.
   Function &F;
 
+  // Cache the DataLayout since we use it a lot.
+  const DataLayout &DL;
+
   /// The candidate callsite being analyzed. Please do not use this to do
   /// analysis in the caller function; we want the inline cost query to be
   /// easily cacheable. Instead, use the cover function paramHasAttr.
@@ -133,9 +145,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   void disableSROA(Value *V);
   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                           int InstructionCost);
-  bool isGEPOffsetConstant(GetElementPtrInst &GEP);
+  bool isGEPFree(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
+  template <typename Callable>
+  bool simplifyInstruction(Instruction &I, Callable Evaluate);
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 
   /// Return true if the given argument to the function being considered for
@@ -202,9 +216,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
 public:
   CallAnalyzer(const TargetTransformInfo &TTI,
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+               Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
                ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
                const InlineParams &Params)
-      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee),
+      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
+        PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()),
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
@@ -286,23 +302,11 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
   SROACostSavings += InstructionCost;
 }
 
-/// \brief Check whether a GEP's indices are all constant.
-///
-/// Respects any simplified values known during the analysis of this callsite.
-bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
-  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-    if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
-      return false;
-
-  return true;
-}
-
 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 ///
 /// Returns false if unable to compute the offset for any reason. Respects any
 /// simplified values known during the analysis of this callsite.
 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
   unsigned IntPtrWidth = DL.getPointerSizeInBits();
   assert(IntPtrWidth == Offset.getBitWidth());
 
@@ -331,13 +335,27 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
   return true;
 }
 
+/// \brief Use TTI to check whether a GEP is free.
+///
+/// Respects any simplified values known during the analysis of this callsite.
+bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
+  SmallVector<Value *, 4> Indices;
+  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+    if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
+       Indices.push_back(SimpleOp);
+     else
+       Indices.push_back(*I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getGEPCost(GEP.getSourceElementType(), GEP.getPointerOperand(),
+                        Indices);
+}
+
 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // Check whether inlining will turn a dynamic alloca into a static
   // alloca and handle that case.
   if (I.isArrayAllocation()) {
     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
-      const DataLayout &DL = F.getParent()->getDataLayout();
       Type *Ty = I.getAllocatedType();
       AllocatedSize = SaturatingMultiplyAdd(
           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
@@ -347,7 +365,6 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
 
   // Accumulate the allocated size.
   if (I.isStaticAlloca()) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
     Type *Ty = I.getAllocatedType();
     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
   }
@@ -396,7 +413,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
         // Non-constant GEPs aren't folded, and disable SROA.
         if (SROACandidate)
           disableSROA(CostIt);
-        return false;
+        return isGEPFree(I);
       }
 
       // Add the result as a new mapping to Base + Offset.
@@ -411,7 +428,15 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
     }
   }
 
-  if (isGEPOffsetConstant(I)) {
+  // Lambda to check whether a GEP's indices are all constant.
+  auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
+    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+      if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
+        return false;
+    return true;
+  };
+
+  if (IsGEPOffsetConstant(I)) {
     if (SROACandidate)
       SROAArgValues[&I] = SROAArg;
 
@@ -422,19 +447,36 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   // Variable GEPs will require math and will disable SROA.
   if (SROACandidate)
     disableSROA(CostIt);
-  return false;
+  return isGEPFree(I);
+}
+
+/// Simplify \p I if its operands are constants and update SimplifiedValues.
+/// \p Evaluate is a callable specific to instruction type that evaluates the
+/// instruction when all the operands are constants.
+template <typename Callable>
+bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
+  SmallVector<Constant *, 2> COps;
+  for (Value *Op : I.operands()) {
+    Constant *COp = dyn_cast<Constant>(Op);
+    if (!COp)
+      COp = SimplifiedValues.lookup(Op);
+    if (!COp)
+      return false;
+    COps.push_back(COp);
+  }
+  auto *C = Evaluate(COps);
+  if (!C)
+    return false;
+  SimplifiedValues[&I] = C;
+  return true;
 }
 
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getBitCast(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offsets through casts
   std::pair<Value *, APInt> BaseAndOffset =
@@ -455,19 +497,14 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 
 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getPtrToInt(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offset pairs when converted to a plain integer provided the
   // integer is large enough to represent the pointer.
   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
-  const DataLayout &DL = F.getParent()->getDataLayout();
   if (IntegerSize >= DL.getPointerSizeInBits()) {
     std::pair<Value *, APInt> BaseAndOffset =
         ConstantOffsetPtrs.lookup(I.getOperand(0));
@@ -492,20 +529,15 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getIntToPtr(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offset pairs when round-tripped through a pointer without
   // modifications provided the integer is not too large.
   Value *Op = I.getOperand(0);
   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
-  const DataLayout &DL = F.getParent()->getDataLayout();
   if (IntegerSize <= DL.getPointerSizeInBits()) {
     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
     if (BaseAndOffset.first)
@@ -523,14 +555,10 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
+      }))
+    return true;
 
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
@@ -540,16 +568,10 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
   Value *Operand = I.getOperand(0);
-  Constant *COp = dyn_cast<Constant>(Operand);
-  if (!COp)
-    COp = SimplifiedValues.lookup(Operand);
-  if (COp) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
-    if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
-  }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantFoldInstOperands(&I, COps[0], DL);
+      }))
+    return true;
 
   // Disable any SROA on the argument to arbitrary unary operators.
   disableSROA(Operand);
@@ -558,8 +580,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
 }
 
 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
-  unsigned ArgNo = A->getArgNo();
-  return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
+  return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
 }
 
 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
@@ -642,16 +663,21 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
     if (Callee.hasFnAttribute(Attribute::InlineHint))
       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
     if (PSI) {
-      uint64_t TotalWeight;
-      if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
-          PSI->isHotCount(TotalWeight)) {
-        Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
+      BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
+      if (PSI->isHotCallSite(CS, CallerBFI)) {
+        DEBUG(dbgs() << "Hot callsite.\n");
+        Threshold = Params.HotCallSiteThreshold.getValue();
       } else if (PSI->isFunctionEntryHot(&Callee)) {
+        DEBUG(dbgs() << "Hot callee.\n");
         // If callsite hotness can not be determined, we may still know
         // that the callee is hot and treat it as a weaker hint for threshold
         // increase.
         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+      } else if (PSI->isColdCallSite(CS, CallerBFI)) {
+        DEBUG(dbgs() << "Cold callsite.\n");
+        Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
       } else if (PSI->isFunctionEntryCold(&Callee)) {
+        DEBUG(dbgs() << "Cold callee.\n");
         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
       }
     }
@@ -665,20 +691,10 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   // First try to handle simplified comparisons.
-  if (!isa<Constant>(LHS))
-    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-      LHS = SimpleLHS;
-  if (!isa<Constant>(RHS))
-    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-      RHS = SimpleRHS;
-  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
-    if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C =
-              ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
-        SimplifiedValues[&I] = C;
-        return true;
-      }
-  }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
+      }))
+    return true;
 
   if (I.getOpcode() == Instruction::FCmp)
     return false;
@@ -756,24 +772,18 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
 
 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  if (!isa<Constant>(LHS))
-    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-      LHS = SimpleLHS;
-  if (!isa<Constant>(RHS))
-    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-      RHS = SimpleRHS;
-  Value *SimpleV = nullptr;
-  if (auto FI = dyn_cast<FPMathOperator>(&I))
-    SimpleV =
-        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
-  else
-    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+  auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
+    Value *SimpleV = nullptr;
+    if (auto FI = dyn_cast<FPMathOperator>(&I))
+      SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
+                                FI->getFastMathFlags(), DL);
+    else
+      SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
+    return dyn_cast_or_null<Constant>(SimpleV);
+  };
 
-  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
-    SimplifiedValues[&I] = C;
+  if (simplifyInstruction(I, Evaluate))
     return true;
-  }
 
   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
   disableSROA(LHS);
@@ -814,13 +824,10 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
 
 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
   // Constant folding for extract value is trivial.
-  Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
-  if (!C)
-    C = SimplifiedValues.lookup(I.getAggregateOperand());
-  if (C) {
-    SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getExtractValue(COps[0], I.getIndices());
+      }))
     return true;
-  }
 
   // SROA can look through these but give them a cost.
   return false;
@@ -828,17 +835,12 @@ bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
 
 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   // Constant folding for insert value is trivial.
-  Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
-  if (!AggC)
-    AggC = SimplifiedValues.lookup(I.getAggregateOperand());
-  Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
-  if (!InsertedC)
-    InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
-  if (AggC && InsertedC) {
-    SimplifiedValues[&I] =
-        ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
+                                            /*InsertedValueOperand*/ COps[1],
+                                            I.getIndices());
+      }))
     return true;
-  }
 
   // SROA can look through these but give them a cost.
   return false;
@@ -959,7 +961,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   // out. Pretend to inline the function, with a custom threshold.
   auto IndirectCallParams = Params;
   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
-  CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams);
+  CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
+                  IndirectCallParams);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // threshold to get the bonus we want to apply, but don't go below zero.
@@ -1006,8 +1009,8 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // does not (yet) fire.
   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
   SuccessorBlocks.insert(SI.getDefaultDest());
-  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
-    SuccessorBlocks.insert(I.getCaseSuccessor());
+  for (auto Case : SI.cases())
+    SuccessorBlocks.insert(Case.getCaseSuccessor());
   // Add cost corresponding to the number of distinct destinations. The first
   // we model as free because of fallthrough.
   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
@@ -1098,19 +1101,10 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // is expensive or the function has the "use-soft-float" attribute, this may
     // eventually become a library call. Treat the cost as such.
     if (I->getType()->isFloatingPointTy()) {
-      bool hasSoftFloatAttr = false;
-
       // If the function has the "use-soft-float" attribute, mark it as
       // expensive.
-      if (F.hasFnAttribute("use-soft-float")) {
-        Attribute Attr = F.getFnAttribute("use-soft-float");
-        StringRef Val = Attr.getValueAsString();
-        if (Val == "true")
-          hasSoftFloatAttr = true;
-      }
-
       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
-          hasSoftFloatAttr)
+          (F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
         Cost += InlineConstants::CallPenalty;
     }
 
@@ -1155,7 +1149,6 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   if (!V->getType()->isPointerTy())
     return nullptr;
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
   unsigned IntPtrWidth = DL.getPointerSizeInBits();
   APInt Offset = APInt::getNullValue(IntPtrWidth);
 
@@ -1212,7 +1205,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
   FiftyPercentVectorBonus = 3 * Threshold / 2;
   TenPercentVectorBonus = 3 * Threshold / 4;
-  const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Track whether the post-inlining function would have more than one basic
   // block. A single basic block is often intended for inlining. Balloon the
@@ -1371,7 +1363,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       Value *Cond = SI->getCondition();
       if (ConstantInt *SimpleCond =
               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
-        BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
+        BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor());
         continue;
       }
     }
@@ -1430,13 +1422,6 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 }
 #endif
 
-/// \brief Test that two functions either have or have not the given attribute
-///        at the same time.
-template <typename AttrKind>
-static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
-  return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
-}
-
 /// \brief Test that there are no attribute conflicts between Caller and Callee
 ///        that prevent inlining.
 static bool functionsHaveCompatibleAttributes(Function *Caller,
@@ -1449,15 +1434,17 @@ static bool functionsHaveCompatibleAttributes(Function *Caller,
 InlineCost llvm::getInlineCost(
     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
     ProfileSummaryInfo *PSI) {
   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
-                       GetAssumptionCache, PSI);
+                       GetAssumptionCache, GetBFI, PSI);
 }
 
 InlineCost llvm::getInlineCost(
     CallSite CS, Function *Callee, const InlineParams &Params,
     TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
     ProfileSummaryInfo *PSI) {
 
   // Cannot inline indirect calls.
@@ -1492,7 +1479,8 @@ InlineCost llvm::getInlineCost(
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                      << "...\n");
 
-  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params);
+  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
+                  Params);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
@@ -1565,6 +1553,9 @@ InlineParams llvm::getInlineParams(int Threshold) {
   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
   Params.HotCallSiteThreshold = HotCallSiteThreshold;
 
+  // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
+  Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
+
   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
   // -inlinehint-threshold commandline option is not explicitly given. If that
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 796e6e444980..e12f640394e6 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
@@ -140,10 +141,9 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
 /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
-static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                          unsigned OpcToExpand, const Query &Q,
+static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
+                          Instruction::BinaryOps OpcodeToExpand, const Query &Q,
                           unsigned MaxRecurse) {
-  Instruction::BinaryOps OpcodeToExpand = (Instruction::BinaryOps)OpcToExpand;
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -199,9 +199,9 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
-static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
-                                       const Query &Q, unsigned MaxRecurse) {
-  Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc;
+static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
+                                       Value *LHS, Value *RHS, const Query &Q,
+                                       unsigned MaxRecurse) {
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -298,8 +298,9 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
 /// try to simplify the binop by seeing whether evaluating it on both branches
 /// of the select results in the same value. Returns the common value if so,
 /// otherwise returns null.
-static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
-                                    const Query &Q, unsigned MaxRecurse) {
+static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
+                                    Value *RHS, const Query &Q,
+                                    unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -451,8 +452,9 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
 /// try to simplify the binop by seeing whether evaluating it on the incoming
 /// phi values yields the same result for every value. If so returns the common
 /// value, otherwise returns null.
-static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
-                                 const Query &Q, unsigned MaxRecurse) {
+static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
+                                 Value *RHS, const Query &Q,
+                                 unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -527,17 +529,26 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   return CommonValue;
 }
 
+static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
+                                       Value *&Op0, Value *&Op1,
+                                       const Query &Q) {
+  if (auto *CLHS = dyn_cast<Constant>(Op0)) {
+    if (auto *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
+
+    // Canonicalize the constant to the RHS if this is a commutative operation.
+    if (Instruction::isCommutative(Opcode))
+      std::swap(Op0, Op1);
+  }
+  return nullptr;
+}
+
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Add, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
+    return C;
 
   // X + undef -> undef
   if (match(Op1, m_Undef()))
@@ -556,12 +567,20 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     return Y;
 
   // X + ~X -> -1   since   ~X = -X-1
+  Type *Ty = Op0->getType();
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
-    return Constant::getAllOnesValue(Op0->getType());
+    return Constant::getAllOnesValue(Ty);
+
+  // add nsw/nuw (xor Y, signbit), signbit --> Y
+  // The no-wrapping add guarantees that the top bit will be set by the add.
+  // Therefore, the xor must be clearing the already set sign bit of Y.
+  if ((isNSW || isNUW) && match(Op1, m_SignBit()) &&
+      match(Op0, m_Xor(m_Value(Y), m_SignBit())))
+    return Y;
 
   /// i1 add -> xor.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->getScalarType()->isIntegerTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -665,9 +684,8 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
 /// If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0))
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Sub, CLHS, CRHS, Q.DL);
+  if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
+    return C;
 
   // X - undef -> undef
   // undef - X -> undef
@@ -692,7 +710,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
     computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-    if (KnownZero == ~APInt::getSignBit(BitWidth)) {
+    if (KnownZero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
       if (isNSW)
@@ -779,7 +797,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // i1 sub -> xor.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->getScalarType()->isIntegerTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -807,13 +825,8 @@ Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// returns null.
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FAdd, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
+    return C;
 
   // fadd X, -0 ==> X
   if (match(Op1, m_NegZero()))
@@ -846,10 +859,8 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// returns null.
 static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FSub, CLHS, CRHS, Q.DL);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
+    return C;
 
   // fsub X, 0 ==> X
   if (match(Op1, m_Zero()))
@@ -878,40 +889,28 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 }
 
 /// Given the operands for an FMul, see if we can fold the result
-static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
-                               FastMathFlags FMF,
-                               const Query &Q,
-                               unsigned MaxRecurse) {
- if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FMul, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
- }
+static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                               const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
+    return C;
 
- // fmul X, 1.0 ==> X
- if (match(Op1, m_FPOne()))
-   return Op0;
+  // fmul X, 1.0 ==> X
+  if (match(Op1, m_FPOne()))
+    return Op0;
 
- // fmul nnan nsz X, 0 ==> 0
- if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
-   return Op1;
+  // fmul nnan nsz X, 0 ==> 0
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
+    return Op1;
 
- return nullptr;
+  return nullptr;
 }
 
 /// Given operands for a Mul, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Mul, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
+    return C;
 
   // X * undef -> 0
   if (match(Op1, m_Undef()))
@@ -932,7 +931,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
     return X;
 
   // i1 mul -> and.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->getScalarType()->isIntegerTy(1))
     if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -998,43 +997,68 @@ Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL,
                            RecursionLimit);
 }
 
-/// Given operands for an SDiv or UDiv, see if we can fold the result.
-/// If not, this returns null.
-static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
-                          const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
-
-  bool isSigned = Opcode == Instruction::SDiv;
+/// Check for common or similar folds of integer division or integer remainder.
+static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
+  Type *Ty = Op0->getType();
 
   // X / undef -> undef
+  // X % undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
-  // X / 0 -> undef, we don't need to preserve faults!
+  // X / 0 -> undef
+  // X % 0 -> undef
+  // We don't need to preserve faults!
   if (match(Op1, m_Zero()))
-    return UndefValue::get(Op1->getType());
+    return UndefValue::get(Ty);
+
+  // If any element of a constant divisor vector is zero, the whole op is undef.
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (Op1C && Ty->isVectorTy()) {
+    unsigned NumElts = Ty->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = Op1C->getAggregateElement(i);
+      if (Elt && Elt->isNullValue())
+        return UndefValue::get(Ty);
+    }
+  }
 
   // undef / X -> 0
+  // undef % X -> 0
   if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
+    return Constant::getNullValue(Ty);
 
-  // 0 / X -> 0, we don't need to preserve faults!
+  // 0 / X -> 0
+  // 0 % X -> 0
   if (match(Op0, m_Zero()))
     return Op0;
 
+  // X / X -> 1
+  // X % X -> 0
+  if (Op0 == Op1)
+    return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty);
+
   // X / 1 -> X
-  if (match(Op1, m_One()))
-    return Op0;
+  // X % 1 -> 0
+  // If this is a boolean op (single-bit element type), we can't have
+  // division-by-zero or remainder-by-zero, so assume the divisor is 1.
+  if (match(Op1, m_One()) || Ty->getScalarType()->isIntegerTy(1))
+    return IsDiv ? Op0 : Constant::getNullValue(Ty);
 
-  if (Op0->getType()->isIntegerTy(1))
-    // It can't be division by zero, hence it must be division by one.
-    return Op0;
+  return nullptr;
+}
 
-  // X / X -> 1
-  if (Op0 == Op1)
-    return ConstantInt::get(Op0->getType(), 1);
+/// Given operands for an SDiv or UDiv, see if we can fold the result.
+/// If not, this returns null.
+static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
+                          const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
+
+  if (Value *V = simplifyDivRem(Op0, Op1, true))
+    return V;
+
+  bool isSigned = Opcode == Instruction::SDiv;
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
   Value *X = nullptr, *Y = nullptr;
@@ -1129,6 +1153,9 @@ Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout &DL,
 
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const Query &Q, unsigned) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
+    return C;
+
   // undef / X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1178,37 +1205,11 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// If not, this returns null.
 static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
-
-  // X % undef -> undef
-  if (match(Op1, m_Undef()))
-    return Op1;
-
-  // undef % X -> 0
-  if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
-
-  // 0 % X -> 0, we don't need to preserve faults!
-  if (match(Op0, m_Zero()))
-    return Op0;
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
 
-  // X % 0 -> undef, we don't need to preserve faults!
-  if (match(Op1, m_Zero()))
-    return UndefValue::get(Op0->getType());
-
-  // X % 1 -> 0
-  if (match(Op1, m_One()))
-    return Constant::getNullValue(Op0->getType());
-
-  if (Op0->getType()->isIntegerTy(1))
-    // It can't be remainder by zero, hence it must be remainder by one.
-    return Constant::getNullValue(Op0->getType());
-
-  // X % X -> 0
-  if (Op0 == Op1)
-    return Constant::getNullValue(Op0->getType());
+  if (Value *V = simplifyDivRem(Op0, Op1, false))
+    return V;
 
   // (X % Y) % Y -> X % Y
   if ((Opcode == Instruction::SRem &&
@@ -1279,7 +1280,10 @@ Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout &DL,
 }
 
 static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                               const Query &, unsigned) {
+                               const Query &Q, unsigned) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
+    return C;
+
   // undef % X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1335,11 +1339,10 @@ static bool isUndefShift(Value *Amount) {
 
 /// Given operands for an Shl, LShr or AShr, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
-                            const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
+static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
+                            Value *Op1, const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
@@ -1386,8 +1389,8 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
 
 /// \brief Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
-                                 bool isExact, const Query &Q,
+static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
+                                 Value *Op1, bool isExact, const Query &Q,
                                  unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1636,13 +1639,8 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
 /// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::And, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
+    return C;
 
   // X & undef -> 0
   if (match(Op1, m_Undef()))
@@ -1838,13 +1836,8 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
 /// If not, this returns null.
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                              unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Or, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
+    return C;
 
   // X | undef -> -1
   if (match(Op1, m_Undef()))
@@ -1971,13 +1964,8 @@ Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL,
 /// If not, this returns null.
 static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Xor, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
+    return C;
 
   // A ^ undef -> undef
   if (match(Op1, m_Undef()))
@@ -2377,6 +2365,163 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+/// Many binary operators with a constant operand have an easy-to-compute
+/// range of outputs. This can be used to fold a comparison to always true or
+/// always false.
+static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper) {
+  unsigned Width = Lower.getBitWidth();
+  const APInt *C;
+  switch (BO.getOpcode()) {
+  case Instruction::Add:
+    if (match(BO.getOperand(1), m_APInt(C)) && *C != 0) {
+      // FIXME: If we have both nuw and nsw, we should reduce the range further.
+      if (BO.hasNoUnsignedWrap()) {
+        // 'add nuw x, C' produces [C, UINT_MAX].
+        Lower = *C;
+      } else if (BO.hasNoSignedWrap()) {
+        if (C->isNegative()) {
+          // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C].
+          Lower = APInt::getSignedMinValue(Width);
+          Upper = APInt::getSignedMaxValue(Width) + *C + 1;
+        } else {
+          // 'add nsw x, +C' produces [SINT_MIN + C, SINT_MAX].
+          Lower = APInt::getSignedMinValue(Width) + *C;
+          Upper = APInt::getSignedMaxValue(Width) + 1;
+        }
+      }
+    }
+    break;
+
+  case Instruction::And:
+    if (match(BO.getOperand(1), m_APInt(C)))
+      // 'and x, C' produces [0, C].
+      Upper = *C + 1;
+    break;
+
+  case Instruction::Or:
+    if (match(BO.getOperand(1), m_APInt(C)))
+      // 'or x, C' produces [C, UINT_MAX].
+      Lower = *C;
+    break;
+
+  case Instruction::AShr:
+    if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
+      // 'ashr x, C' produces [INT_MIN >> C, INT_MAX >> C].
+      Lower = APInt::getSignedMinValue(Width).ashr(*C);
+      Upper = APInt::getSignedMaxValue(Width).ashr(*C) + 1;
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      unsigned ShiftAmount = Width - 1;
+      if (*C != 0 && BO.isExact())
+        ShiftAmount = C->countTrailingZeros();
+      if (C->isNegative()) {
+        // 'ashr C, x' produces [C, C >> (Width-1)]
+        Lower = *C;
+        Upper = C->ashr(ShiftAmount) + 1;
+      } else {
+        // 'ashr C, x' produces [C >> (Width-1), C]
+        Lower = C->ashr(ShiftAmount);
+        Upper = *C + 1;
+      }
+    }
+    break;
+
+  case Instruction::LShr:
+    if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
+      // 'lshr x, C' produces [0, UINT_MAX >> C].
+      Upper = APInt::getAllOnesValue(Width).lshr(*C) + 1;
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      // 'lshr C, x' produces [C >> (Width-1), C].
+      unsigned ShiftAmount = Width - 1;
+      if (*C != 0 && BO.isExact())
+        ShiftAmount = C->countTrailingZeros();
+      Lower = C->lshr(ShiftAmount);
+      Upper = *C + 1;
+    }
+    break;
+
+  case Instruction::Shl:
+    if (match(BO.getOperand(0), m_APInt(C))) {
+      if (BO.hasNoUnsignedWrap()) {
+        // 'shl nuw C, x' produces [C, C << CLZ(C)]
+        Lower = *C;
+        Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
+      } else if (BO.hasNoSignedWrap()) { // TODO: What if both nuw+nsw?
+        if (C->isNegative()) {
+          // 'shl nsw C, x' produces [C << CLO(C)-1, C]
+          unsigned ShiftAmount = C->countLeadingOnes() - 1;
+          Lower = C->shl(ShiftAmount);
+          Upper = *C + 1;
+        } else {
+          // 'shl nsw C, x' produces [C, C << CLZ(C)-1]
+          unsigned ShiftAmount = C->countLeadingZeros() - 1;
+          Lower = *C;
+          Upper = C->shl(ShiftAmount) + 1;
+        }
+      }
+    }
+    break;
+
+  case Instruction::SDiv:
+    if (match(BO.getOperand(1), m_APInt(C))) {
+      APInt IntMin = APInt::getSignedMinValue(Width);
+      APInt IntMax = APInt::getSignedMaxValue(Width);
+      if (C->isAllOnesValue()) {
+        // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
+        //    where C != -1 and C != 0 and C != 1
+        Lower = IntMin + 1;
+        Upper = IntMax + 1;
+      } else if (C->countLeadingZeros() < Width - 1) {
+        // 'sdiv x, C' produces [INT_MIN / C, INT_MAX / C]
+        //    where C != -1 and C != 0 and C != 1
+        Lower = IntMin.sdiv(*C);
+        Upper = IntMax.sdiv(*C);
+        if (Lower.sgt(Upper))
+          std::swap(Lower, Upper);
+        Upper = Upper + 1;
+        assert(Upper != Lower && "Upper part of range has wrapped!");
+      }
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      if (C->isMinSignedValue()) {
+        // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
+        Lower = *C;
+        Upper = Lower.lshr(1) + 1;
+      } else {
+        // 'sdiv C, x' produces [-|C|, |C|].
+        Upper = C->abs() + 1;
+        Lower = (-Upper) + 1;
+      }
+    }
+    break;
+
+  case Instruction::UDiv:
+    if (match(BO.getOperand(1), m_APInt(C)) && *C != 0) {
+      // 'udiv x, C' produces [0, UINT_MAX / C].
+      Upper = APInt::getMaxValue(Width).udiv(*C) + 1;
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      // 'udiv C, x' produces [0, C].
+      Upper = *C + 1;
+    }
+    break;
+
+  case Instruction::SRem:
+    if (match(BO.getOperand(1), m_APInt(C))) {
+      // 'srem x, C' produces (-|C|, |C|).
+      Upper = C->abs();
+      Lower = (-Upper) + 1;
+    }
+    break;
+
+  case Instruction::URem:
+    if (match(BO.getOperand(1), m_APInt(C)))
+      // 'urem x, C' produces [0, C).
+      Upper = *C;
+    break;
+
+  default:
+    break;
+  }
+}
+
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS) {
   const APInt *C;
@@ -2390,114 +2535,12 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   if (RHS_CR.isFullSet())
     return ConstantInt::getTrue(GetCompareTy(RHS));
 
-  // Many binary operators with constant RHS have easy to compute constant
-  // range.  Use them to check whether the comparison is a tautology.
+  // Find the range of possible values for binary operators.
   unsigned Width = C->getBitWidth();
   APInt Lower = APInt(Width, 0);
   APInt Upper = APInt(Width, 0);
-  const APInt *C2;
-  if (match(LHS, m_URem(m_Value(), m_APInt(C2)))) {
-    // 'urem x, C2' produces [0, C2).
-    Upper = *C2;
-  } else if (match(LHS, m_SRem(m_Value(), m_APInt(C2)))) {
-    // 'srem x, C2' produces (-|C2|, |C2|).
-    Upper = C2->abs();
-    Lower = (-Upper) + 1;
-  } else if (match(LHS, m_UDiv(m_APInt(C2), m_Value()))) {
-    // 'udiv C2, x' produces [0, C2].
-    Upper = *C2 + 1;
-  } else if (match(LHS, m_UDiv(m_Value(), m_APInt(C2)))) {
-    // 'udiv x, C2' produces [0, UINT_MAX / C2].
-    APInt NegOne = APInt::getAllOnesValue(Width);
-    if (*C2 != 0)
-      Upper = NegOne.udiv(*C2) + 1;
-  } else if (match(LHS, m_SDiv(m_APInt(C2), m_Value()))) {
-    if (C2->isMinSignedValue()) {
-      // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
-      Lower = *C2;
-      Upper = Lower.lshr(1) + 1;
-    } else {
-      // 'sdiv C2, x' produces [-|C2|, |C2|].
-      Upper = C2->abs() + 1;
-      Lower = (-Upper) + 1;
-    }
-  } else if (match(LHS, m_SDiv(m_Value(), m_APInt(C2)))) {
-    APInt IntMin = APInt::getSignedMinValue(Width);
-    APInt IntMax = APInt::getSignedMaxValue(Width);
-    if (C2->isAllOnesValue()) {
-      // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
-      //    where C2 != -1 and C2 != 0 and C2 != 1
-      Lower = IntMin + 1;
-      Upper = IntMax + 1;
-    } else if (C2->countLeadingZeros() < Width - 1) {
-      // 'sdiv x, C2' produces [INT_MIN / C2, INT_MAX / C2]
-      //    where C2 != -1 and C2 != 0 and C2 != 1
-      Lower = IntMin.sdiv(*C2);
-      Upper = IntMax.sdiv(*C2);
-      if (Lower.sgt(Upper))
-        std::swap(Lower, Upper);
-      Upper = Upper + 1;
-      assert(Upper != Lower && "Upper part of range has wrapped!");
-    }
-  } else if (match(LHS, m_NUWShl(m_APInt(C2), m_Value()))) {
-    // 'shl nuw C2, x' produces [C2, C2 << CLZ(C2)]
-    Lower = *C2;
-    Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
-  } else if (match(LHS, m_NSWShl(m_APInt(C2), m_Value()))) {
-    if (C2->isNegative()) {
-      // 'shl nsw C2, x' produces [C2 << CLO(C2)-1, C2]
-      unsigned ShiftAmount = C2->countLeadingOnes() - 1;
-      Lower = C2->shl(ShiftAmount);
-      Upper = *C2 + 1;
-    } else {
-      // 'shl nsw C2, x' produces [C2, C2 << CLZ(C2)-1]
-      unsigned ShiftAmount = C2->countLeadingZeros() - 1;
-      Lower = *C2;
-      Upper = C2->shl(ShiftAmount) + 1;
-    }
-  } else if (match(LHS, m_LShr(m_Value(), m_APInt(C2)))) {
-    // 'lshr x, C2' produces [0, UINT_MAX >> C2].
-    APInt NegOne = APInt::getAllOnesValue(Width);
-    if (C2->ult(Width))
-      Upper = NegOne.lshr(*C2) + 1;
-  } else if (match(LHS, m_LShr(m_APInt(C2), m_Value()))) {
-    // 'lshr C2, x' produces [C2 >> (Width-1), C2].
-    unsigned ShiftAmount = Width - 1;
-    if (*C2 != 0 && cast<BinaryOperator>(LHS)->isExact())
-      ShiftAmount = C2->countTrailingZeros();
-    Lower = C2->lshr(ShiftAmount);
-    Upper = *C2 + 1;
-  } else if (match(LHS, m_AShr(m_Value(), m_APInt(C2)))) {
-    // 'ashr x, C2' produces [INT_MIN >> C2, INT_MAX >> C2].
-    APInt IntMin = APInt::getSignedMinValue(Width);
-    APInt IntMax = APInt::getSignedMaxValue(Width);
-    if (C2->ult(Width)) {
-      Lower = IntMin.ashr(*C2);
-      Upper = IntMax.ashr(*C2) + 1;
-    }
-  } else if (match(LHS, m_AShr(m_APInt(C2), m_Value()))) {
-    unsigned ShiftAmount = Width - 1;
-    if (*C2 != 0 && cast<BinaryOperator>(LHS)->isExact())
-      ShiftAmount = C2->countTrailingZeros();
-    if (C2->isNegative()) {
-      // 'ashr C2, x' produces [C2, C2 >> (Width-1)]
-      Lower = *C2;
-      Upper = C2->ashr(ShiftAmount) + 1;
-    } else {
-      // 'ashr C2, x' produces [C2 >> (Width-1), C2]
-      Lower = C2->ashr(ShiftAmount);
-      Upper = *C2 + 1;
-    }
-  } else if (match(LHS, m_Or(m_Value(), m_APInt(C2)))) {
-    // 'or x, C2' produces [C2, UINT_MAX].
-    Lower = *C2;
-  } else if (match(LHS, m_And(m_Value(), m_APInt(C2)))) {
-    // 'and x, C2' produces [0, C2].
-    Upper = *C2 + 1;
-  } else if (match(LHS, m_NUWAdd(m_Value(), m_APInt(C2)))) {
-    // 'add nuw x, C2' produces [C2, UINT_MAX].
-    Lower = *C2;
-  }
+  if (auto *BO = dyn_cast<BinaryOperator>(LHS))
+    setLimitsForBinOp(*BO, Lower, Upper);
 
   ConstantRange LHS_CR =
       Lower != Upper ? ConstantRange(Lower, Upper) : ConstantRange(Width, true);
@@ -3064,8 +3107,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
   if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
-    auto RHS_Instr = dyn_cast<Instruction>(RHS);
-    auto LHS_Instr = dyn_cast<Instruction>(LHS);
+    auto RHS_Instr = cast<Instruction>(RHS);
+    auto LHS_Instr = cast<Instruction>(LHS);
 
     if (RHS_Instr->getMetadata(LLVMContext::MD_range) &&
         LHS_Instr->getMetadata(LLVMContext::MD_range)) {
@@ -4039,6 +4082,62 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                             RecursionLimit);
 }
 
+static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
+                                        Type *RetTy, const Query &Q,
+                                        unsigned MaxRecurse) {
+  Type *InVecTy = Op0->getType();
+  unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
+  unsigned InVecNumElts = InVecTy->getVectorNumElements();
+
+  auto *Op0Const = dyn_cast<Constant>(Op0);
+  auto *Op1Const = dyn_cast<Constant>(Op1);
+
+  // If all operands are constant, constant fold the shuffle.
+  if (Op0Const && Op1Const)
+    return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
+
+  // If only one of the operands is constant, constant fold the shuffle if the
+  // mask does not select elements from the variable operand.
+  bool MaskSelects0 = false, MaskSelects1 = false;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    int Idx = ShuffleVectorInst::getMaskValue(Mask, i);
+    if (Idx == -1)
+      continue;
+    if ((unsigned)Idx < InVecNumElts)
+      MaskSelects0 = true;
+    else
+      MaskSelects1 = true;
+  }
+  if (!MaskSelects0 && Op1Const)
+    return ConstantFoldShuffleVectorInstruction(UndefValue::get(InVecTy),
+                                                Op1Const, Mask);
+  if (!MaskSelects1 && Op0Const)
+    return ConstantFoldShuffleVectorInstruction(Op0Const,
+                                                UndefValue::get(InVecTy), Mask);
+
+  // A shuffle of a splat is always the splat itself. Legal if the shuffle's
+  // value type is same as the input vectors' type.
+  if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
+    if (!MaskSelects1 && RetTy == InVecTy &&
+        OpShuf->getMask()->getSplatValue())
+      return Op0;
+  if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op1))
+    if (!MaskSelects0 && RetTy == InVecTy &&
+        OpShuf->getMask()->getSplatValue())
+      return Op1;
+
+  return nullptr;
+}
+
+/// Given operands for a ShuffleVectorInst, fold the result or return null.
+Value *llvm::SimplifyShuffleVectorInst(
+    Value *Op0, Value *Op1, Constant *Mask, Type *RetTy,
+    const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT,
+    AssumptionCache *AC, const Instruction *CxtI) {
+  return ::SimplifyShuffleVectorInst(
+      Op0, Op1, Mask, RetTy, Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+}
+
 //=== Helper functions for higher up the class hierarchy.
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
@@ -4047,61 +4146,43 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const Query &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
-    return SimplifyAddInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-
   case Instruction::Sub:
-    return SimplifySubInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-
-  case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::Mul:
+    return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FMul:
-    return SimplifyFMulInst (LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-  case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SDiv:
+    return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::UDiv:
+    return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FDiv:
-      return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SRem:
+    return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::URem:
+    return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FRem:
-      return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::Shl:
-    return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::LShr:
-    return SimplifyLShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
+    return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::AShr:
-    return SimplifyAShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
-  case Instruction::And: return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::Or:  return SimplifyOrInst (LHS, RHS, Q, MaxRecurse);
-  case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
+  case Instruction::And:
+    return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Or:
+    return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Xor:
+    return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   default:
-    if (Constant *CLHS = dyn_cast<Constant>(LHS))
-      if (Constant *CRHS = dyn_cast<Constant>(RHS))
-        return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
-
-    // If the operation is associative, try some generic simplifications.
-    if (Instruction::isAssociative(Opcode))
-      if (Value *V = SimplifyAssociativeBinOp(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    // If the operation is with the result of a select instruction check whether
-    // operating on either branch of the select always yields the same value.
-    if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-      if (Value *V = ThreadBinOpOverSelect(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    // If the operation is with the result of a phi instruction, check whether
-    // operating on all incoming values of the phi always yields the same value.
-    if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-      if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    return nullptr;
+    llvm_unreachable("Unexpected opcode");
   }
 }
 
@@ -4267,6 +4348,7 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
     case Intrinsic::fabs: {
       if (SignBitMustBeZero(*ArgBegin, Q.TLI))
         return *ArgBegin;
+      return nullptr;
     }
     default:
       return nullptr;
@@ -4396,7 +4478,8 @@ Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
 /// If not, this returns null.
 Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
                                  const TargetLibraryInfo *TLI,
-                                 const DominatorTree *DT, AssumptionCache *AC) {
+                                 const DominatorTree *DT, AssumptionCache *AC,
+                                 OptimizationRemarkEmitter *ORE) {
   Value *Result;
 
   switch (I->getOpcode()) {
@@ -4522,6 +4605,13 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
         EEI->getVectorOperand(), EEI->getIndexOperand(), DL, TLI, DT, AC, I);
     break;
   }
+  case Instruction::ShuffleVector: {
+    auto *SVI = cast<ShuffleVectorInst>(I);
+    Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
+                                       SVI->getMask(), SVI->getType(), DL, TLI,
+                                       DT, AC, I);
+    break;
+  }
   case Instruction::PHI:
     Result = SimplifyPHINode(cast<PHINode>(I), Query(DL, TLI, DT, AC, I));
     break;
@@ -4537,6 +4627,10 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
     Result = SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(),
                               DL, TLI, DT, AC, I);
     break;
+  case Instruction::Alloca:
+    // No simplifications for Alloca and it can't be constant folded.
+    Result = nullptr;
+    break;
   }
 
   // In general, it is possible for computeKnownBits to determine all bits in a
@@ -4545,7 +4639,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
     unsigned BitWidth = I->getType()->getScalarSizeInBits();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT);
+    computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT, ORE);
     if ((KnownZero | KnownOne).isAllOnesValue())
       Result = ConstantInt::get(I->getType(), KnownOne);
   }
diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp
index d1374acd963e..2a736ec0379c 100644
--- a/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -64,10 +64,7 @@ void IDFCalculator<NodeTy>::calculate(
       BasicBlock *BB = Node->getBlock();
       // Succ is the successor in the direction we are calculating IDF, so it is
       // successor for IDF, and predecessor for Reverse IDF.
-      for (auto SuccIter = GraphTraits<NodeTy>::child_begin(BB),
-                End = GraphTraits<NodeTy>::child_end(BB);
-           SuccIter != End; ++SuccIter) {
-        BasicBlock *Succ = *SuccIter;
+      for (auto *Succ : children<NodeTy>(BB)) {
         DomTreeNode *SuccNode = DT.getNode(Succ);
 
         // Quickly skip all CFG edges that are also dominator tree edges instead
diff --git a/lib/Analysis/LazyBlockFrequencyInfo.cpp b/lib/Analysis/LazyBlockFrequencyInfo.cpp
index 596b6fc1afb5..a8178ecc0a24 100644
--- a/lib/Analysis/LazyBlockFrequencyInfo.cpp
+++ b/lib/Analysis/LazyBlockFrequencyInfo.cpp
@@ -9,7 +9,7 @@
 //
 // This is an alternative analysis pass to BlockFrequencyInfoWrapperPass.  The
 // difference is that with this pass the block frequencies are not computed when
-// the analysis pass is executed but rather when the BFI results is explicitly
+// the analysis pass is executed but rather when the BFI result is explicitly
 // requested by the analysis client.
 //
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index f7cf8c6729f2..eef56815f2e0 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -18,26 +18,50 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "lcg"
 
+void LazyCallGraph::EdgeSequence::insertEdgeInternal(Node &TargetN,
+                                                     Edge::Kind EK) {
+  EdgeIndexMap.insert({&TargetN, Edges.size()});
+  Edges.emplace_back(TargetN, EK);
+}
+
+void LazyCallGraph::EdgeSequence::setEdgeKind(Node &TargetN, Edge::Kind EK) {
+  Edges[EdgeIndexMap.find(&TargetN)->second].setKind(EK);
+}
+
+bool LazyCallGraph::EdgeSequence::removeEdgeInternal(Node &TargetN) {
+  auto IndexMapI = EdgeIndexMap.find(&TargetN);
+  if (IndexMapI == EdgeIndexMap.end())
+    return false;
+
+  Edges[IndexMapI->second] = Edge();
+  EdgeIndexMap.erase(IndexMapI);
+  return true;
+}
+
 static void addEdge(SmallVectorImpl<LazyCallGraph::Edge> &Edges,
-                    DenseMap<Function *, int> &EdgeIndexMap, Function &F,
-                    LazyCallGraph::Edge::Kind EK) {
-  if (!EdgeIndexMap.insert({&F, Edges.size()}).second)
+                    DenseMap<LazyCallGraph::Node *, int> &EdgeIndexMap,
+                    LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK) {
+  if (!EdgeIndexMap.insert({&N, Edges.size()}).second)
     return;
 
-  DEBUG(dbgs() << "    Added callable function: " << F.getName() << "\n");
-  Edges.emplace_back(LazyCallGraph::Edge(F, EK));
+  DEBUG(dbgs() << "    Added callable function: " << N.getName() << "\n");
+  Edges.emplace_back(LazyCallGraph::Edge(N, EK));
 }
 
-LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
-    : G(&G), F(F), DFSNumber(0), LowLink(0) {
-  DEBUG(dbgs() << "  Adding functions called by '" << F.getName()
+LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
+  assert(!Edges && "Must not have already populated the edges for this node!");
+
+  DEBUG(dbgs() << "  Adding functions called by '" << getName()
                << "' to the graph.\n");
 
+  Edges = EdgeSequence();
+
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Function *, 4> Callees;
   SmallPtrSet<Constant *, 16> Visited;
@@ -58,14 +82,15 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
   // alias. Then a test of the address of the weak function against the new
   // strong definition's address would be an effective way to determine the
   // safety of optimizing a direct call edge.
-  for (BasicBlock &BB : F)
+  for (BasicBlock &BB : *F)
     for (Instruction &I : BB) {
       if (auto CS = CallSite(&I))
         if (Function *Callee = CS.getCalledFunction())
           if (!Callee->isDeclaration())
             if (Callees.insert(Callee).second) {
               Visited.insert(Callee);
-              addEdge(Edges, EdgeIndexMap, *Callee, LazyCallGraph::Edge::Call);
+              addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*Callee),
+                      LazyCallGraph::Edge::Call);
             }
 
       for (Value *Op : I.operand_values())
@@ -78,50 +103,33 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
   // function containing) operands to all of the instructions in the function.
   // Process them (recursively) collecting every function found.
   visitReferences(Worklist, Visited, [&](Function &F) {
-    addEdge(Edges, EdgeIndexMap, F, LazyCallGraph::Edge::Ref);
+    addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(F),
+            LazyCallGraph::Edge::Ref);
   });
-}
-
-void LazyCallGraph::Node::insertEdgeInternal(Function &Target, Edge::Kind EK) {
-  if (Node *N = G->lookup(Target))
-    return insertEdgeInternal(*N, EK);
-
-  EdgeIndexMap.insert({&Target, Edges.size()});
-  Edges.emplace_back(Target, EK);
-}
 
-void LazyCallGraph::Node::insertEdgeInternal(Node &TargetN, Edge::Kind EK) {
-  EdgeIndexMap.insert({&TargetN.getFunction(), Edges.size()});
-  Edges.emplace_back(TargetN, EK);
+  return *Edges;
 }
 
-void LazyCallGraph::Node::setEdgeKind(Function &TargetF, Edge::Kind EK) {
-  Edges[EdgeIndexMap.find(&TargetF)->second].setKind(EK);
+void LazyCallGraph::Node::replaceFunction(Function &NewF) {
+  assert(F != &NewF && "Must not replace a function with itself!");
+  F = &NewF;
 }
 
-void LazyCallGraph::Node::removeEdgeInternal(Function &Target) {
-  auto IndexMapI = EdgeIndexMap.find(&Target);
-  assert(IndexMapI != EdgeIndexMap.end() &&
-         "Target not in the edge set for this caller?");
-
-  Edges[IndexMapI->second] = Edge();
-  EdgeIndexMap.erase(IndexMapI);
-}
-
-void LazyCallGraph::Node::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const {
   dbgs() << *this << '\n';
 }
+#endif
 
-LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
+LazyCallGraph::LazyCallGraph(Module &M) {
   DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
                << "\n");
   for (Function &F : M)
-    if (!F.isDeclaration() && !F.hasLocalLinkage())
-      if (EntryIndexMap.insert({&F, EntryEdges.size()}).second) {
-        DEBUG(dbgs() << "  Adding '" << F.getName()
-                     << "' to entry set of the graph.\n");
-        EntryEdges.emplace_back(F, Edge::Ref);
-      }
+    if (!F.isDeclaration() && !F.hasLocalLinkage()) {
+      DEBUG(dbgs() << "  Adding '" << F.getName()
+                   << "' to entry set of the graph.\n");
+      addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
+    }
 
   // Now add entry nodes for functions reachable via initializers to globals.
   SmallVector<Constant *, 16> Worklist;
@@ -134,21 +142,15 @@ LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
   DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
                   "entry set.\n");
   visitReferences(Worklist, Visited, [&](Function &F) {
-    addEdge(EntryEdges, EntryIndexMap, F, LazyCallGraph::Edge::Ref);
+    addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F),
+            LazyCallGraph::Edge::Ref);
   });
-
-  for (const Edge &E : EntryEdges)
-    RefSCCEntryNodes.push_back(&E.getFunction());
 }
 
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
     : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
-      EntryEdges(std::move(G.EntryEdges)),
-      EntryIndexMap(std::move(G.EntryIndexMap)), SCCBPA(std::move(G.SCCBPA)),
-      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)),
-      DFSStack(std::move(G.DFSStack)),
-      RefSCCEntryNodes(std::move(G.RefSCCEntryNodes)),
-      NextDFSNumber(G.NextDFSNumber) {
+      EntryEdges(std::move(G.EntryEdges)), SCCBPA(std::move(G.SCCBPA)),
+      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)) {
   updateGraphPtrs();
 }
 
@@ -156,20 +158,18 @@ LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   BPA = std::move(G.BPA);
   NodeMap = std::move(G.NodeMap);
   EntryEdges = std::move(G.EntryEdges);
-  EntryIndexMap = std::move(G.EntryIndexMap);
   SCCBPA = std::move(G.SCCBPA);
   SCCMap = std::move(G.SCCMap);
   LeafRefSCCs = std::move(G.LeafRefSCCs);
-  DFSStack = std::move(G.DFSStack);
-  RefSCCEntryNodes = std::move(G.RefSCCEntryNodes);
-  NextDFSNumber = G.NextDFSNumber;
   updateGraphPtrs();
   return *this;
 }
 
-void LazyCallGraph::SCC::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LazyCallGraph::SCC::dump() const {
   dbgs() << *this << '\n';
 }
+#endif
 
 #ifndef NDEBUG
 void LazyCallGraph::SCC::verify() {
@@ -184,8 +184,8 @@ void LazyCallGraph::SCC::verify() {
            "Must set DFS numbers to -1 when adding a node to an SCC!");
     assert(N->LowLink == -1 &&
            "Must set low link to -1 when adding a node to an SCC!");
-    for (Edge &E : *N)
-      assert(E.getNode() && "Can't have an edge to a raw function!");
+    for (Edge &E : **N)
+      assert(E.getNode() && "Can't have an unpopulated node!");
   }
 }
 #endif
@@ -195,10 +195,9 @@ bool LazyCallGraph::SCC::isParentOf(const SCC &C) const {
     return false;
 
   for (Node &N : *this)
-    for (Edge &E : N.calls())
-      if (Node *CalleeN = E.getNode())
-        if (OuterRefSCC->G->lookupSCC(*CalleeN) == &C)
-          return true;
+    for (Edge &E : N->calls())
+      if (OuterRefSCC->G->lookupSCC(E.getNode()) == &C)
+        return true;
 
   // No edges found.
   return false;
@@ -218,11 +217,8 @@ bool LazyCallGraph::SCC::isAncestorOf(const SCC &TargetC) const {
   do {
     const SCC &C = *Worklist.pop_back_val();
     for (Node &N : C)
-      for (Edge &E : N.calls()) {
-        Node *CalleeN = E.getNode();
-        if (!CalleeN)
-          continue;
-        SCC *CalleeC = G.lookupSCC(*CalleeN);
+      for (Edge &E : N->calls()) {
+        SCC *CalleeC = G.lookupSCC(E.getNode());
         if (!CalleeC)
           continue;
 
@@ -243,9 +239,11 @@ bool LazyCallGraph::SCC::isAncestorOf(const SCC &TargetC) const {
 
 LazyCallGraph::RefSCC::RefSCC(LazyCallGraph &G) : G(&G) {}
 
-void LazyCallGraph::RefSCC::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LazyCallGraph::RefSCC::dump() const {
   dbgs() << *this << '\n';
 }
+#endif
 
 #ifndef NDEBUG
 void LazyCallGraph::RefSCC::verify() {
@@ -279,10 +277,10 @@ void LazyCallGraph::RefSCC::verify() {
   for (int i = 0, Size = SCCs.size(); i < Size; ++i) {
     SCC &SourceSCC = *SCCs[i];
     for (Node &N : SourceSCC)
-      for (Edge &E : N) {
+      for (Edge &E : *N) {
         if (!E.isCall())
           continue;
-        SCC &TargetSCC = *G->lookupSCC(*E.getNode());
+        SCC &TargetSCC = *G->lookupSCC(E.getNode());
         if (&TargetSCC.getOuterRefSCC() == this) {
           assert(SCCIndices.find(&TargetSCC)->second <= i &&
                  "Edge between SCCs violates post-order relationship.");
@@ -299,8 +297,8 @@ void LazyCallGraph::RefSCC::verify() {
     auto HasConnectingEdge = [&] {
       for (SCC &C : *ParentRC)
         for (Node &N : C)
-          for (Edge &E : N)
-            if (G->lookupRefSCC(*E.getNode()) == this)
+          for (Edge &E : *N)
+            if (G->lookupRefSCC(E.getNode()) == this)
               return true;
       return false;
     };
@@ -461,7 +459,7 @@ updatePostorderSequenceForEdgeInsertion(
 
 SmallVector<LazyCallGraph::SCC *, 1>
 LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+  assert(!(*SourceN)[TargetN].isCall() && "Must start with a ref edge!");
   SmallVector<SCC *, 1> DeletedSCCs;
 
 #ifndef NDEBUG
@@ -477,7 +475,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   // If the two nodes are already part of the same SCC, we're also done as
   // we've just added more connectivity.
   if (&SourceSCC == &TargetSCC) {
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+    SourceN->setEdgeKind(TargetN, Edge::Call);
     return DeletedSCCs;
   }
 
@@ -490,7 +488,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   int SourceIdx = SCCIndices[&SourceSCC];
   int TargetIdx = SCCIndices[&TargetSCC];
   if (TargetIdx < SourceIdx) {
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+    SourceN->setEdgeKind(TargetN, Edge::Call);
     return DeletedSCCs;
   }
 
@@ -504,11 +502,9 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     ConnectedSet.insert(&SourceSCC);
     auto IsConnected = [&](SCC &C) {
       for (Node &N : C)
-        for (Edge &E : N.calls()) {
-          assert(E.getNode() && "Must have formed a node within an SCC!");
-          if (ConnectedSet.count(G->lookupSCC(*E.getNode())))
+        for (Edge &E : N->calls())
+          if (ConnectedSet.count(G->lookupSCC(E.getNode())))
             return true;
-        }
 
       return false;
     };
@@ -535,11 +531,10 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     do {
       SCC &C = *Worklist.pop_back_val();
       for (Node &N : C)
-        for (Edge &E : N) {
-          assert(E.getNode() && "Must have formed a node within an SCC!");
+        for (Edge &E : *N) {
           if (!E.isCall())
             continue;
-          SCC &EdgeC = *G->lookupSCC(*E.getNode());
+          SCC &EdgeC = *G->lookupSCC(E.getNode());
           if (&EdgeC.getOuterRefSCC() != this)
             // Not in this RefSCC...
             continue;
@@ -565,7 +560,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   // new cycles. We're done.
   if (MergeRange.begin() == MergeRange.end()) {
     // Now that the SCC structure is finalized, flip the kind to call.
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+    SourceN->setEdgeKind(TargetN, Edge::Call);
     return DeletedSCCs;
   }
 
@@ -600,7 +595,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     SCCIndices[C] -= IndexOffset;
 
   // Now that the SCC structure is finalized, flip the kind to call.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+  SourceN->setEdgeKind(TargetN, Edge::Call);
 
   // And we're done!
   return DeletedSCCs;
@@ -608,7 +603,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
 
 void LazyCallGraph::RefSCC::switchTrivialInternalEdgeToRef(Node &SourceN,
                                                            Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
 #ifndef NDEBUG
   // In a debug build, verify the RefSCC is valid to start with and when this
@@ -625,12 +620,12 @@ void LazyCallGraph::RefSCC::switchTrivialInternalEdgeToRef(Node &SourceN,
          "Source and Target must be in separate SCCs for this to be trivial!");
 
   // Set the edge kind.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 }
 
 iterator_range<LazyCallGraph::RefSCC::iterator>
 LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
 #ifndef NDEBUG
   // In a debug build, verify the RefSCC is valid to start with and when this
@@ -650,7 +645,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
                                                 "full CG update.");
 
   // Set the edge kind.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 
   // Otherwise we are removing a call edge from a single SCC. This may break
   // the cycle. In order to compute the new set of SCCs, we need to do a small
@@ -665,7 +660,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
   // etc.
 
   SCC &OldSCC = TargetSCC;
-  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<std::pair<Node *, EdgeSequence::call_iterator>, 16> DFSStack;
   SmallVector<Node *, 16> PendingSCCStack;
   SmallVector<SCC *, 4> NewSCCs;
 
@@ -706,14 +701,14 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->call_begin()});
+    DFSStack.push_back({RootN, (*RootN)->call_begin()});
     do {
       Node *N;
-      call_edge_iterator I;
+      EdgeSequence::call_iterator I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->call_end();
+      auto E = (*N)->call_end();
       while (I != E) {
-        Node &ChildN = *I->getNode();
+        Node &ChildN = I->getNode();
         if (ChildN.DFSNumber == 0) {
           // We haven't yet visited this child, so descend, pushing the current
           // node onto the stack.
@@ -723,8 +718,8 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
                  "Found a node with 0 DFS number but already in an SCC!");
           ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
           N = &ChildN;
-          I = N->call_begin();
-          E = N->call_end();
+          I = (*N)->call_begin();
+          E = (*N)->call_end();
           continue;
         }
 
@@ -817,17 +812,19 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
 
 void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
                                                      Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+  assert(!(*SourceN)[TargetN].isCall() && "Must start with a ref edge!");
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) != this &&
          "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // Edges between RefSCCs are the same regardless of call or ref, so we can
   // just flip the edge here.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+  SourceN->setEdgeKind(TargetN, Edge::Call);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -837,17 +834,19 @@ void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
 
 void LazyCallGraph::RefSCC::switchOutgoingEdgeToRef(Node &SourceN,
                                                     Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) != this &&
          "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // Edges between RefSCCs are the same regardless of call or ref, so we can
   // just flip the edge here.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -860,7 +859,7 @@ void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
 
-  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+  SourceN->insertEdgeInternal(TargetN, Edge::Ref);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -871,14 +870,16 @@ void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
 void LazyCallGraph::RefSCC::insertOutgoingEdge(Node &SourceN, Node &TargetN,
                                                Edge::Kind EK) {
   // First insert it into the caller.
-  SourceN.insertEdgeInternal(TargetN, EK);
+  SourceN->insertEdgeInternal(TargetN, EK);
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
 
   RefSCC &TargetC = *G->lookupRefSCC(TargetN);
   assert(&TargetC != this && "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(TargetC.isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // The only change required is to add this SCC to the parent set of the
   // callee.
@@ -895,8 +896,10 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
   assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
   RefSCC &SourceC = *G->lookupRefSCC(SourceN);
   assert(&SourceC != this && "Source must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(SourceC.isDescendantOf(*this) &&
          "Source must be a descendant of the Target.");
+#endif
 
   SmallVector<RefSCC *, 1> DeletedRefSCCs;
 
@@ -951,9 +954,8 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
       RefSCC &RC = *Worklist.pop_back_val();
       for (SCC &C : RC)
         for (Node &N : C)
-          for (Edge &E : N) {
-            assert(E.getNode() && "Must have formed a node!");
-            RefSCC &EdgeRC = *G->lookupRefSCC(*E.getNode());
+          for (Edge &E : *N) {
+            RefSCC &EdgeRC = *G->lookupRefSCC(E.getNode());
             if (G->getRefSCCIndex(EdgeRC) <= SourceIdx)
               // Not in the postorder sequence between source and target.
               continue;
@@ -1003,10 +1005,8 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
       SCCIndices[&InnerC] = SCCIndex++;
       for (Node &N : InnerC) {
         G->SCCMap[&N] = &InnerC;
-        for (Edge &E : N) {
-          assert(E.getNode() &&
-                 "Cannot have a null node within a visited SCC!");
-          RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+        for (Edge &E : *N) {
+          RefSCC &ChildRC = *G->lookupRefSCC(E.getNode());
           if (MergeSet.count(&ChildRC))
             continue;
           ChildRC.Parents.erase(RC);
@@ -1042,7 +1042,7 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
 
   // At this point we have a merged RefSCC with a post-order SCCs list, just
   // connect the nodes to form the new edge.
-  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+  SourceN->insertEdgeInternal(TargetN, Edge::Ref);
 
   // We return the list of SCCs which were merged so that callers can
   // invalidate any data they have associated with those SCCs. Note that these
@@ -1069,15 +1069,16 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 #endif
 
   // First remove it from the node.
-  SourceN.removeEdgeInternal(TargetN.getFunction());
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 
   bool HasOtherEdgeToChildRC = false;
   bool HasOtherChildRC = false;
   for (SCC *InnerC : SCCs) {
     for (Node &N : *InnerC) {
-      for (Edge &E : N) {
-        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-        RefSCC &OtherChildRC = *G->lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &OtherChildRC = *G->lookupRefSCC(E.getNode());
         if (&OtherChildRC == &TargetRC) {
           HasOtherEdgeToChildRC = true;
           break;
@@ -1116,7 +1117,7 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 
 SmallVector<LazyCallGraph::RefSCC *, 1>
 LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() &&
+  assert(!(*SourceN)[TargetN].isCall() &&
          "Cannot remove a call edge, it must first be made a ref edge");
 
 #ifndef NDEBUG
@@ -1127,7 +1128,9 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
 #endif
 
   // First remove the actual edge.
-  SourceN.removeEdgeInternal(TargetN.getFunction());
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 
   // We return a list of the resulting *new* RefSCCs in post-order.
   SmallVector<RefSCC *, 1> Result;
@@ -1186,7 +1189,7 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
     PostOrderMapping[&N] = Number;
   };
 
-  SmallVector<std::pair<Node *, edge_iterator>, 4> DFSStack;
+  SmallVector<std::pair<Node *, EdgeSequence::iterator>, 4> DFSStack;
   SmallVector<Node *, 4> PendingRefSCCStack;
   do {
     assert(DFSStack.empty() &&
@@ -1205,18 +1208,18 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->begin()});
+    DFSStack.push_back({RootN, (*RootN)->begin()});
     do {
       Node *N;
-      edge_iterator I;
+      EdgeSequence::iterator I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->end();
+      auto E = (*N)->end();
 
       assert(N->DFSNumber != 0 && "We should always assign a DFS number "
                                   "before processing a node.");
 
       while (I != E) {
-        Node &ChildN = I->getNode(*G);
+        Node &ChildN = I->getNode();
         if (ChildN.DFSNumber == 0) {
           // Mark that we should start at this child when next this node is the
           // top of the stack. We don't start at the next child to ensure this
@@ -1226,8 +1229,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
           // Continue, resetting to the child node.
           ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
           N = &ChildN;
-          I = ChildN.begin();
-          E = ChildN.end();
+          I = ChildN->begin();
+          E = ChildN->end();
           continue;
         }
         if (ChildN.DFSNumber == -1) {
@@ -1382,9 +1385,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
 #endif
   for (SCC *C : SCCs)
     for (Node &N : *C) {
-      for (Edge &E : N) {
-        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-        RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &ChildRC = *G->lookupRefSCC(E.getNode());
         if (&ChildRC == this)
           continue;
         ChildRC.Parents.insert(this);
@@ -1408,9 +1410,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
   for (RefSCC *ParentRC : OldParents)
     for (SCC &ParentC : *ParentRC)
       for (Node &ParentN : ParentC)
-        for (Edge &E : ParentN) {
-          assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-          RefSCC &RC = *G->lookupRefSCC(*E.getNode());
+        for (Edge &E : *ParentN) {
+          RefSCC &RC = *G->lookupRefSCC(E.getNode());
           if (&RC != ParentRC)
             RC.Parents.insert(ParentRC);
         }
@@ -1448,8 +1449,10 @@ void LazyCallGraph::RefSCC::handleTrivialEdgeInsertion(Node &SourceN,
     return;
   }
 
+#ifdef EXPENSIVE_CHECKS
   assert(TargetRC.isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
   // The only change required is to add this RefSCC to the parent set of the
   // target. This is a set and so idempotent if the edge already existed.
   TargetRC.Parents.insert(this);
@@ -1461,25 +1464,29 @@ void LazyCallGraph::RefSCC::insertTrivialCallEdge(Node &SourceN,
   // Check that the RefSCC is still valid when we finish.
   auto ExitVerifier = make_scope_exit([this] { verify(); });
 
-  // Check that we aren't breaking some invariants of the SCC graph.
+#ifdef EXPENSIVE_CHECKS
+  // Check that we aren't breaking some invariants of the SCC graph. Note that
+  // this is quadratic in the number of edges in the call graph!
   SCC &SourceC = *G->lookupSCC(SourceN);
   SCC &TargetC = *G->lookupSCC(TargetN);
   if (&SourceC != &TargetC)
     assert(SourceC.isAncestorOf(TargetC) &&
            "Call edge is not trivial in the SCC graph!");
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // NDEBUG
+
   // First insert it into the source or find the existing edge.
-  auto InsertResult = SourceN.EdgeIndexMap.insert(
-      {&TargetN.getFunction(), SourceN.Edges.size()});
+  auto InsertResult =
+      SourceN->EdgeIndexMap.insert({&TargetN, SourceN->Edges.size()});
   if (!InsertResult.second) {
     // Already an edge, just update it.
-    Edge &E = SourceN.Edges[InsertResult.first->second];
+    Edge &E = SourceN->Edges[InsertResult.first->second];
     if (E.isCall())
       return; // Nothing to do!
     E.setKind(Edge::Call);
   } else {
     // Create the new edge.
-    SourceN.Edges.emplace_back(TargetN, Edge::Call);
+    SourceN->Edges.emplace_back(TargetN, Edge::Call);
   }
 
   // Now that we have the edge, handle the graph fallout.
@@ -1491,39 +1498,75 @@ void LazyCallGraph::RefSCC::insertTrivialRefEdge(Node &SourceN, Node &TargetN) {
   // Check that the RefSCC is still valid when we finish.
   auto ExitVerifier = make_scope_exit([this] { verify(); });
 
+#ifdef EXPENSIVE_CHECKS
   // Check that we aren't breaking some invariants of the RefSCC graph.
   RefSCC &SourceRC = *G->lookupRefSCC(SourceN);
   RefSCC &TargetRC = *G->lookupRefSCC(TargetN);
   if (&SourceRC != &TargetRC)
     assert(SourceRC.isAncestorOf(TargetRC) &&
            "Ref edge is not trivial in the RefSCC graph!");
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // NDEBUG
+
   // First insert it into the source or find the existing edge.
-  auto InsertResult = SourceN.EdgeIndexMap.insert(
-      {&TargetN.getFunction(), SourceN.Edges.size()});
+  auto InsertResult =
+      SourceN->EdgeIndexMap.insert({&TargetN, SourceN->Edges.size()});
   if (!InsertResult.second)
     // Already an edge, we're done.
     return;
 
   // Create the new edge.
-  SourceN.Edges.emplace_back(TargetN, Edge::Ref);
+  SourceN->Edges.emplace_back(TargetN, Edge::Ref);
 
   // Now that we have the edge, handle the graph fallout.
   handleTrivialEdgeInsertion(SourceN, TargetN);
 }
 
-void LazyCallGraph::insertEdge(Node &SourceN, Function &Target, Edge::Kind EK) {
-  assert(SCCMap.empty() && DFSStack.empty() &&
+void LazyCallGraph::RefSCC::replaceNodeFunction(Node &N, Function &NewF) {
+  Function &OldF = N.getFunction();
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid when we finish.
+  auto ExitVerifier = make_scope_exit([this] { verify(); });
+
+  assert(G->lookupRefSCC(N) == this &&
+         "Cannot replace the function of a node outside this RefSCC.");
+
+  assert(G->NodeMap.find(&NewF) == G->NodeMap.end() &&
+         "Must not have already walked the new function!'");
+
+  // It is important that this replacement not introduce graph changes so we
+  // insist that the caller has already removed every use of the original
+  // function and that all uses of the new function correspond to existing
+  // edges in the graph. The common and expected way to use this is when
+  // replacing the function itself in the IR without changing the call graph
+  // shape and just updating the analysis based on that.
+  assert(&OldF != &NewF && "Cannot replace a function with itself!");
+  assert(OldF.use_empty() &&
+         "Must have moved all uses from the old function to the new!");
+#endif
+
+  N.replaceFunction(NewF);
+
+  // Update various call graph maps.
+  G->NodeMap.erase(&OldF);
+  G->NodeMap[&NewF] = &N;
+}
+
+void LazyCallGraph::insertEdge(Node &SourceN, Node &TargetN, Edge::Kind EK) {
+  assert(SCCMap.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return SourceN.insertEdgeInternal(Target, EK);
+  return SourceN->insertEdgeInternal(TargetN, EK);
 }
 
-void LazyCallGraph::removeEdge(Node &SourceN, Function &Target) {
-  assert(SCCMap.empty() && DFSStack.empty() &&
+void LazyCallGraph::removeEdge(Node &SourceN, Node &TargetN) {
+  assert(SCCMap.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return SourceN.removeEdgeInternal(Target);
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 }
 
 void LazyCallGraph::removeDeadFunction(Function &F) {
@@ -1532,19 +1575,6 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   assert(F.use_empty() &&
          "This routine should only be called on trivially dead functions!");
 
-  auto EII = EntryIndexMap.find(&F);
-  if (EII != EntryIndexMap.end()) {
-    EntryEdges[EII->second] = Edge();
-    EntryIndexMap.erase(EII);
-  }
-
-  // It's safe to just remove un-visited functions from the RefSCC entry list.
-  // FIXME: This is a linear operation which could become hot and benefit from
-  // an index map.
-  auto RENI = find(RefSCCEntryNodes, &F);
-  if (RENI != RefSCCEntryNodes.end())
-    RefSCCEntryNodes.erase(RENI);
-
   auto NI = NodeMap.find(&F);
   if (NI == NodeMap.end())
     // Not in the graph at all!
@@ -1553,22 +1583,16 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   Node &N = *NI->second;
   NodeMap.erase(NI);
 
-  if (SCCMap.empty() && DFSStack.empty()) {
-    // No SCC walk has begun, so removing this is fine and there is nothing
+  // Remove this from the entry edges if present.
+  EntryEdges.removeEdgeInternal(N);
+
+  if (SCCMap.empty()) {
+    // No SCCs have been formed, so removing this is fine and there is nothing
     // else necessary at this point but clearing out the node.
     N.clear();
     return;
   }
 
-  // Check that we aren't going to break the DFS walk.
-  assert(all_of(DFSStack,
-                [&N](const std::pair<Node *, edge_iterator> &Element) {
-                  return Element.first != &N;
-                }) &&
-         "Tried to remove a function currently in the DFS stack!");
-  assert(find(PendingRefSCCStack, &N) == PendingRefSCCStack.end() &&
-         "Tried to remove a function currently pending to add to a RefSCC!");
-
   // Cannot remove a function which has yet to be visited in the DFS walk, so
   // if we have a node at all then we must have an SCC and RefSCC.
   auto CI = SCCMap.find(&N);
@@ -1583,13 +1607,19 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   // Validate these properties first.
   assert(C.size() == 1 && "Dead functions must be in a singular SCC");
   assert(RC.size() == 1 && "Dead functions must be in a singular RefSCC");
-  assert(RC.Parents.empty() && "Cannot have parents of a dead RefSCC!");
+
+  // Clean up any remaining reference edges. Note that we walk an unordered set
+  // here but are just removing and so the order doesn't matter.
+  for (RefSCC &ParentRC : RC.parents())
+    for (SCC &ParentC : ParentRC)
+      for (Node &ParentN : ParentC)
+        if (ParentN)
+          ParentN->removeEdgeInternal(N);
 
   // Now remove this RefSCC from any parents sets and the leaf list.
-  for (Edge &E : N)
-    if (Node *TargetN = E.getNode())
-      if (RefSCC *TargetRC = lookupRefSCC(*TargetN))
-        TargetRC->Parents.erase(&RC);
+  for (Edge &E : *N)
+    if (RefSCC *TargetRC = lookupRefSCC(E.getNode()))
+      TargetRC->Parents.erase(&RC);
   // FIXME: This is a linear operation which could become hot and benefit from
   // an index map.
   auto LRI = find(LeafRefSCCs, &RC);
@@ -1622,15 +1652,14 @@ void LazyCallGraph::updateGraphPtrs() {
   {
     SmallVector<Node *, 16> Worklist;
     for (Edge &E : EntryEdges)
-      if (Node *EntryN = E.getNode())
-        Worklist.push_back(EntryN);
+      Worklist.push_back(&E.getNode());
 
     while (!Worklist.empty()) {
-      Node *N = Worklist.pop_back_val();
-      N->G = this;
-      for (Edge &E : N->Edges)
-        if (Node *TargetN = E.getNode())
-          Worklist.push_back(TargetN);
+      Node &N = *Worklist.pop_back_val();
+      N.G = this;
+      if (N)
+        for (Edge &E : *N)
+          Worklist.push_back(&E.getNode());
     }
   }
 
@@ -1647,34 +1676,18 @@ void LazyCallGraph::updateGraphPtrs() {
   }
 }
 
-/// Build the internal SCCs for a RefSCC from a sequence of nodes.
-///
-/// Appends the SCCs to the provided vector and updates the map with their
-/// indices. Both the vector and map must be empty when passed into this
-/// routine.
-void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
-  assert(RC.SCCs.empty() && "Already built SCCs!");
-  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
-
-  for (Node *N : Nodes) {
-    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
-           "We cannot have a low link in an SCC lower than its root on the "
-           "stack!");
+template <typename RootsT, typename GetBeginT, typename GetEndT,
+          typename GetNodeT, typename FormSCCCallbackT>
+void LazyCallGraph::buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
+                                     GetEndT &&GetEnd, GetNodeT &&GetNode,
+                                     FormSCCCallbackT &&FormSCC) {
+  typedef decltype(GetBegin(std::declval<Node &>())) EdgeItT;
 
-    // This node will go into the next RefSCC, clear out its DFS and low link
-    // as we scan.
-    N->DFSNumber = N->LowLink = 0;
-  }
-
-  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
-  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
-  // internal storage as we won't need it for the outer graph's DFS any longer.
-
-  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<std::pair<Node *, EdgeItT>, 16> DFSStack;
   SmallVector<Node *, 16> PendingSCCStack;
 
   // Scan down the stack and DFS across the call edges.
-  for (Node *RootN : Nodes) {
+  for (Node *RootN : Roots) {
     assert(DFSStack.empty() &&
            "Cannot begin a new root with a non-empty DFS stack!");
     assert(PendingSCCStack.empty() &&
@@ -1690,25 +1703,23 @@ void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->call_begin()});
+    DFSStack.push_back({RootN, GetBegin(*RootN)});
     do {
       Node *N;
-      call_edge_iterator I;
+      EdgeItT I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->call_end();
+      auto E = GetEnd(*N);
       while (I != E) {
-        Node &ChildN = *I->getNode();
+        Node &ChildN = GetNode(I);
         if (ChildN.DFSNumber == 0) {
           // We haven't yet visited this child, so descend, pushing the current
           // node onto the stack.
           DFSStack.push_back({N, I});
 
-          assert(!lookupSCC(ChildN) &&
-                 "Found a node with 0 DFS number but already in an SCC!");
           ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
           N = &ChildN;
-          I = N->call_begin();
-          E = N->call_end();
+          I = GetBegin(*N);
+          E = GetEnd(*N);
           continue;
         }
 
@@ -1750,20 +1761,93 @@ void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
           }));
       // Form a new SCC out of these nodes and then clear them off our pending
       // stack.
-      RC.SCCs.push_back(createSCC(RC, SCCNodes));
-      for (Node &N : *RC.SCCs.back()) {
-        N.DFSNumber = N.LowLink = -1;
-        SCCMap[&N] = RC.SCCs.back();
-      }
+      FormSCC(SCCNodes);
       PendingSCCStack.erase(SCCNodes.end().base(), PendingSCCStack.end());
     } while (!DFSStack.empty());
   }
+}
+
+/// Build the internal SCCs for a RefSCC from a sequence of nodes.
+///
+/// Appends the SCCs to the provided vector and updates the map with their
+/// indices. Both the vector and map must be empty when passed into this
+/// routine.
+void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
+  assert(RC.SCCs.empty() && "Already built SCCs!");
+  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
+
+  for (Node *N : Nodes) {
+    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
+           "We cannot have a low link in an SCC lower than its root on the "
+           "stack!");
+
+    // This node will go into the next RefSCC, clear out its DFS and low link
+    // as we scan.
+    N->DFSNumber = N->LowLink = 0;
+  }
+
+  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
+  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
+  // internal storage as we won't need it for the outer graph's DFS any longer.
+  buildGenericSCCs(
+      Nodes, [](Node &N) { return N->call_begin(); },
+      [](Node &N) { return N->call_end(); },
+      [](EdgeSequence::call_iterator I) -> Node & { return I->getNode(); },
+      [this, &RC](node_stack_range Nodes) {
+        RC.SCCs.push_back(createSCC(RC, Nodes));
+        for (Node &N : *RC.SCCs.back()) {
+          N.DFSNumber = N.LowLink = -1;
+          SCCMap[&N] = RC.SCCs.back();
+        }
+      });
 
   // Wire up the SCC indices.
   for (int i = 0, Size = RC.SCCs.size(); i < Size; ++i)
     RC.SCCIndices[RC.SCCs[i]] = i;
 }
 
+void LazyCallGraph::buildRefSCCs() {
+  if (EntryEdges.empty() || !PostOrderRefSCCs.empty())
+    // RefSCCs are either non-existent or already built!
+    return;
+
+  assert(RefSCCIndices.empty() && "Already mapped RefSCC indices!");
+
+  SmallVector<Node *, 16> Roots;
+  for (Edge &E : *this)
+    Roots.push_back(&E.getNode());
+
+  // The roots will be popped of a stack, so use reverse to get a less
+  // surprising order. This doesn't change any of the semantics anywhere.
+  std::reverse(Roots.begin(), Roots.end());
+
+  buildGenericSCCs(
+      Roots,
+      [](Node &N) {
+        // We need to populate each node as we begin to walk its edges.
+        N.populate();
+        return N->begin();
+      },
+      [](Node &N) { return N->end(); },
+      [](EdgeSequence::iterator I) -> Node & { return I->getNode(); },
+      [this](node_stack_range Nodes) {
+        RefSCC *NewRC = createRefSCC(*this);
+        buildSCCs(*NewRC, Nodes);
+        connectRefSCC(*NewRC);
+
+        // Push the new node into the postorder list and remember its position
+        // in the index map.
+        bool Inserted =
+            RefSCCIndices.insert({NewRC, PostOrderRefSCCs.size()}).second;
+        (void)Inserted;
+        assert(Inserted && "Cannot already have this RefSCC in the index map!");
+        PostOrderRefSCCs.push_back(NewRC);
+#ifndef NDEBUG
+        NewRC->verify();
+#endif
+      });
+}
+
 // FIXME: We should move callers of this to embed the parent linking and leaf
 // tracking into their DFS in order to remove a full walk of all edges.
 void LazyCallGraph::connectRefSCC(RefSCC &RC) {
@@ -1773,10 +1857,8 @@ void LazyCallGraph::connectRefSCC(RefSCC &RC) {
   bool IsLeaf = true;
   for (SCC &C : RC)
     for (Node &N : C)
-      for (Edge &E : N) {
-        assert(E.getNode() &&
-               "Cannot have a missing node in a visited part of the graph!");
-        RefSCC &ChildRC = *lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &ChildRC = *lookupRefSCC(E.getNode());
         if (&ChildRC == &RC)
           continue;
         ChildRC.Parents.insert(&RC);
@@ -1788,113 +1870,13 @@ void LazyCallGraph::connectRefSCC(RefSCC &RC) {
     LeafRefSCCs.push_back(&RC);
 }
 
-bool LazyCallGraph::buildNextRefSCCInPostOrder() {
-  if (DFSStack.empty()) {
-    Node *N;
-    do {
-      // If we've handled all candidate entry nodes to the SCC forest, we're
-      // done.
-      if (RefSCCEntryNodes.empty())
-        return false;
-
-      N = &get(*RefSCCEntryNodes.pop_back_val());
-    } while (N->DFSNumber != 0);
-
-    // Found a new root, begin the DFS here.
-    N->LowLink = N->DFSNumber = 1;
-    NextDFSNumber = 2;
-    DFSStack.push_back({N, N->begin()});
-  }
-
-  for (;;) {
-    Node *N;
-    edge_iterator I;
-    std::tie(N, I) = DFSStack.pop_back_val();
-
-    assert(N->DFSNumber > 0 && "We should always assign a DFS number "
-                               "before placing a node onto the stack.");
-
-    auto E = N->end();
-    while (I != E) {
-      Node &ChildN = I->getNode(*this);
-      if (ChildN.DFSNumber == 0) {
-        // We haven't yet visited this child, so descend, pushing the current
-        // node onto the stack.
-        DFSStack.push_back({N, N->begin()});
-
-        assert(!SCCMap.count(&ChildN) &&
-               "Found a node with 0 DFS number but already in an SCC!");
-        ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
-        N = &ChildN;
-        I = N->begin();
-        E = N->end();
-        continue;
-      }
-
-      // If the child has already been added to some child component, it
-      // couldn't impact the low-link of this parent because it isn't
-      // connected, and thus its low-link isn't relevant so skip it.
-      if (ChildN.DFSNumber == -1) {
-        ++I;
-        continue;
-      }
-
-      // Track the lowest linked child as the lowest link for this node.
-      assert(ChildN.LowLink > 0 && "Must have a positive low-link number!");
-      if (ChildN.LowLink < N->LowLink)
-        N->LowLink = ChildN.LowLink;
-
-      // Move to the next edge.
-      ++I;
-    }
-
-    // We've finished processing N and its descendents, put it on our pending
-    // SCC stack to eventually get merged into an SCC of nodes.
-    PendingRefSCCStack.push_back(N);
-
-    // If this node is linked to some lower entry, continue walking up the
-    // stack.
-    if (N->LowLink != N->DFSNumber) {
-      assert(!DFSStack.empty() &&
-             "We never found a viable root for an SCC to pop off!");
-      continue;
-    }
-
-    // Otherwise, form a new RefSCC from the top of the pending node stack.
-    int RootDFSNumber = N->DFSNumber;
-    // Find the range of the node stack by walking down until we pass the
-    // root DFS number.
-    auto RefSCCNodes = node_stack_range(
-        PendingRefSCCStack.rbegin(),
-        find_if(reverse(PendingRefSCCStack), [RootDFSNumber](const Node *N) {
-          return N->DFSNumber < RootDFSNumber;
-        }));
-    // Form a new RefSCC out of these nodes and then clear them off our pending
-    // stack.
-    RefSCC *NewRC = createRefSCC(*this);
-    buildSCCs(*NewRC, RefSCCNodes);
-    connectRefSCC(*NewRC);
-    PendingRefSCCStack.erase(RefSCCNodes.end().base(),
-                             PendingRefSCCStack.end());
-
-    // Push the new node into the postorder list and return true indicating we
-    // successfully grew the postorder sequence by one.
-    bool Inserted =
-        RefSCCIndices.insert({NewRC, PostOrderRefSCCs.size()}).second;
-    (void)Inserted;
-    assert(Inserted && "Cannot already have this RefSCC in the index map!");
-    PostOrderRefSCCs.push_back(NewRC);
-    return true;
-  }
-}
-
 AnalysisKey LazyCallGraphAnalysis::Key;
 
 LazyCallGraphPrinterPass::LazyCallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
 
 static void printNode(raw_ostream &OS, LazyCallGraph::Node &N) {
   OS << "  Edges in function: " << N.getFunction().getName() << "\n";
-  for (const LazyCallGraph::Edge &E : N)
+  for (LazyCallGraph::Edge &E : N.populate())
     OS << "    " << (E.isCall() ? "call" : "ref ") << " -> "
        << E.getFunction().getName() << "\n";
 
@@ -1929,6 +1911,7 @@ PreservedAnalyses LazyCallGraphPrinterPass::run(Module &M,
   for (Function &F : M)
     printNode(OS, G.get(F));
 
+  G.buildRefSCCs();
   for (LazyCallGraph::RefSCC &C : G.postorder_ref_sccs())
     printRefSCC(OS, C);
 
@@ -1941,7 +1924,7 @@ LazyCallGraphDOTPrinterPass::LazyCallGraphDOTPrinterPass(raw_ostream &OS)
 static void printNodeDOT(raw_ostream &OS, LazyCallGraph::Node &N) {
   std::string Name = "\"" + DOT::EscapeString(N.getFunction().getName()) + "\"";
 
-  for (const LazyCallGraph::Edge &E : N) {
+  for (LazyCallGraph::Edge &E : N.populate()) {
     OS << "  " << Name << " -> \""
        << DOT::EscapeString(E.getFunction().getName()) << "\"";
     if (!E.isCall()) // It is a ref edge.
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index d442310476cf..ad01f7f2f215 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <stack>
@@ -39,6 +41,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "lazy-value-info"
 
+// This is the number of worklist items we will process to try to discover an
+// answer for a given value.
+static const unsigned MaxProcessedPerValue = 500;
+
 char LazyValueInfoWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
@@ -358,6 +364,7 @@ namespace {
   /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
+    friend class LazyValueInfoAnnotatedWriter;
     /// This is all of the cached block information for exactly one Value*.
     /// The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
@@ -366,22 +373,23 @@ namespace {
     struct ValueCacheEntryTy {
       ValueCacheEntryTy(Value *V, LazyValueInfoCache *P) : Handle(V, P) {}
       LVIValueHandle Handle;
-      SmallDenseMap<AssertingVH<BasicBlock>, LVILatticeVal, 4> BlockVals;
+      SmallDenseMap<PoisoningVH<BasicBlock>, LVILatticeVal, 4> BlockVals;
     };
 
-    /// This is all of the cached information for all values,
-    /// mapped from Value* to key information.
-    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
-
     /// This tracks, on a per-block basis, the set of values that are
     /// over-defined at the end of that block.
-    typedef DenseMap<AssertingVH<BasicBlock>, SmallPtrSet<Value *, 4>>
+    typedef DenseMap<PoisoningVH<BasicBlock>, SmallPtrSet<Value *, 4>>
         OverDefinedCacheTy;
-    OverDefinedCacheTy OverDefinedCache;
-
     /// Keep track of all blocks that we have ever seen, so we
     /// don't spend time removing unused blocks from our caches.
-    DenseSet<AssertingVH<BasicBlock> > SeenBlocks;
+    DenseSet<PoisoningVH<BasicBlock> > SeenBlocks;
+
+  protected:
+    /// This is all of the cached information for all values,
+    /// mapped from Value* to key information.
+    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
+    OverDefinedCacheTy OverDefinedCache;
+
 
   public:
     void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
@@ -435,6 +443,7 @@ namespace {
       return BBI->second;
     }
 
+    void printCache(Function &F, raw_ostream &OS);
     /// clear - Empty the cache.
     void clear() {
       SeenBlocks.clear();
@@ -458,16 +467,71 @@ namespace {
   };
 }
 
+
+namespace {
+
+  /// An assembly annotator class to print LazyValueCache information in
+  /// comments.
+  class LazyValueInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+    const LazyValueInfoCache* LVICache;
+
+  public:
+    LazyValueInfoAnnotatedWriter(const LazyValueInfoCache *L) : LVICache(L) {}
+
+    virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                          formatted_raw_ostream &OS) {
+      auto ODI = LVICache->OverDefinedCache.find(const_cast<BasicBlock*>(BB));
+      if (ODI == LVICache->OverDefinedCache.end())
+        return;
+      OS << "; OverDefined values for block are: \n";
+      for (auto *V : ODI->second)
+        OS << ";" << *V << "\n";
+
+      // Find if there are latticevalues defined for arguments of the function.
+      auto *F = const_cast<Function *>(BB->getParent());
+      for (auto &Arg : F->args()) {
+        auto VI = LVICache->ValueCache.find_as(&Arg);
+        if (VI == LVICache->ValueCache.end())
+          continue;
+        auto BBI = VI->second->BlockVals.find(const_cast<BasicBlock *>(BB));
+        if (BBI != VI->second->BlockVals.end())
+          OS << "; CachedLatticeValue for: '" << *VI->first << "' is: '"
+             << BBI->second << "'\n";
+      }
+    }
+
+    virtual void emitInstructionAnnot(const Instruction *I,
+                                      formatted_raw_ostream &OS) {
+
+      auto VI = LVICache->ValueCache.find_as(const_cast<Instruction *>(I));
+      if (VI == LVICache->ValueCache.end())
+        return;
+      OS << "; CachedLatticeValues for: '" << *VI->first << "'\n";
+      for (auto &BV : VI->second->BlockVals) {
+        OS << "; at beginning of BasicBlock: '";
+        BV.first->printAsOperand(OS, false);
+        OS << "' LatticeVal: '" << BV.second << "' \n";
+      }
+    }
+};
+}
+
+void LazyValueInfoCache::printCache(Function &F, raw_ostream &OS) {
+  LazyValueInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+
+}
+
 void LazyValueInfoCache::eraseValue(Value *V) {
-  SmallVector<AssertingVH<BasicBlock>, 4> ToErase;
-  for (auto &I : OverDefinedCache) {
-    SmallPtrSetImpl<Value *> &ValueSet = I.second;
+  for (auto I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E;) {
+    // Copy and increment the iterator immediately so we can erase behind
+    // ourselves.
+    auto Iter = I++;
+    SmallPtrSetImpl<Value *> &ValueSet = Iter->second;
     ValueSet.erase(V);
     if (ValueSet.empty())
-      ToErase.push_back(I.first);
+      OverDefinedCache.erase(Iter);
   }
-  for (auto &BB : ToErase)
-    OverDefinedCache.erase(BB);
 
   ValueCache.erase(V);
 }
@@ -480,7 +544,7 @@ void LVIValueHandle::deleted() {
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
   // Shortcut if we have never seen this block.
-  DenseSet<AssertingVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
+  DenseSet<PoisoningVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
   if (I == SeenBlocks.end())
     return;
   SeenBlocks.erase(I);
@@ -563,7 +627,7 @@ namespace {
     /// This stack holds the state of the value solver during a query.
     /// It basically emulates the callstack of the naive
     /// recursive value lookup process.
-    std::stack<std::pair<BasicBlock*, Value*> > BlockValueStack;
+    SmallVector<std::pair<BasicBlock*, Value*>, 8> BlockValueStack;
 
     /// Keeps track of which block-value pairs are in BlockValueStack.
     DenseSet<std::pair<BasicBlock*, Value*> > BlockValueSet;
@@ -576,7 +640,7 @@ namespace {
 
       DEBUG(dbgs() << "PUSH: " << *BV.second << " in " << BV.first->getName()
                    << "\n");
-      BlockValueStack.push(BV);
+      BlockValueStack.push_back(BV);
       return true;
     }
 
@@ -629,6 +693,11 @@ namespace {
       TheCache.clear();
     }
 
+    /// Printing the LazyValueInfoCache.
+    void printCache(Function &F, raw_ostream &OS) {
+       TheCache.printCache(F, OS);
+    }
+
     /// This is part of the update interface to inform the cache
     /// that a block has been deleted.
     void eraseBlock(BasicBlock *BB) {
@@ -646,24 +715,50 @@ namespace {
 } // end anonymous namespace
 
 void LazyValueInfoImpl::solve() {
+  SmallVector<std::pair<BasicBlock *, Value *>, 8> StartingStack(
+      BlockValueStack.begin(), BlockValueStack.end());
+
+  unsigned processedCount = 0;
   while (!BlockValueStack.empty()) {
-    std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
+    processedCount++;
+    // Abort if we have to process too many values to get a result for this one.
+    // Because of the design of the overdefined cache currently being per-block
+    // to avoid naming-related issues (IE it wants to try to give different
+    // results for the same name in different blocks), overdefined results don't
+    // get cached globally, which in turn means we will often try to rediscover
+    // the same overdefined result again and again.  Once something like
+    // PredicateInfo is used in LVI or CVP, we should be able to make the
+    // overdefined cache global, and remove this throttle.
+    if (processedCount > MaxProcessedPerValue) {
+      DEBUG(dbgs() << "Giving up on stack because we are getting too deep\n");
+      // Fill in the original values
+      while (!StartingStack.empty()) {
+        std::pair<BasicBlock *, Value *> &e = StartingStack.back();
+        TheCache.insertResult(e.second, e.first,
+                              LVILatticeVal::getOverdefined());
+        StartingStack.pop_back();
+      }
+      BlockValueSet.clear();
+      BlockValueStack.clear();
+      return;
+    }
+    std::pair<BasicBlock *, Value *> e = BlockValueStack.back();
     assert(BlockValueSet.count(e) && "Stack value should be in BlockValueSet!");
 
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
-      assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
+      assert(BlockValueStack.back() == e && "Nothing should have been pushed!");
       assert(TheCache.hasCachedValueInfo(e.second, e.first) &&
              "Result should be in cache!");
 
       DEBUG(dbgs() << "POP " << *e.second << " in " << e.first->getName()
                    << " = " << TheCache.getCachedValueInfo(e.second, e.first) << "\n");
 
-      BlockValueStack.pop();
+      BlockValueStack.pop_back();
       BlockValueSet.erase(e);
     } else {
       // More work needs to be done before revisiting.
-      assert(BlockValueStack.top() != e && "Stack should have been pushed!");
+      assert(BlockValueStack.back() != e && "Stack should have been pushed!");
     }
   }
 }
@@ -839,13 +934,19 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   }
 
   // Loop over all of our predecessors, merging what we know from them into
-  // result.
-  bool EdgesMissing = false;
+  // result.  If we encounter an unexplored predecessor, we eagerly explore it
+  // in a depth first manner.  In practice, this has the effect of discovering
+  // paths we can't analyze eagerly without spending compile times analyzing
+  // other paths.  This heuristic benefits from the fact that predecessors are
+  // frequently arranged such that dominating ones come first and we quickly
+  // find a path to function entry.  TODO: We should consider explicitly
+  // canonicalizing to make this true rather than relying on this happy
+  // accident.  
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     LVILatticeVal EdgeResult;
-    EdgesMissing |= !getEdgeValue(Val, *PI, BB, EdgeResult);
-    if (EdgesMissing)
-      continue;
+    if (!getEdgeValue(Val, *PI, BB, EdgeResult))
+      // Explore that input, then return here
+      return false;
 
     Result.mergeIn(EdgeResult, DL);
 
@@ -866,8 +967,6 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
       return true;
     }
   }
-  if (EdgesMissing)
-    return false;
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
@@ -880,8 +979,8 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
   LVILatticeVal Result;  // Start Undefined.
 
   // Loop over all of our predecessors, merging what we know from them into
-  // result.
-  bool EdgesMissing = false;
+  // result.  See the comment about the chosen traversal order in
+  // solveBlockValueNonLocal; the same reasoning applies here.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
@@ -889,9 +988,9 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
     // Note that we can provide PN as the context value to getEdgeValue, even
     // though the results will be cached, because PN is the value being used as
     // the cache key in the caller.
-    EdgesMissing |= !getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN);
-    if (EdgesMissing)
-      continue;
+    if (!getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN))
+      // Explore that input, then return here
+      return false;
 
     Result.mergeIn(EdgeResult, DL);
 
@@ -905,8 +1004,6 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
       return true;
     }
   }
-  if (EdgesMissing)
-    return false;
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
@@ -1333,14 +1430,14 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
     unsigned BitWidth = Val->getType()->getIntegerBitWidth();
     ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
 
-    for (SwitchInst::CaseIt i : SI->cases()) {
-      ConstantRange EdgeVal(i.getCaseValue()->getValue());
+    for (auto Case : SI->cases()) {
+      ConstantRange EdgeVal(Case.getCaseValue()->getValue());
       if (DefaultCase) {
         // It is possible that the default destination is the destination of
         // some cases. There is no need to perform difference for those cases.
-        if (i.getCaseSuccessor() != BBTo)
+        if (Case.getCaseSuccessor() != BBTo)
           EdgesVals = EdgesVals.difference(EdgeVal);
-      } else if (i.getCaseSuccessor() == BBTo)
+      } else if (Case.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
     Result = LVILatticeVal::getRange(std::move(EdgesVals));
@@ -1352,8 +1449,8 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 /// \brief Compute the value of Val on the edge BBFrom -> BBTo or the value at
 /// the basic block if the edge does not constrain Val.
 bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                      BasicBlock *BBTo, LVILatticeVal &Result,
-                                      Instruction *CxtI) {
+                                     BasicBlock *BBTo, LVILatticeVal &Result,
+                                     Instruction *CxtI) {
   // If already a constant, there is nothing to compute.
   if (Constant *VC = dyn_cast<Constant>(Val)) {
     Result = LVILatticeVal::get(VC);
@@ -1503,6 +1600,18 @@ void LazyValueInfo::releaseMemory() {
   }
 }
 
+bool LazyValueInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                               FunctionAnalysisManager::Invalidator &Inv) {
+  // We need to invalidate if we have either failed to preserve this analyses
+  // result directly or if any of its dependencies have been invalidated.
+  auto PAC = PA.getChecker<LazyValueAnalysis>();
+  if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+      (DT && Inv.invalidate<DominatorTreeAnalysis>(F, PA)))
+    return true;
+
+  return false;
+}
+
 void LazyValueInfoWrapperPass::releaseMemory() { Info.releaseMemory(); }
 
 LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
@@ -1510,7 +1619,7 @@ LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM)
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
 
-  return LazyValueInfo(&AC, &TLI, DT);
+  return LazyValueInfo(&AC, &F.getParent()->getDataLayout(), &TLI, DT);
 }
 
 /// Returns true if we can statically tell that this value will never be a
@@ -1780,3 +1889,40 @@ void LazyValueInfo::eraseBlock(BasicBlock *BB) {
     getImpl(PImpl, AC, &DL, DT).eraseBlock(BB);
   }
 }
+
+
+void LazyValueInfo::printCache(Function &F, raw_ostream &OS) {
+  if (PImpl) {
+    getImpl(PImpl, AC, DL, DT).printCache(F, OS);
+  }
+}
+
+namespace {
+// Printer class for LazyValueInfo results.
+class LazyValueInfoPrinter : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  LazyValueInfoPrinter() : FunctionPass(ID) {
+    initializeLazyValueInfoPrinterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<LazyValueInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    dbgs() << "LVI for function '" << F.getName() << "':\n";
+    auto &LVI = getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+    LVI.printCache(F, dbgs());
+    return false;
+  }
+};
+}
+
+char LazyValueInfoPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(LazyValueInfoPrinter, "print-lazy-value-info",
+                "Lazy Value Info Printer Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(LazyValueInfoPrinter, "print-lazy-value-info",
+                "Lazy Value Info Printer Pass", false, false)
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index e46541e6538d..96799a459bfc 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -312,21 +312,26 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
                                       BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA, bool *IsLoadCSE) {
-  if (MaxInstsToScan == 0)
-    MaxInstsToScan = ~0U;
-
-  Value *Ptr = Load->getPointerOperand();
-  Type *AccessTy = Load->getType();
-
-  // We can never remove a volatile load
-  if (Load->isVolatile())
-    return nullptr;
-
-  // Anything stronger than unordered is currently unimplemented.
+                                      AliasAnalysis *AA, bool *IsLoad,
+                                      unsigned *NumScanedInst) {
+  // Don't CSE load that is volatile or anything stronger than unordered.
   if (!Load->isUnordered())
     return nullptr;
 
+  return FindAvailablePtrLoadStore(
+      Load->getPointerOperand(), Load->getType(), Load->isAtomic(), ScanBB,
+      ScanFrom, MaxInstsToScan, AA, IsLoad, NumScanedInst);
+}
+
+Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy,
+                                       bool AtLeastAtomic, BasicBlock *ScanBB,
+                                       BasicBlock::iterator &ScanFrom,
+                                       unsigned MaxInstsToScan,
+                                       AliasAnalysis *AA, bool *IsLoadCSE,
+                                       unsigned *NumScanedInst) {
+  if (MaxInstsToScan == 0)
+    MaxInstsToScan = ~0U;
+
   const DataLayout &DL = ScanBB->getModule()->getDataLayout();
 
   // Try to get the store size for the type.
@@ -344,6 +349,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
     // Restore ScanFrom to expected value in case next test succeeds
     ScanFrom++;
 
+    if (NumScanedInst)
+      ++(*NumScanedInst);
+
     // Don't scan huge blocks.
     if (MaxInstsToScan-- == 0)
       return nullptr;
@@ -359,7 +367,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
 
         // We can value forward from an atomic to a non-atomic, but not the
         // other way around.
-        if (LI->isAtomic() < Load->isAtomic())
+        if (LI->isAtomic() < AtLeastAtomic)
           return nullptr;
 
         if (IsLoadCSE)
@@ -378,7 +386,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
 
         // We can value forward from an atomic to a non-atomic, but not the
         // other way around.
-        if (SI->isAtomic() < Load->isAtomic())
+        if (SI->isAtomic() < AtLeastAtomic)
           return nullptr;
 
         if (IsLoadCSE)
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index bf8007213097..4ba12583ff83 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -135,21 +135,6 @@ bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
 
-void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message,
-                                    const Loop *TheLoop, const char *PassName,
-                                    OptimizationRemarkEmitter &ORE) {
-  DebugLoc DL = TheLoop->getStartLoc();
-  const Value *V = TheLoop->getHeader();
-  if (const Instruction *I = Message.getInstr()) {
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-    V = I->getParent();
-  }
-  ORE.emitOptimizationRemarkAnalysis(PassName, DL, V, Message.str());
-}
-
 Value *llvm::stripIntegerCast(Value *V) {
   if (auto *CI = dyn_cast<CastInst>(V))
     if (CI->getOperand(0)->getType()->isIntegerTy())
@@ -172,11 +157,6 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
     // Strip casts.
     StrideVal = stripIntegerCast(StrideVal);
 
-    // Replace symbolic stride by one.
-    Value *One = ConstantInt::get(StrideVal->getType(), 1);
-    ValueToValueMap RewriteMap;
-    RewriteMap[StrideVal] = One;
-
     ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
@@ -518,7 +498,7 @@ class AccessAnalysis {
 public:
   /// \brief Read or write access location.
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+  typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
@@ -570,7 +550,7 @@ public:
     DepChecker.clearDependences();
   }
 
-  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
+  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
 
 private:
   typedef SetVector<MemAccessInfo> PtrAccessSet;
@@ -584,8 +564,8 @@ private:
 
   const DataLayout &DL;
 
-  /// Set of accesses that need a further dependence check.
-  MemAccessInfoSet CheckDeps;
+  /// List of accesses that need a further dependence check.
+  MemAccessInfoList CheckDeps;
 
   /// Set of pointers that are read only.
   SmallPtrSet<Value*, 16> ReadOnlyPtr;
@@ -842,7 +822,7 @@ void AccessAnalysis::processMemAccesses() {
           // there is no other write to the ptr - this is an optimization to
           // catch "a[i] = a[i] + " without having to do a dependence check).
           if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
-            CheckDeps.insert(Access);
+            CheckDeps.push_back(Access);
             IsRTCheckAnalysisNeeded = true;
           }
 
@@ -1205,6 +1185,73 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   return false;
 }
 
+/// Given a non-constant (unknown) dependence-distance \p Dist between two 
+/// memory accesses, that have the same stride whose absolute value is given
+/// in \p Stride, and that have the same type size \p TypeByteSize,
+/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
+/// possible to prove statically that the dependence distance is larger
+/// than the range that the accesses will travel through the execution of
+/// the loop. If so, return true; false otherwise. This is useful for
+/// example in loops such as the following (PR31098):
+///     for (i = 0; i < D; ++i) {
+///                = out[i];
+///       out[i+D] =
+///     }
+static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
+                                     const SCEV &BackedgeTakenCount,
+                                     const SCEV &Dist, uint64_t Stride,
+                                     uint64_t TypeByteSize) {
+
+  // If we can prove that
+  //      (**) |Dist| > BackedgeTakenCount * Step
+  // where Step is the absolute stride of the memory accesses in bytes, 
+  // then there is no dependence.
+  //
+  // Ratioanle: 
+  // We basically want to check if the absolute distance (|Dist/Step|) 
+  // is >= the loop iteration count (or > BackedgeTakenCount). 
+  // This is equivalent to the Strong SIV Test (Practical Dependence Testing, 
+  // Section 4.2.1); Note, that for vectorization it is sufficient to prove 
+  // that the dependence distance is >= VF; This is checked elsewhere.
+  // But in some cases we can prune unknown dependence distances early, and 
+  // even before selecting the VF, and without a runtime test, by comparing 
+  // the distance against the loop iteration count. Since the vectorized code 
+  // will be executed only if LoopCount >= VF, proving distance >= LoopCount 
+  // also guarantees that distance >= VF.
+  //
+  const uint64_t ByteStride = Stride * TypeByteSize;
+  const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
+  const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
+
+  const SCEV *CastedDist = &Dist;
+  const SCEV *CastedProduct = Product;
+  uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
+  uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
+
+  // The dependence distance can be positive/negative, so we sign extend Dist; 
+  // The multiplication of the absolute stride in bytes and the 
+  // backdgeTakenCount is non-negative, so we zero extend Product.
+  if (DistTypeSize > ProductTypeSize)
+    CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
+  else
+    CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
+
+  // Is  Dist - (BackedgeTakenCount * Step) > 0 ?
+  // (If so, then we have proven (**) because |Dist| >= Dist)
+  const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
+  if (SE.isKnownPositive(Minus))
+    return true;
+
+  // Second try: Is  -Dist - (BackedgeTakenCount * Step) > 0 ?
+  // (If so, then we have proven (**) because |Dist| >= -1*Dist)
+  const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
+  Minus = SE.getMinusSCEV(NegDist, CastedProduct);
+  if (SE.isKnownPositive(Minus))
+    return true;
+
+  return false;
+}
+
 /// \brief Check the dependence for two accesses with the same stride \p Stride.
 /// \p Distance is the positive distance and \p TypeByteSize is type size in
 /// bytes.
@@ -1292,21 +1339,26 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::Unknown;
   }
 
+  Type *ATy = APtr->getType()->getPointerElementType();
+  Type *BTy = BPtr->getType()->getPointerElementType();
+  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
+  uint64_t Stride = std::abs(StrideAPtr);
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
   if (!C) {
+    if (TypeByteSize == DL.getTypeAllocSize(BTy) &&
+        isSafeDependenceDistance(DL, *(PSE.getSE()),
+                                 *(PSE.getBackedgeTakenCount()), *Dist, Stride,
+                                 TypeByteSize))
+      return Dependence::NoDep;
+
     DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
     ShouldRetryWithRuntimeCheck = true;
     return Dependence::Unknown;
   }
 
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
-  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
-  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
-
   const APInt &Val = C->getAPInt();
   int64_t Distance = Val.getSExtValue();
-  uint64_t Stride = std::abs(StrideAPtr);
 
   // Attempt to prove strided accesses independent.
   if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
@@ -1427,12 +1479,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 }
 
 bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
-                                   MemAccessInfoSet &CheckDeps,
+                                   MemAccessInfoList &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
   MaxSafeDepDistBytes = -1;
-  while (!CheckDeps.empty()) {
-    MemAccessInfo CurAccess = *CheckDeps.begin();
+  SmallPtrSet<MemAccessInfo, 8> Visited;
+  for (MemAccessInfo CurAccess : CheckDeps) {
+    if (Visited.count(CurAccess))
+      continue;
 
     // Get the relevant memory access set.
     EquivalenceClasses<MemAccessInfo>::iterator I =
@@ -1446,7 +1500,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
 
     // Check every access pair.
     while (AI != AE) {
-      CheckDeps.erase(*AI);
+      Visited.insert(*AI);
       EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
       while (OI != AE) {
         // Check every accessing instruction pair in program order.
@@ -1885,7 +1939,10 @@ expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
     Value *NewPtr = (Inst && TheLoop->contains(Inst))
                         ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
                         : Ptr;
-    return {NewPtr, NewPtr};
+    // We must return a half-open range, which means incrementing Sc.
+    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
+    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
+    return {NewPtr, NewPtrPlusOne};
   } else {
     Value *Start = nullptr, *End = nullptr;
     DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
diff --git a/lib/Analysis/LoopAnalysisManager.cpp b/lib/Analysis/LoopAnalysisManager.cpp
index 5be3ee341c9c..e4a0f90b2f71 100644
--- a/lib/Analysis/LoopAnalysisManager.cpp
+++ b/lib/Analysis/LoopAnalysisManager.cpp
@@ -31,24 +31,10 @@ bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
     FunctionAnalysisManager::Invalidator &Inv) {
   // First compute the sequence of IR units covered by this proxy. We will want
   // to visit this in postorder, but because this is a tree structure we can do
-  // this by building a preorder sequence and walking it in reverse.
-  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
-  // Note that we want to walk the roots in reverse order because we will end
-  // up reversing the preorder sequence. However, it happens that the loop nest
-  // roots are in reverse order within the LoopInfo object. So we just walk
-  // forward here.
-  // FIXME: If we change the order of LoopInfo we will want to add a reverse
-  // here.
-  for (Loop *RootL : *LI) {
-    assert(PreOrderWorklist.empty() &&
-           "Must start with an empty preorder walk worklist.");
-    PreOrderWorklist.push_back(RootL);
-    do {
-      Loop *L = PreOrderWorklist.pop_back_val();
-      PreOrderWorklist.append(L->begin(), L->end());
-      PreOrderLoops.push_back(L);
-    } while (!PreOrderWorklist.empty());
-  }
+  // this by building a preorder sequence and walking it backwards. We also
+  // want siblings in forward program order to match the LoopPassManager so we
+  // get the preorder with siblings reversed.
+  SmallVector<Loop *, 4> PreOrderLoops = LI->getLoopsInReverseSiblingPreorder();
 
   // If this proxy or the loop info is going to be invalidated, we also need
   // to clear all the keys coming from that analysis. We also completely blow
@@ -145,7 +131,6 @@ LoopAnalysisManagerFunctionProxy::run(Function &F,
 
 PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
   PreservedAnalyses PA;
-  PA.preserve<AssumptionAnalysis>();
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
   PA.preserve<LoopAnalysisManagerFunctionProxy>();
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index f449ce94d57c..ff68810abb82 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -40,9 +40,9 @@ template class llvm::LoopInfoBase<BasicBlock, Loop>;
 
 // Always verify loopinfo if expensive checking is enabled.
 #ifdef EXPENSIVE_CHECKS
-static bool VerifyLoopInfo = true;
+bool llvm::VerifyLoopInfo = true;
 #else
-static bool VerifyLoopInfo = false;
+bool llvm::VerifyLoopInfo = false;
 #endif
 static cl::opt<bool,true>
 VerifyLoopInfoX("verify-loop-info", cl::location(VerifyLoopInfo),
@@ -211,9 +211,11 @@ bool Loop::isSafeToClone() const {
 
 MDNode *Loop::getLoopID() const {
   MDNode *LoopID = nullptr;
-  if (isLoopSimplifyForm()) {
-    LoopID = getLoopLatch()->getTerminator()->getMetadata(LLVMContext::MD_loop);
+  if (BasicBlock *Latch = getLoopLatch()) {
+    LoopID = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop);
   } else {
+    assert(!getLoopLatch() &&
+           "The loop should have no single latch at this point");
     // Go through each predecessor of the loop header and check the
     // terminator for the metadata.
     BasicBlock *H = getHeader();
@@ -248,11 +250,12 @@ void Loop::setLoopID(MDNode *LoopID) const {
   assert(LoopID->getNumOperands() > 0 && "Loop ID needs at least one operand");
   assert(LoopID->getOperand(0) == LoopID && "Loop ID should refer to itself");
 
-  if (isLoopSimplifyForm()) {
-    getLoopLatch()->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
+  if (BasicBlock *Latch = getLoopLatch()) {
+    Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
     return;
   }
 
+  assert(!getLoopLatch() && "The loop should have no single latch at this point");
   BasicBlock *H = getHeader();
   for (BasicBlock *BB : this->blocks()) {
     TerminatorInst *TI = BB->getTerminator();
@@ -610,6 +613,15 @@ LoopInfo::LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree) {
   analyze(DomTree);
 }
 
+bool LoopInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                          FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<LoopAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 void LoopInfo::markAsRemoved(Loop *Unloop) {
   assert(!Unloop->isInvalid() && "Loop has already been removed");
   Unloop->invalidate();
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 3f4a07942154..0b5f6266e373 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -54,6 +54,8 @@ public:
     }
     return false;
   }
+
+  StringRef getPassName() const override { return "Print Loop IR"; }
 };
 
 char PrintLoopPassWrapper::ID = 0;
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 2d8274040d39..b8c444904723 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -50,30 +50,30 @@ struct AllocFnsTy {
 
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
-static const std::pair<LibFunc::Func, AllocFnsTy> AllocationFnData[] = {
-  {LibFunc::malloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc::valloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc::Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc::ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc::Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
-  {LibFunc::ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
-  {LibFunc::Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc::ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc::Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
-  {LibFunc::ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
-  {LibFunc::msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc::msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc::msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
-  {LibFunc::msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
-  {LibFunc::msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc::msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc::msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
-  {LibFunc::msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
-  {LibFunc::calloc,              {CallocLike,  2, 0,   1}},
-  {LibFunc::realloc,             {ReallocLike, 2, 1,  -1}},
-  {LibFunc::reallocf,            {ReallocLike, 2, 1,  -1}},
-  {LibFunc::strdup,              {StrDupLike,  1, -1, -1}},
-  {LibFunc::strndup,             {StrDupLike,  2, 1,  -1}}
+static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
+  {LibFunc_malloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc_valloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc_Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
+  {LibFunc_ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc_Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
+  {LibFunc_ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
+  {LibFunc_Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
+  {LibFunc_ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc_Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
+  {LibFunc_ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
+  {LibFunc_msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
+  {LibFunc_msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc_msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
+  {LibFunc_msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
+  {LibFunc_msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
+  {LibFunc_msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc_msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
+  {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
+  {LibFunc_calloc,              {CallocLike,  2, 0,   1}},
+  {LibFunc_realloc,             {ReallocLike, 2, 1,  -1}},
+  {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
+  {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
+  {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}}
   // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
@@ -106,12 +106,12 @@ getAllocationDataForFunction(const Function *Callee, AllocType AllocTy,
                              const TargetLibraryInfo *TLI) {
   // Make sure that the function is available.
   StringRef FnName = Callee->getName();
-  LibFunc::Func TLIFn;
+  LibFunc TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
     return None;
 
   const auto *Iter = find_if(
-      AllocationFnData, [TLIFn](const std::pair<LibFunc::Func, AllocFnsTy> &P) {
+      AllocationFnData, [TLIFn](const std::pair<LibFunc, AllocFnsTy> &P) {
         return P.first == TLIFn;
       });
 
@@ -183,7 +183,7 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
   ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CS && CS.paramHasAttr(AttributeSet::ReturnIndex, Attribute::NoAlias);
+  return CS && CS.hasRetAttr(Attribute::NoAlias);
 }
 
 
@@ -333,33 +333,33 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
     return nullptr;
 
   StringRef FnName = Callee->getName();
-  LibFunc::Func TLIFn;
+  LibFunc TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
     return nullptr;
 
   unsigned ExpectedNumParams;
-  if (TLIFn == LibFunc::free ||
-      TLIFn == LibFunc::ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc::ZdaPv || // operator delete[](void*)
-      TLIFn == LibFunc::msvc_delete_ptr32 || // operator delete(void*)
-      TLIFn == LibFunc::msvc_delete_ptr64 || // operator delete(void*)
-      TLIFn == LibFunc::msvc_delete_array_ptr32 || // operator delete[](void*)
-      TLIFn == LibFunc::msvc_delete_array_ptr64)   // operator delete[](void*)
+  if (TLIFn == LibFunc_free ||
+      TLIFn == LibFunc_ZdlPv || // operator delete(void*)
+      TLIFn == LibFunc_ZdaPv || // operator delete[](void*)
+      TLIFn == LibFunc_msvc_delete_ptr32 || // operator delete(void*)
+      TLIFn == LibFunc_msvc_delete_ptr64 || // operator delete(void*)
+      TLIFn == LibFunc_msvc_delete_array_ptr32 || // operator delete[](void*)
+      TLIFn == LibFunc_msvc_delete_array_ptr64)   // operator delete[](void*)
     ExpectedNumParams = 1;
-  else if (TLIFn == LibFunc::ZdlPvj ||              // delete(void*, uint)
-           TLIFn == LibFunc::ZdlPvm ||              // delete(void*, ulong)
-           TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
-           TLIFn == LibFunc::ZdaPvj ||              // delete[](void*, uint)
-           TLIFn == LibFunc::ZdaPvm ||              // delete[](void*, ulong)
-           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_ptr32_int ||      // delete(void*, uint)
-           TLIFn == LibFunc::msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
-           TLIFn == LibFunc::msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
-           TLIFn == LibFunc::msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
-           TLIFn == LibFunc::msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
+  else if (TLIFn == LibFunc_ZdlPvj ||              // delete(void*, uint)
+           TLIFn == LibFunc_ZdlPvm ||              // delete(void*, ulong)
+           TLIFn == LibFunc_ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
+           TLIFn == LibFunc_ZdaPvj ||              // delete[](void*, uint)
+           TLIFn == LibFunc_ZdaPvm ||              // delete[](void*, ulong)
+           TLIFn == LibFunc_ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_ptr32_int ||      // delete(void*, uint)
+           TLIFn == LibFunc_msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
+           TLIFn == LibFunc_msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
+           TLIFn == LibFunc_msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
+           TLIFn == LibFunc_msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
     return nullptr;
@@ -394,10 +394,8 @@ static APInt getSizeWithOverflow(const SizeOffsetType &Data) {
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
 bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
-                         const TargetLibraryInfo *TLI, bool RoundToAlign,
-                         llvm::ObjSizeMode Mode) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(),
-                                  RoundToAlign, Mode);
+                         const TargetLibraryInfo *TLI, ObjectSizeOpts Opts) {
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), Opts);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -414,19 +412,23 @@ ConstantInt *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
          "ObjectSize must be a call to llvm.objectsize!");
 
   bool MaxVal = cast<ConstantInt>(ObjectSize->getArgOperand(1))->isZero();
-  ObjSizeMode Mode;
+  ObjectSizeOpts EvalOptions;
   // Unless we have to fold this to something, try to be as accurate as
   // possible.
   if (MustSucceed)
-    Mode = MaxVal ? ObjSizeMode::Max : ObjSizeMode::Min;
+    EvalOptions.EvalMode =
+        MaxVal ? ObjectSizeOpts::Mode::Max : ObjectSizeOpts::Mode::Min;
   else
-    Mode = ObjSizeMode::Exact;
+    EvalOptions.EvalMode = ObjectSizeOpts::Mode::Exact;
+
+  EvalOptions.NullIsUnknownSize =
+      cast<ConstantInt>(ObjectSize->getArgOperand(2))->isOne();
 
   // FIXME: Does it make sense to just return a failure value if the size won't
   // fit in the output and `!MustSucceed`?
   uint64_t Size;
   auto *ResultType = cast<IntegerType>(ObjectSize->getType());
-  if (getObjectSize(ObjectSize->getArgOperand(0), Size, DL, TLI, false, Mode) &&
+  if (getObjectSize(ObjectSize->getArgOperand(0), Size, DL, TLI, EvalOptions) &&
       isUIntN(ResultType->getBitWidth(), Size))
     return ConstantInt::get(ResultType, Size);
 
@@ -443,7 +445,7 @@ STATISTIC(ObjectVisitorLoad,
 
 
 APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
-  if (RoundToAlign && Align)
+  if (Options.RoundToAlign && Align)
     return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align));
   return Size;
 }
@@ -451,9 +453,8 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
 ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout &DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
-                                                 bool RoundToAlign,
-                                                 ObjSizeMode Mode)
-    : DL(DL), TLI(TLI), RoundToAlign(RoundToAlign), Mode(Mode) {
+                                                 ObjectSizeOpts Options)
+    : DL(DL), TLI(TLI), Options(Options) {
   // Pointer size must be rechecked for each object visited since it could have
   // a different address space.
 }
@@ -596,7 +597,9 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
 }
 
 SizeOffsetType
-ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) {
+ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull& CPN) {
+  if (Options.NullIsUnknownSize && CPN.getType()->getAddressSpace() == 0)
+    return unknown();
   return std::make_pair(Zero, Zero);
 }
 
@@ -663,12 +666,12 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
     if (TrueResult == FalseResult) {
       return TrueSide;
     }
-    if (Mode == ObjSizeMode::Min) {
+    if (Options.EvalMode == ObjectSizeOpts::Mode::Min) {
       if (TrueResult.slt(FalseResult))
         return TrueSide;
       return FalseSide;
     }
-    if (Mode == ObjSizeMode::Max) {
+    if (Options.EvalMode == ObjectSizeOpts::Mode::Max) {
       if (TrueResult.sgt(FalseResult))
         return TrueSide;
       return FalseSide;
@@ -719,7 +722,10 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, RoundToAlign);
+  ObjectSizeOpts ObjSizeOptions;
+  ObjSizeOptions.RoundToAlign = RoundToAlign;
+
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, ObjSizeOptions);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
diff --git a/lib/Analysis/MemoryLocation.cpp b/lib/Analysis/MemoryLocation.cpp
index a0ae72f1415f..9db6c499129a 100644
--- a/lib/Analysis/MemoryLocation.cpp
+++ b/lib/Analysis/MemoryLocation.cpp
@@ -142,9 +142,9 @@ MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
   // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
-  LibFunc::Func F;
+  LibFunc F;
   if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
-      F == LibFunc::memset_pattern16 && TLI.has(F)) {
+      F == LibFunc_memset_pattern16 && TLI.has(F)) {
     assert((ArgIdx == 0 || ArgIdx == 1) &&
            "Invalid argument index for memset_pattern16");
     if (ArgIdx == 1)
diff --git a/lib/Transforms/Utils/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 1ce4225f09cc..910170561abf 100644
--- a/lib/Transforms/Utils/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -10,7 +10,7 @@
 // This file implements the MemorySSA class.
 //
 //===----------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -44,10 +44,6 @@
 
 #define DEBUG_TYPE "memoryssa"
 using namespace llvm;
-STATISTIC(NumClobberCacheLookups, "Number of Memory SSA version cache lookups");
-STATISTIC(NumClobberCacheHits, "Number of Memory SSA version cache hits");
-STATISTIC(NumClobberCacheInserts, "Number of MemorySSA version cache inserts");
-
 INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
                       true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -145,8 +141,8 @@ public:
 
 private:
   union {
-      ImmutableCallSite CS;
-      MemoryLocation Loc;
+    ImmutableCallSite CS;
+    MemoryLocation Loc;
   };
 };
 }
@@ -218,12 +214,16 @@ static bool instructionClobbersQuery(MemoryDef *MD,
                                      AliasAnalysis &AA) {
   Instruction *DefInst = MD->getMemoryInst();
   assert(DefInst && "Defining instruction not actually an instruction");
+  ImmutableCallSite UseCS(UseInst);
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
     // These intrinsics will show up as affecting memory, but they are just
     // markers.
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
+      if (UseCS)
+        return false;
+      return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), UseLoc);
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
@@ -234,7 +234,6 @@ static bool instructionClobbersQuery(MemoryDef *MD,
     }
   }
 
-  ImmutableCallSite UseCS(UseInst);
   if (UseCS) {
     ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
     return I != MRI_NoModRef;
@@ -269,8 +268,8 @@ static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU,
 }
 
 // Return true when MD may alias MU, return false otherwise.
-bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
-                         AliasAnalysis &AA) {
+bool MemorySSAUtil::defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
+                                        AliasAnalysis &AA) {
   return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA);
 }
 }
@@ -302,7 +301,6 @@ static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
   Instruction *Inst = MD->getMemoryInst();
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
       return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), Loc);
     default:
@@ -320,95 +318,8 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysis &AA,
   // FIXME: We should handle invariant groups, as well. It's a bit harder,
   // because we need to pay close attention to invariant group barriers.
   return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
-                              AA.pointsToConstantMemory(I));
-}
-
-/// Cache for our caching MemorySSA walker.
-class WalkerCache {
-  DenseMap<ConstMemoryAccessPair, MemoryAccess *> Accesses;
-  DenseMap<const MemoryAccess *, MemoryAccess *> Calls;
-
-public:
-  MemoryAccess *lookup(const MemoryAccess *MA, const MemoryLocation &Loc,
-                       bool IsCall) const {
-    ++NumClobberCacheLookups;
-    MemoryAccess *R = IsCall ? Calls.lookup(MA) : Accesses.lookup({MA, Loc});
-    if (R)
-      ++NumClobberCacheHits;
-    return R;
-  }
-
-  bool insert(const MemoryAccess *MA, MemoryAccess *To,
-              const MemoryLocation &Loc, bool IsCall) {
-    // This is fine for Phis, since there are times where we can't optimize
-    // them.  Making a def its own clobber is never correct, though.
-    assert((MA != To || isa<MemoryPhi>(MA)) &&
-           "Something can't clobber itself!");
-
-    ++NumClobberCacheInserts;
-    bool Inserted;
-    if (IsCall)
-      Inserted = Calls.insert({MA, To}).second;
-    else
-      Inserted = Accesses.insert({{MA, Loc}, To}).second;
-
-    return Inserted;
-  }
-
-  bool remove(const MemoryAccess *MA, const MemoryLocation &Loc, bool IsCall) {
-    return IsCall ? Calls.erase(MA) : Accesses.erase({MA, Loc});
-  }
-
-  void clear() {
-    Accesses.clear();
-    Calls.clear();
-  }
-
-  bool contains(const MemoryAccess *MA) const {
-    for (auto &P : Accesses)
-      if (P.first.first == MA || P.second == MA)
-        return true;
-    for (auto &P : Calls)
-      if (P.first == MA || P.second == MA)
-        return true;
-    return false;
-  }
-};
-
-/// Walks the defining uses of MemoryDefs. Stops after we hit something that has
-/// no defining use (e.g. a MemoryPhi or liveOnEntry). Note that, when comparing
-/// against a null def_chain_iterator, this will compare equal only after
-/// walking said Phi/liveOnEntry.
-struct def_chain_iterator
-    : public iterator_facade_base<def_chain_iterator, std::forward_iterator_tag,
-                                  MemoryAccess *> {
-  def_chain_iterator() : MA(nullptr) {}
-  def_chain_iterator(MemoryAccess *MA) : MA(MA) {}
-
-  MemoryAccess *operator*() const { return MA; }
-
-  def_chain_iterator &operator++() {
-    // N.B. liveOnEntry has a null defining access.
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
-      MA = MUD->getDefiningAccess();
-    else
-      MA = nullptr;
-    return *this;
-  }
-
-  bool operator==(const def_chain_iterator &O) const { return MA == O.MA; }
-
-private:
-  MemoryAccess *MA;
-};
-
-static iterator_range<def_chain_iterator>
-def_chain(MemoryAccess *MA, MemoryAccess *UpTo = nullptr) {
-#ifdef EXPENSIVE_CHECKS
-  assert((!UpTo || find(def_chain(MA), UpTo) != def_chain_iterator()) &&
-         "UpTo isn't in the def chain!");
-#endif
-  return make_range(def_chain_iterator(MA), def_chain_iterator(UpTo));
+                              AA.pointsToConstantMemory(cast<LoadInst>(I)->
+                                                          getPointerOperand()));
 }
 
 /// Verifies that `Start` is clobbered by `ClobberAt`, and that nothing
@@ -512,91 +423,24 @@ class ClobberWalker {
   const MemorySSA &MSSA;
   AliasAnalysis &AA;
   DominatorTree &DT;
-  WalkerCache &WC;
   UpwardsMemoryQuery *Query;
-  bool UseCache;
 
   // Phi optimization bookkeeping
   SmallVector<DefPath, 32> Paths;
   DenseSet<ConstMemoryAccessPair> VisitedPhis;
-  DenseMap<const BasicBlock *, MemoryAccess *> WalkTargetCache;
-
-  void setUseCache(bool Use) { UseCache = Use; }
-  bool shouldIgnoreCache() const {
-    // UseCache will only be false when we're debugging, or when expensive
-    // checks are enabled. In either case, we don't care deeply about speed.
-    return LLVM_UNLIKELY(!UseCache);
-  }
-
-  void addCacheEntry(const MemoryAccess *What, MemoryAccess *To,
-                     const MemoryLocation &Loc) const {
-// EXPENSIVE_CHECKS because most of these queries are redundant.
-#ifdef EXPENSIVE_CHECKS
-    assert(MSSA.dominates(To, What));
-#endif
-    if (shouldIgnoreCache())
-      return;
-    WC.insert(What, To, Loc, Query->IsCall);
-  }
-
-  MemoryAccess *lookupCache(const MemoryAccess *MA, const MemoryLocation &Loc) {
-    return shouldIgnoreCache() ? nullptr : WC.lookup(MA, Loc, Query->IsCall);
-  }
-
-  void cacheDefPath(const DefPath &DN, MemoryAccess *Target) const {
-    if (shouldIgnoreCache())
-      return;
-
-    for (MemoryAccess *MA : def_chain(DN.First, DN.Last))
-      addCacheEntry(MA, Target, DN.Loc);
-
-    // DefPaths only express the path we walked. So, DN.Last could either be a
-    // thing we want to cache, or not.
-    if (DN.Last != Target)
-      addCacheEntry(DN.Last, Target, DN.Loc);
-  }
 
   /// Find the nearest def or phi that `From` can legally be optimized to.
-  ///
-  /// FIXME: Deduplicate this with MSSA::findDominatingDef. Ideally, MSSA should
-  /// keep track of this information for us, and allow us O(1) lookups of this
-  /// info.
-  MemoryAccess *getWalkTarget(const MemoryPhi *From) {
+  const MemoryAccess *getWalkTarget(const MemoryPhi *From) const {
     assert(From->getNumOperands() && "Phi with no operands?");
 
     BasicBlock *BB = From->getBlock();
-    auto At = WalkTargetCache.find(BB);
-    if (At != WalkTargetCache.end())
-      return At->second;
-
-    SmallVector<const BasicBlock *, 8> ToCache;
-    ToCache.push_back(BB);
-
     MemoryAccess *Result = MSSA.getLiveOnEntryDef();
     DomTreeNode *Node = DT.getNode(BB);
     while ((Node = Node->getIDom())) {
-      auto At = WalkTargetCache.find(BB);
-      if (At != WalkTargetCache.end()) {
-        Result = At->second;
-        break;
-      }
-
-      auto *Accesses = MSSA.getBlockAccesses(Node->getBlock());
-      if (Accesses) {
-        auto Iter = find_if(reverse(*Accesses), [](const MemoryAccess &MA) {
-          return !isa<MemoryUse>(MA);
-        });
-        if (Iter != Accesses->rend()) {
-          Result = const_cast<MemoryAccess *>(&*Iter);
-          break;
-        }
-      }
-
-      ToCache.push_back(Node->getBlock());
+      auto *Defs = MSSA.getBlockDefs(Node->getBlock());
+      if (Defs)
+        return &*Defs->rbegin();
     }
-
-    for (const BasicBlock *BB : ToCache)
-      WalkTargetCache.insert({BB, Result});
     return Result;
   }
 
@@ -606,7 +450,6 @@ class ClobberWalker {
     /// both.
     MemoryAccess *Result;
     bool IsKnownClobber;
-    bool FromCache;
   };
 
   /// Walk to the next Phi or Clobber in the def chain starting at Desc.Last.
@@ -614,29 +457,25 @@ class ClobberWalker {
   /// StopAt.
   ///
   /// This does not test for whether StopAt is a clobber
-  UpwardsWalkResult walkToPhiOrClobber(DefPath &Desc,
-                                       MemoryAccess *StopAt = nullptr) {
+  UpwardsWalkResult
+  walkToPhiOrClobber(DefPath &Desc,
+                     const MemoryAccess *StopAt = nullptr) const {
     assert(!isa<MemoryUse>(Desc.Last) && "Uses don't exist in my world");
 
     for (MemoryAccess *Current : def_chain(Desc.Last)) {
       Desc.Last = Current;
       if (Current == StopAt)
-        return {Current, false, false};
+        return {Current, false};
 
       if (auto *MD = dyn_cast<MemoryDef>(Current))
         if (MSSA.isLiveOnEntryDef(MD) ||
             instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA))
-          return {MD, true, false};
-
-      // Cache checks must be done last, because if Current is a clobber, the
-      // cache will contain the clobber for Current.
-      if (MemoryAccess *MA = lookupCache(Current, Desc.Loc))
-        return {MA, true, true};
+          return {MD, true};
     }
 
     assert(isa<MemoryPhi>(Desc.Last) &&
            "Ended at a non-clobber that's not a phi?");
-    return {Desc.Last, false, false};
+    return {Desc.Last, false};
   }
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
@@ -666,7 +505,7 @@ class ClobberWalker {
   /// If this returns None, NewPaused is a vector of searches that terminated
   /// at StopWhere. Otherwise, NewPaused is left in an unspecified state.
   Optional<TerminatedPath>
-  getBlockingAccess(MemoryAccess *StopWhere,
+  getBlockingAccess(const MemoryAccess *StopWhere,
                     SmallVectorImpl<ListIndex> &PausedSearches,
                     SmallVectorImpl<ListIndex> &NewPaused,
                     SmallVectorImpl<TerminatedPath> &Terminated) {
@@ -701,11 +540,11 @@ class ClobberWalker {
 
       UpwardsWalkResult Res = walkToPhiOrClobber(Node, /*StopAt=*/StopWhere);
       if (Res.IsKnownClobber) {
-        assert(Res.Result != StopWhere || Res.FromCache);
+        assert(Res.Result != StopWhere);
         // If this wasn't a cache hit, we hit a clobber when walking. That's a
         // failure.
         TerminatedPath Term{Res.Result, PathIndex};
-        if (!Res.FromCache || !MSSA.dominates(Res.Result, StopWhere))
+        if (!MSSA.dominates(Res.Result, StopWhere))
           return Term;
 
         // Otherwise, it's a valid thing to potentially optimize to.
@@ -830,7 +669,7 @@ class ClobberWalker {
       assert(!MSSA.isLiveOnEntryDef(Current) &&
              "liveOnEntry wasn't treated as a clobber?");
 
-      MemoryAccess *Target = getWalkTarget(Current);
+      const auto *Target = getWalkTarget(Current);
       // If a TerminatedPath doesn't dominate Target, then it wasn't a legal
       // optimization for the prior phi.
       assert(all_of(TerminatedPaths, [&](const TerminatedPath &P) {
@@ -842,8 +681,6 @@ class ClobberWalker {
       // For the moment, this is fine, since we do nothing with blocker info.
       if (Optional<TerminatedPath> Blocker = getBlockingAccess(
               Target, PausedSearches, NewPaused, TerminatedPaths)) {
-        // Cache our work on the blocking node, since we know that's correct.
-        cacheDefPath(Paths[Blocker->LastNode], Blocker->Clobber);
 
         // Find the node we started at. We can't search based on N->Last, since
         // we may have gone around a loop with a different MemoryLocation.
@@ -908,7 +745,7 @@ class ClobberWalker {
         // If we couldn't find the dominating phi/liveOnEntry in the above loop,
         // do it now.
         if (!DefChainEnd)
-          for (MemoryAccess *MA : def_chain(Target))
+          for (auto *MA : def_chain(const_cast<MemoryAccess *>(Target)))
             DefChainEnd = MA;
 
         // If any of the terminated paths don't dominate the phi we'll try to
@@ -946,35 +783,6 @@ class ClobberWalker {
     }
   }
 
-  /// Caches everything in an OptznResult.
-  void cacheOptResult(const OptznResult &R) {
-    if (R.OtherClobbers.empty()) {
-      // If we're not going to be caching OtherClobbers, don't bother with
-      // marking visited/etc.
-      for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode))
-        cacheDefPath(N, R.PrimaryClobber.Clobber);
-      return;
-    }
-
-    // PrimaryClobber is our answer. If we can cache anything back, we need to
-    // stop caching when we visit PrimaryClobber.
-    SmallBitVector Visited(Paths.size());
-    for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode)) {
-      Visited[defPathIndex(N)] = true;
-      cacheDefPath(N, R.PrimaryClobber.Clobber);
-    }
-
-    for (const TerminatedPath &P : R.OtherClobbers) {
-      for (const DefPath &N : const_def_path(P.LastNode)) {
-        ListIndex NIndex = defPathIndex(N);
-        if (Visited[NIndex])
-          break;
-        Visited[NIndex] = true;
-        cacheDefPath(N, P.Clobber);
-      }
-    }
-  }
-
   void verifyOptResult(const OptznResult &R) const {
     assert(all_of(R.OtherClobbers, [&](const TerminatedPath &P) {
       return MSSA.dominates(P.Clobber, R.PrimaryClobber.Clobber);
@@ -987,17 +795,14 @@ class ClobberWalker {
   }
 
 public:
-  ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT,
-                WalkerCache &WC)
-      : MSSA(MSSA), AA(AA), DT(DT), WC(WC), UseCache(true) {}
+  ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT)
+      : MSSA(MSSA), AA(AA), DT(DT) {}
 
-  void reset() { WalkTargetCache.clear(); }
+  void reset() {}
 
   /// Finds the nearest clobber for the given query, optimizing phis if
   /// possible.
-  MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q,
-                            bool UseWalkerCache = true) {
-    setUseCache(UseWalkerCache);
+  MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q) {
     Query = &Q;
 
     MemoryAccess *Current = Start;
@@ -1012,13 +817,11 @@ public:
     UpwardsWalkResult WalkResult = walkToPhiOrClobber(FirstDesc);
     MemoryAccess *Result;
     if (WalkResult.IsKnownClobber) {
-      cacheDefPath(FirstDesc, WalkResult.Result);
       Result = WalkResult.Result;
     } else {
       OptznResult OptRes = tryOptimizePhi(cast<MemoryPhi>(FirstDesc.Last),
                                           Current, Q.StartingLoc);
       verifyOptResult(OptRes);
-      cacheOptResult(OptRes);
       resetPhiOptznState();
       Result = OptRes.PrimaryClobber.Clobber;
     }
@@ -1049,41 +852,10 @@ struct RenamePassData {
 } // anonymous namespace
 
 namespace llvm {
-/// \brief A MemorySSAWalker that does AA walks and caching of lookups to
-/// disambiguate accesses.
-///
-/// FIXME: The current implementation of this can take quadratic space in rare
-/// cases. This can be fixed, but it is something to note until it is fixed.
-///
-/// In order to trigger this behavior, you need to store to N distinct locations
-/// (that AA can prove don't alias), perform M stores to other memory
-/// locations that AA can prove don't alias any of the initial N locations, and
-/// then load from all of the N locations. In this case, we insert M cache
-/// entries for each of the N loads.
-///
-/// For example:
-/// define i32 @foo() {
-///   %a = alloca i32, align 4
-///   %b = alloca i32, align 4
-///   store i32 0, i32* %a, align 4
-///   store i32 0, i32* %b, align 4
-///
-///   ; Insert M stores to other memory that doesn't alias %a or %b here
-///
-///   %c = load i32, i32* %a, align 4 ; Caches M entries in
-///                                   ; CachedUpwardsClobberingAccess for the
-///                                   ; MemoryLocation %a
-///   %d = load i32, i32* %b, align 4 ; Caches M entries in
-///                                   ; CachedUpwardsClobberingAccess for the
-///                                   ; MemoryLocation %b
-///
-///   ; For completeness' sake, loading %a or %b again would not cache *another*
-///   ; M entries.
-///   %r = add i32 %c, %d
-///   ret i32 %r
-/// }
+/// \brief A MemorySSAWalker that does AA walks to disambiguate accesses. It no
+/// longer does caching on its own,
+/// but the name has been retained for the moment.
 class MemorySSA::CachingWalker final : public MemorySSAWalker {
-  WalkerCache Cache;
   ClobberWalker Walker;
   bool AutoResetWalker;
 
@@ -1104,10 +876,7 @@ public:
   /// answer a clobber query.
   void setAutoResetWalker(bool AutoReset) { AutoResetWalker = AutoReset; }
 
-  /// Drop the walker's persistent data structures. At the moment, this means
-  /// "drop the walker's cache of BasicBlocks ->
-  /// earliest-MemoryAccess-we-can-optimize-to". This is necessary if we're
-  /// going to have DT updates, if we remove MemoryAccesses, etc.
+  /// Drop the walker's persistent data structures.
   void resetClobberWalker() { Walker.reset(); }
 
   void verify(const MemorySSA *MSSA) override {
@@ -1116,18 +885,37 @@ public:
   }
 };
 
+void MemorySSA::renameSuccessorPhis(BasicBlock *BB, MemoryAccess *IncomingVal,
+                                    bool RenameAllUses) {
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    if (RenameAllUses) {
+      int PhiIndex = Phi->getBasicBlockIndex(BB);
+      assert(PhiIndex != -1 && "Incomplete phi during partial rename");
+      Phi->setIncomingValue(PhiIndex, IncomingVal);
+    } else
+      Phi->addIncoming(IncomingVal, BB);
+  }
+}
+
 /// \brief Rename a single basic block into MemorySSA form.
 /// Uses the standard SSA renaming algorithm.
 /// \returns The new incoming value.
-MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
-                                     MemoryAccess *IncomingVal) {
+MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB, MemoryAccess *IncomingVal,
+                                     bool RenameAllUses) {
   auto It = PerBlockAccesses.find(BB);
   // Skip most processing if the list is empty.
   if (It != PerBlockAccesses.end()) {
     AccessList *Accesses = It->second.get();
     for (MemoryAccess &L : *Accesses) {
       if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(&L)) {
-        if (MUD->getDefiningAccess() == nullptr)
+        if (MUD->getDefiningAccess() == nullptr || RenameAllUses)
           MUD->setDefiningAccess(IncomingVal);
         if (isa<MemoryDef>(&L))
           IncomingVal = &L;
@@ -1136,18 +924,6 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
       }
     }
   }
-
-  // Pass through values to our successors
-  for (const BasicBlock *S : successors(BB)) {
-    auto It = PerBlockAccesses.find(S);
-    // Rename the phi nodes in our successor block
-    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
-      continue;
-    AccessList *Accesses = It->second.get();
-    auto *Phi = cast<MemoryPhi>(&Accesses->front());
-    Phi->addIncoming(IncomingVal, BB);
-  }
-
   return IncomingVal;
 }
 
@@ -1156,11 +932,19 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
 /// We walk the dominator tree in preorder, renaming accesses, and then filling
 /// in phi nodes in our successors.
 void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
-                           SmallPtrSet<BasicBlock *, 16> &Visited) {
+                           SmallPtrSetImpl<BasicBlock *> &Visited,
+                           bool SkipVisited, bool RenameAllUses) {
   SmallVector<RenamePassData, 32> WorkStack;
-  IncomingVal = renameBlock(Root->getBlock(), IncomingVal);
+  // Skip everything if we already renamed this block and we are skipping.
+  // Note: You can't sink this into the if, because we need it to occur
+  // regardless of whether we skip blocks or not.
+  bool AlreadyVisited = !Visited.insert(Root->getBlock()).second;
+  if (SkipVisited && AlreadyVisited)
+    return;
+
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses);
+  renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses);
   WorkStack.push_back({Root, Root->begin(), IncomingVal});
-  Visited.insert(Root->getBlock());
 
   while (!WorkStack.empty()) {
     DomTreeNode *Node = WorkStack.back().DTN;
@@ -1173,20 +957,25 @@ void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
       DomTreeNode *Child = *ChildIt;
       ++WorkStack.back().ChildIt;
       BasicBlock *BB = Child->getBlock();
-      Visited.insert(BB);
-      IncomingVal = renameBlock(BB, IncomingVal);
+      // Note: You can't sink this into the if, because we need it to occur
+      // regardless of whether we skip blocks or not.
+      AlreadyVisited = !Visited.insert(BB).second;
+      if (SkipVisited && AlreadyVisited) {
+        // We already visited this during our renaming, which can happen when
+        // being asked to rename multiple blocks. Figure out the incoming val,
+        // which is the last def.
+        // Incoming value can only change if there is a block def, and in that
+        // case, it's the last block def in the list.
+        if (auto *BlockDefs = getWritableBlockDefs(BB))
+          IncomingVal = &*BlockDefs->rbegin();
+      } else
+        IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses);
+      renameSuccessorPhis(BB, IncomingVal, RenameAllUses);
       WorkStack.push_back({Child, Child->begin(), IncomingVal});
     }
   }
 }
 
-/// \brief Compute dominator levels, used by the phi insertion algorithm above.
-void MemorySSA::computeDomLevels(DenseMap<DomTreeNode *, unsigned> &DomLevels) {
-  for (auto DFI = df_begin(DT->getRootNode()), DFE = df_end(DT->getRootNode());
-       DFI != DFE; ++DFI)
-    DomLevels[*DFI] = DFI.getPathLength() - 1;
-}
-
 /// \brief This handles unreachable block accesses by deleting phi nodes in
 /// unreachable blocks, and marking all other unreachable MemoryAccess's as
 /// being uses of the live on entry definition.
@@ -1247,6 +1036,13 @@ MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
     Res.first->second = make_unique<AccessList>();
   return Res.first->second.get();
 }
+MemorySSA::DefsList *MemorySSA::getOrCreateDefsList(const BasicBlock *BB) {
+  auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<DefsList>();
+  return Res.first->second.get();
+}
 
 /// This class is a batch walker of all MemoryUse's in the program, and points
 /// their defining access at the thing that actually clobbers them.  Because it
@@ -1315,7 +1111,10 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
 
   // Pop everything that doesn't dominate the current block off the stack,
   // increment the PopEpoch to account for this.
-  while (!VersionStack.empty()) {
+  while (true) {
+    assert(
+        !VersionStack.empty() &&
+        "Version stack should have liveOnEntry sentinel dominating everything");
     BasicBlock *BackBlock = VersionStack.back()->getBlock();
     if (DT->dominates(BackBlock, BB))
       break;
@@ -1323,6 +1122,7 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
       VersionStack.pop_back();
     ++PopEpoch;
   }
+
   for (MemoryAccess &MA : *Accesses) {
     auto *MU = dyn_cast<MemoryUse>(&MA);
     if (!MU) {
@@ -1443,20 +1243,13 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
 
 /// Optimize uses to point to their actual clobbering definitions.
 void MemorySSA::OptimizeUses::optimizeUses() {
-
-  // We perform a non-recursive top-down dominator tree walk
-  struct StackInfo {
-    const DomTreeNode *Node;
-    DomTreeNode::const_iterator Iter;
-  };
-
   SmallVector<MemoryAccess *, 16> VersionStack;
-  SmallVector<StackInfo, 16> DomTreeWorklist;
   DenseMap<MemoryLocOrCall, MemlocStackInfo> LocStackInfo;
   VersionStack.push_back(MSSA->getLiveOnEntryDef());
 
   unsigned long StackEpoch = 1;
   unsigned long PopEpoch = 1;
+  // We perform a non-recursive top-down dominator tree walk.
   for (const auto *DomNode : depth_first(DT->getRootNode()))
     optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
                         LocStackInfo);
@@ -1477,14 +1270,8 @@ void MemorySSA::placePHINodes(
             });
 
   // Now place MemoryPhi nodes.
-  for (auto &BB : IDFBlocks) {
-    // Insert phi node
-    AccessList *Accesses = getOrCreateAccessList(BB);
-    MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
-    ValueToMemoryAccess[BB] = Phi;
-    // Phi's always are placed at the front of the block.
-    Accesses->push_front(Phi);
-  }
+  for (auto &BB : IDFBlocks)
+    createMemoryPhi(BB);
 }
 
 void MemorySSA::buildMemorySSA() {
@@ -1511,15 +1298,21 @@ void MemorySSA::buildMemorySSA() {
     BBNumbers[&B] = NextBBNum++;
     bool InsertIntoDef = false;
     AccessList *Accesses = nullptr;
+    DefsList *Defs = nullptr;
     for (Instruction &I : B) {
       MemoryUseOrDef *MUD = createNewAccess(&I);
       if (!MUD)
         continue;
-      InsertIntoDef |= isa<MemoryDef>(MUD);
 
       if (!Accesses)
         Accesses = getOrCreateAccessList(&B);
       Accesses->push_back(MUD);
+      if (isa<MemoryDef>(MUD)) {
+        InsertIntoDef = true;
+        if (!Defs)
+          Defs = getOrCreateDefsList(&B);
+        Defs->push_back(*MUD);
+      }
     }
     if (InsertIntoDef)
       DefiningBlocks.insert(&B);
@@ -1558,14 +1351,94 @@ MemorySSA::CachingWalker *MemorySSA::getWalkerImpl() {
   return Walker.get();
 }
 
+// This is a helper function used by the creation routines. It places NewAccess
+// into the access and defs lists for a given basic block, at the given
+// insertion point.
+void MemorySSA::insertIntoListsForBlock(MemoryAccess *NewAccess,
+                                        const BasicBlock *BB,
+                                        InsertionPlace Point) {
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // If it's a phi node, it goes first, otherwise, it goes after any phi
+    // nodes.
+    if (isa<MemoryPhi>(NewAccess)) {
+      Accesses->push_front(NewAccess);
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_front(*NewAccess);
+    } else {
+      auto AI = find_if_not(
+          *Accesses, [](const MemoryAccess &MA) { return isa<MemoryPhi>(MA); });
+      Accesses->insert(AI, NewAccess);
+      if (!isa<MemoryUse>(NewAccess)) {
+        auto *Defs = getOrCreateDefsList(BB);
+        auto DI = find_if_not(
+            *Defs, [](const MemoryAccess &MA) { return isa<MemoryPhi>(MA); });
+        Defs->insert(DI, *NewAccess);
+      }
+    }
+  } else {
+    Accesses->push_back(NewAccess);
+    if (!isa<MemoryUse>(NewAccess)) {
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_back(*NewAccess);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+void MemorySSA::insertIntoListsBefore(MemoryAccess *What, const BasicBlock *BB,
+                                      AccessList::iterator InsertPt) {
+  auto *Accesses = getWritableBlockAccesses(BB);
+  bool WasEnd = InsertPt == Accesses->end();
+  Accesses->insert(AccessList::iterator(InsertPt), What);
+  if (!isa<MemoryUse>(What)) {
+    auto *Defs = getOrCreateDefsList(BB);
+    // If we got asked to insert at the end, we have an easy job, just shove it
+    // at the end. If we got asked to insert before an existing def, we also get
+    // an terator. If we got asked to insert before a use, we have to hunt for
+    // the next def.
+    if (WasEnd) {
+      Defs->push_back(*What);
+    } else if (isa<MemoryDef>(InsertPt)) {
+      Defs->insert(InsertPt->getDefsIterator(), *What);
+    } else {
+      while (InsertPt != Accesses->end() && !isa<MemoryDef>(InsertPt))
+        ++InsertPt;
+      // Either we found a def, or we are inserting at the end
+      if (InsertPt == Accesses->end())
+        Defs->push_back(*What);
+      else
+        Defs->insert(InsertPt->getDefsIterator(), *What);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+// Move What before Where in the IR.  The end result is taht What will belong to
+// the right lists and have the right Block set, but will not otherwise be
+// correct. It will not have the right defining access, and if it is a def,
+// things below it will not properly be updated.
+void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+                       AccessList::iterator Where) {
+  // Keep it in the lookup tables, remove from the lists
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsBefore(What, BB, Where);
+}
+
+void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+                       InsertionPlace Point) {
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsForBlock(What, BB, Point);
+}
+
 MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
   assert(!getMemoryAccess(BB) && "MemoryPhi already exists for this BB");
-  AccessList *Accesses = getOrCreateAccessList(BB);
   MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
-  ValueToMemoryAccess[BB] = Phi;
   // Phi's always are placed at the front of the block.
-  Accesses->push_front(Phi);
-  BlockNumberingValid.erase(BB);
+  insertIntoListsForBlock(Phi, BB, Beginning);
+  ValueToMemoryAccess[BB] = Phi;
   return Phi;
 }
 
@@ -1580,72 +1453,19 @@ MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
   return NewAccess;
 }
 
-MemoryAccess *MemorySSA::createMemoryAccessInBB(Instruction *I,
-                                                MemoryAccess *Definition,
-                                                const BasicBlock *BB,
-                                                InsertionPlace Point) {
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  auto *Accesses = getOrCreateAccessList(BB);
-  if (Point == Beginning) {
-    // It goes after any phi nodes
-    auto AI = find_if(
-        *Accesses, [](const MemoryAccess &MA) { return !isa<MemoryPhi>(MA); });
-
-    Accesses->insert(AI, NewAccess);
-  } else {
-    Accesses->push_back(NewAccess);
-  }
-  BlockNumberingValid.erase(BB);
-  return NewAccess;
-}
-
-MemoryUseOrDef *MemorySSA::createMemoryAccessBefore(Instruction *I,
-                                                    MemoryAccess *Definition,
-                                                    MemoryUseOrDef *InsertPt) {
-  assert(I->getParent() == InsertPt->getBlock() &&
-         "New and old access must be in the same block");
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
-  Accesses->insert(AccessList::iterator(InsertPt), NewAccess);
-  BlockNumberingValid.erase(InsertPt->getBlock());
-  return NewAccess;
-}
-
-MemoryUseOrDef *MemorySSA::createMemoryAccessAfter(Instruction *I,
-                                                   MemoryAccess *Definition,
-                                                   MemoryAccess *InsertPt) {
-  assert(I->getParent() == InsertPt->getBlock() &&
-         "New and old access must be in the same block");
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
-  Accesses->insertAfter(AccessList::iterator(InsertPt), NewAccess);
-  BlockNumberingValid.erase(InsertPt->getBlock());
-  return NewAccess;
-}
-
-void MemorySSA::spliceMemoryAccessAbove(MemoryDef *Where,
-                                        MemoryUseOrDef *What) {
-  assert(What != getLiveOnEntryDef() &&
-         Where != getLiveOnEntryDef() && "Can't splice (above) LOE.");
-  assert(dominates(Where, What) && "Only upwards splices are permitted.");
-
-  if (Where == What)
-    return;
-  if (isa<MemoryDef>(What)) {
-    // TODO: possibly use removeMemoryAccess' more efficient RAUW
-    What->replaceAllUsesWith(What->getDefiningAccess());
-    What->setDefiningAccess(Where->getDefiningAccess());
-    Where->setDefiningAccess(What);
+// Return true if the instruction has ordering constraints.
+// Note specifically that this only considers stores and loads
+// because others are still considered ModRef by getModRefInfo.
+static inline bool isOrdered(const Instruction *I) {
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (!SI->isUnordered())
+      return true;
+  } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (!LI->isUnordered())
+      return true;
   }
-  AccessList *Src = getWritableBlockAccesses(What->getBlock());
-  AccessList *Dest = getWritableBlockAccesses(Where->getBlock());
-  Dest->splice(AccessList::iterator(Where), *Src, What);
-
-  BlockNumberingValid.erase(What->getBlock());
-  if (What->getBlock() != Where->getBlock())
-    BlockNumberingValid.erase(Where->getBlock());
+  return false;
 }
-
 /// \brief Helper function to create new memory accesses
 MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   // The assume intrinsic has a control dependency which we model by claiming
@@ -1658,7 +1478,15 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
 
   // Find out what affect this instruction has on memory.
   ModRefInfo ModRef = AA->getModRefInfo(I);
-  bool Def = bool(ModRef & MRI_Mod);
+  // The isOrdered check is used to ensure that volatiles end up as defs
+  // (atomics end up as ModRef right now anyway).  Until we separate the
+  // ordering chain from the memory chain, this enables people to see at least
+  // some relative ordering to volatiles.  Note that getClobberingMemoryAccess
+  // will still give an answer that bypasses other volatile loads.  TODO:
+  // Separate memory aliasing and ordering into two different chains so that we
+  // can precisely represent both "what memory will this read/write/is clobbered
+  // by" and "what instructions can I move this past".
+  bool Def = bool(ModRef & MRI_Mod) || isOrdered(I);
   bool Use = bool(ModRef & MRI_Ref);
 
   // It's possible for an instruction to not modify memory at all. During
@@ -1678,33 +1506,6 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   return MUD;
 }
 
-MemoryAccess *MemorySSA::findDominatingDef(BasicBlock *UseBlock,
-                                           enum InsertionPlace Where) {
-  // Handle the initial case
-  if (Where == Beginning)
-    // The only thing that could define us at the beginning is a phi node
-    if (MemoryPhi *Phi = getMemoryAccess(UseBlock))
-      return Phi;
-
-  DomTreeNode *CurrNode = DT->getNode(UseBlock);
-  // Need to be defined by our dominator
-  if (Where == Beginning)
-    CurrNode = CurrNode->getIDom();
-  Where = End;
-  while (CurrNode) {
-    auto It = PerBlockAccesses.find(CurrNode->getBlock());
-    if (It != PerBlockAccesses.end()) {
-      auto &Accesses = It->second;
-      for (MemoryAccess &RA : reverse(*Accesses)) {
-        if (isa<MemoryDef>(RA) || isa<MemoryPhi>(RA))
-          return &RA;
-      }
-    }
-    CurrNode = CurrNode->getIDom();
-  }
-  return LiveOnEntryDef.get();
-}
-
 /// \brief Returns true if \p Replacer dominates \p Replacee .
 bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
                              const MemoryAccess *Replacee) const {
@@ -1722,24 +1523,7 @@ bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
   return true;
 }
 
-/// \brief If all arguments of a MemoryPHI are defined by the same incoming
-/// argument, return that argument.
-static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
-  MemoryAccess *MA = nullptr;
-
-  for (auto &Arg : MP->operands()) {
-    if (!MA)
-      MA = cast<MemoryAccess>(Arg);
-    else if (MA != Arg)
-      return nullptr;
-  }
-  return MA;
-}
-
 /// \brief Properly remove \p MA from all of MemorySSA's lookup tables.
-///
-/// Because of the way the intrusive list and use lists work, it is important to
-/// do removal in the right order.
 void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   assert(MA->use_empty() &&
          "Trying to remove memory access that still has uses");
@@ -1760,69 +1544,46 @@ void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   auto VMA = ValueToMemoryAccess.find(MemoryInst);
   if (VMA->second == MA)
     ValueToMemoryAccess.erase(VMA);
+}
 
+/// \brief Properly remove \p MA from all of MemorySSA's lists.
+///
+/// Because of the way the intrusive list and use lists work, it is important to
+/// do removal in the right order.
+/// ShouldDelete defaults to true, and will cause the memory access to also be
+/// deleted, not just removed.
+void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
+  // The access list owns the reference, so we erase it from the non-owning list
+  // first.
+  if (!isa<MemoryUse>(MA)) {
+    auto DefsIt = PerBlockDefs.find(MA->getBlock());
+    std::unique_ptr<DefsList> &Defs = DefsIt->second;
+    Defs->remove(*MA);
+    if (Defs->empty())
+      PerBlockDefs.erase(DefsIt);
+  }
+
+  // The erase call here will delete it. If we don't want it deleted, we call
+  // remove instead.
   auto AccessIt = PerBlockAccesses.find(MA->getBlock());
   std::unique_ptr<AccessList> &Accesses = AccessIt->second;
-  Accesses->erase(MA);
+  if (ShouldDelete)
+    Accesses->erase(MA);
+  else
+    Accesses->remove(MA);
+
   if (Accesses->empty())
     PerBlockAccesses.erase(AccessIt);
 }
 
-void MemorySSA::removeMemoryAccess(MemoryAccess *MA) {
-  assert(!isLiveOnEntryDef(MA) && "Trying to remove the live on entry def");
-  // We can only delete phi nodes if they have no uses, or we can replace all
-  // uses with a single definition.
-  MemoryAccess *NewDefTarget = nullptr;
-  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
-    // Note that it is sufficient to know that all edges of the phi node have
-    // the same argument.  If they do, by the definition of dominance frontiers
-    // (which we used to place this phi), that argument must dominate this phi,
-    // and thus, must dominate the phi's uses, and so we will not hit the assert
-    // below.
-    NewDefTarget = onlySingleValue(MP);
-    assert((NewDefTarget || MP->use_empty()) &&
-           "We can't delete this memory phi");
-  } else {
-    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
-  }
-
-  // Re-point the uses at our defining access
-  if (!MA->use_empty()) {
-    // Reset optimized on users of this store, and reset the uses.
-    // A few notes:
-    // 1. This is a slightly modified version of RAUW to avoid walking the
-    // uses twice here.
-    // 2. If we wanted to be complete, we would have to reset the optimized
-    // flags on users of phi nodes if doing the below makes a phi node have all
-    // the same arguments. Instead, we prefer users to removeMemoryAccess those
-    // phi nodes, because doing it here would be N^3.
-    if (MA->hasValueHandle())
-      ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
-    // Note: We assume MemorySSA is not used in metadata since it's not really
-    // part of the IR.
-
-    while (!MA->use_empty()) {
-      Use &U = *MA->use_begin();
-      if (MemoryUse *MU = dyn_cast<MemoryUse>(U.getUser()))
-        MU->resetOptimized();
-      U.set(NewDefTarget);
-    }
-  }
-
-  // The call below to erase will destroy MA, so we can't change the order we
-  // are doing things here
-  removeFromLookups(MA);
-}
-
 void MemorySSA::print(raw_ostream &OS) const {
   MemorySSAAnnotatedWriter Writer(this);
   F.print(OS, &Writer);
 }
 
-void MemorySSA::dump() const {
-  MemorySSAAnnotatedWriter Writer(this);
-  F.print(dbgs(), &Writer);
-}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MemorySSA::dump() const { print(dbgs()); }
+#endif
 
 void MemorySSA::verifyMemorySSA() const {
   verifyDefUses(F);
@@ -1838,26 +1599,41 @@ void MemorySSA::verifyOrdering(Function &F) const {
   // lists think, as well as the order in the blocks vs the order in the access
   // lists.
   SmallVector<MemoryAccess *, 32> ActualAccesses;
+  SmallVector<MemoryAccess *, 32> ActualDefs;
   for (BasicBlock &B : F) {
     const AccessList *AL = getBlockAccesses(&B);
+    const auto *DL = getBlockDefs(&B);
     MemoryAccess *Phi = getMemoryAccess(&B);
-    if (Phi)
+    if (Phi) {
       ActualAccesses.push_back(Phi);
+      ActualDefs.push_back(Phi);
+    }
+
     for (Instruction &I : B) {
       MemoryAccess *MA = getMemoryAccess(&I);
-      assert((!MA || AL) && "We have memory affecting instructions "
-                            "in this block but they are not in the "
-                            "access list");
-      if (MA)
+      assert((!MA || (AL && (isa<MemoryUse>(MA) || DL))) &&
+             "We have memory affecting instructions "
+             "in this block but they are not in the "
+             "access list or defs list");
+      if (MA) {
         ActualAccesses.push_back(MA);
+        if (isa<MemoryDef>(MA))
+          ActualDefs.push_back(MA);
+      }
     }
     // Either we hit the assert, really have no accesses, or we have both
-    // accesses and an access list
-    if (!AL)
+    // accesses and an access list.
+    // Same with defs.
+    if (!AL && !DL)
       continue;
     assert(AL->size() == ActualAccesses.size() &&
            "We don't have the same number of accesses in the block as on the "
            "access list");
+    assert((DL || ActualDefs.size() == 0) &&
+           "Either we should have a defs list, or we should have no defs");
+    assert((!DL || DL->size() == ActualDefs.size()) &&
+           "We don't have the same number of defs in the block as on the "
+           "def list");
     auto ALI = AL->begin();
     auto AAI = ActualAccesses.begin();
     while (ALI != AL->end() && AAI != ActualAccesses.end()) {
@@ -1866,6 +1642,16 @@ void MemorySSA::verifyOrdering(Function &F) const {
       ++AAI;
     }
     ActualAccesses.clear();
+    if (DL) {
+      auto DLI = DL->begin();
+      auto ADI = ActualDefs.begin();
+      while (DLI != DL->end() && ADI != ActualDefs.end()) {
+        assert(&*DLI == *ADI && "Not the same defs in the same order");
+        ++DLI;
+        ++ADI;
+      }
+    }
+    ActualDefs.clear();
   }
 }
 
@@ -2066,8 +1852,11 @@ void MemoryUse::print(raw_ostream &OS) const {
 }
 
 void MemoryAccess::dump() const {
+// Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   print(dbgs());
   dbgs() << "\n";
+#endif
 }
 
 char MemorySSAPrinterLegacyPass::ID = 0;
@@ -2145,35 +1934,13 @@ MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
 
 MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
                                         DominatorTree *D)
-    : MemorySSAWalker(M), Walker(*M, *A, *D, Cache), AutoResetWalker(true) {}
+    : MemorySSAWalker(M), Walker(*M, *A, *D), AutoResetWalker(true) {}
 
 MemorySSA::CachingWalker::~CachingWalker() {}
 
 void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
-  // TODO: We can do much better cache invalidation with differently stored
-  // caches.  For now, for MemoryUses, we simply remove them
-  // from the cache, and kill the entire call/non-call cache for everything
-  // else.  The problem is for phis or defs, currently we'd need to follow use
-  // chains down and invalidate anything below us in the chain that currently
-  // terminates at this access.
-
-  // See if this is a MemoryUse, if so, just remove the cached info. MemoryUse
-  // is by definition never a barrier, so nothing in the cache could point to
-  // this use. In that case, we only need invalidate the info for the use
-  // itself.
-
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(MA)) {
-    UpwardsMemoryQuery Q(MU->getMemoryInst(), MU);
-    Cache.remove(MU, Q.StartingLoc, Q.IsCall);
-    MU->resetOptimized();
-  } else {
-    // If it is not a use, the best we can do right now is destroy the cache.
-    Cache.clear();
-  }
-
-#ifdef EXPENSIVE_CHECKS
-  verifyRemoved(MA);
-#endif
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
+    MUD->resetOptimized();
 }
 
 /// \brief Walk the use-def chains starting at \p MA and find
@@ -2184,8 +1951,7 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
   MemoryAccess *New = Walker.findClobber(StartingAccess, Q);
 #ifdef EXPENSIVE_CHECKS
-  MemoryAccess *NewNoCache =
-      Walker.findClobber(StartingAccess, Q, /*UseWalkerCache=*/false);
+  MemoryAccess *NewNoCache = Walker.findClobber(StartingAccess, Q);
   assert(NewNoCache == New && "Cache made us hand back a different result?");
 #endif
   if (AutoResetWalker)
@@ -2215,9 +1981,6 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
   Q.Inst = I;
   Q.IsCall = false;
 
-  if (auto *CacheResult = Cache.lookup(StartingUseOrDef, Loc, Q.IsCall))
-    return CacheResult;
-
   // Unlike the other function, do not walk to the def of a def, because we are
   // handed something we already believe is the clobbering access.
   MemoryAccess *DefiningAccess = isa<MemoryUse>(StartingUseOrDef)
@@ -2242,9 +2005,9 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   // If this is an already optimized use or def, return the optimized result.
   // Note: Currently, we do not store the optimized def result because we'd need
   // a separate field, since we can't use it as the defining access.
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-    if (MU->isOptimized())
-      return MU->getDefiningAccess();
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    if (MUD->isOptimized())
+      return MUD->getOptimized();
 
   const Instruction *I = StartingAccess->getMemoryInst();
   UpwardsMemoryQuery Q(I, StartingAccess);
@@ -2254,14 +2017,10 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (!Q.IsCall && I->isFenceLike())
     return StartingAccess;
 
-  if (auto *CacheResult = Cache.lookup(StartingAccess, Q.StartingLoc, Q.IsCall))
-    return CacheResult;
-
   if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
     MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
-    Cache.insert(StartingAccess, LiveOnEntry, Q.StartingLoc, Q.IsCall);
-    if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-      MU->setDefiningAccess(LiveOnEntry, true);
+    if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+      MUD->setOptimized(LiveOnEntry);
     return LiveOnEntry;
   }
 
@@ -2278,17 +2037,12 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   DEBUG(dbgs() << *DefiningAccess << "\n");
   DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
   DEBUG(dbgs() << *Result << "\n");
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-    MU->setDefiningAccess(Result, true);
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    MUD->setOptimized(Result);
 
   return Result;
 }
 
-// Verify that MA doesn't exist in any of the caches.
-void MemorySSA::CachingWalker::verifyRemoved(MemoryAccess *MA) {
-  assert(!Cache.contains(MA) && "Found removed MemoryAccess in cache.");
-}
-
 MemoryAccess *
 DoNothingMemorySSAWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
new file mode 100644
index 000000000000..c63677fe5502
--- /dev/null
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -0,0 +1,494 @@
+//===-- MemorySSAUpdater.cpp - Memory SSA Updater--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the MemorySSAUpdater class.
+//
+//===----------------------------------------------------------------===//
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "memoryssa"
+using namespace llvm;
+namespace llvm {
+// This is the marker algorithm from "Simple and Efficient Construction of
+// Static Single Assignment Form"
+// The simple, non-marker algorithm places phi nodes at any join
+// Here, we place markers, and only place phi nodes if they end up necessary.
+// They are only necessary if they break a cycle (IE we recursively visit
+// ourselves again), or we discover, while getting the value of the operands,
+// that there are two or more definitions needing to be merged.
+// This still will leave non-minimal form in the case of irreducible control
+// flow, where phi nodes may be in cycles with themselves, but unnecessary.
+MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(BasicBlock *BB) {
+  // Single predecessor case, just recurse, we can only have one definition.
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    return getPreviousDefFromEnd(Pred);
+  } else if (VisitedBlocks.count(BB)) {
+    // We hit our node again, meaning we had a cycle, we must insert a phi
+    // node to break it so we have an operand. The only case this will
+    // insert useless phis is if we have irreducible control flow.
+    return MSSA->createMemoryPhi(BB);
+  } else if (VisitedBlocks.insert(BB).second) {
+    // Mark us visited so we can detect a cycle
+    SmallVector<MemoryAccess *, 8> PhiOps;
+
+    // Recurse to get the values in our predecessors for placement of a
+    // potential phi node. This will insert phi nodes if we cycle in order to
+    // break the cycle and have an operand.
+    for (auto *Pred : predecessors(BB))
+      PhiOps.push_back(getPreviousDefFromEnd(Pred));
+
+    // Now try to simplify the ops to avoid placing a phi.
+    // This may return null if we never created a phi yet, that's okay
+    MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MSSA->getMemoryAccess(BB));
+    bool PHIExistsButNeedsUpdate = false;
+    // See if the existing phi operands match what we need.
+    // Unlike normal SSA, we only allow one phi node per block, so we can't just
+    // create a new one.
+    if (Phi && Phi->getNumOperands() != 0)
+      if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) {
+        PHIExistsButNeedsUpdate = true;
+      }
+
+    // See if we can avoid the phi by simplifying it.
+    auto *Result = tryRemoveTrivialPhi(Phi, PhiOps);
+    // If we couldn't simplify, we may have to create a phi
+    if (Result == Phi) {
+      if (!Phi)
+        Phi = MSSA->createMemoryPhi(BB);
+
+      // These will have been filled in by the recursive read we did above.
+      if (PHIExistsButNeedsUpdate) {
+        std::copy(PhiOps.begin(), PhiOps.end(), Phi->op_begin());
+        std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin());
+      } else {
+        unsigned i = 0;
+        for (auto *Pred : predecessors(BB))
+          Phi->addIncoming(PhiOps[i++], Pred);
+      }
+
+      Result = Phi;
+    }
+    if (MemoryPhi *MP = dyn_cast<MemoryPhi>(Result))
+      InsertedPHIs.push_back(MP);
+    // Set ourselves up for the next variable by resetting visited state.
+    VisitedBlocks.erase(BB);
+    return Result;
+  }
+  llvm_unreachable("Should have hit one of the three cases above");
+}
+
+// This starts at the memory access, and goes backwards in the block to find the
+// previous definition. If a definition is not found the block of the access,
+// it continues globally, creating phi nodes to ensure we have a single
+// definition.
+MemoryAccess *MemorySSAUpdater::getPreviousDef(MemoryAccess *MA) {
+  auto *LocalResult = getPreviousDefInBlock(MA);
+
+  return LocalResult ? LocalResult : getPreviousDefRecursive(MA->getBlock());
+}
+
+// This starts at the memory access, and goes backwards in the block to the find
+// the previous definition. If the definition is not found in the block of the
+// access, it returns nullptr.
+MemoryAccess *MemorySSAUpdater::getPreviousDefInBlock(MemoryAccess *MA) {
+  auto *Defs = MSSA->getWritableBlockDefs(MA->getBlock());
+
+  // It's possible there are no defs, or we got handed the first def to start.
+  if (Defs) {
+    // If this is a def, we can just use the def iterators.
+    if (!isa<MemoryUse>(MA)) {
+      auto Iter = MA->getReverseDefsIterator();
+      ++Iter;
+      if (Iter != Defs->rend())
+        return &*Iter;
+    } else {
+      // Otherwise, have to walk the all access iterator.
+      auto Iter = MA->getReverseIterator();
+      ++Iter;
+      while (&*Iter != &*Defs->begin()) {
+        if (!isa<MemoryUse>(*Iter))
+          return &*Iter;
+        --Iter;
+      }
+      // At this point it must be pointing at firstdef
+      assert(&*Iter == &*Defs->begin() &&
+             "Should have hit first def walking backwards");
+      return &*Iter;
+    }
+  }
+  return nullptr;
+}
+
+// This starts at the end of block
+MemoryAccess *MemorySSAUpdater::getPreviousDefFromEnd(BasicBlock *BB) {
+  auto *Defs = MSSA->getWritableBlockDefs(BB);
+
+  if (Defs)
+    return &*Defs->rbegin();
+
+  return getPreviousDefRecursive(BB);
+}
+// Recurse over a set of phi uses to eliminate the trivial ones
+MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) {
+  if (!Phi)
+    return nullptr;
+  TrackingVH<MemoryAccess> Res(Phi);
+  SmallVector<TrackingVH<Value>, 8> Uses;
+  std::copy(Phi->user_begin(), Phi->user_end(), std::back_inserter(Uses));
+  for (auto &U : Uses) {
+    if (MemoryPhi *UsePhi = dyn_cast<MemoryPhi>(&*U)) {
+      auto OperRange = UsePhi->operands();
+      tryRemoveTrivialPhi(UsePhi, OperRange);
+    }
+  }
+  return Res;
+}
+
+// Eliminate trivial phis
+// Phis are trivial if they are defined either by themselves, or all the same
+// argument.
+// IE phi(a, a) or b = phi(a, b) or c = phi(a, a, c)
+// We recursively try to remove them.
+template <class RangeType>
+MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
+                                                    RangeType &Operands) {
+  // Detect equal or self arguments
+  MemoryAccess *Same = nullptr;
+  for (auto &Op : Operands) {
+    // If the same or self, good so far
+    if (Op == Phi || Op == Same)
+      continue;
+    // not the same, return the phi since it's not eliminatable by us
+    if (Same)
+      return Phi;
+    Same = cast<MemoryAccess>(Op);
+  }
+  // Never found a non-self reference, the phi is undef
+  if (Same == nullptr)
+    return MSSA->getLiveOnEntryDef();
+  if (Phi) {
+    Phi->replaceAllUsesWith(Same);
+    removeMemoryAccess(Phi);
+  }
+
+  // We should only end up recursing in case we replaced something, in which
+  // case, we may have made other Phis trivial.
+  return recursePhi(Same);
+}
+
+void MemorySSAUpdater::insertUse(MemoryUse *MU) {
+  InsertedPHIs.clear();
+  MU->setDefiningAccess(getPreviousDef(MU));
+  // Unlike for defs, there is no extra work to do.  Because uses do not create
+  // new may-defs, there are only two cases:
+  //
+  // 1. There was a def already below us, and therefore, we should not have
+  // created a phi node because it was already needed for the def.
+  //
+  // 2. There is no def below us, and therefore, there is no extra renaming work
+  // to do.
+}
+
+// Set every incoming edge {BB, MP->getBlock()} of MemoryPhi MP to NewDef.
+void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB,
+                               MemoryAccess *NewDef) {
+  // Replace any operand with us an incoming block with the new defining
+  // access.
+  int i = MP->getBasicBlockIndex(BB);
+  assert(i != -1 && "Should have found the basic block in the phi");
+  // We can't just compare i against getNumOperands since one is signed and the
+  // other not. So use it to index into the block iterator.
+  for (auto BBIter = MP->block_begin() + i; BBIter != MP->block_end();
+       ++BBIter) {
+    if (*BBIter != BB)
+      break;
+    MP->setIncomingValue(i, NewDef);
+    ++i;
+  }
+}
+
+// A brief description of the algorithm:
+// First, we compute what should define the new def, using the SSA
+// construction algorithm.
+// Then, we update the defs below us (and any new phi nodes) in the graph to
+// point to the correct new defs, to ensure we only have one variable, and no
+// disconnected stores.
+void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
+  InsertedPHIs.clear();
+
+  // See if we had a local def, and if not, go hunting.
+  MemoryAccess *DefBefore = getPreviousDefInBlock(MD);
+  bool DefBeforeSameBlock = DefBefore != nullptr;
+  if (!DefBefore)
+    DefBefore = getPreviousDefRecursive(MD->getBlock());
+
+  // There is a def before us, which means we can replace any store/phi uses
+  // of that thing with us, since we are in the way of whatever was there
+  // before.
+  // We now define that def's memorydefs and memoryphis
+  if (DefBeforeSameBlock) {
+    for (auto UI = DefBefore->use_begin(), UE = DefBefore->use_end();
+         UI != UE;) {
+      Use &U = *UI++;
+      // Leave the uses alone
+      if (isa<MemoryUse>(U.getUser()))
+        continue;
+      U.set(MD);
+    }
+  }
+
+  // and that def is now our defining access.
+  // We change them in this order otherwise we will appear in the use list
+  // above and reset ourselves.
+  MD->setDefiningAccess(DefBefore);
+
+  SmallVector<MemoryAccess *, 8> FixupList(InsertedPHIs.begin(),
+                                           InsertedPHIs.end());
+  if (!DefBeforeSameBlock) {
+    // If there was a local def before us, we must have the same effect it
+    // did. Because every may-def is the same, any phis/etc we would create, it
+    // would also have created.  If there was no local def before us, we
+    // performed a global update, and have to search all successors and make
+    // sure we update the first def in each of them (following all paths until
+    // we hit the first def along each path). This may also insert phi nodes.
+    // TODO: There are other cases we can skip this work, such as when we have a
+    // single successor, and only used a straight line of single pred blocks
+    // backwards to find the def.  To make that work, we'd have to track whether
+    // getDefRecursive only ever used the single predecessor case.  These types
+    // of paths also only exist in between CFG simplifications.
+    FixupList.push_back(MD);
+  }
+
+  while (!FixupList.empty()) {
+    unsigned StartingPHISize = InsertedPHIs.size();
+    fixupDefs(FixupList);
+    FixupList.clear();
+    // Put any new phis on the fixup list, and process them
+    FixupList.append(InsertedPHIs.end() - StartingPHISize, InsertedPHIs.end());
+  }
+  // Now that all fixups are done, rename all uses if we are asked.
+  if (RenameUses) {
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    BasicBlock *StartBlock = MD->getBlock();
+    // We are guaranteed there is a def in the block, because we just got it
+    // handed to us in this function.
+    MemoryAccess *FirstDef = &*MSSA->getWritableBlockDefs(StartBlock)->begin();
+    // Convert to incoming value if it's a memorydef. A phi *is* already an
+    // incoming value.
+    if (auto *MD = dyn_cast<MemoryDef>(FirstDef))
+      FirstDef = MD->getDefiningAccess();
+
+    MSSA->renamePass(MD->getBlock(), FirstDef, Visited);
+    // We just inserted a phi into this block, so the incoming value will become
+    // the phi anyway, so it does not matter what we pass.
+    for (auto *MP : InsertedPHIs)
+      MSSA->renamePass(MP->getBlock(), nullptr, Visited);
+  }
+}
+
+void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<MemoryAccess *> &Vars) {
+  SmallPtrSet<const BasicBlock *, 8> Seen;
+  SmallVector<const BasicBlock *, 16> Worklist;
+  for (auto *NewDef : Vars) {
+    // First, see if there is a local def after the operand.
+    auto *Defs = MSSA->getWritableBlockDefs(NewDef->getBlock());
+    auto DefIter = NewDef->getDefsIterator();
+
+    // If there is a local def after us, we only have to rename that.
+    if (++DefIter != Defs->end()) {
+      cast<MemoryDef>(DefIter)->setDefiningAccess(NewDef);
+      continue;
+    }
+
+    // Otherwise, we need to search down through the CFG.
+    // For each of our successors, handle it directly if their is a phi, or
+    // place on the fixup worklist.
+    for (const auto *S : successors(NewDef->getBlock())) {
+      if (auto *MP = MSSA->getMemoryAccess(S))
+        setMemoryPhiValueForBlock(MP, NewDef->getBlock(), NewDef);
+      else
+        Worklist.push_back(S);
+    }
+
+    while (!Worklist.empty()) {
+      const BasicBlock *FixupBlock = Worklist.back();
+      Worklist.pop_back();
+
+      // Get the first def in the block that isn't a phi node.
+      if (auto *Defs = MSSA->getWritableBlockDefs(FixupBlock)) {
+        auto *FirstDef = &*Defs->begin();
+        // The loop above and below should have taken care of phi nodes
+        assert(!isa<MemoryPhi>(FirstDef) &&
+               "Should have already handled phi nodes!");
+        // We are now this def's defining access, make sure we actually dominate
+        // it
+        assert(MSSA->dominates(NewDef, FirstDef) &&
+               "Should have dominated the new access");
+
+        // This may insert new phi nodes, because we are not guaranteed the
+        // block we are processing has a single pred, and depending where the
+        // store was inserted, it may require phi nodes below it.
+        cast<MemoryDef>(FirstDef)->setDefiningAccess(getPreviousDef(FirstDef));
+        return;
+      }
+      // We didn't find a def, so we must continue.
+      for (const auto *S : successors(FixupBlock)) {
+        // If there is a phi node, handle it.
+        // Otherwise, put the block on the worklist
+        if (auto *MP = MSSA->getMemoryAccess(S))
+          setMemoryPhiValueForBlock(MP, FixupBlock, NewDef);
+        else {
+          // If we cycle, we should have ended up at a phi node that we already
+          // processed.  FIXME: Double check this
+          if (!Seen.insert(S).second)
+            continue;
+          Worklist.push_back(S);
+        }
+      }
+    }
+  }
+}
+
+// Move What before Where in the MemorySSA IR.
+template <class WhereType>
+void MemorySSAUpdater::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+                              WhereType Where) {
+  // Replace all our users with our defining access.
+  What->replaceAllUsesWith(What->getDefiningAccess());
+
+  // Let MemorySSA take care of moving it around in the lists.
+  MSSA->moveTo(What, BB, Where);
+
+  // Now reinsert it into the IR and do whatever fixups needed.
+  if (auto *MD = dyn_cast<MemoryDef>(What))
+    insertDef(MD);
+  else
+    insertUse(cast<MemoryUse>(What));
+}
+
+// Move What before Where in the MemorySSA IR.
+void MemorySSAUpdater::moveBefore(MemoryUseOrDef *What, MemoryUseOrDef *Where) {
+  moveTo(What, Where->getBlock(), Where->getIterator());
+}
+
+// Move What after Where in the MemorySSA IR.
+void MemorySSAUpdater::moveAfter(MemoryUseOrDef *What, MemoryUseOrDef *Where) {
+  moveTo(What, Where->getBlock(), ++Where->getIterator());
+}
+
+void MemorySSAUpdater::moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
+                                   MemorySSA::InsertionPlace Where) {
+  return moveTo(What, BB, Where);
+}
+
+/// \brief If all arguments of a MemoryPHI are defined by the same incoming
+/// argument, return that argument.
+static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
+  MemoryAccess *MA = nullptr;
+
+  for (auto &Arg : MP->operands()) {
+    if (!MA)
+      MA = cast<MemoryAccess>(Arg);
+    else if (MA != Arg)
+      return nullptr;
+  }
+  return MA;
+}
+void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA) {
+  assert(!MSSA->isLiveOnEntryDef(MA) &&
+         "Trying to remove the live on entry def");
+  // We can only delete phi nodes if they have no uses, or we can replace all
+  // uses with a single definition.
+  MemoryAccess *NewDefTarget = nullptr;
+  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
+    // Note that it is sufficient to know that all edges of the phi node have
+    // the same argument.  If they do, by the definition of dominance frontiers
+    // (which we used to place this phi), that argument must dominate this phi,
+    // and thus, must dominate the phi's uses, and so we will not hit the assert
+    // below.
+    NewDefTarget = onlySingleValue(MP);
+    assert((NewDefTarget || MP->use_empty()) &&
+           "We can't delete this memory phi");
+  } else {
+    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
+  }
+
+  // Re-point the uses at our defining access
+  if (!isa<MemoryUse>(MA) && !MA->use_empty()) {
+    // Reset optimized on users of this store, and reset the uses.
+    // A few notes:
+    // 1. This is a slightly modified version of RAUW to avoid walking the
+    // uses twice here.
+    // 2. If we wanted to be complete, we would have to reset the optimized
+    // flags on users of phi nodes if doing the below makes a phi node have all
+    // the same arguments. Instead, we prefer users to removeMemoryAccess those
+    // phi nodes, because doing it here would be N^3.
+    if (MA->hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
+    // Note: We assume MemorySSA is not used in metadata since it's not really
+    // part of the IR.
+
+    while (!MA->use_empty()) {
+      Use &U = *MA->use_begin();
+      if (auto *MUD = dyn_cast<MemoryUseOrDef>(U.getUser()))
+        MUD->resetOptimized();
+      U.set(NewDefTarget);
+    }
+  }
+
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  MSSA->removeFromLookups(MA);
+  MSSA->removeFromLists(MA);
+}
+
+MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB(
+    Instruction *I, MemoryAccess *Definition, const BasicBlock *BB,
+    MemorySSA::InsertionPlace Point) {
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsForBlock(NewAccess, BB, Point);
+  return NewAccess;
+}
+
+MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessBefore(
+    Instruction *I, MemoryAccess *Definition, MemoryUseOrDef *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
+                              InsertPt->getIterator());
+  return NewAccess;
+}
+
+MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessAfter(
+    Instruction *I, MemoryAccess *Definition, MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
+                              ++InsertPt->getIterator());
+  return NewAccess;
+}
+
+} // namespace llvm
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index f5ba637e58e2..f6d9a73e4e9a 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -28,7 +28,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -84,6 +84,92 @@ static bool isNonRenamableLocal(const GlobalValue &GV) {
   return GV.hasSection() && GV.hasLocalLinkage();
 }
 
+/// Determine whether this call has all constant integer arguments (excluding
+/// "this") and summarize it to VCalls or ConstVCalls as appropriate.
+static void addVCallToSet(DevirtCallSite Call, GlobalValue::GUID Guid,
+                          SetVector<FunctionSummary::VFuncId> &VCalls,
+                          SetVector<FunctionSummary::ConstVCall> &ConstVCalls) {
+  std::vector<uint64_t> Args;
+  // Start from the second argument to skip the "this" pointer.
+  for (auto &Arg : make_range(Call.CS.arg_begin() + 1, Call.CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64) {
+      VCalls.insert({Guid, Call.Offset});
+      return;
+    }
+    Args.push_back(CI->getZExtValue());
+  }
+  ConstVCalls.insert({{Guid, Call.Offset}, std::move(Args)});
+}
+
+/// If this intrinsic call requires that we add information to the function
+/// summary, do so via the non-constant reference arguments.
+static void addIntrinsicToSummary(
+    const CallInst *CI, SetVector<GlobalValue::GUID> &TypeTests,
+    SetVector<FunctionSummary::VFuncId> &TypeTestAssumeVCalls,
+    SetVector<FunctionSummary::VFuncId> &TypeCheckedLoadVCalls,
+    SetVector<FunctionSummary::ConstVCall> &TypeTestAssumeConstVCalls,
+    SetVector<FunctionSummary::ConstVCall> &TypeCheckedLoadConstVCalls) {
+  switch (CI->getCalledFunction()->getIntrinsicID()) {
+  case Intrinsic::type_test: {
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+    auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!TypeId)
+      break;
+    GlobalValue::GUID Guid = GlobalValue::getGUID(TypeId->getString());
+
+    // Produce a summary from type.test intrinsics. We only summarize type.test
+    // intrinsics that are used other than by an llvm.assume intrinsic.
+    // Intrinsics that are assumed are relevant only to the devirtualization
+    // pass, not the type test lowering pass.
+    bool HasNonAssumeUses = llvm::any_of(CI->uses(), [](const Use &CIU) {
+      auto *AssumeCI = dyn_cast<CallInst>(CIU.getUser());
+      if (!AssumeCI)
+        return true;
+      Function *F = AssumeCI->getCalledFunction();
+      return !F || F->getIntrinsicID() != Intrinsic::assume;
+    });
+    if (HasNonAssumeUses)
+      TypeTests.insert(Guid);
+
+    SmallVector<DevirtCallSite, 4> DevirtCalls;
+    SmallVector<CallInst *, 4> Assumes;
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI);
+    for (auto &Call : DevirtCalls)
+      addVCallToSet(Call, Guid, TypeTestAssumeVCalls,
+                    TypeTestAssumeConstVCalls);
+
+    break;
+  }
+
+  case Intrinsic::type_checked_load: {
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(2));
+    auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!TypeId)
+      break;
+    GlobalValue::GUID Guid = GlobalValue::getGUID(TypeId->getString());
+
+    SmallVector<DevirtCallSite, 4> DevirtCalls;
+    SmallVector<Instruction *, 4> LoadedPtrs;
+    SmallVector<Instruction *, 4> Preds;
+    bool HasNonCallUses = false;
+    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+                                               HasNonCallUses, CI);
+    // Any non-call uses of the result of llvm.type.checked.load will
+    // prevent us from optimizing away the llvm.type.test.
+    if (HasNonCallUses)
+      TypeTests.insert(Guid);
+    for (auto &Call : DevirtCalls)
+      addVCallToSet(Call, Guid, TypeCheckedLoadVCalls,
+                    TypeCheckedLoadConstVCalls);
+
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 static void
 computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
                        const Function &F, BlockFrequencyInfo *BFI,
@@ -99,6 +185,10 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
   MapVector<ValueInfo, CalleeInfo> CallGraphEdges;
   SetVector<ValueInfo> RefEdges;
   SetVector<GlobalValue::GUID> TypeTests;
+  SetVector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
+      TypeCheckedLoadVCalls;
+  SetVector<FunctionSummary::ConstVCall> TypeTestAssumeConstVCalls,
+      TypeCheckedLoadConstVCalls;
   ICallPromotionAnalysis ICallAnalysis;
 
   bool HasInlineAsmMaybeReferencingInternal = false;
@@ -133,29 +223,15 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // Check if this is a direct call to a known function or a known
       // intrinsic, or an indirect call with profile data.
       if (CalledFunction) {
-        if (CalledFunction->isIntrinsic()) {
-          if (CalledFunction->getIntrinsicID() != Intrinsic::type_test)
-            continue;
-          // Produce a summary from type.test intrinsics. We only summarize
-          // type.test intrinsics that are used other than by an llvm.assume
-          // intrinsic. Intrinsics that are assumed are relevant only to the
-          // devirtualization pass, not the type test lowering pass.
-          bool HasNonAssumeUses = llvm::any_of(CI->uses(), [](const Use &CIU) {
-            auto *AssumeCI = dyn_cast<CallInst>(CIU.getUser());
-            if (!AssumeCI)
-              return true;
-            Function *F = AssumeCI->getCalledFunction();
-            return !F || F->getIntrinsicID() != Intrinsic::assume;
-          });
-          if (HasNonAssumeUses) {
-            auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
-            if (auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata()))
-              TypeTests.insert(GlobalValue::getGUID(TypeId->getString()));
-          }
+        if (CI && CalledFunction->isIntrinsic()) {
+          addIntrinsicToSummary(
+              CI, TypeTests, TypeTestAssumeVCalls, TypeCheckedLoadVCalls,
+              TypeTestAssumeConstVCalls, TypeCheckedLoadConstVCalls);
+          continue;
         }
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
-        auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
+        auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
 
@@ -183,6 +259,11 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       }
     }
 
+  // Explicit add hot edges to enforce importing for designated GUIDs for
+  // sample PGO, to enable the same inlines as the profiled optimized binary.
+  for (auto &I : F.getImportGUIDs())
+    CallGraphEdges[I].updateHotness(CalleeInfo::HotnessType::Hot);
+
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport =
       NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
@@ -193,7 +274,10 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
                                     /* LiveRoot = */ false);
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(),
-      TypeTests.takeVector());
+      TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(),
+      TypeCheckedLoadVCalls.takeVector(),
+      TypeTestAssumeConstVCalls.takeVector(),
+      TypeCheckedLoadConstVCalls.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
@@ -326,9 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     // be listed on the llvm.used or llvm.compiler.used global and marked as
     // referenced from there.
     ModuleSymbolTable::CollectAsmSymbols(
-        Triple(M.getTargetTriple()), M.getModuleInlineAsm(),
-        [&M, &Index, &CantBePromoted](StringRef Name,
-                                      object::BasicSymbolRef::Flags Flags) {
+        M, [&M, &Index, &CantBePromoted](StringRef Name,
+                                         object::BasicSymbolRef::Flags Flags) {
           // Symbols not marked as Weak or Global are local definitions.
           if (Flags & (object::BasicSymbolRef::SF_Weak |
                        object::BasicSymbolRef::SF_Global))
@@ -347,7 +430,11 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                 llvm::make_unique<FunctionSummary>(
                     GVFlags, 0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
-                    ArrayRef<GlobalValue::GUID>{});
+                    ArrayRef<GlobalValue::GUID>{},
+                    ArrayRef<FunctionSummary::VFuncId>{},
+                    ArrayRef<FunctionSummary::VFuncId>{},
+                    ArrayRef<FunctionSummary::ConstVCall>{},
+                    ArrayRef<FunctionSummary::ConstVCall>{});
             Index.addGlobalValueSummary(Name, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
@@ -364,6 +451,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     auto &Summary = GlobalList.second[0];
     bool AllRefsCanBeExternallyReferenced =
         llvm::all_of(Summary->refs(), [&](const ValueInfo &VI) {
+          // If a global value definition references an unnamed global,
+          // be conservative. They're valid IR so we don't want to crash
+          // when we encounter any of them but they're infrequent enough
+          // that we don't bother optimizing them.
+          if (!VI.getValue()->hasName())
+            return false;
           return !CantBePromoted.count(VI.getValue()->getGUID());
         });
     if (!AllRefsCanBeExternallyReferenced) {
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index fa8b07d61b01..73245981b022 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -23,14 +23,14 @@
 
 using namespace llvm;
 
-OptimizationRemarkEmitter::OptimizationRemarkEmitter(Function *F)
+OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
     : F(F), BFI(nullptr) {
   if (!F->getContext().getDiagnosticHotnessRequested())
     return;
 
   // First create a dominator tree.
   DominatorTree DT;
-  DT.recalculate(*F);
+  DT.recalculate(*const_cast<Function *>(F));
 
   // Generate LoopInfo from it.
   LoopInfo LI;
@@ -45,6 +45,18 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(Function *F)
   BFI = OwnedBFI.get();
 }
 
+bool OptimizationRemarkEmitter::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv) {
+  // This analysis has no state and so can be trivially preserved but it needs
+  // a fresh view of BFI if it was constructed with one.
+  if (BFI && Inv.invalidate<BlockFrequencyAnalysis>(F, PA))
+    return true;
+
+  // Otherwise this analysis result remains valid.
+  return false;
+}
+
 Optional<uint64_t> OptimizationRemarkEmitter::computeHotness(const Value *V) {
   if (!BFI)
     return None;
@@ -55,53 +67,59 @@ Optional<uint64_t> OptimizationRemarkEmitter::computeHotness(const Value *V) {
 namespace llvm {
 namespace yaml {
 
-template <> struct MappingTraits<DiagnosticInfoOptimizationBase *> {
-  static void mapping(IO &io, DiagnosticInfoOptimizationBase *&OptDiag) {
-    assert(io.outputting() && "input not yet implemented");
+void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
+    IO &io, DiagnosticInfoOptimizationBase *&OptDiag) {
+  assert(io.outputting() && "input not yet implemented");
+
+  if (io.mapTag("!Passed",
+                (OptDiag->getKind() == DK_OptimizationRemark ||
+                 OptDiag->getKind() == DK_MachineOptimizationRemark)))
+    ;
+  else if (io.mapTag(
+               "!Missed",
+               (OptDiag->getKind() == DK_OptimizationRemarkMissed ||
+                OptDiag->getKind() == DK_MachineOptimizationRemarkMissed)))
+    ;
+  else if (io.mapTag(
+               "!Analysis",
+               (OptDiag->getKind() == DK_OptimizationRemarkAnalysis ||
+                OptDiag->getKind() == DK_MachineOptimizationRemarkAnalysis)))
+    ;
+  else if (io.mapTag("!AnalysisFPCommute",
+                     OptDiag->getKind() ==
+                         DK_OptimizationRemarkAnalysisFPCommute))
+    ;
+  else if (io.mapTag("!AnalysisAliasing",
+                     OptDiag->getKind() ==
+                         DK_OptimizationRemarkAnalysisAliasing))
+    ;
+  else if (io.mapTag("!Failure", OptDiag->getKind() == DK_OptimizationFailure))
+    ;
+  else
+    llvm_unreachable("Unknown remark type");
 
-    if (io.mapTag("!Passed", OptDiag->getKind() == DK_OptimizationRemark))
-      ;
-    else if (io.mapTag("!Missed",
-                       OptDiag->getKind() == DK_OptimizationRemarkMissed))
-      ;
-    else if (io.mapTag("!Analysis",
-                       OptDiag->getKind() == DK_OptimizationRemarkAnalysis))
-      ;
-    else if (io.mapTag("!AnalysisFPCommute",
-                       OptDiag->getKind() ==
-                           DK_OptimizationRemarkAnalysisFPCommute))
-      ;
-    else if (io.mapTag("!AnalysisAliasing",
-                       OptDiag->getKind() ==
-                           DK_OptimizationRemarkAnalysisAliasing))
-      ;
-    else
-      llvm_unreachable("todo");
-
-    // These are read-only for now.
-    DebugLoc DL = OptDiag->getDebugLoc();
-    StringRef FN = GlobalValue::getRealLinkageName(
-        OptDiag->getFunction().getName());
-
-    StringRef PassName(OptDiag->PassName);
-    io.mapRequired("Pass", PassName);
-    io.mapRequired("Name", OptDiag->RemarkName);
-    if (!io.outputting() || DL)
-      io.mapOptional("DebugLoc", DL);
-    io.mapRequired("Function", FN);
-    io.mapOptional("Hotness", OptDiag->Hotness);
-    io.mapOptional("Args", OptDiag->Args);
-  }
-};
+  // These are read-only for now.
+  DiagnosticLocation DL = OptDiag->getLocation();
+  StringRef FN =
+      GlobalValue::getRealLinkageName(OptDiag->getFunction().getName());
+
+  StringRef PassName(OptDiag->PassName);
+  io.mapRequired("Pass", PassName);
+  io.mapRequired("Name", OptDiag->RemarkName);
+  if (!io.outputting() || DL.isValid())
+    io.mapOptional("DebugLoc", DL);
+  io.mapRequired("Function", FN);
+  io.mapOptional("Hotness", OptDiag->Hotness);
+  io.mapOptional("Args", OptDiag->Args);
+}
 
-template <> struct MappingTraits<DebugLoc> {
-  static void mapping(IO &io, DebugLoc &DL) {
+template <> struct MappingTraits<DiagnosticLocation> {
+  static void mapping(IO &io, DiagnosticLocation &DL) {
     assert(io.outputting() && "input not yet implemented");
 
-    auto *Scope = cast<DIScope>(DL.getScope());
-    StringRef File = Scope->getFilename();
+    StringRef File = DL.getFilename();
     unsigned Line = DL.getLine();
-    unsigned Col = DL.getCol();
+    unsigned Col = DL.getColumn();
 
     io.mapRequired("File", File);
     io.mapRequired("Line", Line);
@@ -116,8 +134,8 @@ template <> struct MappingTraits<DiagnosticInfoOptimizationBase::Argument> {
   static void mapping(IO &io, DiagnosticInfoOptimizationBase::Argument &A) {
     assert(io.outputting() && "input not yet implemented");
     io.mapRequired(A.Key.data(), A.Val);
-    if (A.DLoc)
-      io.mapOptional("DebugLoc", A.DLoc);
+    if (A.Loc.isValid())
+      io.mapOptional("DebugLoc", A.Loc);
   }
 };
 
@@ -127,18 +145,20 @@ template <> struct MappingTraits<DiagnosticInfoOptimizationBase::Argument> {
 LLVM_YAML_IS_SEQUENCE_VECTOR(DiagnosticInfoOptimizationBase::Argument)
 
 void OptimizationRemarkEmitter::computeHotness(
-    DiagnosticInfoOptimizationBase &OptDiag) {
-  Value *V = OptDiag.getCodeRegion();
+    DiagnosticInfoIROptimization &OptDiag) {
+  const Value *V = OptDiag.getCodeRegion();
   if (V)
     OptDiag.setHotness(computeHotness(V));
 }
 
-void OptimizationRemarkEmitter::emit(DiagnosticInfoOptimizationBase &OptDiag) {
+void OptimizationRemarkEmitter::emit(
+    DiagnosticInfoOptimizationBase &OptDiagBase) {
+  auto &OptDiag = cast<DiagnosticInfoIROptimization>(OptDiagBase);
   computeHotness(OptDiag);
 
   yaml::Output *Out = F->getContext().getDiagnosticsOutputFile();
   if (Out) {
-    auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiag);
+    auto *P = const_cast<DiagnosticInfoOptimizationBase *>(&OptDiagBase);
     *Out << P;
   }
   // FIXME: now that IsVerbose is part of DI, filtering for this will be moved
@@ -147,72 +167,6 @@ void OptimizationRemarkEmitter::emit(DiagnosticInfoOptimizationBase &OptDiag) {
     F->getContext().diagnose(OptDiag);
 }
 
-void OptimizationRemarkEmitter::emitOptimizationRemark(const char *PassName,
-                                                       const DebugLoc &DLoc,
-                                                       const Value *V,
-                                                       const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemark(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemark(const char *PassName,
-                                                       Loop *L,
-                                                       const Twine &Msg) {
-  emitOptimizationRemark(PassName, L->getStartLoc(), L->getHeader(), Msg);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg, bool IsVerbose) {
-  LLVMContext &Ctx = F->getContext();
-  if (!IsVerbose || shouldEmitVerbose())
-    Ctx.diagnose(
-        OptimizationRemarkMissed(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
-    const char *PassName, Loop *L, const Twine &Msg, bool IsVerbose) {
-  emitOptimizationRemarkMissed(PassName, L->getStartLoc(), L->getHeader(), Msg,
-                               IsVerbose);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysis(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg, bool IsVerbose) {
-  LLVMContext &Ctx = F->getContext();
-  if (!IsVerbose || shouldEmitVerbose())
-    Ctx.diagnose(
-        OptimizationRemarkAnalysis(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysis(
-    const char *PassName, Loop *L, const Twine &Msg, bool IsVerbose) {
-  emitOptimizationRemarkAnalysis(PassName, L->getStartLoc(), L->getHeader(),
-                                 Msg, IsVerbose);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisFPCommute(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, *F, DLoc, Msg,
-                                                   computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisAliasing(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, *F, DLoc, Msg,
-                                                  computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisAliasing(
-    const char *PassName, Loop *L, const Twine &Msg) {
-  emitOptimizationRemarkAnalysisAliasing(PassName, L->getStartLoc(),
-                                         L->getHeader(), Msg);
-}
-
 OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
     : FunctionPass(ID) {
   initializeOptimizationRemarkEmitterWrapperPassPass(
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index cb9438a2f928..1caf151546d9 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -31,6 +31,15 @@ char PostDominatorTreeWrapperPass::ID = 0;
 INITIALIZE_PASS(PostDominatorTreeWrapperPass, "postdomtree",
                 "Post-Dominator Tree Construction", true, true)
 
+bool PostDominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
+                                   FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<PostDominatorTreeAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 bool PostDominatorTreeWrapperPass::runOnFunction(Function &F) {
   DT.recalculate(F);
   return false;
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 16d3614c14c6..1a53a8ed4283 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -12,9 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ProfileSummary.h"
@@ -55,22 +56,40 @@ static uint64_t getMinCountForPercentile(SummaryEntryVector &DS,
 // The profile summary metadata may be attached either by the frontend or by
 // any backend passes (IR level instrumentation, for example). This method
 // checks if the Summary is null and if so checks if the summary metadata is now
-// available in the module and parses it to get the Summary object.
-void ProfileSummaryInfo::computeSummary() {
+// available in the module and parses it to get the Summary object. Returns true
+// if a valid Summary is available.
+bool ProfileSummaryInfo::computeSummary() {
   if (Summary)
-    return;
+    return true;
   auto *SummaryMD = M.getProfileSummary();
   if (!SummaryMD)
-    return;
+    return false;
   Summary.reset(ProfileSummary::getFromMD(SummaryMD));
+  return true;
+}
+
+Optional<uint64_t>
+ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
+                                    BlockFrequencyInfo *BFI) {
+  if (!Inst)
+    return None;
+  assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+         "We can only get profile count for call/invoke instruction.");
+  // Check if there is a profile metadata on the instruction. If it is present,
+  // determine hotness solely based on that.
+  uint64_t TotalCount;
+  if (Inst->extractProfTotalWeight(TotalCount))
+    return TotalCount;
+  if (BFI)
+    return BFI->getBlockProfileCount(Inst->getParent());
+  return None;
 }
 
 /// Returns true if the function's entry is hot. If it returns false, it
 /// either means it is not hot or it is unknown whether it is hot or not (for
 /// example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
-  computeSummary();
-  if (!F || !Summary)
+  if (!F || !computeSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining hotness is based on
@@ -79,17 +98,53 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
   return FunctionCount && isHotCount(FunctionCount.getValue());
 }
 
+/// Returns true if the function's entry or total call edge count is hot.
+/// If it returns false, it either means it is not hot or it is unknown
+/// whether it is hot or not (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (isHotCount(FunctionCount.getValue()))
+      return true;
+
+  uint64_t TotalCallCount = 0;
+  for (const auto &BB : *F)
+    for (const auto &I : BB)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I))
+        if (auto CallCount = getProfileCount(&I, nullptr))
+          TotalCallCount += CallCount.getValue();
+  return isHotCount(TotalCallCount);
+}
+
+/// Returns true if the function's entry and total call edge count is cold.
+/// If it returns false, it either means it is not cold or it is unknown
+/// whether it is cold or not (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (!isColdCount(FunctionCount.getValue()))
+      return false;
+  
+  uint64_t TotalCallCount = 0;
+  for (const auto &BB : *F)
+    for (const auto &I : BB) 
+      if (isa<CallInst>(I) || isa<InvokeInst>(I))
+        if (auto CallCount = getProfileCount(&I, nullptr))
+          TotalCallCount += CallCount.getValue();
+  return isColdCount(TotalCallCount);
+}
+
 /// Returns true if the function's entry is a cold. If it returns false, it
 /// either means it is not cold or it is unknown whether it is cold or not (for
 /// example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
-  computeSummary();
   if (!F)
     return false;
-  if (F->hasFnAttribute(Attribute::Cold)) {
+  if (F->hasFnAttribute(Attribute::Cold))
     return true;
-  }
-  if (!Summary)
+  if (!computeSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining coldness is based on
@@ -100,9 +155,7 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
 
 /// Compute the hot and cold thresholds.
 void ProfileSummaryInfo::computeThresholds() {
-  if (!Summary)
-    computeSummary();
-  if (!Summary)
+  if (!computeSummary())
     return;
   auto &DetailedSummary = Summary->getDetailedSummary();
   HotCountThreshold =
@@ -125,20 +178,25 @@ bool ProfileSummaryInfo::isColdCount(uint64_t C) {
 
 bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) {
   auto Count = BFI->getBlockProfileCount(B);
-  if (Count && isHotCount(*Count))
-    return true;
-  // Use extractProfTotalWeight to get BB count.
-  // For Sample PGO, BFI may not provide accurate BB count due to errors
-  // magnified during sample count propagation. This serves as a backup plan
-  // to ensure all hot BB will not be missed.
-  // The query currently has false positives as branch instruction cloning does
-  // not update/scale branch weights. Unlike false negatives, this will not cause
-  // performance problem.
-  uint64_t TotalCount;
-  if (B->getTerminator()->extractProfTotalWeight(TotalCount) &&
-      isHotCount(TotalCount))
-    return true;
-  return false;
+  return Count && isHotCount(*Count);
+}
+
+bool ProfileSummaryInfo::isColdBB(const BasicBlock *B,
+                                  BlockFrequencyInfo *BFI) {
+  auto Count = BFI->getBlockProfileCount(B);
+  return Count && isColdCount(*Count);
+}
+
+bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS,
+                                       BlockFrequencyInfo *BFI) {
+  auto C = getProfileCount(CS.getInstruction(), BFI);
+  return C && isHotCount(*C);
+}
+
+bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
+                                        BlockFrequencyInfo *BFI) {
+  auto C = getProfileCount(CS.getInstruction(), BFI);
+  return C && isColdCount(*C);
 }
 
 INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info",
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 8c084ddd2266..63ef8d28d44a 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -83,6 +83,15 @@ RegionInfo::~RegionInfo() {
 
 }
 
+bool RegionInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                            FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<RegionInfoAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 void RegionInfo::updateStatistics(Region *R) {
   ++numRegions;
 
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 7358aa6810a1..82107cb18025 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -206,6 +206,8 @@ public:
 
     return false;
   }
+
+  StringRef getPassName() const override { return "Print Region IR"; }
 };
 
 char PrintRegionPass::ID = 0;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index ed328f12c463..ca32cf3c7c34 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -127,16 +127,35 @@ static cl::opt<unsigned> MulOpsInlineThreshold(
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
     cl::init(1000));
 
+static cl::opt<unsigned> AddOpsInlineThreshold(
+    "scev-addops-inline-threshold", cl::Hidden,
+    cl::desc("Threshold for inlining multiplication operands into a SCEV"),
+    cl::init(500));
+
 static cl::opt<unsigned> MaxSCEVCompareDepth(
     "scalar-evolution-max-scev-compare-depth", cl::Hidden,
     cl::desc("Maximum depth of recursive SCEV complexity comparisons"),
     cl::init(32));
 
+static cl::opt<unsigned> MaxSCEVOperationsImplicationDepth(
+    "scalar-evolution-max-scev-operations-implication-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive SCEV operations implication analysis"),
+    cl::init(2));
+
 static cl::opt<unsigned> MaxValueCompareDepth(
     "scalar-evolution-max-value-compare-depth", cl::Hidden,
     cl::desc("Maximum depth of recursive value complexity comparisons"),
     cl::init(2));
 
+static cl::opt<unsigned>
+    MaxAddExprDepth("scalar-evolution-max-addexpr-depth", cl::Hidden,
+                    cl::desc("Maximum depth of recursive AddExpr"),
+                    cl::init(32));
+
+static cl::opt<unsigned> MaxConstantEvolvingDepth(
+    "scalar-evolution-max-constant-evolving-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive constant evolving"), cl::init(32));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -145,11 +164,12 @@ static cl::opt<unsigned> MaxValueCompareDepth(
 // Implementation of the SCEV class.
 //
 
-LLVM_DUMP_METHOD
-void SCEV::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SCEV::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void SCEV::print(raw_ostream &OS) const {
   switch (static_cast<SCEVTypes>(getSCEVType())) {
@@ -2095,7 +2115,8 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
 
 /// Get a canonical add expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
-                                        SCEV::NoWrapFlags Flags) {
+                                        SCEV::NoWrapFlags Flags,
+                                        unsigned Depth) {
   assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
          "only nuw or nsw allowed");
   assert(!Ops.empty() && "Cannot get empty add!");
@@ -2134,6 +2155,10 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (Ops.size() == 1) return Ops[0];
   }
 
+  // Limit recursion calls depth
+  if (Depth > MaxAddExprDepth)
+    return getOrCreateAddExpr(Ops, Flags);
+
   // Okay, check to see if the same value occurs in the operand list more than
   // once.  If so, merge them together into an multiply expression.  Since we
   // sorted the list, these values are required to be adjacent.
@@ -2205,7 +2230,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     }
     if (Ok) {
       // Evaluate the expression in the larger type.
-      const SCEV *Fold = getAddExpr(LargeOps, Flags);
+      const SCEV *Fold = getAddExpr(LargeOps, Flags, Depth + 1);
       // If it folds to something simple, use it. Otherwise, don't.
       if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
         return getTruncateExpr(Fold, DstType);
@@ -2220,6 +2245,9 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   if (Idx < Ops.size()) {
     bool DeletedAdd = false;
     while (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[Idx])) {
+      if (Ops.size() > AddOpsInlineThreshold ||
+          Add->getNumOperands() > AddOpsInlineThreshold)
+        break;
       // If we have an add, expand the add operands onto the end of the operands
       // list.
       Ops.erase(Ops.begin()+Idx);
@@ -2231,7 +2259,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // and they are not necessarily sorted.  Recurse to resort and resimplify
     // any operands we just acquired.
     if (DeletedAdd)
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
   }
 
   // Skip over the add expression until we get to a multiply.
@@ -2266,13 +2294,14 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         Ops.push_back(getConstant(AccumulatedConstant));
       for (auto &MulOp : MulOpLists)
         if (MulOp.first != 0)
-          Ops.push_back(getMulExpr(getConstant(MulOp.first),
-                                   getAddExpr(MulOp.second)));
+          Ops.push_back(getMulExpr(
+              getConstant(MulOp.first),
+              getAddExpr(MulOp.second, SCEV::FlagAnyWrap, Depth + 1)));
       if (Ops.empty())
         return getZero(Ty);
       if (Ops.size() == 1)
         return Ops[0];
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
   }
 
@@ -2297,8 +2326,8 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
             InnerMul = getMulExpr(MulOps);
           }
-          const SCEV *One = getOne(Ty);
-          const SCEV *AddOne = getAddExpr(One, InnerMul);
+          SmallVector<const SCEV *, 2> TwoOps = {getOne(Ty), InnerMul};
+          const SCEV *AddOne = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
           const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV);
           if (Ops.size() == 2) return OuterMul;
           if (AddOp < Idx) {
@@ -2309,7 +2338,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             Ops.erase(Ops.begin()+AddOp-1);
           }
           Ops.push_back(OuterMul);
-          return getAddExpr(Ops);
+          return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
         }
 
       // Check this multiply against other multiplies being added together.
@@ -2337,13 +2366,15 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
               MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end());
               InnerMul2 = getMulExpr(MulOps);
             }
-            const SCEV *InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
+            SmallVector<const SCEV *, 2> TwoOps = {InnerMul1, InnerMul2};
+            const SCEV *InnerMulSum =
+                getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
             const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
             if (Ops.size() == 2) return OuterMul;
             Ops.erase(Ops.begin()+Idx);
             Ops.erase(Ops.begin()+OtherMulIdx-1);
             Ops.push_back(OuterMul);
-            return getAddExpr(Ops);
+            return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
           }
       }
     }
@@ -2379,7 +2410,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       // This follows from the fact that the no-wrap flags on the outer add
       // expression are applicable on the 0th iteration, when the add recurrence
       // will be equal to its start value.
-      AddRecOps[0] = getAddExpr(LIOps, Flags);
+      AddRecOps[0] = getAddExpr(LIOps, Flags, Depth + 1);
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer add and the inner addrec are guaranteed to have no overflow.
@@ -2396,7 +2427,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
           Ops[i] = NewRec;
           break;
         }
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
 
     // Okay, if there weren't any loop invariants to be folded, check to see if
@@ -2420,14 +2451,15 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                    OtherAddRec->op_end());
                   break;
                 }
-                AddRecOps[i] = getAddExpr(AddRecOps[i],
-                                          OtherAddRec->getOperand(i));
+                SmallVector<const SCEV *, 2> TwoOps = {
+                    AddRecOps[i], OtherAddRec->getOperand(i)};
+                AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
               }
               Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
             }
         // Step size has changed, so we cannot guarantee no self-wraparound.
         Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
-        return getAddExpr(Ops);
+        return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
       }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
@@ -2436,18 +2468,24 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Okay, it looks like we really DO need an add expr.  Check to see if we
   // already have one, otherwise create a new one.
+  return getOrCreateAddExpr(Ops, Flags);
+}
+
+const SCEV *
+ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                    SCEV::NoWrapFlags Flags) {
   FoldingSetNodeID ID;
   ID.AddInteger(scAddExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
   void *IP = nullptr;
   SCEVAddExpr *S =
-    static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+      static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
   if (!S) {
     const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
     std::uninitialized_copy(Ops.begin(), Ops.end(), O);
-    S = new (SCEVAllocator) SCEVAddExpr(ID.Intern(SCEVAllocator),
-                                        O, Ops.size());
+    S = new (SCEVAllocator)
+        SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size());
     UniqueSCEVs.InsertNode(S, IP);
   }
   S->setNoWrapFlags(Flags);
@@ -2889,7 +2927,7 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
   // end of this file for inspiration.
 
   const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS);
-  if (!Mul)
+  if (!Mul || !Mul->hasNoUnsignedWrap())
     return getUDivExpr(LHS, RHS);
 
   if (const SCEVConstant *RHSCst = dyn_cast<SCEVConstant>(RHS)) {
@@ -3385,6 +3423,10 @@ Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   return getDataLayout().getIntPtrType(Ty);
 }
 
+Type *ScalarEvolution::getWiderType(Type *T1, Type *T2) const {
+  return  getTypeSizeInBits(T1) >= getTypeSizeInBits(T2) ? T1 : T2;
+}
+
 const SCEV *ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute.get();
 }
@@ -4409,8 +4451,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   return getGEPExpr(GEP, IndexExprs);
 }
 
-uint32_t
-ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
     return C->getAPInt().countTrailingZeros();
 
@@ -4420,14 +4461,16 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
 
   if (const SCEVZeroExtendExpr *E = dyn_cast<SCEVZeroExtendExpr>(S)) {
     uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
-    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
-             getTypeSizeInBits(E->getType()) : OpRes;
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType())
+               ? getTypeSizeInBits(E->getType())
+               : OpRes;
   }
 
   if (const SCEVSignExtendExpr *E = dyn_cast<SCEVSignExtendExpr>(S)) {
     uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
-    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
-             getTypeSizeInBits(E->getType()) : OpRes;
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType())
+               ? getTypeSizeInBits(E->getType())
+               : OpRes;
   }
 
   if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
@@ -4444,8 +4487,8 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
     uint32_t BitWidth = getTypeSizeInBits(M->getType());
     for (unsigned i = 1, e = M->getNumOperands();
          SumOpRes != BitWidth && i != e; ++i)
-      SumOpRes = std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)),
-                          BitWidth);
+      SumOpRes =
+          std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)), BitWidth);
     return SumOpRes;
   }
 
@@ -4486,6 +4529,17 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
   return 0;
 }
 
+uint32_t ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+  auto I = MinTrailingZerosCache.find(S);
+  if (I != MinTrailingZerosCache.end())
+    return I->second;
+
+  uint32_t Result = GetMinTrailingZerosImpl(S);
+  auto InsertPair = MinTrailingZerosCache.insert({S, Result});
+  assert(InsertPair.second && "Should insert a new key");
+  return InsertPair.first->second;
+}
+
 /// Helper method to assign a range to V from metadata present in the IR.
 static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
@@ -4668,6 +4722,77 @@ ScalarEvolution::getRange(const SCEV *S,
   return setRange(S, SignHint, ConservativeResult);
 }
 
+// Given a StartRange, Step and MaxBECount for an expression compute a range of
+// values that the expression can take. Initially, the expression has a value
+// from StartRange and then is changed by Step up to MaxBECount times. Signed
+// argument defines if we treat Step as signed or unsigned.
+static ConstantRange getRangeForAffineARHelper(APInt Step,
+                                               ConstantRange StartRange,
+                                               APInt MaxBECount,
+                                               unsigned BitWidth, bool Signed) {
+  // If either Step or MaxBECount is 0, then the expression won't change, and we
+  // just need to return the initial range.
+  if (Step == 0 || MaxBECount == 0)
+    return StartRange;
+
+  // If we don't know anything about the initial value (i.e. StartRange is
+  // FullRange), then we don't know anything about the final range either.
+  // Return FullRange.
+  if (StartRange.isFullSet())
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // If Step is signed and negative, then we use its absolute value, but we also
+  // note that we're moving in the opposite direction.
+  bool Descending = Signed && Step.isNegative();
+
+  if (Signed)
+    // This is correct even for INT_SMIN. Let's look at i8 to illustrate this:
+    // abs(INT_SMIN) = abs(-128) = abs(0x80) = -0x80 = 0x80 = 128.
+    // This equations hold true due to the well-defined wrap-around behavior of
+    // APInt.
+    Step = Step.abs();
+
+  // Check if Offset is more than full span of BitWidth. If it is, the
+  // expression is guaranteed to overflow.
+  if (APInt::getMaxValue(StartRange.getBitWidth()).udiv(Step).ult(MaxBECount))
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // Offset is by how much the expression can change. Checks above guarantee no
+  // overflow here.
+  APInt Offset = Step * MaxBECount;
+
+  // Minimum value of the final range will match the minimal value of StartRange
+  // if the expression is increasing and will be decreased by Offset otherwise.
+  // Maximum value of the final range will match the maximal value of StartRange
+  // if the expression is decreasing and will be increased by Offset otherwise.
+  APInt StartLower = StartRange.getLower();
+  APInt StartUpper = StartRange.getUpper() - 1;
+  APInt MovedBoundary =
+      Descending ? (StartLower - Offset) : (StartUpper + Offset);
+
+  // It's possible that the new minimum/maximum value will fall into the initial
+  // range (due to wrap around). This means that the expression can take any
+  // value in this bitwidth, and we have to return full range.
+  if (StartRange.contains(MovedBoundary))
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  APInt NewLower, NewUpper;
+  if (Descending) {
+    NewLower = MovedBoundary;
+    NewUpper = StartUpper;
+  } else {
+    NewLower = StartLower;
+    NewUpper = MovedBoundary;
+  }
+
+  // If we end up with full range, return a proper full range.
+  if (NewLower == NewUpper + 1)
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // No overflow detected, return [StartLower, StartUpper + Offset + 1) range.
+  return ConstantRange(NewLower, NewUpper + 1);
+}
+
 ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
                                                    const SCEV *Step,
                                                    const SCEV *MaxBECount,
@@ -4676,60 +4801,30 @@ ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
          getTypeSizeInBits(MaxBECount->getType()) <= BitWidth &&
          "Precondition!");
 
-  ConstantRange Result(BitWidth, /* isFullSet = */ true);
-
-  // Check for overflow.  This must be done with ConstantRange arithmetic
-  // because we could be called from within the ScalarEvolution overflow
-  // checking code.
-
   MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType());
   ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
-  ConstantRange ZExtMaxBECountRange = MaxBECountRange.zextOrTrunc(BitWidth * 2);
+  APInt MaxBECountValue = MaxBECountRange.getUnsignedMax();
 
+  // First, consider step signed.
+  ConstantRange StartSRange = getSignedRange(Start);
   ConstantRange StepSRange = getSignedRange(Step);
-  ConstantRange SExtStepSRange = StepSRange.sextOrTrunc(BitWidth * 2);
-
-  ConstantRange StartURange = getUnsignedRange(Start);
-  ConstantRange EndURange =
-      StartURange.add(MaxBECountRange.multiply(StepSRange));
-
-  // Check for unsigned overflow.
-  ConstantRange ZExtStartURange = StartURange.zextOrTrunc(BitWidth * 2);
-  ConstantRange ZExtEndURange = EndURange.zextOrTrunc(BitWidth * 2);
-  if (ZExtStartURange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-      ZExtEndURange) {
-    APInt Min = APIntOps::umin(StartURange.getUnsignedMin(),
-                               EndURange.getUnsignedMin());
-    APInt Max = APIntOps::umax(StartURange.getUnsignedMax(),
-                               EndURange.getUnsignedMax());
-    bool IsFullRange = Min.isMinValue() && Max.isMaxValue();
-    if (!IsFullRange)
-      Result =
-          Result.intersectWith(ConstantRange(Min, Max + 1));
-  }
 
-  ConstantRange StartSRange = getSignedRange(Start);
-  ConstantRange EndSRange =
-      StartSRange.add(MaxBECountRange.multiply(StepSRange));
-
-  // Check for signed overflow. This must be done with ConstantRange
-  // arithmetic because we could be called from within the ScalarEvolution
-  // overflow checking code.
-  ConstantRange SExtStartSRange = StartSRange.sextOrTrunc(BitWidth * 2);
-  ConstantRange SExtEndSRange = EndSRange.sextOrTrunc(BitWidth * 2);
-  if (SExtStartSRange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-      SExtEndSRange) {
-    APInt Min =
-        APIntOps::smin(StartSRange.getSignedMin(), EndSRange.getSignedMin());
-    APInt Max =
-        APIntOps::smax(StartSRange.getSignedMax(), EndSRange.getSignedMax());
-    bool IsFullRange = Min.isMinSignedValue() && Max.isMaxSignedValue();
-    if (!IsFullRange)
-      Result =
-          Result.intersectWith(ConstantRange(Min, Max + 1));
-  }
+  // If Step can be both positive and negative, we need to find ranges for the
+  // maximum absolute step values in both directions and union them.
+  ConstantRange SR =
+      getRangeForAffineARHelper(StepSRange.getSignedMin(), StartSRange,
+                                MaxBECountValue, BitWidth, /* Signed = */ true);
+  SR = SR.unionWith(getRangeForAffineARHelper(StepSRange.getSignedMax(),
+                                              StartSRange, MaxBECountValue,
+                                              BitWidth, /* Signed = */ true));
 
-  return Result;
+  // Next, consider step unsigned.
+  ConstantRange UR = getRangeForAffineARHelper(
+      getUnsignedRange(Step).getUnsignedMax(), getUnsignedRange(Start),
+      MaxBECountValue, BitWidth, /* Signed = */ false);
+
+  // Finally, intersect signed and unsigned ranges.
+  return SR.intersectWith(UR);
 }
 
 ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
@@ -5148,12 +5243,27 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         APInt EffectiveMask =
             APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
         if ((LZ != 0 || TZ != 0) && !((~A & ~KnownZero) & EffectiveMask)) {
-          const SCEV *MulCount = getConstant(ConstantInt::get(
-              getContext(), APInt::getOneBitSet(BitWidth, TZ)));
+          const SCEV *MulCount = getConstant(APInt::getOneBitSet(BitWidth, TZ));
+          const SCEV *LHS = getSCEV(BO->LHS);
+          const SCEV *ShiftedLHS = nullptr;
+          if (auto *LHSMul = dyn_cast<SCEVMulExpr>(LHS)) {
+            if (auto *OpC = dyn_cast<SCEVConstant>(LHSMul->getOperand(0))) {
+              // For an expression like (x * 8) & 8, simplify the multiply.
+              unsigned MulZeros = OpC->getAPInt().countTrailingZeros();
+              unsigned GCD = std::min(MulZeros, TZ);
+              APInt DivAmt = APInt::getOneBitSet(BitWidth, TZ - GCD);
+              SmallVector<const SCEV*, 4> MulOps;
+              MulOps.push_back(getConstant(OpC->getAPInt().lshr(GCD)));
+              MulOps.append(LHSMul->op_begin() + 1, LHSMul->op_end());
+              auto *NewMul = getMulExpr(MulOps, LHSMul->getNoWrapFlags());
+              ShiftedLHS = getUDivExpr(NewMul, getConstant(DivAmt));
+            }
+          }
+          if (!ShiftedLHS)
+            ShiftedLHS = getUDivExpr(LHS, MulCount);
           return getMulExpr(
               getZeroExtendExpr(
-                  getTruncateExpr(
-                      getUDivExactExpr(getSCEV(BO->LHS), MulCount),
+                  getTruncateExpr(ShiftedLHS,
                       IntegerType::get(getContext(), BitWidth - LZ - TZ)),
                   BO->LHS->getType()),
               MulCount);
@@ -5211,7 +5321,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
                 // If C is a low-bits mask, the zero extend is serving to
                 // mask off the high bits. Complement the operand and
                 // re-apply the zext.
-                if (APIntOps::isMask(Z0TySize, CI->getValue()))
+                if (CI->getValue().isMask(Z0TySize))
                   return getZeroExtendExpr(getNotSCEV(Z0), UTy);
 
                 // If C is a single bit, it may be in the sign-bit position
@@ -5255,28 +5365,55 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     break;
 
     case Instruction::AShr:
-      // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS))
-        if (Operator *L = dyn_cast<Operator>(BO->LHS))
-          if (L->getOpcode() == Instruction::Shl &&
-              L->getOperand(1) == BO->RHS) {
-            uint64_t BitWidth = getTypeSizeInBits(BO->LHS->getType());
-
-            // If the shift count is not less than the bitwidth, the result of
-            // the shift is undefined. Don't try to analyze it, because the
-            // resolution chosen here may differ from the resolution chosen in
-            // other parts of the compiler.
-            if (CI->getValue().uge(BitWidth))
-              break;
+      // AShr X, C, where C is a constant.
+      ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS);
+      if (!CI)
+        break;
+
+      Type *OuterTy = BO->LHS->getType();
+      uint64_t BitWidth = getTypeSizeInBits(OuterTy);
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (CI->getValue().uge(BitWidth))
+        break;
 
-            uint64_t Amt = BitWidth - CI->getZExtValue();
-            if (Amt == BitWidth)
-              return getSCEV(L->getOperand(0)); // shift by zero --> noop
+      if (CI->isNullValue())
+        return getSCEV(BO->LHS); // shift by zero --> noop
+
+      uint64_t AShrAmt = CI->getZExtValue();
+      Type *TruncTy = IntegerType::get(getContext(), BitWidth - AShrAmt);
+
+      Operator *L = dyn_cast<Operator>(BO->LHS);
+      if (L && L->getOpcode() == Instruction::Shl) {
+        // X = Shl A, n
+        // Y = AShr X, m
+        // Both n and m are constant.
+
+        const SCEV *ShlOp0SCEV = getSCEV(L->getOperand(0));
+        if (L->getOperand(1) == BO->RHS)
+          // For a two-shift sext-inreg, i.e. n = m,
+          // use sext(trunc(x)) as the SCEV expression.
+          return getSignExtendExpr(
+              getTruncateExpr(ShlOp0SCEV, TruncTy), OuterTy);
+
+        ConstantInt *ShlAmtCI = dyn_cast<ConstantInt>(L->getOperand(1));
+        if (ShlAmtCI && ShlAmtCI->getValue().ult(BitWidth)) {
+          uint64_t ShlAmt = ShlAmtCI->getZExtValue();
+          if (ShlAmt > AShrAmt) {
+            // When n > m, use sext(mul(trunc(x), 2^(n-m)))) as the SCEV
+            // expression. We already checked that ShlAmt < BitWidth, so
+            // the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as
+            // ShlAmt - AShrAmt < Amt.
+            APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt,
+                                            ShlAmt - AShrAmt);
             return getSignExtendExpr(
-                getTruncateExpr(getSCEV(L->getOperand(0)),
-                                IntegerType::get(getContext(), Amt)),
-                BO->LHS->getType());
+                getMulExpr(getTruncateExpr(ShlOp0SCEV, TruncTy),
+                getConstant(Mul)), OuterTy);
           }
+        }
+      }
       break;
     }
   }
@@ -5348,7 +5485,7 @@ static unsigned getConstantTripCount(const SCEVConstant *ExitCount) {
   return ((unsigned)ExitConst->getZExtValue()) + 1;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripCount(L, ExitingBB);
 
@@ -5356,7 +5493,7 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
   return 0;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
+unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L,
                                                     BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
@@ -5366,13 +5503,13 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
   return getConstantTripCount(ExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantMaxTripCount(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
   const auto *MaxExitCount =
       dyn_cast<SCEVConstant>(getMaxBackedgeTakenCount(L));
   return getConstantTripCount(MaxExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripMultiple(L, ExitingBB);
 
@@ -5393,7 +5530,7 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
 /// As explained in the comments for getSmallConstantTripCount, this assumes
 /// that control exits the loop via ExitingBlock.
 unsigned
-ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
+ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
                                               BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
@@ -5403,17 +5540,16 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
     return 1;
 
   // Get the trip count from the BE count by adding 1.
-  const SCEV *TCMul = getAddExpr(ExitCount, getOne(ExitCount->getType()));
-  // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt
-  // to factor simple cases.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul))
-    TCMul = Mul->getOperand(0);
-
-  const SCEVConstant *MulC = dyn_cast<SCEVConstant>(TCMul);
-  if (!MulC)
-    return 1;
+  const SCEV *TCExpr = getAddExpr(ExitCount, getOne(ExitCount->getType()));
 
-  ConstantInt *Result = MulC->getValue();
+  const SCEVConstant *TC = dyn_cast<SCEVConstant>(TCExpr);
+  if (!TC)
+    // Attempt to factor more general cases. Returns the greatest power of
+    // two divisor. If overflow happens, the trip count expression is still
+    // divisible by the greatest power of 2 divisor returned.
+    return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
+
+  ConstantInt *Result = TC->getValue();
 
   // Guard against huge trip counts (this requires checking
   // for zero to handle the case where the trip count == -1 and the
@@ -5428,7 +5564,8 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
 /// Get the expression for the number of loop iterations for which this loop is
 /// guaranteed not to exit via ExitingBlock. Otherwise return
 /// SCEVCouldNotCompute.
-const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
+const SCEV *ScalarEvolution::getExitCount(const Loop *L,
+                                          BasicBlock *ExitingBlock) {
   return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
 }
 
@@ -6408,7 +6545,10 @@ static bool canConstantEvolve(Instruction *I, const Loop *L) {
 /// recursing through each instruction operand until reaching a loop header phi.
 static PHINode *
 getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
-                               DenseMap<Instruction *, PHINode *> &PHIMap) {
+                               DenseMap<Instruction *, PHINode *> &PHIMap,
+                               unsigned Depth) {
+  if (Depth > MaxConstantEvolvingDepth)
+    return nullptr;
 
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
@@ -6428,7 +6568,7 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
     if (!P) {
       // Recurse and memoize the results, whether a phi is found or not.
       // This recursive call invalidates pointers into PHIMap.
-      P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap);
+      P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap, Depth + 1);
       PHIMap[OpInst] = P;
     }
     if (!P)
@@ -6455,7 +6595,7 @@ static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
 
   // Record non-constant instructions contained by the loop.
   DenseMap<Instruction *, PHINode *> PHIMap;
-  return getConstantEvolvingPHIOperands(I, L, PHIMap);
+  return getConstantEvolvingPHIOperands(I, L, PHIMap, 0);
 }
 
 /// EvaluateExpression - Given an expression that passes the
@@ -7014,10 +7154,10 @@ const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
 /// A and B isn't important.
 ///
 /// If the equation does not have a solution, SCEVCouldNotCompute is returned.
-static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
+static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
                                                ScalarEvolution &SE) {
   uint32_t BW = A.getBitWidth();
-  assert(BW == B.getBitWidth() && "Bit widths must be the same.");
+  assert(BW == SE.getTypeSizeInBits(B->getType()));
   assert(A != 0 && "A must be non-zero.");
 
   // 1. D = gcd(A, N)
@@ -7031,7 +7171,7 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   //
   // B is divisible by D if and only if the multiplicity of prime factor 2 for B
   // is not less than multiplicity of this prime factor for D.
-  if (B.countTrailingZeros() < Mult2)
+  if (SE.GetMinTrailingZeros(B) < Mult2)
     return SE.getCouldNotCompute();
 
   // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
@@ -7049,9 +7189,8 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   // I * (B / D) mod (N / D)
   // To simplify the computation, we factor out the divide by D:
   // (I * B mod N) / D
-  APInt Result = (I * B).lshr(Mult2);
-
-  return SE.getConstant(Result);
+  const SCEV *D = SE.getConstant(APInt::getOneBitSet(BW, Mult2));
+  return SE.getUDivExactExpr(SE.getMulExpr(B, SE.getConstant(I)), D);
 }
 
 /// Find the roots of the quadratic equation for the given quadratic chrec
@@ -7082,7 +7221,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
     // The B coefficient is M-N/2
     APInt B(M);
-    B -= sdiv(N,Two);
+    B -= N.sdiv(Two);
 
     // The A coefficient is N/2
     APInt A(N.sdiv(Two));
@@ -7233,62 +7372,6 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     return ExitLimit(Distance, getConstant(MaxBECount), false, Predicates);
   }
 
-  // As a special case, handle the instance where Step is a positive power of
-  // two. In this case, determining whether Step divides Distance evenly can be
-  // done by counting and comparing the number of trailing zeros of Step and
-  // Distance.
-  if (!CountDown) {
-    const APInt &StepV = StepC->getAPInt();
-    // StepV.isPowerOf2() returns true if StepV is an positive power of two.  It
-    // also returns true if StepV is maximally negative (eg, INT_MIN), but that
-    // case is not handled as this code is guarded by !CountDown.
-    if (StepV.isPowerOf2() &&
-        GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) {
-      // Here we've constrained the equation to be of the form
-      //
-      //   2^(N + k) * Distance' = (StepV == 2^N) * X (mod 2^W)  ... (0)
-      //
-      // where we're operating on a W bit wide integer domain and k is
-      // non-negative.  The smallest unsigned solution for X is the trip count.
-      //
-      // (0) is equivalent to:
-      //
-      //      2^(N + k) * Distance' - 2^N * X = L * 2^W
-      // <=>  2^N(2^k * Distance' - X) = L * 2^(W - N) * 2^N
-      // <=>  2^k * Distance' - X = L * 2^(W - N)
-      // <=>  2^k * Distance'     = L * 2^(W - N) + X    ... (1)
-      //
-      // The smallest X satisfying (1) is unsigned remainder of dividing the LHS
-      // by 2^(W - N).
-      //
-      // <=>  X = 2^k * Distance' URem 2^(W - N)   ... (2)
-      //
-      // E.g. say we're solving
-      //
-      //   2 * Val = 2 * X  (in i8)   ... (3)
-      //
-      // then from (2), we get X = Val URem i8 128 (k = 0 in this case).
-      //
-      // Note: It is tempting to solve (3) by setting X = Val, but Val is not
-      // necessarily the smallest unsigned value of X that satisfies (3).
-      // E.g. if Val is i8 -127 then the smallest value of X that satisfies (3)
-      // is i8 1, not i8 -127
-
-      const auto *ModuloResult = getUDivExactExpr(Distance, Step);
-
-      // Since SCEV does not have a URem node, we construct one using a truncate
-      // and a zero extend.
-
-      unsigned NarrowWidth = StepV.getBitWidth() - StepV.countTrailingZeros();
-      auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth);
-      auto *WideTy = Distance->getType();
-
-      const SCEV *Limit =
-          getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy);
-      return ExitLimit(Limit, Limit, false, Predicates);
-    }
-  }
-
   // If the condition controls loop exit (the loop exits only if the expression
   // is true) and the addition is no-wrap we can use unsigned divide to
   // compute the backedge count.  In this case, the step may not divide the
@@ -7301,13 +7384,10 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     return ExitLimit(Exact, Exact, false, Predicates);
   }
 
-  // Then, try to solve the above equation provided that Start is constant.
-  if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start)) {
-    const SCEV *E = SolveLinEquationWithOverflow(
-        StepC->getValue()->getValue(), -StartC->getValue()->getValue(), *this);
-    return ExitLimit(E, E, false, Predicates);
-  }
-  return getCouldNotCompute();
+  // Solve the general equation.
+  const SCEV *E = SolveLinEquationWithOverflow(
+      StepC->getAPInt(), getNegativeSCEV(Start), *this);
+  return ExitLimit(E, E, false, Predicates);
 }
 
 ScalarEvolution::ExitLimit
@@ -8488,19 +8568,161 @@ static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
   llvm_unreachable("covered switch fell through?!");
 }
 
+bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
+                                             const SCEV *LHS, const SCEV *RHS,
+                                             const SCEV *FoundLHS,
+                                             const SCEV *FoundRHS,
+                                             unsigned Depth) {
+  assert(getTypeSizeInBits(LHS->getType()) ==
+             getTypeSizeInBits(RHS->getType()) &&
+         "LHS and RHS have different sizes?");
+  assert(getTypeSizeInBits(FoundLHS->getType()) ==
+             getTypeSizeInBits(FoundRHS->getType()) &&
+         "FoundLHS and FoundRHS have different sizes?");
+  // We want to avoid hurting the compile time with analysis of too big trees.
+  if (Depth > MaxSCEVOperationsImplicationDepth)
+    return false;
+  // We only want to work with ICMP_SGT comparison so far.
+  // TODO: Extend to ICMP_UGT?
+  if (Pred == ICmpInst::ICMP_SLT) {
+    Pred = ICmpInst::ICMP_SGT;
+    std::swap(LHS, RHS);
+    std::swap(FoundLHS, FoundRHS);
+  }
+  if (Pred != ICmpInst::ICMP_SGT)
+    return false;
+
+  auto GetOpFromSExt = [&](const SCEV *S) {
+    if (auto *Ext = dyn_cast<SCEVSignExtendExpr>(S))
+      return Ext->getOperand();
+    // TODO: If S is a SCEVConstant then you can cheaply "strip" the sext off
+    // the constant in some cases.
+    return S;
+  };
+
+  // Acquire values from extensions.
+  auto *OrigFoundLHS = FoundLHS;
+  LHS = GetOpFromSExt(LHS);
+  FoundLHS = GetOpFromSExt(FoundLHS);
+
+  // Is the SGT predicate can be proved trivially or using the found context.
+  auto IsSGTViaContext = [&](const SCEV *S1, const SCEV *S2) {
+    return isKnownViaSimpleReasoning(ICmpInst::ICMP_SGT, S1, S2) ||
+           isImpliedViaOperations(ICmpInst::ICMP_SGT, S1, S2, OrigFoundLHS,
+                                  FoundRHS, Depth + 1);
+  };
+
+  if (auto *LHSAddExpr = dyn_cast<SCEVAddExpr>(LHS)) {
+    // We want to avoid creation of any new non-constant SCEV. Since we are
+    // going to compare the operands to RHS, we should be certain that we don't
+    // need any size extensions for this. So let's decline all cases when the
+    // sizes of types of LHS and RHS do not match.
+    // TODO: Maybe try to get RHS from sext to catch more cases?
+    if (getTypeSizeInBits(LHS->getType()) != getTypeSizeInBits(RHS->getType()))
+      return false;
+
+    // Should not overflow.
+    if (!LHSAddExpr->hasNoSignedWrap())
+      return false;
+
+    auto *LL = LHSAddExpr->getOperand(0);
+    auto *LR = LHSAddExpr->getOperand(1);
+    auto *MinusOne = getNegativeSCEV(getOne(RHS->getType()));
+
+    // Checks that S1 >= 0 && S2 > RHS, trivially or using the found context.
+    auto IsSumGreaterThanRHS = [&](const SCEV *S1, const SCEV *S2) {
+      return IsSGTViaContext(S1, MinusOne) && IsSGTViaContext(S2, RHS);
+    };
+    // Try to prove the following rule:
+    // (LHS = LL + LR) && (LL >= 0) && (LR > RHS) => (LHS > RHS).
+    // (LHS = LL + LR) && (LR >= 0) && (LL > RHS) => (LHS > RHS).
+    if (IsSumGreaterThanRHS(LL, LR) || IsSumGreaterThanRHS(LR, LL))
+      return true;
+  } else if (auto *LHSUnknownExpr = dyn_cast<SCEVUnknown>(LHS)) {
+    Value *LL, *LR;
+    // FIXME: Once we have SDiv implemented, we can get rid of this matching.
+    using namespace llvm::PatternMatch;
+    if (match(LHSUnknownExpr->getValue(), m_SDiv(m_Value(LL), m_Value(LR)))) {
+      // Rules for division.
+      // We are going to perform some comparisons with Denominator and its
+      // derivative expressions. In general case, creating a SCEV for it may
+      // lead to a complex analysis of the entire graph, and in particular it
+      // can request trip count recalculation for the same loop. This would
+      // cache as SCEVCouldNotCompute to avoid the infinite recursion. To avoid
+      // this, we only want to create SCEVs that are constants in this section.
+      // So we bail if Denominator is not a constant.
+      if (!isa<ConstantInt>(LR))
+        return false;
+
+      auto *Denominator = cast<SCEVConstant>(getSCEV(LR));
+
+      // We want to make sure that LHS = FoundLHS / Denominator. If it is so,
+      // then a SCEV for the numerator already exists and matches with FoundLHS.
+      auto *Numerator = getExistingSCEV(LL);
+      if (!Numerator || Numerator->getType() != FoundLHS->getType())
+        return false;
+
+      // Make sure that the numerator matches with FoundLHS and the denominator
+      // is positive.
+      if (!HasSameValue(Numerator, FoundLHS) || !isKnownPositive(Denominator))
+        return false;
+
+      auto *DTy = Denominator->getType();
+      auto *FRHSTy = FoundRHS->getType();
+      if (DTy->isPointerTy() != FRHSTy->isPointerTy())
+        // One of types is a pointer and another one is not. We cannot extend
+        // them properly to a wider type, so let us just reject this case.
+        // TODO: Usage of getEffectiveSCEVType for DTy, FRHSTy etc should help
+        // to avoid this check.
+        return false;
+
+      // Given that:
+      // FoundLHS > FoundRHS, LHS = FoundLHS / Denominator, Denominator > 0.
+      auto *WTy = getWiderType(DTy, FRHSTy);
+      auto *DenominatorExt = getNoopOrSignExtend(Denominator, WTy);
+      auto *FoundRHSExt = getNoopOrSignExtend(FoundRHS, WTy);
+
+      // Try to prove the following rule:
+      // (FoundRHS > Denominator - 2) && (RHS <= 0) => (LHS > RHS).
+      // For example, given that FoundLHS > 2. It means that FoundLHS is at
+      // least 3. If we divide it by Denominator < 4, we will have at least 1.
+      auto *DenomMinusTwo = getMinusSCEV(DenominatorExt, getConstant(WTy, 2));
+      if (isKnownNonPositive(RHS) &&
+          IsSGTViaContext(FoundRHSExt, DenomMinusTwo))
+        return true;
+
+      // Try to prove the following rule:
+      // (FoundRHS > -1 - Denominator) && (RHS < 0) => (LHS > RHS).
+      // For example, given that FoundLHS > -3. Then FoundLHS is at least -2.
+      // If we divide it by Denominator > 2, then:
+      // 1. If FoundLHS is negative, then the result is 0.
+      // 2. If FoundLHS is non-negative, then the result is non-negative.
+      // Anyways, the result is non-negative.
+      auto *MinusOne = getNegativeSCEV(getOne(WTy));
+      auto *NegDenomMinusOne = getMinusSCEV(MinusOne, DenominatorExt);
+      if (isKnownNegative(RHS) &&
+          IsSGTViaContext(FoundRHSExt, NegDenomMinusOne))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool
+ScalarEvolution::isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS) {
+  return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
+         IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
+         IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
+         isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
+}
+
 bool
 ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
                                              const SCEV *LHS, const SCEV *RHS,
                                              const SCEV *FoundLHS,
                                              const SCEV *FoundRHS) {
-  auto IsKnownPredicateFull =
-      [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
-    return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
-           IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
-           IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
-           isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
-  };
-
   switch (Pred) {
   default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
   case ICmpInst::ICMP_EQ:
@@ -8510,30 +8732,34 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
     break;
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_SLE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_SGE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_SGE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_SLE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_ULE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_UGE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_UGE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_ULE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS))
       return true;
     break;
   }
 
+  // Maybe it can be proved via operations?
+  if (isImpliedViaOperations(Pred, LHS, RHS, FoundLHS, FoundRHS))
+    return true;
+
   return false;
 }
 
@@ -9524,6 +9750,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       ValueExprMap(std::move(Arg.ValueExprMap)),
       PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)),
       WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
+      MinTrailingZerosCache(std::move(Arg.MinTrailingZerosCache)),
       BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
       PredicatedBackedgeTakenCounts(
           std::move(Arg.PredicatedBackedgeTakenCounts)),
@@ -9621,6 +9848,13 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     OS << "Unpredictable predicated backedge-taken count. ";
   }
   OS << "\n";
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    OS << "Trip multiple is " << SE->getSmallConstantTripMultiple(L) << "\n";
+  }
 }
 
 static StringRef loopDispositionToStr(ScalarEvolution::LoopDisposition LD) {
@@ -9929,6 +10163,7 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   SignedRanges.erase(S);
   ExprValueMap.erase(S);
   HasRecMap.erase(S);
+  MinTrailingZerosCache.erase(S);
 
   auto RemoveSCEVFromBackedgeMap =
       [S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index d15a7dbd20e6..6dd10441c4cb 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1268,8 +1268,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   if (PostIncLoops.count(L)) {
     PostIncLoopSet Loops;
     Loops.insert(L);
-    Normalized = cast<SCEVAddRecExpr>(TransformForPostIncUse(
-        Normalize, S, nullptr, nullptr, Loops, SE, SE.DT));
+    Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE));
   }
 
   // Strip off any non-loop-dominating component from the addrec start.
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index c1f9503816ee..2aaa4c1ae117 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -12,243 +12,100 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 using namespace llvm;
 
-/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
-/// and now we need to decide whether the user should use the preinc or post-inc
-/// value.  If this user should use the post-inc version of the IV, return true.
-///
-/// Choosing wrong here can break dominance properties (if we choose to use the
-/// post-inc value when we cannot) or it can end up adding extra live-ranges to
-/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
-/// should use the post-inc value).
-static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
-                                       const Loop *L, DominatorTree *DT) {
-  // If the user is in the loop, use the preinc value.
-  if (L->contains(User)) return false;
-
-  BasicBlock *LatchBlock = L->getLoopLatch();
-  if (!LatchBlock)
-    return false;
-
-  // Ok, the user is outside of the loop.  If it is dominated by the latch
-  // block, use the post-inc value.
-  if (DT->dominates(LatchBlock, User->getParent()))
-    return true;
-
-  // There is one case we have to be careful of: PHI nodes.  These little guys
-  // can live in blocks that are not dominated by the latch block, but (since
-  // their uses occur in the predecessor block, not the block the PHI lives in)
-  // should still use the post-inc value.  Check for this case now.
-  PHINode *PN = dyn_cast<PHINode>(User);
-  if (!PN || !Operand) return false; // not a phi, not dominated by latch block.
-
-  // Look at all of the uses of Operand by the PHI node.  If any use corresponds
-  // to a block that is not dominated by the latch block, give up and use the
-  // preincremented value.
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-    if (PN->getIncomingValue(i) == Operand &&
-        !DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
-      return false;
-
-  // Okay, all uses of Operand by PN are in predecessor blocks that really are
-  // dominated by the latch block.  Use the post-incremented value.
-  return true;
-}
+/// TransformKind - Different types of transformations that
+/// TransformForPostIncUse can do.
+enum TransformKind {
+  /// Normalize - Normalize according to the given loops.
+  Normalize,
+  /// Denormalize - Perform the inverse transform on the expression with the
+  /// given loop set.
+  Denormalize
+};
 
 namespace {
-
-/// Hold the state used during post-inc expression transformation, including a
-/// map of transformed expressions.
-class PostIncTransform {
-  TransformKind Kind;
-  PostIncLoopSet &Loops;
-  ScalarEvolution &SE;
-  DominatorTree &DT;
-
-  DenseMap<const SCEV*, const SCEV*> Transformed;
-
-public:
-  PostIncTransform(TransformKind kind, PostIncLoopSet &loops,
-                   ScalarEvolution &se, DominatorTree &dt):
-    Kind(kind), Loops(loops), SE(se), DT(dt) {}
-
-  const SCEV *TransformSubExpr(const SCEV *S, Instruction *User,
-                               Value *OperandValToReplace);
-
-protected:
-  const SCEV *TransformImpl(const SCEV *S, Instruction *User,
-                            Value *OperandValToReplace);
+struct NormalizeDenormalizeRewriter
+    : public SCEVRewriteVisitor<NormalizeDenormalizeRewriter> {
+  const TransformKind Kind;
+
+  // NB! Pred is a function_ref.  Storing it here is okay only because
+  // we're careful about the lifetime of NormalizeDenormalizeRewriter.
+  const NormalizePredTy Pred;
+
+  NormalizeDenormalizeRewriter(TransformKind Kind, NormalizePredTy Pred,
+                               ScalarEvolution &SE)
+      : SCEVRewriteVisitor<NormalizeDenormalizeRewriter>(SE), Kind(Kind),
+        Pred(Pred) {}
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr);
 };
-
 } // namespace
 
-/// Implement post-inc transformation for all valid expression types.
-const SCEV *PostIncTransform::
-TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
-
-  if (const SCEVCastExpr *X = dyn_cast<SCEVCastExpr>(S)) {
-    const SCEV *O = X->getOperand();
-    const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
-    if (O != N)
-      switch (S->getSCEVType()) {
-      case scZeroExtend: return SE.getZeroExtendExpr(N, S->getType());
-      case scSignExtend: return SE.getSignExtendExpr(N, S->getType());
-      case scTruncate: return SE.getTruncateExpr(N, S->getType());
-      default: llvm_unreachable("Unexpected SCEVCastExpr kind!");
-      }
-    return S;
-  }
-
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-    // An addrec. This is the interesting part.
-    SmallVector<const SCEV *, 8> Operands;
-    const Loop *L = AR->getLoop();
-    // The addrec conceptually uses its operands at loop entry.
-    Instruction *LUser = &L->getHeader()->front();
-    // Transform each operand.
-    for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
-         I != E; ++I) {
-      Operands.push_back(TransformSubExpr(*I, LUser, nullptr));
+const SCEV *
+NormalizeDenormalizeRewriter::visitAddRecExpr(const SCEVAddRecExpr *AR) {
+  SmallVector<const SCEV *, 8> Operands;
+
+  transform(AR->operands(), std::back_inserter(Operands),
+            [&](const SCEV *Op) { return visit(Op); });
+
+  // Conservatively use AnyWrap until/unless we need FlagNW.
+  const SCEV *Result =
+      SE.getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagAnyWrap);
+  switch (Kind) {
+  case Normalize:
+    // We want to normalize step expression, because otherwise we might not be
+    // able to denormalize to the original expression.
+    //
+    // Here is an example what will happen if we don't normalize step:
+    //  ORIGINAL ISE:
+    //    {(100 /u {1,+,1}<%bb16>),+,(100 /u {1,+,1}<%bb16>)}<%bb25>
+    //  NORMALIZED ISE:
+    //    {((-1 * (100 /u {1,+,1}<%bb16>)) + (100 /u {0,+,1}<%bb16>)),+,
+    //     (100 /u {0,+,1}<%bb16>)}<%bb25>
+    //  DENORMALIZED BACK ISE:
+    //    {((2 * (100 /u {1,+,1}<%bb16>)) + (-1 * (100 /u {2,+,1}<%bb16>))),+,
+    //     (100 /u {1,+,1}<%bb16>)}<%bb25>
+    //  Note that the initial value changes after normalization +
+    //  denormalization, which isn't correct.
+    if (Pred(AR)) {
+      const SCEV *TransformedStep = visit(AR->getStepRecurrence(SE));
+      Result = SE.getMinusSCEV(Result, TransformedStep);
     }
-    // Conservatively use AnyWrap until/unless we need FlagNW.
-    const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
-    switch (Kind) {
-    case NormalizeAutodetect:
-      // Normalize this SCEV by subtracting the expression for the final step.
-      // We only allow affine AddRecs to be normalized, otherwise we would not
-      // be able to correctly denormalize.
-      // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2}
-      // Normalized form:   {-2,+,1,+,2}
-      // Denormalized form: {1,+,3,+,2}
-      //
-      // However, denormalization would use a different step expression than
-      // normalization (see getPostIncExpr), generating the wrong final
-      // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
-      if (AR->isAffine() &&
-          IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
-        Loops.insert(L);
-      }
-#if 0
-      // This assert is conceptually correct, but ScalarEvolution currently
-      // sometimes fails to canonicalize two equal SCEVs to exactly the same
-      // form. It's possibly a pessimization when this happens, but it isn't a
-      // correctness problem, so disable this assert for now.
-      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
-             "SCEV normalization is not invertible!");
-#endif
-      break;
-    case Normalize:
-      // We want to normalize step expression, because otherwise we might not be
-      // able to denormalize to the original expression.
-      //
-      // Here is an example what will happen if we don't normalize step:
-      //  ORIGINAL ISE:
-      //    {(100 /u {1,+,1}<%bb16>),+,(100 /u {1,+,1}<%bb16>)}<%bb25>
-      //  NORMALIZED ISE:
-      //    {((-1 * (100 /u {1,+,1}<%bb16>)) + (100 /u {0,+,1}<%bb16>)),+,
-      //     (100 /u {0,+,1}<%bb16>)}<%bb25>
-      //  DENORMALIZED BACK ISE:
-      //    {((2 * (100 /u {1,+,1}<%bb16>)) + (-1 * (100 /u {2,+,1}<%bb16>))),+,
-      //     (100 /u {1,+,1}<%bb16>)}<%bb25>
-      //  Note that the initial value changes after normalization +
-      //  denormalization, which isn't correct.
-      if (Loops.count(L)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
-      }
-#if 0
-      // See the comment on the assert above.
-      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
-             "SCEV normalization is not invertible!");
-#endif
-      break;
-    case Denormalize:
-      // Here we want to normalize step expressions for the same reasons, as
-      // stated above.
-      if (Loops.count(L)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getAddExpr(Result, TransformedStep);
-      }
-      break;
+    break;
+  case Denormalize:
+    // Here we want to normalize step expressions for the same reasons, as
+    // stated above.
+    if (Pred(AR)) {
+      const SCEV *TransformedStep = visit(AR->getStepRecurrence(SE));
+      Result = SE.getAddExpr(Result, TransformedStep);
     }
-    return Result;
-  }
-
-  if (const SCEVNAryExpr *X = dyn_cast<SCEVNAryExpr>(S)) {
-    SmallVector<const SCEV *, 8> Operands;
-    bool Changed = false;
-    // Transform each operand.
-    for (SCEVNAryExpr::op_iterator I = X->op_begin(), E = X->op_end();
-         I != E; ++I) {
-      const SCEV *O = *I;
-      const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
-      Changed |= N != O;
-      Operands.push_back(N);
-    }
-    // If any operand actually changed, return a transformed result.
-    if (Changed)
-      switch (S->getSCEVType()) {
-      case scAddExpr: return SE.getAddExpr(Operands);
-      case scMulExpr: return SE.getMulExpr(Operands);
-      case scSMaxExpr: return SE.getSMaxExpr(Operands);
-      case scUMaxExpr: return SE.getUMaxExpr(Operands);
-      default: llvm_unreachable("Unexpected SCEVNAryExpr kind!");
-      }
-    return S;
-  }
-
-  if (const SCEVUDivExpr *X = dyn_cast<SCEVUDivExpr>(S)) {
-    const SCEV *LO = X->getLHS();
-    const SCEV *RO = X->getRHS();
-    const SCEV *LN = TransformSubExpr(LO, User, OperandValToReplace);
-    const SCEV *RN = TransformSubExpr(RO, User, OperandValToReplace);
-    if (LO != LN || RO != RN)
-      return SE.getUDivExpr(LN, RN);
-    return S;
+    break;
   }
-
-  llvm_unreachable("Unexpected SCEV kind!");
+  return Result;
 }
 
-/// Manage recursive transformation across an expression DAG. Revisiting
-/// expressions would lead to exponential recursion.
-const SCEV *PostIncTransform::
-TransformSubExpr(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
-
-  if (isa<SCEVConstant>(S) || isa<SCEVUnknown>(S))
-    return S;
-
-  const SCEV *Result = Transformed.lookup(S);
-  if (Result)
-    return Result;
+const SCEV *llvm::normalizeForPostIncUse(const SCEV *S,
+                                         const PostIncLoopSet &Loops,
+                                         ScalarEvolution &SE) {
+  auto Pred = [&](const SCEVAddRecExpr *AR) {
+    return Loops.count(AR->getLoop());
+  };
+  return NormalizeDenormalizeRewriter(Normalize, Pred, SE).visit(S);
+}
 
-  Result = TransformImpl(S, User, OperandValToReplace);
-  Transformed[S] = Result;
-  return Result;
+const SCEV *llvm::normalizeForPostIncUseIf(const SCEV *S, NormalizePredTy Pred,
+                                           ScalarEvolution &SE) {
+  return NormalizeDenormalizeRewriter(Normalize, Pred, SE).visit(S);
 }
 
-/// Top level driver for transforming an expression DAG into its requested
-/// post-inc form (either "Normalized" or "Denormalized").
-const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
-                                         const SCEV *S,
-                                         Instruction *User,
-                                         Value *OperandValToReplace,
-                                         PostIncLoopSet &Loops,
-                                         ScalarEvolution &SE,
-                                         DominatorTree &DT) {
-  PostIncTransform Transform(Kind, Loops, SE, DT);
-  return Transform.TransformSubExpr(S, User, OperandValToReplace);
+const SCEV *llvm::denormalizeForPostIncUse(const SCEV *S,
+                                           const PostIncLoopSet &Loops,
+                                           ScalarEvolution &SE) {
+  auto Pred = [&](const SCEVAddRecExpr *AR) {
+    return Loops.count(AR->getLoop());
+  };
+  return NormalizeDenormalizeRewriter(Denormalize, Pred, SE).visit(S);
 }
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
index 79dc84e25533..470f4bee1e0a 100644
--- a/lib/Analysis/SparsePropagation.cpp
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -195,7 +195,7 @@ void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
     Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
-  SwitchInst::CaseIt Case = SI.findCaseValue(cast<ConstantInt>(C));
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(cast<ConstantInt>(C));
   Succs[Case.getSuccessorIndex()] = true;
 }
 
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 112118ab77eb..be734fa91425 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -82,24 +82,24 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
 
   if (T.getArch() == Triple::r600 ||
       T.getArch() == Triple::amdgcn) {
-    TLI.setUnavailable(LibFunc::ldexp);
-    TLI.setUnavailable(LibFunc::ldexpf);
-    TLI.setUnavailable(LibFunc::ldexpl);
-    TLI.setUnavailable(LibFunc::exp10);
-    TLI.setUnavailable(LibFunc::exp10f);
-    TLI.setUnavailable(LibFunc::exp10l);
-    TLI.setUnavailable(LibFunc::log10);
-    TLI.setUnavailable(LibFunc::log10f);
-    TLI.setUnavailable(LibFunc::log10l);
+    TLI.setUnavailable(LibFunc_ldexp);
+    TLI.setUnavailable(LibFunc_ldexpf);
+    TLI.setUnavailable(LibFunc_ldexpl);
+    TLI.setUnavailable(LibFunc_exp10);
+    TLI.setUnavailable(LibFunc_exp10f);
+    TLI.setUnavailable(LibFunc_exp10l);
+    TLI.setUnavailable(LibFunc_log10);
+    TLI.setUnavailable(LibFunc_log10f);
+    TLI.setUnavailable(LibFunc_log10l);
   }
 
   // There are no library implementations of mempcy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
   if (T.getArch() == Triple::r600 ||
       T.getArch() == Triple::amdgcn) {
-    TLI.setUnavailable(LibFunc::memcpy);
-    TLI.setUnavailable(LibFunc::memset);
-    TLI.setUnavailable(LibFunc::memset_pattern16);
+    TLI.setUnavailable(LibFunc_memcpy);
+    TLI.setUnavailable(LibFunc_memset);
+    TLI.setUnavailable(LibFunc_memset_pattern16);
     return;
   }
 
@@ -107,21 +107,21 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // All versions of watchOS support it.
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
-      TLI.setUnavailable(LibFunc::memset_pattern16);
+      TLI.setUnavailable(LibFunc_memset_pattern16);
   } else if (T.isiOS()) {
     if (T.isOSVersionLT(3, 0))
-      TLI.setUnavailable(LibFunc::memset_pattern16);
+      TLI.setUnavailable(LibFunc_memset_pattern16);
   } else if (!T.isWatchOS()) {
-    TLI.setUnavailable(LibFunc::memset_pattern16);
+    TLI.setUnavailable(LibFunc_memset_pattern16);
   }
 
   if (!hasSinCosPiStret(T)) {
-    TLI.setUnavailable(LibFunc::sinpi);
-    TLI.setUnavailable(LibFunc::sinpif);
-    TLI.setUnavailable(LibFunc::cospi);
-    TLI.setUnavailable(LibFunc::cospif);
-    TLI.setUnavailable(LibFunc::sincospi_stret);
-    TLI.setUnavailable(LibFunc::sincospif_stret);
+    TLI.setUnavailable(LibFunc_sinpi);
+    TLI.setUnavailable(LibFunc_sinpif);
+    TLI.setUnavailable(LibFunc_cospi);
+    TLI.setUnavailable(LibFunc_cospif);
+    TLI.setUnavailable(LibFunc_sincospi_stret);
+    TLI.setUnavailable(LibFunc_sincospif_stret);
   }
 
   if (T.isMacOSX() && T.getArch() == Triple::x86 &&
@@ -131,179 +131,179 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // has a $UNIX2003 suffix. The two implementations are identical except
     // for the return value in some edge cases.  However, we don't want to
     // generate code that depends on the old symbols.
-    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
-    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
+    TLI.setAvailableWithName(LibFunc_fwrite, "fwrite$UNIX2003");
+    TLI.setAvailableWithName(LibFunc_fputs, "fputs$UNIX2003");
   }
 
   // iprintf and friends are only available on XCore and TCE.
   if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
-    TLI.setUnavailable(LibFunc::iprintf);
-    TLI.setUnavailable(LibFunc::siprintf);
-    TLI.setUnavailable(LibFunc::fiprintf);
+    TLI.setUnavailable(LibFunc_iprintf);
+    TLI.setUnavailable(LibFunc_siprintf);
+    TLI.setUnavailable(LibFunc_fiprintf);
   }
 
   if (T.isOSWindows() && !T.isOSCygMing()) {
     // Win32 does not support long double
-    TLI.setUnavailable(LibFunc::acosl);
-    TLI.setUnavailable(LibFunc::asinl);
-    TLI.setUnavailable(LibFunc::atanl);
-    TLI.setUnavailable(LibFunc::atan2l);
-    TLI.setUnavailable(LibFunc::ceill);
-    TLI.setUnavailable(LibFunc::copysignl);
-    TLI.setUnavailable(LibFunc::cosl);
-    TLI.setUnavailable(LibFunc::coshl);
-    TLI.setUnavailable(LibFunc::expl);
-    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
-    TLI.setUnavailable(LibFunc::fabsl);
-    TLI.setUnavailable(LibFunc::floorl);
-    TLI.setUnavailable(LibFunc::fmaxl);
-    TLI.setUnavailable(LibFunc::fminl);
-    TLI.setUnavailable(LibFunc::fmodl);
-    TLI.setUnavailable(LibFunc::frexpl);
-    TLI.setUnavailable(LibFunc::ldexpf);
-    TLI.setUnavailable(LibFunc::ldexpl);
-    TLI.setUnavailable(LibFunc::logl);
-    TLI.setUnavailable(LibFunc::modfl);
-    TLI.setUnavailable(LibFunc::powl);
-    TLI.setUnavailable(LibFunc::sinl);
-    TLI.setUnavailable(LibFunc::sinhl);
-    TLI.setUnavailable(LibFunc::sqrtl);
-    TLI.setUnavailable(LibFunc::tanl);
-    TLI.setUnavailable(LibFunc::tanhl);
+    TLI.setUnavailable(LibFunc_acosl);
+    TLI.setUnavailable(LibFunc_asinl);
+    TLI.setUnavailable(LibFunc_atanl);
+    TLI.setUnavailable(LibFunc_atan2l);
+    TLI.setUnavailable(LibFunc_ceill);
+    TLI.setUnavailable(LibFunc_copysignl);
+    TLI.setUnavailable(LibFunc_cosl);
+    TLI.setUnavailable(LibFunc_coshl);
+    TLI.setUnavailable(LibFunc_expl);
+    TLI.setUnavailable(LibFunc_fabsf); // Win32 and Win64 both lack fabsf
+    TLI.setUnavailable(LibFunc_fabsl);
+    TLI.setUnavailable(LibFunc_floorl);
+    TLI.setUnavailable(LibFunc_fmaxl);
+    TLI.setUnavailable(LibFunc_fminl);
+    TLI.setUnavailable(LibFunc_fmodl);
+    TLI.setUnavailable(LibFunc_frexpl);
+    TLI.setUnavailable(LibFunc_ldexpf);
+    TLI.setUnavailable(LibFunc_ldexpl);
+    TLI.setUnavailable(LibFunc_logl);
+    TLI.setUnavailable(LibFunc_modfl);
+    TLI.setUnavailable(LibFunc_powl);
+    TLI.setUnavailable(LibFunc_sinl);
+    TLI.setUnavailable(LibFunc_sinhl);
+    TLI.setUnavailable(LibFunc_sqrtl);
+    TLI.setUnavailable(LibFunc_tanl);
+    TLI.setUnavailable(LibFunc_tanhl);
 
     // Win32 only has C89 math
-    TLI.setUnavailable(LibFunc::acosh);
-    TLI.setUnavailable(LibFunc::acoshf);
-    TLI.setUnavailable(LibFunc::acoshl);
-    TLI.setUnavailable(LibFunc::asinh);
-    TLI.setUnavailable(LibFunc::asinhf);
-    TLI.setUnavailable(LibFunc::asinhl);
-    TLI.setUnavailable(LibFunc::atanh);
-    TLI.setUnavailable(LibFunc::atanhf);
-    TLI.setUnavailable(LibFunc::atanhl);
-    TLI.setUnavailable(LibFunc::cbrt);
-    TLI.setUnavailable(LibFunc::cbrtf);
-    TLI.setUnavailable(LibFunc::cbrtl);
-    TLI.setUnavailable(LibFunc::exp2);
-    TLI.setUnavailable(LibFunc::exp2f);
-    TLI.setUnavailable(LibFunc::exp2l);
-    TLI.setUnavailable(LibFunc::expm1);
-    TLI.setUnavailable(LibFunc::expm1f);
-    TLI.setUnavailable(LibFunc::expm1l);
-    TLI.setUnavailable(LibFunc::log2);
-    TLI.setUnavailable(LibFunc::log2f);
-    TLI.setUnavailable(LibFunc::log2l);
-    TLI.setUnavailable(LibFunc::log1p);
-    TLI.setUnavailable(LibFunc::log1pf);
-    TLI.setUnavailable(LibFunc::log1pl);
-    TLI.setUnavailable(LibFunc::logb);
-    TLI.setUnavailable(LibFunc::logbf);
-    TLI.setUnavailable(LibFunc::logbl);
-    TLI.setUnavailable(LibFunc::nearbyint);
-    TLI.setUnavailable(LibFunc::nearbyintf);
-    TLI.setUnavailable(LibFunc::nearbyintl);
-    TLI.setUnavailable(LibFunc::rint);
-    TLI.setUnavailable(LibFunc::rintf);
-    TLI.setUnavailable(LibFunc::rintl);
-    TLI.setUnavailable(LibFunc::round);
-    TLI.setUnavailable(LibFunc::roundf);
-    TLI.setUnavailable(LibFunc::roundl);
-    TLI.setUnavailable(LibFunc::trunc);
-    TLI.setUnavailable(LibFunc::truncf);
-    TLI.setUnavailable(LibFunc::truncl);
+    TLI.setUnavailable(LibFunc_acosh);
+    TLI.setUnavailable(LibFunc_acoshf);
+    TLI.setUnavailable(LibFunc_acoshl);
+    TLI.setUnavailable(LibFunc_asinh);
+    TLI.setUnavailable(LibFunc_asinhf);
+    TLI.setUnavailable(LibFunc_asinhl);
+    TLI.setUnavailable(LibFunc_atanh);
+    TLI.setUnavailable(LibFunc_atanhf);
+    TLI.setUnavailable(LibFunc_atanhl);
+    TLI.setUnavailable(LibFunc_cbrt);
+    TLI.setUnavailable(LibFunc_cbrtf);
+    TLI.setUnavailable(LibFunc_cbrtl);
+    TLI.setUnavailable(LibFunc_exp2);
+    TLI.setUnavailable(LibFunc_exp2f);
+    TLI.setUnavailable(LibFunc_exp2l);
+    TLI.setUnavailable(LibFunc_expm1);
+    TLI.setUnavailable(LibFunc_expm1f);
+    TLI.setUnavailable(LibFunc_expm1l);
+    TLI.setUnavailable(LibFunc_log2);
+    TLI.setUnavailable(LibFunc_log2f);
+    TLI.setUnavailable(LibFunc_log2l);
+    TLI.setUnavailable(LibFunc_log1p);
+    TLI.setUnavailable(LibFunc_log1pf);
+    TLI.setUnavailable(LibFunc_log1pl);
+    TLI.setUnavailable(LibFunc_logb);
+    TLI.setUnavailable(LibFunc_logbf);
+    TLI.setUnavailable(LibFunc_logbl);
+    TLI.setUnavailable(LibFunc_nearbyint);
+    TLI.setUnavailable(LibFunc_nearbyintf);
+    TLI.setUnavailable(LibFunc_nearbyintl);
+    TLI.setUnavailable(LibFunc_rint);
+    TLI.setUnavailable(LibFunc_rintf);
+    TLI.setUnavailable(LibFunc_rintl);
+    TLI.setUnavailable(LibFunc_round);
+    TLI.setUnavailable(LibFunc_roundf);
+    TLI.setUnavailable(LibFunc_roundl);
+    TLI.setUnavailable(LibFunc_trunc);
+    TLI.setUnavailable(LibFunc_truncf);
+    TLI.setUnavailable(LibFunc_truncl);
 
     // Win32 provides some C99 math with mangled names
-    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
+    TLI.setAvailableWithName(LibFunc_copysign, "_copysign");
 
     if (T.getArch() == Triple::x86) {
       // Win32 on x86 implements single-precision math functions as macros
-      TLI.setUnavailable(LibFunc::acosf);
-      TLI.setUnavailable(LibFunc::asinf);
-      TLI.setUnavailable(LibFunc::atanf);
-      TLI.setUnavailable(LibFunc::atan2f);
-      TLI.setUnavailable(LibFunc::ceilf);
-      TLI.setUnavailable(LibFunc::copysignf);
-      TLI.setUnavailable(LibFunc::cosf);
-      TLI.setUnavailable(LibFunc::coshf);
-      TLI.setUnavailable(LibFunc::expf);
-      TLI.setUnavailable(LibFunc::floorf);
-      TLI.setUnavailable(LibFunc::fminf);
-      TLI.setUnavailable(LibFunc::fmaxf);
-      TLI.setUnavailable(LibFunc::fmodf);
-      TLI.setUnavailable(LibFunc::logf);
-      TLI.setUnavailable(LibFunc::log10f);
-      TLI.setUnavailable(LibFunc::modff);
-      TLI.setUnavailable(LibFunc::powf);
-      TLI.setUnavailable(LibFunc::sinf);
-      TLI.setUnavailable(LibFunc::sinhf);
-      TLI.setUnavailable(LibFunc::sqrtf);
-      TLI.setUnavailable(LibFunc::tanf);
-      TLI.setUnavailable(LibFunc::tanhf);
+      TLI.setUnavailable(LibFunc_acosf);
+      TLI.setUnavailable(LibFunc_asinf);
+      TLI.setUnavailable(LibFunc_atanf);
+      TLI.setUnavailable(LibFunc_atan2f);
+      TLI.setUnavailable(LibFunc_ceilf);
+      TLI.setUnavailable(LibFunc_copysignf);
+      TLI.setUnavailable(LibFunc_cosf);
+      TLI.setUnavailable(LibFunc_coshf);
+      TLI.setUnavailable(LibFunc_expf);
+      TLI.setUnavailable(LibFunc_floorf);
+      TLI.setUnavailable(LibFunc_fminf);
+      TLI.setUnavailable(LibFunc_fmaxf);
+      TLI.setUnavailable(LibFunc_fmodf);
+      TLI.setUnavailable(LibFunc_logf);
+      TLI.setUnavailable(LibFunc_log10f);
+      TLI.setUnavailable(LibFunc_modff);
+      TLI.setUnavailable(LibFunc_powf);
+      TLI.setUnavailable(LibFunc_sinf);
+      TLI.setUnavailable(LibFunc_sinhf);
+      TLI.setUnavailable(LibFunc_sqrtf);
+      TLI.setUnavailable(LibFunc_tanf);
+      TLI.setUnavailable(LibFunc_tanhf);
     }
 
     // Win32 does *not* provide provide these functions, but they are
     // generally available on POSIX-compliant systems:
-    TLI.setUnavailable(LibFunc::access);
-    TLI.setUnavailable(LibFunc::bcmp);
-    TLI.setUnavailable(LibFunc::bcopy);
-    TLI.setUnavailable(LibFunc::bzero);
-    TLI.setUnavailable(LibFunc::chmod);
-    TLI.setUnavailable(LibFunc::chown);
-    TLI.setUnavailable(LibFunc::closedir);
-    TLI.setUnavailable(LibFunc::ctermid);
-    TLI.setUnavailable(LibFunc::fdopen);
-    TLI.setUnavailable(LibFunc::ffs);
-    TLI.setUnavailable(LibFunc::fileno);
-    TLI.setUnavailable(LibFunc::flockfile);
-    TLI.setUnavailable(LibFunc::fseeko);
-    TLI.setUnavailable(LibFunc::fstat);
-    TLI.setUnavailable(LibFunc::fstatvfs);
-    TLI.setUnavailable(LibFunc::ftello);
-    TLI.setUnavailable(LibFunc::ftrylockfile);
-    TLI.setUnavailable(LibFunc::funlockfile);
-    TLI.setUnavailable(LibFunc::getc_unlocked);
-    TLI.setUnavailable(LibFunc::getitimer);
-    TLI.setUnavailable(LibFunc::getlogin_r);
-    TLI.setUnavailable(LibFunc::getpwnam);
-    TLI.setUnavailable(LibFunc::gettimeofday);
-    TLI.setUnavailable(LibFunc::htonl);
-    TLI.setUnavailable(LibFunc::htons);
-    TLI.setUnavailable(LibFunc::lchown);
-    TLI.setUnavailable(LibFunc::lstat);
-    TLI.setUnavailable(LibFunc::memccpy);
-    TLI.setUnavailable(LibFunc::mkdir);
-    TLI.setUnavailable(LibFunc::ntohl);
-    TLI.setUnavailable(LibFunc::ntohs);
-    TLI.setUnavailable(LibFunc::open);
-    TLI.setUnavailable(LibFunc::opendir);
-    TLI.setUnavailable(LibFunc::pclose);
-    TLI.setUnavailable(LibFunc::popen);
-    TLI.setUnavailable(LibFunc::pread);
-    TLI.setUnavailable(LibFunc::pwrite);
-    TLI.setUnavailable(LibFunc::read);
-    TLI.setUnavailable(LibFunc::readlink);
-    TLI.setUnavailable(LibFunc::realpath);
-    TLI.setUnavailable(LibFunc::rmdir);
-    TLI.setUnavailable(LibFunc::setitimer);
-    TLI.setUnavailable(LibFunc::stat);
-    TLI.setUnavailable(LibFunc::statvfs);
-    TLI.setUnavailable(LibFunc::stpcpy);
-    TLI.setUnavailable(LibFunc::stpncpy);
-    TLI.setUnavailable(LibFunc::strcasecmp);
-    TLI.setUnavailable(LibFunc::strncasecmp);
-    TLI.setUnavailable(LibFunc::times);
-    TLI.setUnavailable(LibFunc::uname);
-    TLI.setUnavailable(LibFunc::unlink);
-    TLI.setUnavailable(LibFunc::unsetenv);
-    TLI.setUnavailable(LibFunc::utime);
-    TLI.setUnavailable(LibFunc::utimes);
-    TLI.setUnavailable(LibFunc::write);
+    TLI.setUnavailable(LibFunc_access);
+    TLI.setUnavailable(LibFunc_bcmp);
+    TLI.setUnavailable(LibFunc_bcopy);
+    TLI.setUnavailable(LibFunc_bzero);
+    TLI.setUnavailable(LibFunc_chmod);
+    TLI.setUnavailable(LibFunc_chown);
+    TLI.setUnavailable(LibFunc_closedir);
+    TLI.setUnavailable(LibFunc_ctermid);
+    TLI.setUnavailable(LibFunc_fdopen);
+    TLI.setUnavailable(LibFunc_ffs);
+    TLI.setUnavailable(LibFunc_fileno);
+    TLI.setUnavailable(LibFunc_flockfile);
+    TLI.setUnavailable(LibFunc_fseeko);
+    TLI.setUnavailable(LibFunc_fstat);
+    TLI.setUnavailable(LibFunc_fstatvfs);
+    TLI.setUnavailable(LibFunc_ftello);
+    TLI.setUnavailable(LibFunc_ftrylockfile);
+    TLI.setUnavailable(LibFunc_funlockfile);
+    TLI.setUnavailable(LibFunc_getc_unlocked);
+    TLI.setUnavailable(LibFunc_getitimer);
+    TLI.setUnavailable(LibFunc_getlogin_r);
+    TLI.setUnavailable(LibFunc_getpwnam);
+    TLI.setUnavailable(LibFunc_gettimeofday);
+    TLI.setUnavailable(LibFunc_htonl);
+    TLI.setUnavailable(LibFunc_htons);
+    TLI.setUnavailable(LibFunc_lchown);
+    TLI.setUnavailable(LibFunc_lstat);
+    TLI.setUnavailable(LibFunc_memccpy);
+    TLI.setUnavailable(LibFunc_mkdir);
+    TLI.setUnavailable(LibFunc_ntohl);
+    TLI.setUnavailable(LibFunc_ntohs);
+    TLI.setUnavailable(LibFunc_open);
+    TLI.setUnavailable(LibFunc_opendir);
+    TLI.setUnavailable(LibFunc_pclose);
+    TLI.setUnavailable(LibFunc_popen);
+    TLI.setUnavailable(LibFunc_pread);
+    TLI.setUnavailable(LibFunc_pwrite);
+    TLI.setUnavailable(LibFunc_read);
+    TLI.setUnavailable(LibFunc_readlink);
+    TLI.setUnavailable(LibFunc_realpath);
+    TLI.setUnavailable(LibFunc_rmdir);
+    TLI.setUnavailable(LibFunc_setitimer);
+    TLI.setUnavailable(LibFunc_stat);
+    TLI.setUnavailable(LibFunc_statvfs);
+    TLI.setUnavailable(LibFunc_stpcpy);
+    TLI.setUnavailable(LibFunc_stpncpy);
+    TLI.setUnavailable(LibFunc_strcasecmp);
+    TLI.setUnavailable(LibFunc_strncasecmp);
+    TLI.setUnavailable(LibFunc_times);
+    TLI.setUnavailable(LibFunc_uname);
+    TLI.setUnavailable(LibFunc_unlink);
+    TLI.setUnavailable(LibFunc_unsetenv);
+    TLI.setUnavailable(LibFunc_utime);
+    TLI.setUnavailable(LibFunc_utimes);
+    TLI.setUnavailable(LibFunc_write);
 
     // Win32 does *not* provide provide these functions, but they are
     // specified by C99:
-    TLI.setUnavailable(LibFunc::atoll);
-    TLI.setUnavailable(LibFunc::frexpf);
-    TLI.setUnavailable(LibFunc::llabs);
+    TLI.setUnavailable(LibFunc_atoll);
+    TLI.setUnavailable(LibFunc_frexpf);
+    TLI.setUnavailable(LibFunc_llabs);
   }
 
   switch (T.getOS()) {
@@ -311,28 +311,28 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
     // and their names are __exp10 and __exp10f. exp10l is not available on
     // OS X or iOS.
-    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc_exp10l);
     if (T.isMacOSXVersionLT(10, 9)) {
-      TLI.setUnavailable(LibFunc::exp10);
-      TLI.setUnavailable(LibFunc::exp10f);
+      TLI.setUnavailable(LibFunc_exp10);
+      TLI.setUnavailable(LibFunc_exp10f);
     } else {
-      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
-      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+      TLI.setAvailableWithName(LibFunc_exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc_exp10f, "__exp10f");
     }
     break;
   case Triple::IOS:
   case Triple::TvOS:
   case Triple::WatchOS:
-    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc_exp10l);
     if (!T.isWatchOS() && (T.isOSVersionLT(7, 0) ||
                            (T.isOSVersionLT(9, 0) &&
                             (T.getArch() == Triple::x86 ||
                              T.getArch() == Triple::x86_64)))) {
-      TLI.setUnavailable(LibFunc::exp10);
-      TLI.setUnavailable(LibFunc::exp10f);
+      TLI.setUnavailable(LibFunc_exp10);
+      TLI.setUnavailable(LibFunc_exp10f);
     } else {
-      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
-      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+      TLI.setAvailableWithName(LibFunc_exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc_exp10f, "__exp10f");
     }
     break;
   case Triple::Linux:
@@ -344,9 +344,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // Fall through to disable all of them.
     LLVM_FALLTHROUGH;
   default:
-    TLI.setUnavailable(LibFunc::exp10);
-    TLI.setUnavailable(LibFunc::exp10f);
-    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc_exp10);
+    TLI.setUnavailable(LibFunc_exp10f);
+    TLI.setUnavailable(LibFunc_exp10l);
   }
 
   // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
@@ -364,7 +364,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   case Triple::Linux:
     break;
   default:
-    TLI.setUnavailable(LibFunc::ffsl);
+    TLI.setUnavailable(LibFunc_ffsl);
   }
 
   // ffsll is available on at least FreeBSD and Linux (GLIBC):
@@ -380,7 +380,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   case Triple::Linux:
     break;
   default:
-    TLI.setUnavailable(LibFunc::ffsll);
+    TLI.setUnavailable(LibFunc_ffsll);
   }
 
   // The following functions are available on at least FreeBSD:
@@ -388,30 +388,30 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // http://svn.freebsd.org/base/head/lib/libc/string/flsl.c
   // http://svn.freebsd.org/base/head/lib/libc/string/flsll.c
   if (!T.isOSFreeBSD()) {
-    TLI.setUnavailable(LibFunc::fls);
-    TLI.setUnavailable(LibFunc::flsl);
-    TLI.setUnavailable(LibFunc::flsll);
+    TLI.setUnavailable(LibFunc_fls);
+    TLI.setUnavailable(LibFunc_flsl);
+    TLI.setUnavailable(LibFunc_flsll);
   }
 
   // The following functions are available on at least Linux:
   if (!T.isOSLinux()) {
-    TLI.setUnavailable(LibFunc::dunder_strdup);
-    TLI.setUnavailable(LibFunc::dunder_strtok_r);
-    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
-    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
-    TLI.setUnavailable(LibFunc::under_IO_getc);
-    TLI.setUnavailable(LibFunc::under_IO_putc);
-    TLI.setUnavailable(LibFunc::memalign);
-    TLI.setUnavailable(LibFunc::fopen64);
-    TLI.setUnavailable(LibFunc::fseeko64);
-    TLI.setUnavailable(LibFunc::fstat64);
-    TLI.setUnavailable(LibFunc::fstatvfs64);
-    TLI.setUnavailable(LibFunc::ftello64);
-    TLI.setUnavailable(LibFunc::lstat64);
-    TLI.setUnavailable(LibFunc::open64);
-    TLI.setUnavailable(LibFunc::stat64);
-    TLI.setUnavailable(LibFunc::statvfs64);
-    TLI.setUnavailable(LibFunc::tmpfile64);
+    TLI.setUnavailable(LibFunc_dunder_strdup);
+    TLI.setUnavailable(LibFunc_dunder_strtok_r);
+    TLI.setUnavailable(LibFunc_dunder_isoc99_scanf);
+    TLI.setUnavailable(LibFunc_dunder_isoc99_sscanf);
+    TLI.setUnavailable(LibFunc_under_IO_getc);
+    TLI.setUnavailable(LibFunc_under_IO_putc);
+    TLI.setUnavailable(LibFunc_memalign);
+    TLI.setUnavailable(LibFunc_fopen64);
+    TLI.setUnavailable(LibFunc_fseeko64);
+    TLI.setUnavailable(LibFunc_fstat64);
+    TLI.setUnavailable(LibFunc_fstatvfs64);
+    TLI.setUnavailable(LibFunc_ftello64);
+    TLI.setUnavailable(LibFunc_lstat64);
+    TLI.setUnavailable(LibFunc_open64);
+    TLI.setUnavailable(LibFunc_stat64);
+    TLI.setUnavailable(LibFunc_statvfs64);
+    TLI.setUnavailable(LibFunc_tmpfile64);
   }
 
   // As currently implemented in clang, NVPTX code has no standard library to
@@ -427,9 +427,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // optimizations, so this situation should be fixed.
   if (T.isNVPTX()) {
     TLI.disableAllFunctions();
-    TLI.setAvailable(LibFunc::nvvm_reflect);
+    TLI.setAvailable(LibFunc_nvvm_reflect);
   } else {
-    TLI.setUnavailable(LibFunc::nvvm_reflect);
+    TLI.setUnavailable(LibFunc_nvvm_reflect);
   }
 
   TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary);
@@ -500,9 +500,9 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
-                                       LibFunc::Func &F) const {
+                                       LibFunc &F) const {
   StringRef const *Start = &StandardNames[0];
-  StringRef const *End = &StandardNames[LibFunc::NumLibFuncs];
+  StringRef const *End = &StandardNames[NumLibFuncs];
 
   funcName = sanitizeFunctionName(funcName);
   if (funcName.empty())
@@ -513,14 +513,14 @@ bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
         return LHS < RHS;
       });
   if (I != End && *I == funcName) {
-    F = (LibFunc::Func)(I - Start);
+    F = (LibFunc)(I - Start);
     return true;
   }
   return false;
 }
 
 bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
-                                                   LibFunc::Func F,
+                                                   LibFunc F,
                                                    const DataLayout *DL) const {
   LLVMContext &Ctx = FTy.getContext();
   Type *PCharTy = Type::getInt8PtrTy(Ctx);
@@ -531,504 +531,660 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   unsigned NumParams = FTy.getNumParams();
 
   switch (F) {
-  case LibFunc::strlen:
+  case LibFunc_strlen:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy());
 
-  case LibFunc::strchr:
-  case LibFunc::strrchr:
+  case LibFunc_strchr:
+  case LibFunc_strrchr:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1)->isIntegerTy());
 
-  case LibFunc::strtol:
-  case LibFunc::strtod:
-  case LibFunc::strtof:
-  case LibFunc::strtoul:
-  case LibFunc::strtoll:
-  case LibFunc::strtold:
-  case LibFunc::strtoull:
+  case LibFunc_strtol:
+  case LibFunc_strtod:
+  case LibFunc_strtof:
+  case LibFunc_strtoul:
+  case LibFunc_strtoll:
+  case LibFunc_strtold:
+  case LibFunc_strtoull:
     return ((NumParams == 2 || NumParams == 3) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::strcat:
+  case LibFunc_strcat:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1) == FTy.getReturnType());
 
-  case LibFunc::strncat:
+  case LibFunc_strncat:
     return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1) == FTy.getReturnType() &&
             FTy.getParamType(2)->isIntegerTy());
 
-  case LibFunc::strcpy_chk:
-  case LibFunc::stpcpy_chk:
+  case LibFunc_strcpy_chk:
+  case LibFunc_stpcpy_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
+  case LibFunc_strcpy:
+  case LibFunc_stpcpy:
     return (NumParams == 2 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(0) == PCharTy);
 
-  case LibFunc::strncpy_chk:
-  case LibFunc::stpncpy_chk:
+  case LibFunc_strncpy_chk:
+  case LibFunc_stpncpy_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::strncpy:
-  case LibFunc::stpncpy:
+  case LibFunc_strncpy:
+  case LibFunc_stpncpy:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(0) == PCharTy &&
             FTy.getParamType(2)->isIntegerTy());
 
-  case LibFunc::strxfrm:
+  case LibFunc_strxfrm:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::strcmp:
+  case LibFunc_strcmp:
     return (NumParams == 2 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1));
 
-  case LibFunc::strncmp:
+  case LibFunc_strncmp:
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(2)->isIntegerTy());
 
-  case LibFunc::strspn:
-  case LibFunc::strcspn:
+  case LibFunc_strspn:
+  case LibFunc_strcspn:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getReturnType()->isIntegerTy());
 
-  case LibFunc::strcoll:
-  case LibFunc::strcasecmp:
-  case LibFunc::strncasecmp:
+  case LibFunc_strcoll:
+  case LibFunc_strcasecmp:
+  case LibFunc_strncasecmp:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::strstr:
+  case LibFunc_strstr:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::strpbrk:
+  case LibFunc_strpbrk:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1));
 
-  case LibFunc::strtok:
-  case LibFunc::strtok_r:
+  case LibFunc_strtok:
+  case LibFunc_strtok_r:
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::scanf:
-  case LibFunc::setbuf:
-  case LibFunc::setvbuf:
+  case LibFunc_scanf:
+  case LibFunc_setbuf:
+  case LibFunc_setvbuf:
     return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::strdup:
-  case LibFunc::strndup:
+  case LibFunc_strdup:
+  case LibFunc_strndup:
     return (NumParams >= 1 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy());
-  case LibFunc::sscanf:
-  case LibFunc::stat:
-  case LibFunc::statvfs:
-  case LibFunc::sprintf:
+  case LibFunc_sscanf:
+  case LibFunc_stat:
+  case LibFunc_statvfs:
+  case LibFunc_siprintf:
+  case LibFunc_sprintf:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::snprintf:
+  case LibFunc_snprintf:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::setitimer:
+  case LibFunc_setitimer:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::system:
+  case LibFunc_system:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::malloc:
+  case LibFunc_malloc:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
-  case LibFunc::memcmp:
-    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(1)->isPointerTy() &&
-            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_memcmp:
+    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::memchr:
-  case LibFunc::memrchr:
-    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
+    return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(1)->isIntegerTy(32) &&
-            FTy.getParamType(2)->isIntegerTy() &&
-            FTy.getReturnType()->isPointerTy());
-  case LibFunc::modf:
-  case LibFunc::modff:
-  case LibFunc::modfl:
+            IsSizeTTy(FTy.getParamType(2)));
+  case LibFunc_modf:
+  case LibFunc_modff:
+  case LibFunc_modfl:
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::memcpy_chk:
-  case LibFunc::memmove_chk:
+  case LibFunc_memcpy_chk:
+  case LibFunc_memmove_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::memcpy:
-  case LibFunc::mempcpy:
-  case LibFunc::memmove:
+  case LibFunc_memcpy:
+  case LibFunc_mempcpy:
+  case LibFunc_memmove:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy() &&
             IsSizeTTy(FTy.getParamType(2)));
 
-  case LibFunc::memset_chk:
+  case LibFunc_memset_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::memset:
+  case LibFunc_memset:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isIntegerTy() &&
             IsSizeTTy(FTy.getParamType(2)));
 
-  case LibFunc::memccpy:
+  case LibFunc_memccpy:
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::memalign:
+  case LibFunc_memalign:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::realloc:
-    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getReturnType()->isPointerTy());
-  case LibFunc::read:
+  case LibFunc_realloc:
+  case LibFunc_reallocf:
+    return (NumParams == 2 && FTy.getReturnType() == PCharTy &&
+            FTy.getParamType(0) == FTy.getReturnType() &&
+            IsSizeTTy(FTy.getParamType(1)));
+  case LibFunc_read:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::rewind:
-  case LibFunc::rmdir:
-  case LibFunc::remove:
-  case LibFunc::realpath:
+  case LibFunc_rewind:
+  case LibFunc_rmdir:
+  case LibFunc_remove:
+  case LibFunc_realpath:
     return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::rename:
+  case LibFunc_rename:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::readlink:
+  case LibFunc_readlink:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::write:
+  case LibFunc_write:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::bcopy:
-  case LibFunc::bcmp:
+  case LibFunc_bcopy:
+  case LibFunc_bcmp:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::bzero:
+  case LibFunc_bzero:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::calloc:
+  case LibFunc_calloc:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
 
-  case LibFunc::atof:
-  case LibFunc::atoi:
-  case LibFunc::atol:
-  case LibFunc::atoll:
-  case LibFunc::ferror:
-  case LibFunc::getenv:
-  case LibFunc::getpwnam:
-  case LibFunc::pclose:
-  case LibFunc::perror:
-  case LibFunc::printf:
-  case LibFunc::puts:
-  case LibFunc::uname:
-  case LibFunc::under_IO_getc:
-  case LibFunc::unlink:
-  case LibFunc::unsetenv:
+  case LibFunc_atof:
+  case LibFunc_atoi:
+  case LibFunc_atol:
+  case LibFunc_atoll:
+  case LibFunc_ferror:
+  case LibFunc_getenv:
+  case LibFunc_getpwnam:
+  case LibFunc_iprintf:
+  case LibFunc_pclose:
+  case LibFunc_perror:
+  case LibFunc_printf:
+  case LibFunc_puts:
+  case LibFunc_uname:
+  case LibFunc_under_IO_getc:
+  case LibFunc_unlink:
+  case LibFunc_unsetenv:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
 
-  case LibFunc::chmod:
-  case LibFunc::chown:
-  case LibFunc::clearerr:
-  case LibFunc::closedir:
-  case LibFunc::ctermid:
-  case LibFunc::fclose:
-  case LibFunc::feof:
-  case LibFunc::fflush:
-  case LibFunc::fgetc:
-  case LibFunc::fileno:
-  case LibFunc::flockfile:
-  case LibFunc::free:
-  case LibFunc::fseek:
-  case LibFunc::fseeko64:
-  case LibFunc::fseeko:
-  case LibFunc::fsetpos:
-  case LibFunc::ftell:
-  case LibFunc::ftello64:
-  case LibFunc::ftello:
-  case LibFunc::ftrylockfile:
-  case LibFunc::funlockfile:
-  case LibFunc::getc:
-  case LibFunc::getc_unlocked:
-  case LibFunc::getlogin_r:
-  case LibFunc::mkdir:
-  case LibFunc::mktime:
-  case LibFunc::times:
+  case LibFunc_access:
+  case LibFunc_chmod:
+  case LibFunc_chown:
+  case LibFunc_clearerr:
+  case LibFunc_closedir:
+  case LibFunc_ctermid:
+  case LibFunc_fclose:
+  case LibFunc_feof:
+  case LibFunc_fflush:
+  case LibFunc_fgetc:
+  case LibFunc_fileno:
+  case LibFunc_flockfile:
+  case LibFunc_free:
+  case LibFunc_fseek:
+  case LibFunc_fseeko64:
+  case LibFunc_fseeko:
+  case LibFunc_fsetpos:
+  case LibFunc_ftell:
+  case LibFunc_ftello64:
+  case LibFunc_ftello:
+  case LibFunc_ftrylockfile:
+  case LibFunc_funlockfile:
+  case LibFunc_getc:
+  case LibFunc_getc_unlocked:
+  case LibFunc_getlogin_r:
+  case LibFunc_mkdir:
+  case LibFunc_mktime:
+  case LibFunc_times:
     return (NumParams != 0 && FTy.getParamType(0)->isPointerTy());
 
-  case LibFunc::access:
-    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::fopen:
+  case LibFunc_fopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fdopen:
+  case LibFunc_fdopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fputc:
-  case LibFunc::fstat:
-  case LibFunc::frexp:
-  case LibFunc::frexpf:
-  case LibFunc::frexpl:
-  case LibFunc::fstatvfs:
+  case LibFunc_fputc:
+  case LibFunc_fstat:
+  case LibFunc_frexp:
+  case LibFunc_frexpf:
+  case LibFunc_frexpl:
+  case LibFunc_fstatvfs:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fgets:
+  case LibFunc_fgets:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::fread:
+  case LibFunc_fread:
     return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(3)->isPointerTy());
-  case LibFunc::fwrite:
+  case LibFunc_fwrite:
     return (NumParams == 4 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isIntegerTy() &&
             FTy.getParamType(2)->isIntegerTy() &&
             FTy.getParamType(3)->isPointerTy());
-  case LibFunc::fputs:
+  case LibFunc_fputs:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fscanf:
-  case LibFunc::fprintf:
-    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+  case LibFunc_fscanf:
+  case LibFunc_fiprintf:
+  case LibFunc_fprintf:
+    return (NumParams >= 2 && FTy.getReturnType()->isIntegerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fgetpos:
+  case LibFunc_fgetpos:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::gets:
-  case LibFunc::getchar:
-  case LibFunc::getitimer:
+  case LibFunc_getchar:
+    return (NumParams == 0 && FTy.getReturnType()->isIntegerTy());
+  case LibFunc_gets:
+    return (NumParams == 1 && FTy.getParamType(0) == PCharTy);
+  case LibFunc_getitimer:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::ungetc:
+  case LibFunc_ungetc:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::utime:
-  case LibFunc::utimes:
+  case LibFunc_utime:
+  case LibFunc_utimes:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::putc:
+  case LibFunc_putc:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::pread:
-  case LibFunc::pwrite:
+  case LibFunc_pread:
+  case LibFunc_pwrite:
     return (NumParams == 4 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::popen:
+  case LibFunc_popen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::vscanf:
+  case LibFunc_vscanf:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::vsscanf:
+  case LibFunc_vsscanf:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::vfscanf:
+  case LibFunc_vfscanf:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::valloc:
+  case LibFunc_valloc:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::vprintf:
+  case LibFunc_vprintf:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::vfprintf:
-  case LibFunc::vsprintf:
+  case LibFunc_vfprintf:
+  case LibFunc_vsprintf:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::vsnprintf:
+  case LibFunc_vsnprintf:
     return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::open:
+  case LibFunc_open:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::opendir:
+  case LibFunc_opendir:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy());
-  case LibFunc::tmpfile:
+  case LibFunc_tmpfile:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::htonl:
-  case LibFunc::htons:
-  case LibFunc::ntohl:
-  case LibFunc::ntohs:
-  case LibFunc::lstat:
+  case LibFunc_htonl:
+  case LibFunc_ntohl:
+    return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getReturnType() == FTy.getParamType(0));
+  case LibFunc_htons:
+  case LibFunc_ntohs:
+    return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(16) &&
+            FTy.getReturnType() == FTy.getParamType(0));
+  case LibFunc_lstat:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::lchown:
+  case LibFunc_lchown:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::qsort:
+  case LibFunc_qsort:
     return (NumParams == 4 && FTy.getParamType(3)->isPointerTy());
-  case LibFunc::dunder_strdup:
-  case LibFunc::dunder_strndup:
+  case LibFunc_dunder_strdup:
+  case LibFunc_dunder_strndup:
     return (NumParams >= 1 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy());
-  case LibFunc::dunder_strtok_r:
+  case LibFunc_dunder_strtok_r:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::under_IO_putc:
+  case LibFunc_under_IO_putc:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::dunder_isoc99_scanf:
+  case LibFunc_dunder_isoc99_scanf:
     return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::stat64:
-  case LibFunc::lstat64:
-  case LibFunc::statvfs64:
+  case LibFunc_stat64:
+  case LibFunc_lstat64:
+  case LibFunc_statvfs64:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::dunder_isoc99_sscanf:
+  case LibFunc_dunder_isoc99_sscanf:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fopen64:
+  case LibFunc_fopen64:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::tmpfile64:
+  case LibFunc_tmpfile64:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::fstat64:
-  case LibFunc::fstatvfs64:
+  case LibFunc_fstat64:
+  case LibFunc_fstatvfs64:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::open64:
+  case LibFunc_open64:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::gettimeofday:
+  case LibFunc_gettimeofday:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::Znwj:                    // new(unsigned int);
-  case LibFunc::Znwm:                    // new(unsigned long);
-  case LibFunc::Znaj:                    // new[](unsigned int);
-  case LibFunc::Znam:                    // new[](unsigned long);
-  case LibFunc::msvc_new_int:            // new(unsigned int);
-  case LibFunc::msvc_new_longlong:       // new(unsigned long long);
-  case LibFunc::msvc_new_array_int:      // new[](unsigned int);
-  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long);
-    return (NumParams == 1);
-
-  case LibFunc::memset_pattern16:
+  // new(unsigned int);
+  case LibFunc_Znwj:
+  // new(unsigned long);
+  case LibFunc_Znwm:
+  // new[](unsigned int);
+  case LibFunc_Znaj:
+  // new[](unsigned long);
+  case LibFunc_Znam:
+  // new(unsigned int);
+  case LibFunc_msvc_new_int:
+  // new(unsigned long long);
+  case LibFunc_msvc_new_longlong:
+  // new[](unsigned int);
+  case LibFunc_msvc_new_array_int:
+  // new[](unsigned long long);
+  case LibFunc_msvc_new_array_longlong:
+    return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
+
+  // new(unsigned int, nothrow);
+  case LibFunc_ZnwjRKSt9nothrow_t:
+  // new(unsigned long, nothrow);
+  case LibFunc_ZnwmRKSt9nothrow_t:
+  // new[](unsigned int, nothrow);
+  case LibFunc_ZnajRKSt9nothrow_t:
+  // new[](unsigned long, nothrow);
+  case LibFunc_ZnamRKSt9nothrow_t:
+  // new(unsigned int, nothrow);
+  case LibFunc_msvc_new_int_nothrow:
+  // new(unsigned long long, nothrow);
+  case LibFunc_msvc_new_longlong_nothrow:
+  // new[](unsigned int, nothrow);
+  case LibFunc_msvc_new_array_int_nothrow:
+  // new[](unsigned long long, nothrow);
+  case LibFunc_msvc_new_array_longlong_nothrow:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
+
+  // void operator delete[](void*);
+  case LibFunc_ZdaPv:
+  // void operator delete(void*);
+  case LibFunc_ZdlPv:
+  // void operator delete[](void*);
+  case LibFunc_msvc_delete_array_ptr32:
+  // void operator delete[](void*);
+  case LibFunc_msvc_delete_array_ptr64:
+  // void operator delete(void*);
+  case LibFunc_msvc_delete_ptr32:
+  // void operator delete(void*);
+  case LibFunc_msvc_delete_ptr64:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+
+  // void operator delete[](void*, nothrow);
+  case LibFunc_ZdaPvRKSt9nothrow_t:
+  // void operator delete[](void*, unsigned int);
+  case LibFunc_ZdaPvj:
+  // void operator delete[](void*, unsigned long);
+  case LibFunc_ZdaPvm:
+  // void operator delete(void*, nothrow);
+  case LibFunc_ZdlPvRKSt9nothrow_t:
+  // void operator delete(void*, unsigned int);
+  case LibFunc_ZdlPvj:
+  // void operator delete(void*, unsigned long);
+  case LibFunc_ZdlPvm:
+  // void operator delete[](void*, unsigned int);
+  case LibFunc_msvc_delete_array_ptr32_int:
+  // void operator delete[](void*, nothrow);
+  case LibFunc_msvc_delete_array_ptr32_nothrow:
+  // void operator delete[](void*, unsigned long long);
+  case LibFunc_msvc_delete_array_ptr64_longlong:
+  // void operator delete[](void*, nothrow);
+  case LibFunc_msvc_delete_array_ptr64_nothrow:
+  // void operator delete(void*, unsigned int);
+  case LibFunc_msvc_delete_ptr32_int:
+  // void operator delete(void*, nothrow);
+  case LibFunc_msvc_delete_ptr32_nothrow:
+  // void operator delete(void*, unsigned long long);
+  case LibFunc_msvc_delete_ptr64_longlong:
+  // void operator delete(void*, nothrow);
+  case LibFunc_msvc_delete_ptr64_nothrow:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
+
+  case LibFunc_memset_pattern16:
     return (!FTy.isVarArg() && NumParams == 3 &&
-            isa<PointerType>(FTy.getParamType(0)) &&
-            isa<PointerType>(FTy.getParamType(1)) &&
-            isa<IntegerType>(FTy.getParamType(2)));
-
-  // int __nvvm_reflect(const char *);
-  case LibFunc::nvvm_reflect:
-    return (NumParams == 1 && isa<PointerType>(FTy.getParamType(0)));
-
-  case LibFunc::sin:
-  case LibFunc::sinf:
-  case LibFunc::sinl:
-  case LibFunc::cos:
-  case LibFunc::cosf:
-  case LibFunc::cosl:
-  case LibFunc::tan:
-  case LibFunc::tanf:
-  case LibFunc::tanl:
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
-  case LibFunc::log:
-  case LibFunc::logf:
-  case LibFunc::logl:
-  case LibFunc::log10:
-  case LibFunc::log10f:
-  case LibFunc::log10l:
-  case LibFunc::log2:
-  case LibFunc::log2f:
-  case LibFunc::log2l:
-  case LibFunc::fabs:
-  case LibFunc::fabsf:
-  case LibFunc::fabsl:
-  case LibFunc::floor:
-  case LibFunc::floorf:
-  case LibFunc::floorl:
-  case LibFunc::ceil:
-  case LibFunc::ceilf:
-  case LibFunc::ceill:
-  case LibFunc::trunc:
-  case LibFunc::truncf:
-  case LibFunc::truncl:
-  case LibFunc::rint:
-  case LibFunc::rintf:
-  case LibFunc::rintl:
-  case LibFunc::nearbyint:
-  case LibFunc::nearbyintf:
-  case LibFunc::nearbyintl:
-  case LibFunc::round:
-  case LibFunc::roundf:
-  case LibFunc::roundl:
-  case LibFunc::sqrt:
-  case LibFunc::sqrtf:
-  case LibFunc::sqrtl:
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isIntegerTy());
+
+  case LibFunc_cxa_guard_abort:
+  case LibFunc_cxa_guard_acquire:
+  case LibFunc_cxa_guard_release:
+  case LibFunc_nvvm_reflect:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+
+  case LibFunc_sincospi_stret:
+  case LibFunc_sincospif_stret:
+    return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy());
+
+  case LibFunc_acos:
+  case LibFunc_acosf:
+  case LibFunc_acosh:
+  case LibFunc_acoshf:
+  case LibFunc_acoshl:
+  case LibFunc_acosl:
+  case LibFunc_asin:
+  case LibFunc_asinf:
+  case LibFunc_asinh:
+  case LibFunc_asinhf:
+  case LibFunc_asinhl:
+  case LibFunc_asinl:
+  case LibFunc_atan:
+  case LibFunc_atanf:
+  case LibFunc_atanh:
+  case LibFunc_atanhf:
+  case LibFunc_atanhl:
+  case LibFunc_atanl:
+  case LibFunc_cbrt:
+  case LibFunc_cbrtf:
+  case LibFunc_cbrtl:
+  case LibFunc_ceil:
+  case LibFunc_ceilf:
+  case LibFunc_ceill:
+  case LibFunc_cos:
+  case LibFunc_cosf:
+  case LibFunc_cosh:
+  case LibFunc_coshf:
+  case LibFunc_coshl:
+  case LibFunc_cosl:
+  case LibFunc_exp10:
+  case LibFunc_exp10f:
+  case LibFunc_exp10l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
+  case LibFunc_expm1:
+  case LibFunc_expm1f:
+  case LibFunc_expm1l:
+  case LibFunc_fabs:
+  case LibFunc_fabsf:
+  case LibFunc_fabsl:
+  case LibFunc_floor:
+  case LibFunc_floorf:
+  case LibFunc_floorl:
+  case LibFunc_log10:
+  case LibFunc_log10f:
+  case LibFunc_log10l:
+  case LibFunc_log1p:
+  case LibFunc_log1pf:
+  case LibFunc_log1pl:
+  case LibFunc_log2:
+  case LibFunc_log2f:
+  case LibFunc_log2l:
+  case LibFunc_log:
+  case LibFunc_logb:
+  case LibFunc_logbf:
+  case LibFunc_logbl:
+  case LibFunc_logf:
+  case LibFunc_logl:
+  case LibFunc_nearbyint:
+  case LibFunc_nearbyintf:
+  case LibFunc_nearbyintl:
+  case LibFunc_rint:
+  case LibFunc_rintf:
+  case LibFunc_rintl:
+  case LibFunc_round:
+  case LibFunc_roundf:
+  case LibFunc_roundl:
+  case LibFunc_sin:
+  case LibFunc_sinf:
+  case LibFunc_sinh:
+  case LibFunc_sinhf:
+  case LibFunc_sinhl:
+  case LibFunc_sinl:
+  case LibFunc_sqrt:
+  case LibFunc_sqrt_finite:
+  case LibFunc_sqrtf:
+  case LibFunc_sqrtf_finite:
+  case LibFunc_sqrtl:
+  case LibFunc_sqrtl_finite:
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanh:
+  case LibFunc_tanhf:
+  case LibFunc_tanhl:
+  case LibFunc_tanl:
+  case LibFunc_trunc:
+  case LibFunc_truncf:
+  case LibFunc_truncl:
     return (NumParams == 1 && FTy.getReturnType()->isFloatingPointTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::fmin:
-  case LibFunc::fminf:
-  case LibFunc::fminl:
-  case LibFunc::fmax:
-  case LibFunc::fmaxf:
-  case LibFunc::fmaxl:
-  case LibFunc::copysign:
-  case LibFunc::copysignf:
-  case LibFunc::copysignl:
-  case LibFunc::pow:
-  case LibFunc::powf:
-  case LibFunc::powl:
+  case LibFunc_atan2:
+  case LibFunc_atan2f:
+  case LibFunc_atan2l:
+  case LibFunc_fmin:
+  case LibFunc_fminf:
+  case LibFunc_fminl:
+  case LibFunc_fmax:
+  case LibFunc_fmaxf:
+  case LibFunc_fmaxl:
+  case LibFunc_fmod:
+  case LibFunc_fmodf:
+  case LibFunc_fmodl:
+  case LibFunc_copysign:
+  case LibFunc_copysignf:
+  case LibFunc_copysignl:
+  case LibFunc_pow:
+  case LibFunc_powf:
+  case LibFunc_powl:
     return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getReturnType() == FTy.getParamType(1));
 
-  case LibFunc::ffs:
-  case LibFunc::ffsl:
-  case LibFunc::ffsll:
-  case LibFunc::fls:
-  case LibFunc::flsl:
-  case LibFunc::flsll:
+  case LibFunc_ldexp:
+  case LibFunc_ldexpf:
+  case LibFunc_ldexpl:
+    return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
+            FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(1)->isIntegerTy(32));
+
+  case LibFunc_ffs:
+  case LibFunc_ffsl:
+  case LibFunc_ffsll:
+  case LibFunc_fls:
+  case LibFunc_flsl:
+  case LibFunc_flsll:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isIntegerTy());
 
-  case LibFunc::isdigit:
-  case LibFunc::isascii:
-  case LibFunc::toascii:
+  case LibFunc_isdigit:
+  case LibFunc_isascii:
+  case LibFunc_toascii:
+  case LibFunc_putchar:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::abs:
-  case LibFunc::labs:
-  case LibFunc::llabs:
+  case LibFunc_abs:
+  case LibFunc_labs:
+  case LibFunc_llabs:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::cxa_atexit:
+  case LibFunc_cxa_atexit:
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
 
-  case LibFunc::sinpi:
-  case LibFunc::cospi:
+  case LibFunc_sinpi:
+  case LibFunc_cospi:
     return (NumParams == 1 && FTy.getReturnType()->isDoubleTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::sinpif:
-  case LibFunc::cospif:
+  case LibFunc_sinpif:
+  case LibFunc_cospif:
     return (NumParams == 1 && FTy.getReturnType()->isFloatTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  default:
-    // Assume the other functions are correct.
-    // FIXME: It'd be really nice to cover them all.
-    return true;
+  case LibFunc_strnlen:
+    return (NumParams == 2 && FTy.getReturnType() == FTy.getParamType(1) &&
+            FTy.getParamType(0) == PCharTy &&
+            FTy.getParamType(1) == SizeTTy);
+
+  case LibFunc_posix_memalign:
+    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1) == SizeTTy && FTy.getParamType(2) == SizeTTy);
+
+  case LibFunc::NumLibFuncs:
+    break;
   }
+
+  llvm_unreachable("Invalid libfunc");
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
-                                       LibFunc::Func &F) const {
+                                       LibFunc &F) const {
   const DataLayout *DL =
       FDecl.getParent() ? &FDecl.getParent()->getDataLayout() : nullptr;
   return getLibFunc(FDecl.getName(), F) &&
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 5c0d1aac1b98..d73b1a128031 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -97,6 +97,10 @@ bool TargetTransformInfo::isSourceOfDivergence(const Value *V) const {
   return TTIImpl->isSourceOfDivergence(V);
 }
 
+unsigned TargetTransformInfo::getFlatAddressSpace() const {
+  return TTIImpl->getFlatAddressSpace();
+}
+
 bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
   return TTIImpl->isLoweredToCall(F);
 }
@@ -182,6 +186,21 @@ bool TargetTransformInfo::shouldBuildLookupTablesForConstant(Constant *C) const
   return TTIImpl->shouldBuildLookupTablesForConstant(C);
 }
 
+unsigned TargetTransformInfo::
+getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const {
+  return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
+}
+
+unsigned TargetTransformInfo::
+getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                 unsigned VF) const {
+  return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
+}
+
+bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
+  return TTIImpl->supportsEfficientVectorElementLoadStore();
+}
+
 bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
@@ -254,6 +273,12 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
+    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
+  return TTIImpl->shouldConsiderAddressTypePromotion(
+      I, AllowPromotionWithoutCommonHeader);
+}
+
 unsigned TargetTransformInfo::getCacheLineSize() const {
   return TTIImpl->getCacheLineSize();
 }
@@ -293,8 +318,10 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
 }
 
 int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                          Type *Src) const {
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+                                 Type *Src, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -314,8 +341,10 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
 }
 
 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) const {
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+                                 Type *CondTy, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -329,8 +358,11 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
 
 int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
-                                         unsigned AddressSpace) const {
-  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+                                         unsigned AddressSpace,
+                                         const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -363,17 +395,17 @@ int TargetTransformInfo::getInterleavedMemoryOpCost(
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Type *> Tys,
-                                               FastMathFlags FMF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+                                    ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                    unsigned ScalarizationCostPassed) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                            ScalarizationCostPassed);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Value *> Args,
-                                               FastMathFlags FMF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF);
+           ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/TypeMetadataUtils.cpp b/lib/Analysis/TypeMetadataUtils.cpp
index f56754167360..6871e4887c9e 100644
--- a/lib/Analysis/TypeMetadataUtils.cpp
+++ b/lib/Analysis/TypeMetadataUtils.cpp
@@ -39,7 +39,7 @@ findCallsAtConstantOffset(SmallVectorImpl<DevirtCallSite> &DevirtCalls,
 
 // Search for virtual calls that load from VPtr and add them to DevirtCalls.
 static void
-findLoadCallsAtConstantOffset(Module *M,
+findLoadCallsAtConstantOffset(const Module *M,
                               SmallVectorImpl<DevirtCallSite> &DevirtCalls,
                               Value *VPtr, int64_t Offset) {
   for (const Use &U : VPtr->uses()) {
@@ -62,10 +62,10 @@ findLoadCallsAtConstantOffset(Module *M,
 
 void llvm::findDevirtualizableCallsForTypeTest(
     SmallVectorImpl<DevirtCallSite> &DevirtCalls,
-    SmallVectorImpl<CallInst *> &Assumes, CallInst *CI) {
+    SmallVectorImpl<CallInst *> &Assumes, const CallInst *CI) {
   assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_test);
 
-  Module *M = CI->getParent()->getParent()->getParent();
+  const Module *M = CI->getParent()->getParent()->getParent();
 
   // Find llvm.assume intrinsics for this llvm.type.test call.
   for (const Use &CIU : CI->uses()) {
@@ -86,7 +86,8 @@ void llvm::findDevirtualizableCallsForTypeTest(
 void llvm::findDevirtualizableCallsForTypeCheckedLoad(
     SmallVectorImpl<DevirtCallSite> &DevirtCalls,
     SmallVectorImpl<Instruction *> &LoadedPtrs,
-    SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses, CallInst *CI) {
+    SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses,
+    const CallInst *CI) {
   assert(CI->getCalledFunction()->getIntrinsicID() ==
          Intrinsic::type_checked_load);
 
@@ -96,7 +97,7 @@ void llvm::findDevirtualizableCallsForTypeCheckedLoad(
     return;
   }
 
-  for (Use &U : CI->uses()) {
+  for (const Use &U : CI->uses()) {
     auto CIU = U.getUser();
     if (auto EVI = dyn_cast<ExtractValueInst>(CIU)) {
       if (EVI->getNumIndices() == 1 && EVI->getIndices()[0] == 0) {
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index b79370baad10..d871e83f222a 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/ConstantRange.h"
@@ -76,6 +77,9 @@ struct Query {
   AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
+  // Unlike the other analyses, this may be a nullptr because not all clients
+  // provide it currently.
+  OptimizationRemarkEmitter *ORE;
 
   /// Set of assumptions that should be excluded from further queries.
   /// This is because of the potential for mutual recursion to cause
@@ -90,11 +94,12 @@ struct Query {
   unsigned NumExcluded;
 
   Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI,
-        const DominatorTree *DT)
-      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), NumExcluded(0) {}
+        const DominatorTree *DT, OptimizationRemarkEmitter *ORE = nullptr)
+      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE), NumExcluded(0) {}
 
   Query(const Query &Q, const Value *NewExcl)
-      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), NumExcluded(Q.NumExcluded) {
+      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
+        NumExcluded(Q.NumExcluded) {
     Excluded = Q.Excluded;
     Excluded[NumExcluded++] = NewExcl;
     assert(NumExcluded <= Excluded.size());
@@ -131,9 +136,10 @@ static void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
 void llvm::computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
-                            const DominatorTree *DT) {
+                            const DominatorTree *DT,
+                            OptimizationRemarkEmitter *ORE) {
   ::computeKnownBits(V, KnownZero, KnownOne, Depth,
-                     Query(DL, AC, safeCxtI(V, CxtI), DT));
+                     Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
@@ -249,30 +255,6 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    APInt &KnownZero, APInt &KnownOne,
                                    APInt &KnownZero2, APInt &KnownOne2,
                                    unsigned Depth, const Query &Q) {
-  if (!Add) {
-    if (const ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
-      // We know that the top bits of C-X are clear if X contains less bits
-      // than C (i.e. no wrap-around can happen).  For example, 20-X is
-      // positive if we can prove that X is >= 0 and < 16.
-      if (!CLHS->getValue().isNegative()) {
-        unsigned BitWidth = KnownZero.getBitWidth();
-        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
-        // NLZ can't be BitWidth with no sign bit
-        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
-
-        // If all of the MaskV bits are known to be zero, then we know the
-        // output top bits are zero, because we now know that the output is
-        // from [0-C].
-        if ((KnownZero2 & MaskV) == MaskV) {
-          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
-          // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
-        }
-      }
-    }
-  }
-
   unsigned BitWidth = KnownZero.getBitWidth();
 
   // If an initial sequence of bits in the result is not needed, the
@@ -282,11 +264,11 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
   computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
 
   // Carry in a 1 for a subtract, rather than a 0.
-  APInt CarryIn(BitWidth, 0);
+  uint64_t CarryIn = 0;
   if (!Add) {
     // Sum = LHS + ~RHS + 1
     std::swap(KnownZero2, KnownOne2);
-    CarryIn.setBit(0);
+    CarryIn = 1;
   }
 
   APInt PossibleSumZero = ~LHSKnownZero + ~KnownZero2 + CarryIn;
@@ -315,11 +297,11 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
       // Adding two non-negative numbers, or subtracting a negative number from
       // a non-negative one, can't wrap into negative.
       if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
-        KnownZero |= APInt::getSignBit(BitWidth);
+        KnownZero.setSignBit();
       // Adding two negative numbers, or subtracting a non-negative number from
       // a negative one, can't wrap into non-negative.
       else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
-        KnownOne |= APInt::getSignBit(BitWidth);
+        KnownOne.setSignBit();
     }
   }
 }
@@ -370,8 +352,9 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
 
   TrailZ = std::min(TrailZ, BitWidth);
   LeadZ = std::min(LeadZ, BitWidth);
-  KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-              APInt::getHighBitsSet(BitWidth, LeadZ);
+  KnownZero.clearAllBits();
+  KnownZero.setLowBits(TrailZ);
+  KnownZero.setHighBits(LeadZ);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
@@ -379,9 +362,9 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
   // though as the program is invoking undefined behaviour we can choose
   // whatever we like here.
   if (isKnownNonNegative && !KnownOne.isNegative())
-    KnownZero.setBit(BitWidth - 1);
+    KnownZero.setSignBit();
   else if (isKnownNegative && !KnownZero.isNegative())
-    KnownOne.setBit(BitWidth - 1);
+    KnownOne.setSignBit();
 }
 
 void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
@@ -553,6 +536,13 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
       KnownOne.setAllBits();
       return;
     }
+    if (match(Arg, m_Not(m_Specific(V))) &&
+        isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
+      assert(BitWidth == 1 && "assume operand is not i1?");
+      KnownZero.setAllBits();
+      KnownOne.clearAllBits();
+      return;
+    }
 
     // The remaining tests are all recursive, so bail out if we hit the limit.
     if (Depth == MaxDepth)
@@ -719,7 +709,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownZero.isNegative()) {
         // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
+        KnownZero.setSignBit();
       }
     // assume(v >_s c) where c is at least -1.
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -730,7 +720,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isNegative()) {
         // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
+        KnownZero.setSignBit();
       }
     // assume(v <=_s c) where c is negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -741,7 +731,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownOne.isNegative()) {
         // We know that the sign bit is one.
-        KnownOne |= APInt::getSignBit(BitWidth);
+        KnownOne.setSignBit();
       }
     // assume(v <_s c) where c is non-positive
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -752,7 +742,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isNegative()) {
         // We know that the sign bit is one.
-        KnownOne |= APInt::getSignBit(BitWidth);
+        KnownOne.setSignBit();
       }
     // assume(v <=_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -762,8 +752,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
-      KnownZero |=
-        APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+      KnownZero.setHighBits(RHSKnownZero.countLeadingOnes());
     // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
@@ -774,11 +763,27 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
-        KnownZero |=
-          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()+1);
+        KnownZero.setHighBits(RHSKnownZero.countLeadingOnes()+1);
       else
-        KnownZero |=
-          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+        KnownZero.setHighBits(RHSKnownZero.countLeadingOnes());
+    }
+  }
+
+  // If assumptions conflict with each other or previous known bits, then we
+  // have a logical fallacy. It's possible that the assumption is not reachable,
+  // so this isn't a real bug. On the other hand, the program may have undefined
+  // behavior, or we might have a bug in the compiler. We can't assert/crash, so
+  // clear out the known bits, try to warn the user, and hope for the best.
+  if ((KnownZero & KnownOne) != 0) {
+    KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
+
+    if (Q.ORE) {
+      auto *CxtI = const_cast<Instruction *>(Q.CxtI);
+      OptimizationRemarkAnalysis ORA("value-tracking", "BadAssumption", CxtI);
+      Q.ORE->emit(ORA << "Detected conflicting code assumptions. Program may "
+                         "have undefined behavior, or compiler may have "
+                         "internal error.");
     }
   }
 }
@@ -817,6 +822,14 @@ static void computeKnownBitsFromShiftOperator(
 
   computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
 
+  // If the shift amount could be greater than or equal to the bit-width of the LHS, the
+  // value could be undef, so we don't know anything about it.
+  if ((~KnownZero).uge(BitWidth)) {
+    KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
+    return;
+  }
+
   // Note: We cannot use KnownZero.getLimitedValue() here, because if
   // BitWidth > 64 and any upper bits are known, we'll end up returning the
   // limit value (which implies all bits are known).
@@ -905,14 +918,15 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     // TODO: This could be generalized to clearing any bit set in y where the
     // following bit is known to be unset in y.
     Value *Y = nullptr;
-    if (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
-                                      m_Value(Y))) ||
-        match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
-                                      m_Value(Y)))) {
-      APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0);
-      computeKnownBits(Y, KnownZero3, KnownOne3, Depth + 1, Q);
-      if (KnownOne3.countTrailingOnes() > 0)
-        KnownZero |= APInt::getLowBitsSet(BitWidth, 1);
+    if (!KnownZero[0] && !KnownOne[0] &&
+        (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
+                                       m_Value(Y))) ||
+         match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
+                                       m_Value(Y))))) {
+      KnownZero2.clearAllBits(); KnownOne2.clearAllBits();
+      computeKnownBits(Y, KnownZero2, KnownOne2, Depth + 1, Q);
+      if (KnownOne2.countTrailingOnes() > 0)
+        KnownZero.setBit(0);
     }
     break;
   }
@@ -934,7 +948,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
     KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
-    KnownZero = KnownZeroOut;
+    KnownZero = std::move(KnownZeroOut);
     break;
   }
   case Instruction::Mul: {
@@ -958,15 +972,11 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero.setHighBits(LeadZ);
     break;
   }
   case Instruction::Select: {
-    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
-
-    const Value *LHS;
-    const Value *RHS;
+    const Value *LHS, *RHS;
     SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
     if (SelectPatternResult::isMinOrMax(SPF)) {
       computeKnownBits(RHS, KnownZero, KnownOne, Depth + 1, Q);
@@ -980,23 +990,23 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     unsigned MaxHighZeros = 0;
     if (SPF == SPF_SMAX) {
       // If both sides are negative, the result is negative.
-      if (KnownOne[BitWidth - 1] && KnownOne2[BitWidth - 1])
+      if (KnownOne.isNegative() && KnownOne2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
         MaxHighOnes =
             std::max(KnownOne.countLeadingOnes(), KnownOne2.countLeadingOnes());
       // If either side is non-negative, the result is non-negative.
-      else if (KnownZero[BitWidth - 1] || KnownZero2[BitWidth - 1])
+      else if (KnownZero.isNegative() || KnownZero2.isNegative())
         MaxHighZeros = 1;
     } else if (SPF == SPF_SMIN) {
       // If both sides are non-negative, the result is non-negative.
-      if (KnownZero[BitWidth - 1] && KnownZero2[BitWidth - 1])
+      if (KnownZero.isNegative() && KnownZero2.isNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
         MaxHighZeros = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
       // If either side is negative, the result is negative.
-      else if (KnownOne[BitWidth - 1] || KnownOne2[BitWidth - 1])
+      else if (KnownOne.isNegative() || KnownOne2.isNegative())
         MaxHighOnes = 1;
     } else if (SPF == SPF_UMAX) {
       // We can derive a lower bound on the result by taking the max of the
@@ -1014,9 +1024,9 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
     if (MaxHighOnes > 0)
-      KnownOne |= APInt::getHighBitsSet(BitWidth, MaxHighOnes);
+      KnownOne.setHighBits(MaxHighOnes);
     if (MaxHighZeros > 0)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, MaxHighZeros);
+      KnownZero.setHighBits(MaxHighZeros);
     break;
   }
   case Instruction::FPTrunc:
@@ -1047,7 +1057,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     KnownOne = KnownOne.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+      KnownZero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::BitCast: {
@@ -1068,35 +1078,29 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
     computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
-
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
-    if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
-      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    KnownZero = KnownZero.sext(BitWidth);
+    KnownOne = KnownOne.sext(BitWidth);
     break;
   }
   case Instruction::Shl: {
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    auto KZF = [BitWidth, NSW](const APInt &KnownZero, unsigned ShiftAmt) {
-      APInt KZResult =
-          (KnownZero << ShiftAmt) |
-          APInt::getLowBitsSet(BitWidth, ShiftAmt); // Low bits known 0.
+    auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
+      APInt KZResult = KnownZero << ShiftAmt;
+      KZResult.setLowBits(ShiftAmt); // Low bits known 0.
       // If this shift has "nsw" keyword, then the result is either a poison
       // value or has the same sign bit as the first operand.
       if (NSW && KnownZero.isNegative())
-        KZResult.setBit(BitWidth - 1);
+        KZResult.setSignBit();
       return KZResult;
     };
 
-    auto KOF = [BitWidth, NSW](const APInt &KnownOne, unsigned ShiftAmt) {
+    auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
       APInt KOResult = KnownOne << ShiftAmt;
       if (NSW && KnownOne.isNegative())
-        KOResult.setBit(BitWidth - 1);
+        KOResult.setSignBit();
       return KOResult;
     };
 
@@ -1108,13 +1112,13 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
   case Instruction::LShr: {
     // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
-      return APIntOps::lshr(KnownZero, ShiftAmt) |
+      return KnownZero.lshr(ShiftAmt) |
              // High bits known zero.
              APInt::getHighBitsSet(BitWidth, ShiftAmt);
     };
 
-    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
-      return APIntOps::lshr(KnownOne, ShiftAmt);
+    auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
+      return KnownOne.lshr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
@@ -1124,12 +1128,12 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
   }
   case Instruction::AShr: {
     // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
-      return APIntOps::ashr(KnownZero, ShiftAmt);
+    auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
+      return KnownZero.ashr(ShiftAmt);
     };
 
-    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
-      return APIntOps::ashr(KnownOne, ShiftAmt);
+    auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
+      return KnownOne.ashr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
@@ -1165,12 +1169,12 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
-        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+        if (KnownZero2.isNegative() || ((KnownZero2 & LowBits) == LowBits))
           KnownZero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
-        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
+        if (KnownOne2.isNegative() && ((KnownOne2 & LowBits) != 0))
           KnownOne |= ~LowBits;
 
         assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
@@ -1185,7 +1189,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
                        Q);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero.setBit(BitWidth - 1);
+        KnownZero.setSignBit();
     }
 
     break;
@@ -1209,7 +1213,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    KnownZero.clearAllBits();
+    KnownZero.setHighBits(Leaders);
     break;
   }
 
@@ -1220,7 +1225,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
 
     if (Align > 0)
-      KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+      KnownZero.setLowBits(countTrailingZeros(Align));
     break;
   }
   case Instruction::GetElementPtr: {
@@ -1267,7 +1272,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       }
     }
 
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ);
+    KnownZero.setLowBits(TrailZ);
     break;
   }
   case Instruction::PHI: {
@@ -1308,9 +1313,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
           APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
           computeKnownBits(L, KnownZero3, KnownOne3, Depth + 1, Q);
 
-          KnownZero = APInt::getLowBitsSet(
-              BitWidth, std::min(KnownZero2.countTrailingOnes(),
-                                 KnownZero3.countTrailingOnes()));
+          KnownZero.setLowBits(std::min(KnownZero2.countTrailingOnes(),
+                                        KnownZero3.countTrailingOnes()));
 
           if (DontImproveNonNegativePhiBits)
             break;
@@ -1328,24 +1332,24 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
             // (add negative, negative) --> negative
             if (Opcode == Instruction::Add) {
               if (KnownZero2.isNegative() && KnownZero3.isNegative())
-                KnownZero.setBit(BitWidth - 1);
+                KnownZero.setSignBit();
               else if (KnownOne2.isNegative() && KnownOne3.isNegative())
-                KnownOne.setBit(BitWidth - 1);
+                KnownOne.setSignBit();
             }
 
             // (sub nsw non-negative, negative) --> non-negative
             // (sub nsw negative, non-negative) --> negative
             else if (Opcode == Instruction::Sub && LL == I) {
               if (KnownZero2.isNegative() && KnownOne3.isNegative())
-                KnownZero.setBit(BitWidth - 1);
+                KnownZero.setSignBit();
               else if (KnownOne2.isNegative() && KnownZero3.isNegative())
-                KnownOne.setBit(BitWidth - 1);
+                KnownOne.setSignBit();
             }
 
             // (mul nsw non-negative, non-negative) --> non-negative
             else if (Opcode == Instruction::Mul && KnownZero2.isNegative() &&
                      KnownZero3.isNegative())
-              KnownZero.setBit(BitWidth - 1);
+              KnownZero.setSignBit();
           }
 
           break;
@@ -1364,8 +1368,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
-      KnownZero = APInt::getAllOnesValue(BitWidth);
-      KnownOne = APInt::getAllOnesValue(BitWidth);
+      KnownZero.setAllBits();
+      KnownOne.setAllBits();
       for (Value *IncValue : P->incoming_values()) {
         // Skip direct self references.
         if (IncValue == P) continue;
@@ -1400,6 +1404,11 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
+      case Intrinsic::bitreverse:
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
+        KnownZero |= KnownZero2.reverseBits();
+        KnownOne |= KnownOne2.reverseBits();
+        break;
       case Intrinsic::bswap:
         computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
         KnownZero |= KnownZero2.byteSwap();
@@ -1411,7 +1420,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           LowBits -= 1;
-        KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::ctpop: {
@@ -1419,17 +1428,14 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
         unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation();
-        unsigned LeadingZeros =
-          APInt(BitWidth, BitsPossiblySet).countLeadingZeros();
-        assert(LeadingZeros <= BitWidth);
-        KnownZero |= APInt::getHighBitsSet(BitWidth, LeadingZeros);
-        KnownOne &= ~KnownZero;
+        unsigned LowBits = Log2_32(BitsPossiblySet)+1;
+        KnownZero.setBitsFrom(LowBits);
         // TODO: we could bound KnownOne using the lower bound on the number
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero |= APInt::getHighBitsSet(64, 32);
+        KnownZero.setBitsFrom(32);
         break;
       }
     }
@@ -1502,6 +1508,7 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
          KnownZero.getBitWidth() == BitWidth &&
          KnownOne.getBitWidth() == BitWidth &&
          "V, KnownOne and KnownZero should have same BitWidth");
+  (void)BitWidth;
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
@@ -1513,7 +1520,7 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   // Null and aggregate-zero are all-zeros.
   if (isa<ConstantPointerNull>(V) || isa<ConstantAggregateZero>(V)) {
     KnownOne.clearAllBits();
-    KnownZero = APInt::getAllOnesValue(BitWidth);
+    KnownZero.setAllBits();
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
@@ -1582,7 +1589,7 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   if (V->getType()->isPointerTy()) {
     unsigned Align = V->getPointerAlignment(Q.DL);
     if (Align)
-      KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+      KnownZero.setLowBits(countTrailingZeros(Align));
   }
 
   // computeKnownBitsFromAssume strictly refines KnownZero and
@@ -1607,8 +1614,8 @@ void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
   APInt ZeroBits(BitWidth, 0);
   APInt OneBits(BitWidth, 0);
   computeKnownBits(V, ZeroBits, OneBits, Depth, Q);
-  KnownOne = OneBits[BitWidth - 1];
-  KnownZero = ZeroBits[BitWidth - 1];
+  KnownOne = OneBits.isNegative();
+  KnownZero = ZeroBits.isNegative();
 }
 
 /// Return true if the given value is known to have exactly one
@@ -1788,10 +1795,12 @@ static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value)
   return true;
 }
 
-/// Return true if the given value is known to be non-zero when defined.
-/// For vectors return true if every element is known to be non-zero when
-/// defined. Supports values with integer or pointer type and vectors of
-/// integers.
+/// Return true if the given value is known to be non-zero when defined. For
+/// vectors, return true if every element is known to be non-zero when
+/// defined. For pointers, if the context instruction and dominator tree are
+/// specified, perform context-sensitive analysis and return true if the
+/// pointer couldn't possibly be null at the specified instruction.
+/// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
@@ -1834,7 +1843,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
 
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
-    if (isKnownNonNull(V))
+    if (isKnownNonNullAt(V, Q.CxtI, Q.DT))
       return true;
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
       if (isGEPKnownNonNull(GEP, Depth, Q))
@@ -2075,13 +2084,29 @@ static unsigned computeNumSignBitsVectorConstant(const Value *V,
   return MinSignBits;
 }
 
+static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
+                                       const Query &Q);
+
+static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
+                                   const Query &Q) {
+  unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
+  assert(Result > 0 && "At least one sign bit needs to be present!");
+  return Result;
+}
+
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign bit
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
 /// vector element with the mininum number of known sign bits.
-unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
+static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
+                                       const Query &Q) {
+
+  // We return the minimum number of sign bits that are guaranteed to be present
+  // in V, so for undef we have to conservatively return 1.  We don't have the
+  // same behavior for poison though -- that's a FIXME today.
+
   unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
@@ -2157,7 +2182,10 @@ unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
-      Tmp += ShAmt->getZExtValue();
+      unsigned ShAmtLimited = ShAmt->getZExtValue();
+      if (ShAmtLimited >= TyBits)
+        break;  // Bad shift.
+      Tmp += ShAmtLimited;
       if (Tmp > TyBits) Tmp = TyBits;
     }
     return Tmp;
@@ -2436,7 +2464,7 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
   if (!TLI)
     return Intrinsic::not_intrinsic;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   // We're going to make assumptions on the semantics of the functions, check
   // that the target knows that it's available in this environment and it does
   // not have local linkage.
@@ -2451,81 +2479,81 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
   switch (Func) {
   default:
     break;
-  case LibFunc::sin:
-  case LibFunc::sinf:
-  case LibFunc::sinl:
+  case LibFunc_sin:
+  case LibFunc_sinf:
+  case LibFunc_sinl:
     return Intrinsic::sin;
-  case LibFunc::cos:
-  case LibFunc::cosf:
-  case LibFunc::cosl:
+  case LibFunc_cos:
+  case LibFunc_cosf:
+  case LibFunc_cosl:
     return Intrinsic::cos;
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
     return Intrinsic::exp;
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
     return Intrinsic::exp2;
-  case LibFunc::log:
-  case LibFunc::logf:
-  case LibFunc::logl:
+  case LibFunc_log:
+  case LibFunc_logf:
+  case LibFunc_logl:
     return Intrinsic::log;
-  case LibFunc::log10:
-  case LibFunc::log10f:
-  case LibFunc::log10l:
+  case LibFunc_log10:
+  case LibFunc_log10f:
+  case LibFunc_log10l:
     return Intrinsic::log10;
-  case LibFunc::log2:
-  case LibFunc::log2f:
-  case LibFunc::log2l:
+  case LibFunc_log2:
+  case LibFunc_log2f:
+  case LibFunc_log2l:
     return Intrinsic::log2;
-  case LibFunc::fabs:
-  case LibFunc::fabsf:
-  case LibFunc::fabsl:
+  case LibFunc_fabs:
+  case LibFunc_fabsf:
+  case LibFunc_fabsl:
     return Intrinsic::fabs;
-  case LibFunc::fmin:
-  case LibFunc::fminf:
-  case LibFunc::fminl:
+  case LibFunc_fmin:
+  case LibFunc_fminf:
+  case LibFunc_fminl:
     return Intrinsic::minnum;
-  case LibFunc::fmax:
-  case LibFunc::fmaxf:
-  case LibFunc::fmaxl:
+  case LibFunc_fmax:
+  case LibFunc_fmaxf:
+  case LibFunc_fmaxl:
     return Intrinsic::maxnum;
-  case LibFunc::copysign:
-  case LibFunc::copysignf:
-  case LibFunc::copysignl:
+  case LibFunc_copysign:
+  case LibFunc_copysignf:
+  case LibFunc_copysignl:
     return Intrinsic::copysign;
-  case LibFunc::floor:
-  case LibFunc::floorf:
-  case LibFunc::floorl:
+  case LibFunc_floor:
+  case LibFunc_floorf:
+  case LibFunc_floorl:
     return Intrinsic::floor;
-  case LibFunc::ceil:
-  case LibFunc::ceilf:
-  case LibFunc::ceill:
+  case LibFunc_ceil:
+  case LibFunc_ceilf:
+  case LibFunc_ceill:
     return Intrinsic::ceil;
-  case LibFunc::trunc:
-  case LibFunc::truncf:
-  case LibFunc::truncl:
+  case LibFunc_trunc:
+  case LibFunc_truncf:
+  case LibFunc_truncl:
     return Intrinsic::trunc;
-  case LibFunc::rint:
-  case LibFunc::rintf:
-  case LibFunc::rintl:
+  case LibFunc_rint:
+  case LibFunc_rintf:
+  case LibFunc_rintl:
     return Intrinsic::rint;
-  case LibFunc::nearbyint:
-  case LibFunc::nearbyintf:
-  case LibFunc::nearbyintl:
+  case LibFunc_nearbyint:
+  case LibFunc_nearbyintf:
+  case LibFunc_nearbyintl:
     return Intrinsic::nearbyint;
-  case LibFunc::round:
-  case LibFunc::roundf:
-  case LibFunc::roundl:
+  case LibFunc_round:
+  case LibFunc_roundf:
+  case LibFunc_roundl:
     return Intrinsic::round;
-  case LibFunc::pow:
-  case LibFunc::powf:
-  case LibFunc::powl:
+  case LibFunc_pow:
+  case LibFunc_powf:
+  case LibFunc_powl:
     return Intrinsic::pow;
-  case LibFunc::sqrt:
-  case LibFunc::sqrtf:
-  case LibFunc::sqrtl:
+  case LibFunc_sqrt:
+  case LibFunc_sqrtf:
+  case LibFunc_sqrtl:
     if (ICS->hasNoNaNs())
       return Intrinsic::sqrt;
     return Intrinsic::not_intrinsic;
@@ -2590,6 +2618,11 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             bool SignBitOnly,
                                             unsigned Depth) {
+  // TODO: This function does not do the right thing when SignBitOnly is true
+  // and we're lowering to a hypothetical IEEE 754-compliant-but-evil platform
+  // which flips the sign bits of NaNs.  See
+  // https://llvm.org/bugs/show_bug.cgi?id=31702.
+
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     return !CFP->getValueAPF().isNegative() ||
            (!SignBitOnly && CFP->getValueAPF().isZero());
@@ -2633,7 +2666,8 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::Call:
-    Intrinsic::ID IID = getIntrinsicForCallSite(cast<CallInst>(I), TLI);
+    const auto *CI = cast<CallInst>(I);
+    Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
     switch (IID) {
     default:
       break;
@@ -2650,16 +2684,37 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
     case Intrinsic::exp:
     case Intrinsic::exp2:
     case Intrinsic::fabs:
-    case Intrinsic::sqrt:
       return true;
+
+    case Intrinsic::sqrt:
+      // sqrt(x) is always >= -0 or NaN.  Moreover, sqrt(x) == -0 iff x == -0.
+      if (!SignBitOnly)
+        return true;
+      return CI->hasNoNaNs() && (CI->hasNoSignedZeros() ||
+                                 CannotBeNegativeZero(CI->getOperand(0), TLI));
+
     case Intrinsic::powi:
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (ConstantInt *Exponent = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // powi(x,n) is non-negative if n is even.
-        if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
+        if (Exponent->getBitWidth() <= 64 && Exponent->getSExtValue() % 2u == 0)
           return true;
       }
+      // TODO: This is not correct.  Given that exp is an integer, here are the
+      // ways that pow can return a negative value:
+      //
+      //   pow(x, exp)    --> negative if exp is odd and x is negative.
+      //   pow(-0, exp)   --> -inf if exp is negative odd.
+      //   pow(-0, exp)   --> -0 if exp is positive odd.
+      //   pow(-inf, exp) --> -0 if exp is negative odd.
+      //   pow(-inf, exp) --> -inf if exp is positive odd.
+      //
+      // Therefore, if !SignBitOnly, we can return true if x >= +0 or x is NaN,
+      // but we must return false if x == -0.  Unfortunately we do not currently
+      // have a way of expressing this constraint.  See details in
+      // https://llvm.org/bugs/show_bug.cgi?id=31702.
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1);
+
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
       // x*x+y is non-negative if y is non-negative.
@@ -3150,6 +3205,9 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
       if (GA->isInterposable())
         return V;
       V = GA->getAliasee();
+    } else if (isa<AllocaInst>(V)) {
+      // An alloca can't be further simplified.
+      return V;
     } else {
       if (auto CS = CallSite(V))
         if (Value *RV = CS.getReturnedArgOperand()) {
@@ -3327,6 +3385,10 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
       case Intrinsic::rint:
       case Intrinsic::round:
         return true;
+      // These intrinsics do not correspond to any libm function, and
+      // do not set errno.
+      case Intrinsic::powi:
+        return true;
       // TODO: are convert_{from,to}_fp16 safe?
       // TODO: can we list target-specific intrinsics here?
       default: break;
@@ -3406,6 +3468,16 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
     NumUsesExplored++;
+
+    // If the value is used as an argument to a call or invoke, then argument
+    // attributes may provide an answer about null-ness.
+    if (auto CS = ImmutableCallSite(U))
+      if (auto *CalledFunc = CS.getCalledFunction())
+        for (const Argument &Arg : CalledFunc->args())
+          if (CS.getArgOperand(Arg.getArgNo()) == V &&
+              Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
+            return true;
+
     // Consider only compare instructions uniquely controlling a branch
     CmpInst::Predicate Pred;
     if (!match(const_cast<User *>(U),
@@ -3683,6 +3755,8 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
     return false;
   if (isa<ReturnInst>(I))
     return false;
+  if (isa<UnreachableInst>(I))
+    return false;
 
   // Calls can throw, or contain an infinite loop, or kill the process.
   if (auto CS = ImmutableCallSite(I)) {
@@ -3731,79 +3805,33 @@ bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
 
 bool llvm::propagatesFullPoison(const Instruction *I) {
   switch (I->getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Xor:
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::AddrSpaceCast:
-      // These operations all propagate poison unconditionally. Note that poison
-      // is not any particular value, so xor or subtraction of poison with
-      // itself still yields poison, not zero.
-      return true;
-
-    case Instruction::AShr:
-    case Instruction::SExt:
-      // For these operations, one bit of the input is replicated across
-      // multiple output bits. A replicated poison bit is still poison.
-      return true;
-
-    case Instruction::Shl: {
-      // Left shift *by* a poison value is poison. The number of
-      // positions to shift is unsigned, so no negative values are
-      // possible there. Left shift by zero places preserves poison. So
-      // it only remains to consider left shift of poison by a positive
-      // number of places.
-      //
-      // A left shift by a positive number of places leaves the lowest order bit
-      // non-poisoned. However, if such a shift has a no-wrap flag, then we can
-      // make the poison operand violate that flag, yielding a fresh full-poison
-      // value.
-      auto *OBO = cast<OverflowingBinaryOperator>(I);
-      return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap();
-    }
-
-    case Instruction::Mul: {
-      // A multiplication by zero yields a non-poison zero result, so we need to
-      // rule out zero as an operand. Conservatively, multiplication by a
-      // non-zero constant is not multiplication by zero.
-      //
-      // Multiplication by a non-zero constant can leave some bits
-      // non-poisoned. For example, a multiplication by 2 leaves the lowest
-      // order bit unpoisoned. So we need to consider that.
-      //
-      // Multiplication by 1 preserves poison. If the multiplication has a
-      // no-wrap flag, then we can make the poison operand violate that flag
-      // when multiplied by any integer other than 0 and 1.
-      auto *OBO = cast<OverflowingBinaryOperator>(I);
-      if (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) {
-        for (Value *V : OBO->operands()) {
-          if (auto *CI = dyn_cast<ConstantInt>(V)) {
-            // A ConstantInt cannot yield poison, so we can assume that it is
-            // the other operand that is poison.
-            return !CI->isZero();
-          }
-        }
-      }
-      return false;
-    }
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Xor:
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::GetElementPtr:
+    // These operations all propagate poison unconditionally. Note that poison
+    // is not any particular value, so xor or subtraction of poison with
+    // itself still yields poison, not zero.
+    return true;
 
-    case Instruction::ICmp:
-      // Comparing poison with any value yields poison.  This is why, for
-      // instance, x s< (x +nsw 1) can be folded to true.
-      return true;
+  case Instruction::AShr:
+  case Instruction::SExt:
+    // For these operations, one bit of the input is replicated across
+    // multiple output bits. A replicated poison bit is still poison.
+    return true;
 
-    case Instruction::GetElementPtr:
-      // A GEP implicitly represents a sequence of additions, subtractions,
-      // truncations, sign extensions and multiplications. The multiplications
-      // are by the non-zero sizes of some set of types, so we do not have to be
-      // concerned with multiplication by zero. If the GEP is in-bounds, then
-      // these operations are implicitly no-signed-wrap so poison is propagated
-      // by the arguments above for Add, Sub, Trunc, SExt and Mul.
-      return cast<GEPOperator>(I)->isInBounds();
+  case Instruction::ICmp:
+    // Comparing poison with any value yields poison.  This is why, for
+    // instance, x s< (x +nsw 1) can be folded to true.
+    return true;
 
-    default:
-      return false;
+  default:
+    return false;
   }
 }
 
@@ -3906,6 +3934,37 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
                                        Value *CmpLHS, Value *CmpRHS,
                                        Value *TrueVal, Value *FalseVal,
                                        Value *&LHS, Value *&RHS) {
+  // Assume success. If there's no match, callers should not use these anyway.
+  LHS = TrueVal;
+  RHS = FalseVal;
+
+  // Recognize variations of:
+  // CLAMP(v,l,h) ==> ((v) < (l) ? (l) : ((v) > (h) ? (h) : (v)))
+  const APInt *C1;
+  if (CmpRHS == TrueVal && match(CmpRHS, m_APInt(C1))) {
+    const APInt *C2;
+
+    // (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
+    if (match(FalseVal, m_SMin(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->slt(*C2) && Pred == CmpInst::ICMP_SLT)
+      return {SPF_SMAX, SPNB_NA, false};
+
+    // (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
+    if (match(FalseVal, m_SMax(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->sgt(*C2) && Pred == CmpInst::ICMP_SGT)
+      return {SPF_SMIN, SPNB_NA, false};
+
+    // (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
+    if (match(FalseVal, m_UMin(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->ult(*C2) && Pred == CmpInst::ICMP_ULT)
+      return {SPF_UMAX, SPNB_NA, false};
+
+    // (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
+    if (match(FalseVal, m_UMax(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->ugt(*C2) && Pred == CmpInst::ICMP_UGT)
+      return {SPF_UMIN, SPNB_NA, false};
+  }
+
   if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
@@ -3913,23 +3972,16 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
   // (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
   // (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
   if (match(TrueVal, m_Zero()) &&
-      match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS)))) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
-  }
 
   // Z = X -nsw Y
   // (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
   // (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
   if (match(FalseVal, m_Zero()) &&
-      match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS)))) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
-  }
 
-  const APInt *C1;
   if (!match(CmpRHS, m_APInt(C1)))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
@@ -3940,41 +3992,29 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
     // Is the sign bit set?
     // (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
     // (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
-    if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue()) {
-      LHS = TrueVal;
-      RHS = FalseVal;
+    if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue())
       return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
-    }
 
     // Is the sign bit clear?
     // (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
     // (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
     if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
-        C2->isMinSignedValue()) {
-      LHS = TrueVal;
-      RHS = FalseVal;
+        C2->isMinSignedValue())
       return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
-    }
   }
 
   // Look through 'not' ops to find disguised signed min/max.
   // (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
   // (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
   if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
-      match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
-  }
 
   // (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
   // (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
   if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
-      match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
-  }
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
@@ -4101,58 +4141,64 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
 
 static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
                               Instruction::CastOps *CastOp) {
-  CastInst *CI = dyn_cast<CastInst>(V1);
-  Constant *C = dyn_cast<Constant>(V2);
-  if (!CI)
+  auto *Cast1 = dyn_cast<CastInst>(V1);
+  if (!Cast1)
     return nullptr;
-  *CastOp = CI->getOpcode();
-
-  if (auto *CI2 = dyn_cast<CastInst>(V2)) {
-    // If V1 and V2 are both the same cast from the same type, we can look
-    // through V1.
-    if (CI2->getOpcode() == CI->getOpcode() &&
-        CI2->getSrcTy() == CI->getSrcTy())
-      return CI2->getOperand(0);
-    return nullptr;
-  } else if (!C) {
+
+  *CastOp = Cast1->getOpcode();
+  Type *SrcTy = Cast1->getSrcTy();
+  if (auto *Cast2 = dyn_cast<CastInst>(V2)) {
+    // If V1 and V2 are both the same cast from the same type, look through V1.
+    if (*CastOp == Cast2->getOpcode() && SrcTy == Cast2->getSrcTy())
+      return Cast2->getOperand(0);
     return nullptr;
   }
 
-  Constant *CastedTo = nullptr;
-
-  if (isa<ZExtInst>(CI) && CmpI->isUnsigned())
-    CastedTo = ConstantExpr::getTrunc(C, CI->getSrcTy());
-
-  if (isa<SExtInst>(CI) && CmpI->isSigned())
-    CastedTo = ConstantExpr::getTrunc(C, CI->getSrcTy(), true);
-
-  if (isa<TruncInst>(CI))
-    CastedTo = ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned());
-
-  if (isa<FPTruncInst>(CI))
-    CastedTo = ConstantExpr::getFPExtend(C, CI->getSrcTy(), true);
-
-  if (isa<FPExtInst>(CI))
-    CastedTo = ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true);
-
-  if (isa<FPToUIInst>(CI))
-    CastedTo = ConstantExpr::getUIToFP(C, CI->getSrcTy(), true);
-
-  if (isa<FPToSIInst>(CI))
-    CastedTo = ConstantExpr::getSIToFP(C, CI->getSrcTy(), true);
-
-  if (isa<UIToFPInst>(CI))
-    CastedTo = ConstantExpr::getFPToUI(C, CI->getSrcTy(), true);
+  auto *C = dyn_cast<Constant>(V2);
+  if (!C)
+    return nullptr;
 
-  if (isa<SIToFPInst>(CI))
-    CastedTo = ConstantExpr::getFPToSI(C, CI->getSrcTy(), true);
+  Constant *CastedTo = nullptr;
+  switch (*CastOp) {
+  case Instruction::ZExt:
+    if (CmpI->isUnsigned())
+      CastedTo = ConstantExpr::getTrunc(C, SrcTy);
+    break;
+  case Instruction::SExt:
+    if (CmpI->isSigned())
+      CastedTo = ConstantExpr::getTrunc(C, SrcTy, true);
+    break;
+  case Instruction::Trunc:
+    CastedTo = ConstantExpr::getIntegerCast(C, SrcTy, CmpI->isSigned());
+    break;
+  case Instruction::FPTrunc:
+    CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
+    break;
+  case Instruction::FPExt:
+    CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
+    break;
+  case Instruction::FPToUI:
+    CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
+    break;
+  case Instruction::FPToSI:
+    CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
+    break;
+  case Instruction::UIToFP:
+    CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
+    break;
+  case Instruction::SIToFP:
+    CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
+    break;
+  default:
+    break;
+  }
 
   if (!CastedTo)
     return nullptr;
 
-  Constant *CastedBack =
-      ConstantExpr::getCast(CI->getOpcode(), CastedTo, C->getType(), true);
   // Make sure the cast doesn't lose any information.
+  Constant *CastedBack =
+      ConstantExpr::getCast(*CastOp, CastedTo, C->getType(), true);
   if (CastedBack != C)
     return nullptr;
 
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 7e598f435ff5..722f17a8067e 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -488,3 +488,88 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
 
   return Inst;
 }
+
+Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
+                                     unsigned NumVecs) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVecs; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start,
+                                 unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                     unsigned NumInts, unsigned NumUndefs) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumInts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndefs; i++)
+    Mask.push_back(Undef);
+
+  return ConstantVector::get(Mask);
+}
+
+/// A helper function for concatenating vectors. This function concatenates two
+/// vectors having the same element type. If the second vector has fewer
+/// elements than the first, it is padded with undefs.
+static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+  if (NumElts1 > NumElts2) {
+    // Extend with UNDEFs.
+    Constant *ExtMask =
+        createSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+  }
+
+  Constant *Mask = createSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+Value *llvm::concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) {
+  unsigned NumVecs = Vecs.size();
+  assert(NumVecs > 1 && "Should be at least two vectors");
+
+  SmallVector<Value *, 8> ResList;
+  ResList.append(Vecs.begin(), Vecs.end());
+  do {
+    SmallVector<Value *, 8> TmpList;
+    for (unsigned i = 0; i < NumVecs - 1; i += 2) {
+      Value *V0 = ResList[i], *V1 = ResList[i + 1];
+      assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
+             "Only the last vector may have a different type");
+
+      TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
+    }
+
+    // Push the last vector if the total number of vectors is odd.
+    if (NumVecs % 2 != 0)
+      TmpList.push_back(ResList[NumVecs - 1]);
+
+    ResList = TmpList;
+    NumVecs = ResList.size();
+  } while (NumVecs > 1);
+
+  return ResList[0];
+}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 752942fc9fcc..49a8ce4bed0b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -548,6 +548,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(ninf);
   KEYWORD(nsz);
   KEYWORD(arcp);
+  KEYWORD(contract);
   KEYWORD(fast);
   KEYWORD(nuw);
   KEYWORD(nsw);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 4cd986e143b6..58ea9296afda 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -130,10 +130,9 @@ bool LLParser::ValidateEndOfModule() {
       B.merge(NumberedAttrBuilders[Attr]);
 
     if (Function *Fn = dyn_cast<Function>(V)) {
-      AttributeSet AS = Fn->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = Fn->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
 
       FnAttrs.merge(B);
 
@@ -144,32 +143,27 @@ bool LLParser::ValidateEndOfModule() {
         FnAttrs.removeAttribute(Attribute::Alignment);
       }
 
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(
+          Context, AttributeList::FunctionIndex,
+          AttributeList::get(Context, AttributeList::FunctionIndex, FnAttrs));
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
-      AttributeSet AS = CI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = CI->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(
+          Context, AttributeList::FunctionIndex,
+          AttributeList::get(Context, AttributeList::FunctionIndex, FnAttrs));
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
-      AttributeSet AS = II->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = II->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(
+          Context, AttributeList::FunctionIndex,
+          AttributeList::get(Context, AttributeList::FunctionIndex, FnAttrs));
       II->setAttributes(AS);
     } else {
       llvm_unreachable("invalid object with forward attribute group reference");
@@ -1855,6 +1849,34 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
   return false;
 }
 
+/// ParseOptionalCommaAddrSpace
+///   ::=
+///   ::= ',' addrspace(1)
+///
+/// This returns with AteExtraComma set to true if it ate an excess comma at the
+/// end.
+bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace,
+                                           LocTy &Loc,
+                                           bool &AteExtraComma) {
+  AteExtraComma = false;
+  while (EatIfPresent(lltok::comma)) {
+    // Metadata at the end is an early exit.
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      return false;
+    }
+
+    Loc = Lex.getLoc();
+    if (Lex.getKind() != lltok::kw_addrspace)
+      return Error(Lex.getLoc(), "expected metadata or 'addrspace'");
+
+    if (ParseOptionalAddrSpace(AddrSpace))
+      return true;
+  }
+
+  return false;
+}
+
 bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
                                        Optional<unsigned> &HowManyArg) {
   Lex.Lex();
@@ -2098,7 +2120,6 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
   if (ParseToken(lltok::lparen, "expected '(' in call"))
     return true;
 
-  unsigned AttrIndex = 1;
   while (Lex.getKind() != lltok::rparen) {
     // If this isn't the first argument, we need a comma.
     if (!ArgList.empty() &&
@@ -2132,9 +2153,8 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
       if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
         return true;
     }
-    ArgList.push_back(ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(),
-                                                             AttrIndex++,
-                                                             ArgAttrs)));
+    ArgList.push_back(ParamInfo(
+        ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs)));
   }
 
   if (IsMustTailCall && InVarArgsFunc)
@@ -2239,9 +2259,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (!FunctionType::isValidArgumentType(ArgTy))
       return Error(TypeLoc, "invalid type for function argument");
 
-    unsigned AttrIndex = 1;
-    ArgList.emplace_back(TypeLoc, ArgTy, AttributeSet::get(ArgTy->getContext(),
-                                                           AttrIndex++, Attrs),
+    ArgList.emplace_back(TypeLoc, ArgTy,
+                         AttributeSet::get(ArgTy->getContext(), Attrs),
                          std::move(Name));
 
     while (EatIfPresent(lltok::comma)) {
@@ -2268,10 +2287,9 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       if (!ArgTy->isFirstClassType())
         return Error(TypeLoc, "invalid type for function argument");
 
-      ArgList.emplace_back(
-          TypeLoc, ArgTy,
-          AttributeSet::get(ArgTy->getContext(), AttrIndex++, Attrs),
-          std::move(Name));
+      ArgList.emplace_back(TypeLoc, ArgTy,
+                           AttributeSet::get(ArgTy->getContext(), Attrs),
+                           std::move(Name));
     }
   }
 
@@ -2295,7 +2313,7 @@ bool LLParser::ParseFunctionType(Type *&Result) {
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     if (!ArgList[i].Name.empty())
       return Error(ArgList[i].Loc, "argument name invalid in function type");
-    if (ArgList[i].Attrs.hasAttributes(i + 1))
+    if (ArgList[i].Attrs.hasAttributes())
       return Error(ArgList[i].Loc,
                    "argument attributes invalid in function type");
   }
@@ -3908,7 +3926,8 @@ bool LLParser::ParseDIBasicType(MDNode *&Result, bool IsDistinct) {
 /// ParseDIDerivedType:
 ///   ::= !DIDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
 ///                      line: 7, scope: !1, baseType: !2, size: 32,
-///                      align: 32, offset: 0, flags: 0, extraData: !3)
+///                      align: 32, offset: 0, flags: 0, extraData: !3,
+///                      dwarfAddressSpace: 3)
 bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
@@ -3921,14 +3940,20 @@ bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
   OPTIONAL(flags, DIFlagField, );                                              \
-  OPTIONAL(extraData, MDField, );
+  OPTIONAL(extraData, MDField, );                                              \
+  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Optional<unsigned> DWARFAddressSpace;
+  if (dwarfAddressSpace.Val != UINT32_MAX)
+    DWARFAddressSpace = dwarfAddressSpace.Val;
+
   Result = GET_OR_DISTINCT(DIDerivedType,
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, flags.Val, extraData.Val));
+                            offset.Val, DWARFAddressSpace, flags.Val,
+                            extraData.Val));
   return false;
 }
 
@@ -4029,7 +4054,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(imports, MDField, );                                                \
   OPTIONAL(macros, MDField, );                                                 \
   OPTIONAL(dwoId, MDUnsignedField, );                                          \
-  OPTIONAL(splitDebugInlining, MDBoolField, = true);
+  OPTIONAL(splitDebugInlining, MDBoolField, = true);                           \
+  OPTIONAL(debugInfoForProfiling, MDBoolField, = false);
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4037,7 +4063,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
       retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val,
-      splitDebugInlining.Val);
+      splitDebugInlining.Val, debugInfoForProfiling.Val);
   return false;
 }
 
@@ -4589,6 +4615,9 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
     C = cast<Constant>(V);
     return false;
   }
+  case ValID::t_Null:
+    C = Constant::getNullValue(Ty);
+    return false;
   default:
     return Error(Loc, "expected a constant value");
   }
@@ -4735,25 +4764,14 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::vector<Type*> ParamTypeList;
   SmallVector<AttributeSet, 8> Attrs;
 
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
-
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     ParamTypeList.push_back(ArgList[i].Ty);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
-  if (FuncAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FuncAttrs));
-
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL =
+      AttributeList::get(Context, AttributeSet::get(Context, FuncAttrs),
+                         AttributeSet::get(Context, RetAttrs), Attrs);
 
   if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
@@ -5363,13 +5381,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
     return true;
 
   // Set up the Attribute for the function.
-  SmallVector<AttributeSet, 8> Attrs;
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
-
-  SmallVector<Value*, 8> Args;
+  SmallVector<Value *, 8> Args;
+  SmallVector<AttributeSet, 8> ArgAttrs;
 
   // Loop through FunctionType's arguments and ensure they are specified
   // correctly.  Also, gather any parameter attributes.
@@ -5387,26 +5400,19 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    ArgAttrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes()) {
-    if (FnAttrs.hasAlignmentAttr())
-      return Error(CallLoc, "invoke instructions may not have an alignment");
-
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FnAttrs));
-  }
+  if (FnAttrs.hasAlignmentAttr())
+    return Error(CallLoc, "invoke instructions may not have an alignment");
 
   // Finish off the Attribute and check them
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL =
+      AttributeList::get(Context, AttributeSet::get(Context, FnAttrs),
+                         AttributeSet::get(Context, RetAttrs), ArgAttrs);
 
   InvokeInst *II =
       InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args, BundleList);
@@ -5968,10 +5974,6 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Set up the Attribute for the function.
   SmallVector<AttributeSet, 8> Attrs;
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
 
   SmallVector<Value*, 8> Args;
 
@@ -5991,26 +5993,19 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes()) {
-    if (FnAttrs.hasAlignmentAttr())
-      return Error(CallLoc, "call instructions may not have an alignment");
-
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FnAttrs));
-  }
+  if (FnAttrs.hasAlignmentAttr())
+    return Error(CallLoc, "call instructions may not have an alignment");
 
   // Finish off the Attribute and check them
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL =
+      AttributeList::get(Context, AttributeSet::get(Context, FnAttrs),
+                         AttributeSet::get(Context, RetAttrs), Attrs);
 
   CallInst *CI = CallInst::Create(Ty, Callee, Args, BundleList);
   CI->setTailCallKind(TCK);
@@ -6032,8 +6027,9 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 ///       (',' 'align' i32)?
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
-  LocTy SizeLoc, TyLoc;
+  LocTy SizeLoc, TyLoc, ASLoc;
   unsigned Alignment = 0;
+  unsigned AddrSpace = 0;
   Type *Ty = nullptr;
 
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
@@ -6047,12 +6043,21 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   if (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::kw_align) {
-      if (ParseOptionalAlignment(Alignment)) return true;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
+      if (ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
+        return true;
+    } else if (Lex.getKind() == lltok::kw_addrspace) {
+      ASLoc = Lex.getLoc();
+      if (ParseOptionalAddrSpace(AddrSpace))
+        return true;
     } else if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
     } else {
       if (ParseTypeAndValue(Size, SizeLoc, PFS) ||
-          ParseOptionalCommaAlign(Alignment, AteExtraComma))
+          ParseOptionalCommaAlign(Alignment, AteExtraComma) ||
+          (!AteExtraComma &&
+           ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma)))
         return true;
     }
   }
@@ -6060,7 +6065,14 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  AllocaInst *AI = new AllocaInst(Ty, Size, Alignment);
+  const DataLayout &DL = M->getDataLayout();
+  unsigned AS = DL.getAllocaAddrSpace();
+  if (AS != AddrSpace) {
+    // TODO: In the future it should be possible to specify addrspace per-alloca.
+    return Error(ASLoc, "address space must match datalayout");
+  }
+
+  AllocaInst *AI = new AllocaInst(Ty, AS, Size, Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 16d4e8b5baa0..4616c2e86947 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -193,6 +193,10 @@ namespace llvm {
         case lltok::kw_ninf: FMF.setNoInfs();          Lex.Lex(); continue;
         case lltok::kw_nsz:  FMF.setNoSignedZeros();   Lex.Lex(); continue;
         case lltok::kw_arcp: FMF.setAllowReciprocal(); Lex.Lex(); continue;
+        case lltok::kw_contract:
+          FMF.setAllowContract(true);
+          Lex.Lex();
+          continue;
         default: return FMF;
         }
       return FMF;
@@ -242,6 +246,8 @@ namespace llvm {
     bool ParseOrdering(AtomicOrdering &Ordering);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
+    bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
+                                     bool &AteExtraComma);
     bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
     bool parseAllocSizeArguments(unsigned &ElemSizeArg,
                                  Optional<unsigned> &HowManyArg);
@@ -393,7 +399,7 @@ namespace llvm {
       Value *V;
       AttributeSet Attrs;
       ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
-        : Loc(loc), V(v), Attrs(attrs) {}
+          : Loc(loc), V(v), Attrs(attrs) {}
     };
     bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
                             PerFunctionState &PFS,
@@ -447,7 +453,7 @@ namespace llvm {
       AttributeSet Attrs;
       std::string Name;
       ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
-        : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
+          : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
     };
     bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
     bool ParseFunctionHeader(Function *&Fn, bool isDefine);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 048aeee90b35..33f8e63daa05 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -98,6 +98,7 @@ enum Kind {
   kw_ninf,
   kw_nsz,
   kw_arcp,
+  kw_contract,
   kw_fast,
   kw_nuw,
   kw_nsw,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index a46e49ccde83..24ab7e9a950c 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -379,6 +379,8 @@ protected:
   BitstreamBlockInfo BlockInfo;
   BitstreamCursor Stream;
 
+  Expected<unsigned> parseVersionRecord(ArrayRef<uint64_t> Record);
+
   bool readBlockInfo();
 
   // Contains an arbitrary and optional string identifying the bitcode producer
@@ -395,6 +397,16 @@ Error BitcodeReaderBase::error(const Twine &Message) {
   return ::error(FullMsg);
 }
 
+Expected<unsigned>
+BitcodeReaderBase::parseVersionRecord(ArrayRef<uint64_t> Record) {
+  if (Record.size() < 1)
+    return error("Invalid record");
+  unsigned ModuleVersion = Record[0];
+  if (ModuleVersion > 1)
+    return error("Invalid value");
+  return ModuleVersion;
+}
+
 class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   LLVMContext &Context;
   Module *TheModule = nullptr;
@@ -405,6 +417,9 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   bool SeenValueSymbolTable = false;
   uint64_t VSTOffset = 0;
 
+  std::vector<std::string> SectionTable;
+  std::vector<std::string> GCTable;
+
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
   Optional<MetadataLoader> MDLoader;
@@ -419,10 +434,10 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
 
   /// The set of attributes by index.  Index zero in the file is for null, and
   /// is thus not represented here.  As such all indices are off by one.
-  std::vector<AttributeSet> MAttributes;
+  std::vector<AttributeList> MAttributes;
 
   /// The set of attribute groups.
-  std::map<unsigned, AttributeSet> MAttributeGroups;
+  std::map<unsigned, AttributeList> MAttributeGroups;
 
   /// While parsing a function body, this is a list of the basic blocks for the
   /// function.
@@ -520,10 +535,10 @@ private:
     return FunctionBBs[ID];
   }
 
-  AttributeSet getAttributes(unsigned i) const {
+  AttributeList getAttributes(unsigned i) const {
     if (i-1 < MAttributes.size())
       return MAttributes[i-1];
-    return AttributeSet();
+    return AttributeList();
   }
 
   /// Read a value/type pair out of the specified record from slot 'Slot'.
@@ -598,6 +613,13 @@ private:
   Error parseAlignmentValue(uint64_t Exponent, unsigned &Alignment);
   Error parseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
   Error parseModule(uint64_t ResumeBit, bool ShouldLazyLoadMetadata = false);
+
+  Error parseComdatRecord(ArrayRef<uint64_t> Record);
+  Error parseGlobalVarRecord(ArrayRef<uint64_t> Record);
+  Error parseFunctionRecord(ArrayRef<uint64_t> Record);
+  Error parseGlobalIndirectSymbolRecord(unsigned BitCode,
+                                        ArrayRef<uint64_t> Record);
+
   Error parseAttributeBlock();
   Error parseAttributeGroupBlock();
   Error parseTypeTable();
@@ -971,6 +993,8 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
     FMF.setNoSignedZeros();
   if (0 != (Val & FastMathFlags::AllowReciprocal))
     FMF.setAllowReciprocal();
+  if (0 != (Val & FastMathFlags::AllowContract))
+    FMF.setAllowContract(true);
   return FMF;
 }
 
@@ -1132,7 +1156,7 @@ Error BitcodeReader::parseAttributeBlock() {
 
   SmallVector<uint64_t, 64> Record;
 
-  SmallVector<AttributeSet, 8> Attrs;
+  SmallVector<AttributeList, 8> Attrs;
 
   // Read all the records.
   while (true) {
@@ -1162,10 +1186,10 @@ Error BitcodeReader::parseAttributeBlock() {
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
         decodeLLVMAttributesForBitcode(B, Record[i+1]);
-        Attrs.push_back(AttributeSet::get(Context, Record[i], B));
+        Attrs.push_back(AttributeList::get(Context, Record[i], B));
       }
 
-      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      MAttributes.push_back(AttributeList::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -1173,7 +1197,7 @@ Error BitcodeReader::parseAttributeBlock() {
       for (unsigned i = 0, e = Record.size(); i != e; ++i)
         Attrs.push_back(MAttributeGroups[Record[i]]);
 
-      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      MAttributes.push_back(AttributeList::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -1391,7 +1415,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
         }
       }
 
-      MAttributeGroups[GrpID] = AttributeSet::get(Context, Idx, B);
+      MAttributeGroups[GrpID] = AttributeList::get(Context, Idx, B);
       break;
     }
     }
@@ -1794,22 +1818,16 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
         return Err;
       Value *V = ValOrErr.get();
 
-      auto *GO = dyn_cast<GlobalObject>(V);
-      if (!GO) {
-        // If this is an alias, need to get the actual Function object
-        // it aliases, in order to set up the DeferredFunctionInfo entry below.
-        auto *GA = dyn_cast<GlobalAlias>(V);
-        if (GA)
-          GO = GA->getBaseObject();
-        assert(GO);
-      }
+      auto *F = dyn_cast<Function>(V);
+      // Ignore function offsets emitted for aliases of functions in older
+      // versions of LLVM.
+      if (!F)
+        break;
 
       // Note that we subtract 1 here because the offset is relative to one word
       // before the start of the identification or module block, which was
       // historically always the start of the regular bitcode header.
       uint64_t FuncWordOffset = Record[1] - 1;
-      Function *F = dyn_cast<Function>(GO);
-      assert(F);
       uint64_t FuncBitOffset = FuncWordOffset * 32;
       DeferredFunctionInfo[F] = FuncBitOffset + FuncBitcodeOffsetDelta;
       // Set the LastFunctionBlockBit to point to the last function block.
@@ -2607,6 +2625,246 @@ bool BitcodeReaderBase::readBlockInfo() {
   return false;
 }
 
+Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
+  // [selection_kind, name]
+  if (Record.size() < 2)
+    return error("Invalid record");
+  Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
+  std::string Name;
+  unsigned ComdatNameSize = Record[1];
+  Name.reserve(ComdatNameSize);
+  for (unsigned i = 0; i != ComdatNameSize; ++i)
+    Name += (char)Record[2 + i];
+  Comdat *C = TheModule->getOrInsertComdat(Name);
+  C->setSelectionKind(SK);
+  ComdatList.push_back(C);
+  return Error::success();
+}
+
+Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
+  // [pointer type, isconst, initid, linkage, alignment, section,
+  // visibility, threadlocal, unnamed_addr, externally_initialized,
+  // dllstorageclass, comdat]
+  if (Record.size() < 6)
+    return error("Invalid record");
+  Type *Ty = getTypeByID(Record[0]);
+  if (!Ty)
+    return error("Invalid record");
+  bool isConstant = Record[1] & 1;
+  bool explicitType = Record[1] & 2;
+  unsigned AddressSpace;
+  if (explicitType) {
+    AddressSpace = Record[1] >> 2;
+  } else {
+    if (!Ty->isPointerTy())
+      return error("Invalid type for value");
+    AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
+    Ty = cast<PointerType>(Ty)->getElementType();
+  }
+
+  uint64_t RawLinkage = Record[3];
+  GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+  unsigned Alignment;
+  if (Error Err = parseAlignmentValue(Record[4], Alignment))
+    return Err;
+  std::string Section;
+  if (Record[5]) {
+    if (Record[5] - 1 >= SectionTable.size())
+      return error("Invalid ID");
+    Section = SectionTable[Record[5] - 1];
+  }
+  GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
+  // Local linkage must have default visibility.
+  if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage))
+    // FIXME: Change to an error if non-default in 4.0.
+    Visibility = getDecodedVisibility(Record[6]);
+
+  GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
+  if (Record.size() > 7)
+    TLM = getDecodedThreadLocalMode(Record[7]);
+
+  GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
+  if (Record.size() > 8)
+    UnnamedAddr = getDecodedUnnamedAddrType(Record[8]);
+
+  bool ExternallyInitialized = false;
+  if (Record.size() > 9)
+    ExternallyInitialized = Record[9];
+
+  GlobalVariable *NewGV =
+      new GlobalVariable(*TheModule, Ty, isConstant, Linkage, nullptr, "",
+                         nullptr, TLM, AddressSpace, ExternallyInitialized);
+  NewGV->setAlignment(Alignment);
+  if (!Section.empty())
+    NewGV->setSection(Section);
+  NewGV->setVisibility(Visibility);
+  NewGV->setUnnamedAddr(UnnamedAddr);
+
+  if (Record.size() > 10)
+    NewGV->setDLLStorageClass(getDecodedDLLStorageClass(Record[10]));
+  else
+    upgradeDLLImportExportLinkage(NewGV, RawLinkage);
+
+  ValueList.push_back(NewGV);
+
+  // Remember which value to use for the global initializer.
+  if (unsigned InitID = Record[2])
+    GlobalInits.push_back(std::make_pair(NewGV, InitID - 1));
+
+  if (Record.size() > 11) {
+    if (unsigned ComdatID = Record[11]) {
+      if (ComdatID > ComdatList.size())
+        return error("Invalid global variable comdat ID");
+      NewGV->setComdat(ComdatList[ComdatID - 1]);
+    }
+  } else if (hasImplicitComdat(RawLinkage)) {
+    NewGV->setComdat(reinterpret_cast<Comdat *>(1));
+  }
+  return Error::success();
+}
+
+Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
+  // [type, callingconv, isproto, linkage, paramattr, alignment, section,
+  // visibility, gc, unnamed_addr, prologuedata, dllstorageclass, comdat,
+  // prefixdata]
+  if (Record.size() < 8)
+    return error("Invalid record");
+  Type *Ty = getTypeByID(Record[0]);
+  if (!Ty)
+    return error("Invalid record");
+  if (auto *PTy = dyn_cast<PointerType>(Ty))
+    Ty = PTy->getElementType();
+  auto *FTy = dyn_cast<FunctionType>(Ty);
+  if (!FTy)
+    return error("Invalid type for value");
+  auto CC = static_cast<CallingConv::ID>(Record[1]);
+  if (CC & ~CallingConv::MaxID)
+    return error("Invalid calling convention ID");
+
+  Function *Func =
+      Function::Create(FTy, GlobalValue::ExternalLinkage, "", TheModule);
+
+  Func->setCallingConv(CC);
+  bool isProto = Record[2];
+  uint64_t RawLinkage = Record[3];
+  Func->setLinkage(getDecodedLinkage(RawLinkage));
+  Func->setAttributes(getAttributes(Record[4]));
+
+  unsigned Alignment;
+  if (Error Err = parseAlignmentValue(Record[5], Alignment))
+    return Err;
+  Func->setAlignment(Alignment);
+  if (Record[6]) {
+    if (Record[6] - 1 >= SectionTable.size())
+      return error("Invalid ID");
+    Func->setSection(SectionTable[Record[6] - 1]);
+  }
+  // Local linkage must have default visibility.
+  if (!Func->hasLocalLinkage())
+    // FIXME: Change to an error if non-default in 4.0.
+    Func->setVisibility(getDecodedVisibility(Record[7]));
+  if (Record.size() > 8 && Record[8]) {
+    if (Record[8] - 1 >= GCTable.size())
+      return error("Invalid ID");
+    Func->setGC(GCTable[Record[8] - 1]);
+  }
+  GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
+  if (Record.size() > 9)
+    UnnamedAddr = getDecodedUnnamedAddrType(Record[9]);
+  Func->setUnnamedAddr(UnnamedAddr);
+  if (Record.size() > 10 && Record[10] != 0)
+    FunctionPrologues.push_back(std::make_pair(Func, Record[10] - 1));
+
+  if (Record.size() > 11)
+    Func->setDLLStorageClass(getDecodedDLLStorageClass(Record[11]));
+  else
+    upgradeDLLImportExportLinkage(Func, RawLinkage);
+
+  if (Record.size() > 12) {
+    if (unsigned ComdatID = Record[12]) {
+      if (ComdatID > ComdatList.size())
+        return error("Invalid function comdat ID");
+      Func->setComdat(ComdatList[ComdatID - 1]);
+    }
+  } else if (hasImplicitComdat(RawLinkage)) {
+    Func->setComdat(reinterpret_cast<Comdat *>(1));
+  }
+
+  if (Record.size() > 13 && Record[13] != 0)
+    FunctionPrefixes.push_back(std::make_pair(Func, Record[13] - 1));
+
+  if (Record.size() > 14 && Record[14] != 0)
+    FunctionPersonalityFns.push_back(std::make_pair(Func, Record[14] - 1));
+
+  ValueList.push_back(Func);
+
+  // If this is a function with a body, remember the prototype we are
+  // creating now, so that we can match up the body with them later.
+  if (!isProto) {
+    Func->setIsMaterializable(true);
+    FunctionsWithBodies.push_back(Func);
+    DeferredFunctionInfo[Func] = 0;
+  }
+  return Error::success();
+}
+
+Error BitcodeReader::parseGlobalIndirectSymbolRecord(
+    unsigned BitCode, ArrayRef<uint64_t> Record) {
+  // ALIAS_OLD: [alias type, aliasee val#, linkage]
+  // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility,
+  // dllstorageclass]
+  // IFUNC: [alias type, addrspace, aliasee val#, linkage,
+  // visibility, dllstorageclass]
+  bool NewRecord = BitCode != bitc::MODULE_CODE_ALIAS_OLD;
+  if (Record.size() < (3 + (unsigned)NewRecord))
+    return error("Invalid record");
+  unsigned OpNum = 0;
+  Type *Ty = getTypeByID(Record[OpNum++]);
+  if (!Ty)
+    return error("Invalid record");
+
+  unsigned AddrSpace;
+  if (!NewRecord) {
+    auto *PTy = dyn_cast<PointerType>(Ty);
+    if (!PTy)
+      return error("Invalid type for value");
+    Ty = PTy->getElementType();
+    AddrSpace = PTy->getAddressSpace();
+  } else {
+    AddrSpace = Record[OpNum++];
+  }
+
+  auto Val = Record[OpNum++];
+  auto Linkage = Record[OpNum++];
+  GlobalIndirectSymbol *NewGA;
+  if (BitCode == bitc::MODULE_CODE_ALIAS ||
+      BitCode == bitc::MODULE_CODE_ALIAS_OLD)
+    NewGA = GlobalAlias::create(Ty, AddrSpace, getDecodedLinkage(Linkage), "",
+                                TheModule);
+  else
+    NewGA = GlobalIFunc::create(Ty, AddrSpace, getDecodedLinkage(Linkage), "",
+                                nullptr, TheModule);
+  // Old bitcode files didn't have visibility field.
+  // Local linkage must have default visibility.
+  if (OpNum != Record.size()) {
+    auto VisInd = OpNum++;
+    if (!NewGA->hasLocalLinkage())
+      // FIXME: Change to an error if non-default in 4.0.
+      NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
+  }
+  if (OpNum != Record.size())
+    NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
+  else
+    upgradeDLLImportExportLinkage(NewGA, Linkage);
+  if (OpNum != Record.size())
+    NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
+  if (OpNum != Record.size())
+    NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
+  ValueList.push_back(NewGA);
+  IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
+  return Error::success();
+}
+
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata) {
   if (ResumeBit)
@@ -2615,8 +2873,6 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
     return error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
-  std::vector<std::string> SectionTable;
-  std::vector<std::string> GCTable;
 
   // Read all the records for this module.
   while (true) {
@@ -2762,21 +3018,11 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
     auto BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: break;  // Default behavior, ignore unknown content.
-    case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
-      if (Record.size() < 1)
-        return error("Invalid record");
-      // Only version #0 and #1 are supported so far.
-      unsigned module_version = Record[0];
-      switch (module_version) {
-        default:
-          return error("Invalid value");
-        case 0:
-          UseRelativeIDs = false;
-          break;
-        case 1:
-          UseRelativeIDs = true;
-          break;
-      }
+    case bitc::MODULE_CODE_VERSION: {
+      Expected<unsigned> VersionOrErr = parseVersionRecord(Record);
+      if (!VersionOrErr)
+        return VersionOrErr.takeError();
+      UseRelativeIDs = *VersionOrErr >= 1;
       break;
     }
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
@@ -2822,249 +3068,28 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       GCTable.push_back(S);
       break;
     }
-    case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name]
-      if (Record.size() < 2)
-        return error("Invalid record");
-      Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
-      unsigned ComdatNameSize = Record[1];
-      std::string ComdatName;
-      ComdatName.reserve(ComdatNameSize);
-      for (unsigned i = 0; i != ComdatNameSize; ++i)
-        ComdatName += (char)Record[2 + i];
-      Comdat *C = TheModule->getOrInsertComdat(ComdatName);
-      C->setSelectionKind(SK);
-      ComdatList.push_back(C);
-      break;
-    }
-    // GLOBALVAR: [pointer type, isconst, initid,
-    //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr, externally_initialized, dllstorageclass,
-    //             comdat]
+    case bitc::MODULE_CODE_COMDAT: {
+      if (Error Err = parseComdatRecord(Record))
+        return Err;
+      break;
+    }
     case bitc::MODULE_CODE_GLOBALVAR: {
-      if (Record.size() < 6)
-        return error("Invalid record");
-      Type *Ty = getTypeByID(Record[0]);
-      if (!Ty)
-        return error("Invalid record");
-      bool isConstant = Record[1] & 1;
-      bool explicitType = Record[1] & 2;
-      unsigned AddressSpace;
-      if (explicitType) {
-        AddressSpace = Record[1] >> 2;
-      } else {
-        if (!Ty->isPointerTy())
-          return error("Invalid type for value");
-        AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
-        Ty = cast<PointerType>(Ty)->getElementType();
-      }
-
-      uint64_t RawLinkage = Record[3];
-      GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-      unsigned Alignment;
-      if (Error Err = parseAlignmentValue(Record[4], Alignment))
+      if (Error Err = parseGlobalVarRecord(Record))
         return Err;
-      std::string Section;
-      if (Record[5]) {
-        if (Record[5]-1 >= SectionTable.size())
-          return error("Invalid ID");
-        Section = SectionTable[Record[5]-1];
-      }
-      GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
-      // Local linkage must have default visibility.
-      if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage))
-        // FIXME: Change to an error if non-default in 4.0.
-        Visibility = getDecodedVisibility(Record[6]);
-
-      GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
-      if (Record.size() > 7)
-        TLM = getDecodedThreadLocalMode(Record[7]);
-
-      GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
-      if (Record.size() > 8)
-        UnnamedAddr = getDecodedUnnamedAddrType(Record[8]);
-
-      bool ExternallyInitialized = false;
-      if (Record.size() > 9)
-        ExternallyInitialized = Record[9];
-
-      GlobalVariable *NewGV =
-        new GlobalVariable(*TheModule, Ty, isConstant, Linkage, nullptr, "", nullptr,
-                           TLM, AddressSpace, ExternallyInitialized);
-      NewGV->setAlignment(Alignment);
-      if (!Section.empty())
-        NewGV->setSection(Section);
-      NewGV->setVisibility(Visibility);
-      NewGV->setUnnamedAddr(UnnamedAddr);
-
-      if (Record.size() > 10)
-        NewGV->setDLLStorageClass(getDecodedDLLStorageClass(Record[10]));
-      else
-        upgradeDLLImportExportLinkage(NewGV, RawLinkage);
-
-      ValueList.push_back(NewGV);
-
-      // Remember which value to use for the global initializer.
-      if (unsigned InitID = Record[2])
-        GlobalInits.push_back(std::make_pair(NewGV, InitID-1));
-
-      if (Record.size() > 11) {
-        if (unsigned ComdatID = Record[11]) {
-          if (ComdatID > ComdatList.size())
-            return error("Invalid global variable comdat ID");
-          NewGV->setComdat(ComdatList[ComdatID - 1]);
-        }
-      } else if (hasImplicitComdat(RawLinkage)) {
-        NewGV->setComdat(reinterpret_cast<Comdat *>(1));
-      }
-
       break;
     }
-    // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
-    //             alignment, section, visibility, gc, unnamed_addr,
-    //             prologuedata, dllstorageclass, comdat, prefixdata]
     case bitc::MODULE_CODE_FUNCTION: {
-      if (Record.size() < 8)
-        return error("Invalid record");
-      Type *Ty = getTypeByID(Record[0]);
-      if (!Ty)
-        return error("Invalid record");
-      if (auto *PTy = dyn_cast<PointerType>(Ty))
-        Ty = PTy->getElementType();
-      auto *FTy = dyn_cast<FunctionType>(Ty);
-      if (!FTy)
-        return error("Invalid type for value");
-      auto CC = static_cast<CallingConv::ID>(Record[1]);
-      if (CC & ~CallingConv::MaxID)
-        return error("Invalid calling convention ID");
-
-      Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                        "", TheModule);
-
-      Func->setCallingConv(CC);
-      bool isProto = Record[2];
-      uint64_t RawLinkage = Record[3];
-      Func->setLinkage(getDecodedLinkage(RawLinkage));
-      Func->setAttributes(getAttributes(Record[4]));
-
-      unsigned Alignment;
-      if (Error Err = parseAlignmentValue(Record[5], Alignment))
+      if (Error Err = parseFunctionRecord(Record))
         return Err;
-      Func->setAlignment(Alignment);
-      if (Record[6]) {
-        if (Record[6]-1 >= SectionTable.size())
-          return error("Invalid ID");
-        Func->setSection(SectionTable[Record[6]-1]);
-      }
-      // Local linkage must have default visibility.
-      if (!Func->hasLocalLinkage())
-        // FIXME: Change to an error if non-default in 4.0.
-        Func->setVisibility(getDecodedVisibility(Record[7]));
-      if (Record.size() > 8 && Record[8]) {
-        if (Record[8]-1 >= GCTable.size())
-          return error("Invalid ID");
-        Func->setGC(GCTable[Record[8] - 1]);
-      }
-      GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
-      if (Record.size() > 9)
-        UnnamedAddr = getDecodedUnnamedAddrType(Record[9]);
-      Func->setUnnamedAddr(UnnamedAddr);
-      if (Record.size() > 10 && Record[10] != 0)
-        FunctionPrologues.push_back(std::make_pair(Func, Record[10]-1));
-
-      if (Record.size() > 11)
-        Func->setDLLStorageClass(getDecodedDLLStorageClass(Record[11]));
-      else
-        upgradeDLLImportExportLinkage(Func, RawLinkage);
-
-      if (Record.size() > 12) {
-        if (unsigned ComdatID = Record[12]) {
-          if (ComdatID > ComdatList.size())
-            return error("Invalid function comdat ID");
-          Func->setComdat(ComdatList[ComdatID - 1]);
-        }
-      } else if (hasImplicitComdat(RawLinkage)) {
-        Func->setComdat(reinterpret_cast<Comdat *>(1));
-      }
-
-      if (Record.size() > 13 && Record[13] != 0)
-        FunctionPrefixes.push_back(std::make_pair(Func, Record[13]-1));
-
-      if (Record.size() > 14 && Record[14] != 0)
-        FunctionPersonalityFns.push_back(std::make_pair(Func, Record[14] - 1));
-
-      ValueList.push_back(Func);
-
-      // If this is a function with a body, remember the prototype we are
-      // creating now, so that we can match up the body with them later.
-      if (!isProto) {
-        Func->setIsMaterializable(true);
-        FunctionsWithBodies.push_back(Func);
-        DeferredFunctionInfo[Func] = 0;
-      }
       break;
     }
-    // ALIAS: [alias type, addrspace, aliasee val#, linkage]
-    // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
-    // IFUNC: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
     case bitc::MODULE_CODE_IFUNC:
     case bitc::MODULE_CODE_ALIAS:
     case bitc::MODULE_CODE_ALIAS_OLD: {
-      bool NewRecord = BitCode != bitc::MODULE_CODE_ALIAS_OLD;
-      if (Record.size() < (3 + (unsigned)NewRecord))
-        return error("Invalid record");
-      unsigned OpNum = 0;
-      Type *Ty = getTypeByID(Record[OpNum++]);
-      if (!Ty)
-        return error("Invalid record");
-
-      unsigned AddrSpace;
-      if (!NewRecord) {
-        auto *PTy = dyn_cast<PointerType>(Ty);
-        if (!PTy)
-          return error("Invalid type for value");
-        Ty = PTy->getElementType();
-        AddrSpace = PTy->getAddressSpace();
-      } else {
-        AddrSpace = Record[OpNum++];
-      }
-
-      auto Val = Record[OpNum++];
-      auto Linkage = Record[OpNum++];
-      GlobalIndirectSymbol *NewGA;
-      if (BitCode == bitc::MODULE_CODE_ALIAS ||
-          BitCode == bitc::MODULE_CODE_ALIAS_OLD)
-        NewGA = GlobalAlias::create(Ty, AddrSpace, getDecodedLinkage(Linkage),
-                                    "", TheModule);
-      else
-        NewGA = GlobalIFunc::create(Ty, AddrSpace, getDecodedLinkage(Linkage),
-                                    "", nullptr, TheModule);
-      // Old bitcode files didn't have visibility field.
-      // Local linkage must have default visibility.
-      if (OpNum != Record.size()) {
-        auto VisInd = OpNum++;
-        if (!NewGA->hasLocalLinkage())
-          // FIXME: Change to an error if non-default in 4.0.
-          NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
-      }
-      if (OpNum != Record.size())
-        NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
-      else
-        upgradeDLLImportExportLinkage(NewGA, Linkage);
-      if (OpNum != Record.size())
-        NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
-      if (OpNum != Record.size())
-        NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
-      ValueList.push_back(NewGA);
-      IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
-      break;
-    }
-    /// MODULE_CODE_PURGEVALS: [numvals]
-    case bitc::MODULE_CODE_PURGEVALS:
-      // Trim down the value list to the specified size.
-      if (Record.size() < 1 || Record[0] > ValueList.size())
-        return error("Invalid record");
-      ValueList.shrinkTo(Record[0]);
+      if (Error Err = parseGlobalIndirectSymbolRecord(BitCode, Record))
+        return Err;
       break;
+    }
     /// MODULE_CODE_VSTOFFSET: [offset]
     case bitc::MODULE_CODE_VSTOFFSET:
       if (Record.size() < 1)
@@ -3840,7 +3865,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() < 4)
         return error("Invalid record");
       unsigned OpNum = 0;
-      AttributeSet PAL = getAttributes(Record[OpNum++]);
+      AttributeList PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
       BasicBlock *NormalBB = getBasicBlock(Record[OpNum++]);
       BasicBlock *UnwindBB = getBasicBlock(Record[OpNum++]);
@@ -4017,7 +4042,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
       if (!Ty || !Size)
         return error("Invalid record");
-      AllocaInst *AI = new AllocaInst(Ty, Size, Align);
+
+      // FIXME: Make this an optional field.
+      const DataLayout &DL = TheModule->getDataLayout();
+      unsigned AS = DL.getAllocaAddrSpace();
+
+      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align);
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
@@ -4225,7 +4255,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
 
       unsigned OpNum = 0;
-      AttributeSet PAL = getAttributes(Record[OpNum++]);
+      AttributeList PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
 
       FastMathFlags FMF;
@@ -4753,33 +4783,13 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
           // was historically always the start of the regular bitcode header.
           VSTOffset = Record[0] - 1;
           break;
-        // GLOBALVAR: [pointer type, isconst, initid,
-        //             linkage, alignment, section, visibility, threadlocal,
-        //             unnamed_addr, externally_initialized, dllstorageclass,
-        //             comdat]
-        case bitc::MODULE_CODE_GLOBALVAR: {
-          if (Record.size() < 6)
-            return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
-          break;
-        }
-        // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
-        //             alignment, section, visibility, gc, unnamed_addr,
-        //             prologuedata, dllstorageclass, comdat, prefixdata]
-        case bitc::MODULE_CODE_FUNCTION: {
-          if (Record.size() < 8)
-            return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
-          break;
-        }
-        // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility,
-        // dllstorageclass]
+        // GLOBALVAR: [pointer type, isconst,     initid,       linkage, ...]
+        // FUNCTION:  [type,         callingconv, isproto,      linkage, ...]
+        // ALIAS:     [alias type,   addrspace,   aliasee val#, linkage, ...]
+        case bitc::MODULE_CODE_GLOBALVAR:
+        case bitc::MODULE_CODE_FUNCTION:
         case bitc::MODULE_CODE_ALIAS: {
-          if (Record.size() < 6)
+          if (Record.size() <= 3)
             return error("Invalid record");
           uint64_t RawLinkage = Record[3];
           GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
@@ -4846,8 +4856,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
   // Keep around the last seen summary to be used when we see an optional
   // "OriginalName" attachement.
   GlobalValueSummary *LastSeenSummary = nullptr;
+  GlobalValue::GUID LastSeenGUID = 0;
   bool Combined = false;
+
+  // We can expect to see any number of type ID information records before
+  // each function summary records; these variables store the information
+  // collected so far so that it can be used to create the summary object.
   std::vector<GlobalValue::GUID> PendingTypeTests;
+  std::vector<FunctionSummary::VFuncId> PendingTypeTestAssumeVCalls,
+      PendingTypeCheckedLoadVCalls;
+  std::vector<FunctionSummary::ConstVCall> PendingTypeTestAssumeConstVCalls,
+      PendingTypeCheckedLoadConstVCalls;
 
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -4914,8 +4933,15 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
           IsOldProfileFormat, HasProfile);
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, std::move(Refs), std::move(Calls),
-          std::move(PendingTypeTests));
+          std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls),
+          std::move(PendingTypeCheckedLoadVCalls),
+          std::move(PendingTypeTestAssumeConstVCalls),
+          std::move(PendingTypeCheckedLoadConstVCalls));
       PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       auto GUID = getGUIDFromValueId(ValueID);
       FS->setModulePath(TheIndex.addModulePath(ModulePath, 0)->first());
       FS->setOriginalName(GUID.second);
@@ -4989,9 +5015,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, std::move(Refs), std::move(Edges),
-          std::move(PendingTypeTests));
+          std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls),
+          std::move(PendingTypeCheckedLoadVCalls),
+          std::move(PendingTypeTestAssumeConstVCalls),
+          std::move(PendingTypeCheckedLoadConstVCalls));
       PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       LastSeenSummary = FS.get();
+      LastSeenGUID = GUID;
       FS->setModulePath(ModuleIdMap[ModuleId]);
       TheIndex.addGlobalValueSummary(GUID, std::move(FS));
       Combined = true;
@@ -5018,6 +5052,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       AS->setAliasee(AliaseeInModule);
 
       GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      LastSeenGUID = GUID;
       TheIndex.addGlobalValueSummary(GUID, std::move(AS));
       Combined = true;
       break;
@@ -5034,6 +5069,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       LastSeenSummary = FS.get();
       FS->setModulePath(ModuleIdMap[ModuleId]);
       GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      LastSeenGUID = GUID;
       TheIndex.addGlobalValueSummary(GUID, std::move(FS));
       Combined = true;
       break;
@@ -5044,8 +5080,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       if (!LastSeenSummary)
         return error("Name attachment that does not follow a combined record");
       LastSeenSummary->setOriginalName(OriginalName);
+      TheIndex.addOriginalName(LastSeenGUID, OriginalName);
       // Reset the LastSeenSummary
       LastSeenSummary = nullptr;
+      LastSeenGUID = 0;
       break;
     }
     case bitc::FS_TYPE_TESTS: {
@@ -5054,6 +5092,28 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
                               Record.end());
       break;
     }
+    case bitc::FS_TYPE_TEST_ASSUME_VCALLS: {
+      assert(PendingTypeTestAssumeVCalls.empty());
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I+1]});
+      break;
+    }
+    case bitc::FS_TYPE_CHECKED_LOAD_VCALLS: {
+      assert(PendingTypeCheckedLoadVCalls.empty());
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I+1]});
+      break;
+    }
+    case bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL: {
+      PendingTypeTestAssumeConstVCalls.push_back(
+          {{Record[0], Record[1]}, {Record.begin() + 2, Record.end()}});
+      break;
+    }
+    case bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL: {
+      PendingTypeCheckedLoadConstVCalls.push_back(
+          {{Record[0], Record[1]}, {Record.begin() + 2, Record.end()}});
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index b89f5be4a369..274dfe89cce5 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -358,6 +358,9 @@ class PlaceholderQueue {
   std::deque<DistinctMDOperandPlaceholder> PHs;
 
 public:
+  ~PlaceholderQueue() {
+    assert(empty() && "PlaceholderQueue hasn't been flushed before being destroyed");
+  }
   bool empty() { return PHs.empty(); }
   DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
   void flush(BitcodeReaderMetadataList &MetadataList);
@@ -457,7 +460,7 @@ class MetadataLoader::MetadataLoaderImpl {
                          PlaceholderQueue &Placeholders, StringRef Blob,
                          unsigned &NextMetadataNo);
   Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
-                             std::function<void(StringRef)> CallBack);
+                             function_ref<void(StringRef)> CallBack);
   Error parseGlobalObjectAttachment(GlobalObject &GO,
                                     ArrayRef<uint64_t> Record);
   Error parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
@@ -520,7 +523,7 @@ public:
                      bool IsImporting)
       : MetadataList(TheModule.getContext()), ValueList(ValueList),
         Stream(Stream), Context(TheModule.getContext()), TheModule(TheModule),
-        getTypeByID(getTypeByID), IsImporting(IsImporting) {}
+        getTypeByID(std::move(getTypeByID)), IsImporting(IsImporting) {}
 
   Error parseMetadata(bool ModuleLevel);
 
@@ -564,7 +567,7 @@ public:
   void shrinkTo(unsigned N) { MetadataList.shrinkTo(N); }
 };
 
-Error error(const Twine &Message) {
+static Error error(const Twine &Message) {
   return make_error<StringError>(
       Message, make_error_code(BitcodeError::CorruptedBitcode));
 }
@@ -1107,9 +1110,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() != 12)
+    if (Record.size() < 12 || Record.size() > 13)
       return error("Invalid record");
 
+    // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
+    // that there is no DWARF address space associated with DIDerivedType.
+    Optional<unsigned> DWARFAddressSpace;
+    if (Record.size() > 12 && Record[12])
+      DWARFAddressSpace = Record[12] - 1;
+
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
     MetadataList.assignValue(
@@ -1118,7 +1127,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[3]), Record[4],
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                         Record[9], Flags, getDITypeRefOrNull(Record[11]))),
+                         Record[9], DWARFAddressSpace, Flags,
+                         getDITypeRefOrNull(Record[11]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1240,7 +1250,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPILE_UNIT: {
-    if (Record.size() < 14 || Record.size() > 17)
+    if (Record.size() < 14 || Record.size() > 18)
       return error("Invalid record");
 
     // Ignore Record[0], which indicates whether this compile unit is
@@ -1253,7 +1263,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         getMDOrNull(Record[12]), getMDOrNull(Record[13]),
         Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
         Record.size() <= 14 ? 0 : Record[14],
-        Record.size() <= 16 ? true : Record[16]);
+        Record.size() <= 16 ? true : Record[16],
+        Record.size() <= 17 ? false : Record[17]);
 
     MetadataList.assignValue(CU, NextMetadataNo);
     NextMetadataNo++;
@@ -1433,6 +1444,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     } else if (Version == 0) {
       // Upgrade old metadata, which stored a global variable reference or a
       // ConstantInt here.
+      NeedUpgradeToDIGlobalVariableExpression = true;
       Metadata *Expr = getMDOrNull(Record[9]);
       uint32_t AlignInBits = 0;
       if (Record.size() > 11) {
@@ -1463,8 +1475,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       DIGlobalVariableExpression *DGVE = nullptr;
       if (Attach || Expr)
         DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
-      else
-        NeedUpgradeToDIGlobalVariableExpression = true;
       if (Attach)
         Attach->addDebugInfo(DGVE);
 
@@ -1485,7 +1495,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     bool HasAlignment = Record[0] & 2;
     // 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
     // DW_TAG_arg_variable, if we have alignment flag encoded it means, that
-    // this is newer version of record which doesn't have artifical tag.
+    // this is newer version of record which doesn't have artificial tag.
     bool HasTag = !HasAlignment && Record.size() > 8;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7 + HasTag]);
     uint32_t AlignInBits = 0;
@@ -1611,7 +1621,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
 Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
     ArrayRef<uint64_t> Record, StringRef Blob,
-    std::function<void(StringRef)> CallBack) {
+    function_ref<void(StringRef)> CallBack) {
   // All the MDStrings in the block are emitted together in a single
   // record.  The strings are concatenated and stored in a blob along with
   // their sizes.
@@ -1808,8 +1818,8 @@ MetadataLoader::MetadataLoader(BitstreamCursor &Stream, Module &TheModule,
                                BitcodeReaderValueList &ValueList,
                                bool IsImporting,
                                std::function<Type *(unsigned)> getTypeByID)
-    : Pimpl(llvm::make_unique<MetadataLoaderImpl>(Stream, TheModule, ValueList,
-                                                  getTypeByID, IsImporting)) {}
+    : Pimpl(llvm::make_unique<MetadataLoaderImpl>(
+          Stream, TheModule, ValueList, std::move(getTypeByID), IsImporting)) {}
 
 Error MetadataLoader::parseMetadata(bool ModuleLevel) {
   return Pimpl->parseMetadata(ModuleLevel);
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index ebb2022551f7..043441bac4de 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -108,6 +108,14 @@ class ModuleBitcodeWriter : public BitcodeWriterBase {
   /// True if a module hash record should be written.
   bool GenerateHash;
 
+  /// If non-null, when GenerateHash is true, the resulting hash is written
+  /// into ModHash. When GenerateHash is false, that specified value
+  /// is used as the hash instead of computing from the generated bitcode.
+  /// Can be used to produce the same module hash for a minimized bitcode
+  /// used just for the thin link as in the regular full bitcode that will
+  /// be used in the backend.
+  ModuleHash *ModHash;
+
   /// The start bit of the identification block.
   uint64_t BitcodeStartBit;
 
@@ -124,10 +132,12 @@ public:
   /// writing to the provided \p Buffer.
   ModuleBitcodeWriter(const Module *M, SmallVectorImpl<char> &Buffer,
                       BitstreamWriter &Stream, bool ShouldPreserveUseListOrder,
-                      const ModuleSummaryIndex *Index, bool GenerateHash)
+                      const ModuleSummaryIndex *Index, bool GenerateHash,
+                      ModuleHash *ModHash = nullptr)
       : BitcodeWriterBase(Stream), Buffer(Buffer), M(*M),
         VE(*M, ShouldPreserveUseListOrder), Index(Index),
-        GenerateHash(GenerateHash), BitcodeStartBit(Stream.GetCurrentBitNo()) {
+        GenerateHash(GenerateHash), ModHash(ModHash),
+        BitcodeStartBit(Stream.GetCurrentBitNo()) {
     // Assign ValueIds to any callee values in the index that came from
     // indirect call profiles and were recorded as a GUID not a Value*
     // (which would have been assigned an ID by the ValueEnumerator).
@@ -466,7 +476,6 @@ public:
   void write();
 
 private:
-  void writeIndex();
   void writeModStrings();
   void writeCombinedValueSymbolTable();
   void writeCombinedGlobalValueSummary();
@@ -709,22 +718,22 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
 }
 
 void ModuleBitcodeWriter::writeAttributeGroupTable() {
-  const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
+  const std::vector<AttributeList> &AttrGrps = VE.getAttributeGroups();
   if (AttrGrps.empty()) return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = AttrGrps.size(); i != e; ++i) {
-    AttributeSet AS = AttrGrps[i];
+    AttributeList AS = AttrGrps[i];
     for (unsigned i = 0, e = AS.getNumSlots(); i != e; ++i) {
-      AttributeSet A = AS.getSlotAttributes(i);
+      AttributeList A = AS.getSlotAttributes(i);
 
       Record.push_back(VE.getAttributeGroupID(A));
       Record.push_back(AS.getSlotIndex(i));
 
-      for (AttributeSet::iterator I = AS.begin(0), E = AS.end(0);
-           I != E; ++I) {
+      for (AttributeList::iterator I = AS.begin(0), E = AS.end(0); I != E;
+           ++I) {
         Attribute Attr = *I;
         if (Attr.isEnumAttribute()) {
           Record.push_back(0);
@@ -756,14 +765,14 @@ void ModuleBitcodeWriter::writeAttributeGroupTable() {
 }
 
 void ModuleBitcodeWriter::writeAttributeTable() {
-  const std::vector<AttributeSet> &Attrs = VE.getAttributes();
+  const std::vector<AttributeList> &Attrs = VE.getAttributes();
   if (Attrs.empty()) return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    const AttributeSet &A = Attrs[i];
+    const AttributeList &A = Attrs[i];
     for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i)
       Record.push_back(VE.getAttributeGroupID(A.getSlotAttributes(i)));
 
@@ -1326,6 +1335,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
       Flags |= FastMathFlags::NoSignedZeros;
     if (FPMO->hasAllowReciprocal())
       Flags |= FastMathFlags::AllowReciprocal;
+    if (FPMO->hasAllowContract())
+      Flags |= FastMathFlags::AllowContract;
   }
 
   return Flags;
@@ -1473,6 +1484,13 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getExtraData()));
 
+  // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
+  // that there is no DWARF address space associated with DIDerivedType.
+  if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
+    Record.push_back(*DWARFAddressSpace + 1);
+  else
+    Record.push_back(0);
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -1549,6 +1567,7 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
   Record.push_back(N->getDWOId());
   Record.push_back(VE.getMetadataOrNullID(N->getMacros().get()));
   Record.push_back(N->getSplitDebugInlining());
+  Record.push_back(N->getDebugInfoForProfiling());
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -2559,7 +2578,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
       pushValue(SI.getCondition(), InstID, Vals);
       Vals.push_back(VE.getValueID(SI.getDefaultDest()));
-      for (SwitchInst::ConstCaseIt Case : SI.cases()) {
+      for (auto Case : SI.cases()) {
         Vals.push_back(VE.getValueID(Case.getCaseValue()));
         Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
       }
@@ -2905,13 +2924,6 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
     NameVals.push_back(VE.getValueID(Name.getValue()));
 
     Function *F = dyn_cast<Function>(Name.getValue());
-    if (!F) {
-      // If value is an alias, need to get the aliased base object to
-      // see if it is a function.
-      auto *GA = dyn_cast<GlobalAlias>(Name.getValue());
-      if (GA && GA->getBaseObject())
-        F = dyn_cast<Function>(GA->getBaseObject());
-    }
 
     // VST_CODE_ENTRY:   [valueid, namechar x N]
     // VST_CODE_FNENTRY: [valueid, funcoffset, namechar x N]
@@ -3367,6 +3379,49 @@ void IndexBitcodeWriter::writeModStrings() {
   Stream.ExitBlock();
 }
 
+/// Write the function type metadata related records that need to appear before
+/// a function summary entry (whether per-module or combined).
+static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
+                                             FunctionSummary *FS) {
+  if (!FS->type_tests().empty())
+    Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+
+  SmallVector<uint64_t, 64> Record;
+
+  auto WriteVFuncIdVec = [&](uint64_t Ty,
+                             ArrayRef<FunctionSummary::VFuncId> VFs) {
+    if (VFs.empty())
+      return;
+    Record.clear();
+    for (auto &VF : VFs) {
+      Record.push_back(VF.GUID);
+      Record.push_back(VF.Offset);
+    }
+    Stream.EmitRecord(Ty, Record);
+  };
+
+  WriteVFuncIdVec(bitc::FS_TYPE_TEST_ASSUME_VCALLS,
+                  FS->type_test_assume_vcalls());
+  WriteVFuncIdVec(bitc::FS_TYPE_CHECKED_LOAD_VCALLS,
+                  FS->type_checked_load_vcalls());
+
+  auto WriteConstVCallVec = [&](uint64_t Ty,
+                                ArrayRef<FunctionSummary::ConstVCall> VCs) {
+    for (auto &VC : VCs) {
+      Record.clear();
+      Record.push_back(VC.VFunc.GUID);
+      Record.push_back(VC.VFunc.Offset);
+      Record.insert(Record.end(), VC.Args.begin(), VC.Args.end());
+      Stream.EmitRecord(Ty, Record);
+    }
+  };
+
+  WriteConstVCallVec(bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL,
+                     FS->type_test_assume_const_vcalls());
+  WriteConstVCallVec(bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL,
+                     FS->type_checked_load_const_vcalls());
+}
+
 // Helper to emit a single function summary record.
 void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
@@ -3375,8 +3430,7 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
-  if (!FS->type_tests().empty())
-    Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+  writeFunctionTypeMetadataRecords(Stream, FS);
 
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
@@ -3636,8 +3690,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
 
     auto *FS = cast<FunctionSummary>(S);
-    if (!FS->type_tests().empty())
-      Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+    writeFunctionTypeMetadataRecords(Stream, FS);
 
     NameVals.push_back(ValueId);
     NameVals.push_back(Index.getModuleId(FS->modulePath()));
@@ -3659,9 +3712,16 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     for (auto &EI : FS->calls()) {
       // If this GUID doesn't have a value id, it doesn't have a function
       // summary and we don't need to record any calls to it.
-      if (!hasValueId(EI.first.getGUID()))
-        continue;
-      NameVals.push_back(getValueId(EI.first.getGUID()));
+      GlobalValue::GUID GUID = EI.first.getGUID();
+      if (!hasValueId(GUID)) {
+        // For SamplePGO, the indirect call targets for local functions will
+        // have its original name annotated in profile. We try to find the
+        // corresponding PGOFuncName as the GUID.
+        GUID = Index.getGUIDFromOriginalID(GUID);
+        if (GUID == 0 || !hasValueId(GUID))
+          continue;
+      }
+      NameVals.push_back(getValueId(GUID));
       if (HasProfileData)
         NameVals.push_back(static_cast<uint8_t>(EI.second.Hotness));
     }
@@ -3697,7 +3757,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
 
 /// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the
 /// current llvm version, and a record for the epoch number.
-void writeIdentificationBlock(BitstreamWriter &Stream) {
+static void writeIdentificationBlock(BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
 
   // Write the "user readable" string identifying the bitcode producer
@@ -3722,17 +3782,24 @@ void writeIdentificationBlock(BitstreamWriter &Stream) {
 void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
   // Emit the module's hash.
   // MODULE_CODE_HASH: [5*i32]
-  SHA1 Hasher;
-  Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
-                                  Buffer.size() - BlockStartPos));
-  StringRef Hash = Hasher.result();
-  uint32_t Vals[5];
-  for (int Pos = 0; Pos < 20; Pos += 4) {
-    Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
-  }
+  if (GenerateHash) {
+    SHA1 Hasher;
+    uint32_t Vals[5];
+    Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
+                                    Buffer.size() - BlockStartPos));
+    StringRef Hash = Hasher.result();
+    for (int Pos = 0; Pos < 20; Pos += 4) {
+      Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
+    }
 
-  // Emit the finished record.
-  Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+
+    if (ModHash)
+      // Save the written hash value.
+      std::copy(std::begin(Vals), std::end(Vals), std::begin(*ModHash));
+  } else if (ModHash)
+    Stream.EmitRecord(bitc::MODULE_CODE_HASH, ArrayRef<uint32_t>(*ModHash));
 }
 
 void ModuleBitcodeWriter::write() {
@@ -3793,9 +3860,7 @@ void ModuleBitcodeWriter::write() {
   writeValueSymbolTable(M.getValueSymbolTable(),
                         /* IsModuleLevel */ true, &FunctionToBitcodeIndex);
 
-  if (GenerateHash) {
-    writeModuleHash(BlockStartPos);
-  }
+  writeModuleHash(BlockStartPos);
 
   Stream.ExitBlock();
 }
@@ -3886,9 +3951,10 @@ BitcodeWriter::~BitcodeWriter() = default;
 void BitcodeWriter::writeModule(const Module *M,
                                 bool ShouldPreserveUseListOrder,
                                 const ModuleSummaryIndex *Index,
-                                bool GenerateHash) {
-  ModuleBitcodeWriter ModuleWriter(
-      M, Buffer, *Stream, ShouldPreserveUseListOrder, Index, GenerateHash);
+                                bool GenerateHash, ModuleHash *ModHash) {
+  ModuleBitcodeWriter ModuleWriter(M, Buffer, *Stream,
+                                   ShouldPreserveUseListOrder, Index,
+                                   GenerateHash, ModHash);
   ModuleWriter.write();
 }
 
@@ -3897,7 +3963,7 @@ void BitcodeWriter::writeModule(const Module *M,
 void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
                               bool ShouldPreserveUseListOrder,
                               const ModuleSummaryIndex *Index,
-                              bool GenerateHash) {
+                              bool GenerateHash, ModuleHash *ModHash) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256*1024);
 
@@ -3908,7 +3974,8 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
     Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
 
   BitcodeWriter Writer(Buffer);
-  Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash);
+  Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash,
+                     ModHash);
 
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
     emitDarwinBCHeaderAndTrailer(Buffer, TT);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 5d5bfab58b81..3800d9abd429 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -432,12 +432,14 @@ unsigned ValueEnumerator::getValueID(const Value *V) const {
   return I->second-1;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ValueEnumerator::dump() const {
   print(dbgs(), ValueMap, "Default");
   dbgs() << '\n';
   print(dbgs(), MetadataMap, "MetaData");
   dbgs() << '\n';
 }
+#endif
 
 void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
                             const char *Name) const {
@@ -452,7 +454,8 @@ void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
       OS << "Value: " << V->getName();
     else
       OS << "Value: [null]\n";
-    V->dump();
+    V->print(errs());
+    errs() << '\n';
 
     OS << " Uses(" << std::distance(V->use_begin(),V->use_end()) << "):";
     for (const Use &U : V->uses()) {
@@ -549,7 +552,7 @@ void ValueEnumerator::EnumerateFunctionLocalMetadata(
 void ValueEnumerator::dropFunctionFromMetadata(
     MetadataMapType::value_type &FirstMD) {
   SmallVector<const MDNode *, 64> Worklist;
-  auto push = [this, &Worklist](MetadataMapType::value_type &MD) {
+  auto push = [&Worklist](MetadataMapType::value_type &MD) {
     auto &Entry = MD.second;
 
     // Nothing to do if this metadata isn't tagged.
@@ -884,7 +887,7 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
   }
 }
 
-void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
+void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
   if (PAL.isEmpty()) return;  // null is always 0.
 
   // Do a lookup.
@@ -897,7 +900,7 @@ void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
 
   // Do lookups for all attribute groups.
   for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) {
-    AttributeSet AS = PAL.getSlotAttributes(i);
+    AttributeList AS = PAL.getSlotAttributes(i);
     unsigned &Entry = AttributeGroupMap[AS];
     if (Entry == 0) {
       AttributeGroups.push_back(AS);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index a8d6cf965a4b..8a82aab29836 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -36,7 +36,7 @@ class LocalAsMetadata;
 class MDNode;
 class MDOperand;
 class NamedMDNode;
-class AttributeSet;
+class AttributeList;
 class ValueSymbolTable;
 class MDSymbolTable;
 class raw_ostream;
@@ -102,13 +102,13 @@ private:
 
   bool ShouldPreserveUseListOrder;
 
-  typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
+  typedef DenseMap<AttributeList, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
-  std::vector<AttributeSet> AttributeGroups;
+  std::vector<AttributeList> AttributeGroups;
 
-  typedef DenseMap<AttributeSet, unsigned> AttributeMapType;
+  typedef DenseMap<AttributeList, unsigned> AttributeMapType;
   AttributeMapType AttributeMap;
-  std::vector<AttributeSet> Attribute;
+  std::vector<AttributeList> Attribute;
 
   /// GlobalBasicBlockIDs - This map memoizes the basic block ID's referenced by
   /// the "getGlobalBasicBlockID" method.
@@ -166,14 +166,14 @@ public:
   unsigned getInstructionID(const Instruction *I) const;
   void setInstructionID(const Instruction *I);
 
-  unsigned getAttributeID(AttributeSet PAL) const {
+  unsigned getAttributeID(AttributeList PAL) const {
     if (PAL.isEmpty()) return 0;  // Null maps to zero.
     AttributeMapType::const_iterator I = AttributeMap.find(PAL);
     assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!");
     return I->second;
   }
 
-  unsigned getAttributeGroupID(AttributeSet PAL) const {
+  unsigned getAttributeGroupID(AttributeList PAL) const {
     if (PAL.isEmpty()) return 0;  // Null maps to zero.
     AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(PAL);
     assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!");
@@ -206,10 +206,8 @@ public:
   const std::vector<const BasicBlock*> &getBasicBlocks() const {
     return BasicBlocks;
   }
-  const std::vector<AttributeSet> &getAttributes() const {
-    return Attribute;
-  }
-  const std::vector<AttributeSet> &getAttributeGroups() const {
+  const std::vector<AttributeList> &getAttributes() const { return Attribute; }
+  const std::vector<AttributeList> &getAttributeGroups() const {
     return AttributeGroups;
   }
 
@@ -283,7 +281,7 @@ private:
   void EnumerateValue(const Value *V);
   void EnumerateType(Type *T);
   void EnumerateOperandType(const Value *V);
-  void EnumerateAttributes(AttributeSet PAL);
+  void EnumerateAttributes(AttributeList PAL);
 
   void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
   void EnumerateNamedMetadata(const Module &M);
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index bb908618b679..955524c2a676 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -163,9 +163,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector Pristine = MFI.getPristineRegs(MF);
-  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
     unsigned Reg = *I;
-    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg)))
+      continue;
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       unsigned AliasReg = *AI;
       State->UnionGroups(AliasReg, 0);
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 79ecc4308fe7..09a37a77e9fb 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
   bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS;
   ADS = true;
 
-  AttrBuilder CallerAttrs(F->getAttributes(),
-                          AttributeSet::ReturnIndex);
+  AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex);
   AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
-                          AttributeSet::ReturnIndex);
+                          AttributeList::ReturnIndex);
 
   // Noalias is completely benign as far as calling convention goes, it
   // shouldn't affect whether the call is a tail call.
@@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
   return true;
 }
 
-bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
-  if (!GV->hasLinkOnceODRLinkage())
-    return false;
-
-  // We assume that anyone who sets global unnamed_addr on a non-constant knows
-  // what they're doing.
-  if (GV->hasGlobalUnnamedAddr())
-    return true;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (!Var->isConstant())
-      return false;
-  }
-
-  return GV->hasAtLeastLocalUnnamedAddr();
-}
-
 static void collectFuncletMembers(
     DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
     const MachineBasicBlock *MBB) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 24fdbfc901fd..6c18d56b8272 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -11,48 +11,102 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/AsmPrinter.h"
+#include "AsmPrinterHandler.h"
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
 #include "WinException.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Value.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -69,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription =
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
+static cl::opt<bool>
+    PrintSchedule("print-schedule", cl::Hidden, cl::init(false),
+                  cl::desc("Print 'sched: [latency:throughput]' in .s output"));
+
 char AsmPrinter::ID = 0;
 
 typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type;
@@ -78,7 +136,6 @@ static gcp_map_type &getGCMap(void *&P) {
   return *(gcp_map_type*)P;
 }
 
-
 /// getGVAlignmentLog2 - Return the alignment to use for the specified global
 /// value in log2 form.  This rounds up to the preferred alignment if possible
 /// and legal.
@@ -107,16 +164,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
-      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
-      isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) {
-  DD = nullptr;
-  MMI = nullptr;
-  LI = nullptr;
-  MF = nullptr;
-  CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr;
-  CurrentFnBegin = nullptr;
-  CurrentFnEnd = nullptr;
-  GCMetadataPrinters = nullptr;
+      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) {
   VerboseAsm = OutStreamer->isVerboseAsm();
 }
 
@@ -171,6 +219,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
   if (isVerbose())
     AU.addRequired<MachineLoopInfo>();
@@ -223,7 +272,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   // don't, this at least helps the user find where a global came from.
   if (MAI->hasSingleParameterDotFile()) {
     // .file "foo.c"
-    OutStreamer->EmitFileDirective(M.getModuleIdentifier());
+    OutStreamer->EmitFileDirective(M.getSourceFileName());
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
@@ -571,7 +620,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 ///
 /// \p Value - The value to emit.
 /// \p Size - The size of the integer (in bytes) to emit.
-void AsmPrinter::EmitDebugValue(const MCExpr *Value,
+void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
                                       unsigned Size) const {
   OutStreamer->EmitValue(Value, Size);
 }
@@ -602,8 +651,23 @@ void AsmPrinter::EmitFunctionHeader() {
   }
 
   // Emit the prefix data.
-  if (F->hasPrefixData())
-    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+  if (F->hasPrefixData()) {
+    if (MAI->hasSubsectionsViaSymbols()) {
+      // Preserving prefix data on platforms which use subsections-via-symbols
+      // is a bit tricky. Here we introduce a symbol for the prefix data
+      // and use the .alt_entry attribute to mark the function's real entry point
+      // as an alternative entry point to the prefix-data symbol.
+      MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol();
+      OutStreamer->EmitLabel(PrefixSym);
+
+      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+
+      // Emit an .alt_entry directive for the actual function symbol.
+      OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry);
+    } else {
+      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+    }
+  }
 
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
@@ -660,7 +724,8 @@ void AsmPrinter::EmitFunctionEntryLabel() {
 }
 
 /// emitComments - Pretty-print comments for instructions.
-static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS,
+                         AsmPrinter *AP) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
@@ -668,6 +733,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   int FI;
 
   const MachineFrameInfo &MFI = MF->getFrameInfo();
+  bool Commented = false;
 
   // We assume a single instruction only has a spill or reload, not
   // both.
@@ -675,24 +741,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   if (TII->isLoadFromStackSlotPostFE(MI, FI)) {
     if (MFI.isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
-      CommentOS << MMO->getSize() << "-byte Reload\n";
+      CommentOS << MMO->getSize() << "-byte Reload";
+      Commented = true;
     }
   } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) {
-    if (MFI.isSpillSlotObjectIndex(FI))
-      CommentOS << MMO->getSize() << "-byte Folded Reload\n";
+    if (MFI.isSpillSlotObjectIndex(FI)) {
+      CommentOS << MMO->getSize() << "-byte Folded Reload";
+      Commented = true;
+    }
   } else if (TII->isStoreToStackSlotPostFE(MI, FI)) {
     if (MFI.isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
-      CommentOS << MMO->getSize() << "-byte Spill\n";
+      CommentOS << MMO->getSize() << "-byte Spill";
+      Commented = true;
     }
   } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) {
-    if (MFI.isSpillSlotObjectIndex(FI))
-      CommentOS << MMO->getSize() << "-byte Folded Spill\n";
+    if (MFI.isSpillSlotObjectIndex(FI)) {
+      CommentOS << MMO->getSize() << "-byte Folded Spill";
+      Commented = true;
+    }
   }
 
   // Check for spill-induced copies
-  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
-    CommentOS << " Reload Reuse\n";
+  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) {
+    Commented = true;
+    CommentOS << " Reload Reuse";
+  }
+
+  if (Commented && AP->EnablePrintSchedInfo)
+    // If any comment was added above and we need sched info comment then
+    // add this new comment just after the above comment w/o "\n" between them.
+    CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n";
+  else if (Commented)
+    CommentOS << "\n";
 }
 
 /// emitImplicitDef - This method emits the specified machine instruction
@@ -883,6 +964,7 @@ void AsmPrinter::EmitFunctionBody() {
 
   // Print out code for the function.
   bool HasAnyRealCode = false;
+  int NumInstsInFunction = 0;
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
     EmitBasicBlockStart(MBB);
@@ -892,7 +974,7 @@ void AsmPrinter::EmitFunctionBody() {
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
           !MI.isDebugValue()) {
         HasAnyRealCode = true;
-        ++EmittedInsts;
+        ++NumInstsInFunction;
       }
 
       if (ShouldPrintDebugScopes) {
@@ -905,7 +987,7 @@ void AsmPrinter::EmitFunctionBody() {
       }
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->GetCommentOS());
+        emitComments(MI, OutStreamer->GetCommentOS(), this);
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
@@ -953,6 +1035,14 @@ void AsmPrinter::EmitFunctionBody() {
     EmitBasicBlockEnd(MBB);
   }
 
+  EmittedInsts += NumInstsInFunction;
+  MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount",
+                                      MF->getFunction()->getSubprogram(),
+                                      &MF->front());
+  R << ore::NV("NumInstructions", NumInstsInFunction)
+    << " instructions in function";
+  ORE->emit(R);
+
   // If the function is empty and the object file uses .subsections_via_symbols,
   // then we need to emit *something* to the function body to prevent the
   // labels from collapsing together.  Just emit a noop.
@@ -1238,7 +1328,7 @@ bool AsmPrinter::doFinalization(Module &M) {
         break;
       AliasStack.push_back(Cur);
     }
-    for (const GlobalAlias *AncestorAlias : reverse(AliasStack))
+    for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack))
       emitGlobalIndirectSymbol(M, *AncestorAlias);
     AliasStack.clear();
   }
@@ -1311,19 +1401,28 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
       CurrentFnSymForSize = CurrentFnBegin;
   }
 
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   if (isVerbose())
     LI = &getAnalysis<MachineLoopInfo>();
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  EnablePrintSchedInfo = PrintSchedule.getNumOccurrences()
+                             ? PrintSchedule
+                             : STI.supportPrintSchedInfo();
 }
 
 namespace {
+
 // Keep track the alignment, constpool entries per Section.
   struct SectionCPs {
     MCSection *S;
     unsigned Alignment;
     SmallVector<unsigned, 4> CPEs;
+
     SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {}
   };
-}
+
+} // end anonymous namespace
 
 /// EmitConstantPool - Print to the current output stream assembly
 /// representations of the constants in the constant pool MCP. This is
@@ -1547,7 +1646,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   OutStreamer->EmitValue(Value, EntrySize);
 }
 
-
 /// EmitSpecialLLVMGlobal - Check to see if the specified global is a
 /// special global used by LLVM.  If so, emit it and return true, otherwise
 /// do nothing and return false.
@@ -1598,13 +1696,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
 }
 
 namespace {
+
 struct Structor {
-  Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {}
-  int Priority;
-  llvm::Constant *Func;
-  llvm::GlobalValue *ComdatKey;
+  int Priority = 0;
+  Constant *Func = nullptr;
+  GlobalValue *ComdatKey = nullptr;
+
+  Structor() = default;
 };
-} // end namespace
+
+}  // end anonymous namespace
 
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
@@ -1653,8 +1754,11 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
     const MCSymbol *KeySym = nullptr;
     if (GlobalValue *GV = S.ComdatKey) {
-      if (GV->hasAvailableExternallyLinkage())
-        // If the associated variable is available_externally, some other TU
+      if (GV->isDeclarationForLinker())
+        // If the associated variable is not defined in this module
+        // (it might be available_externally, or have been an
+        // available_externally definition that was dropped by the
+        // EliminateAvailableExternally pass), some other TU
         // will provide its dynamic initializer.
         continue;
 
@@ -1931,7 +2035,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) {
   return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
 }
 
-
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
@@ -1972,7 +2075,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
 static void emitGlobalConstantDataSequential(const DataLayout &DL,
                                              const ConstantDataSequential *CDS,
                                              AsmPrinter &AP) {
-
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, DL);
   if (Value != -1) {
@@ -2006,7 +2108,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
                         CDS->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer->EmitZeros(Padding);
-
 }
 
 static void emitGlobalConstantArray(const DataLayout &DL,
@@ -2420,8 +2521,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
   return OutContext.getOrCreateSymbol(NameStr);
 }
 
-
-
 /// PrintParentLoopComment - Print comments about parent loops of this one.
 static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                    unsigned FunctionNumber) {
@@ -2486,7 +2585,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
 }
 
-
 /// EmitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
@@ -2607,8 +2705,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   return true;
 }
 
-
-
 GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   if (!S.usesMetadata())
     return nullptr;
@@ -2639,7 +2735,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
 }
 
 /// Pin vtable to this file.
-AsmPrinterHandler::~AsmPrinterHandler() {}
+AsmPrinterHandler::~AsmPrinterHandler() = default;
 
 void AsmPrinterHandler::markFunctionEnd() {}
 
@@ -2702,8 +2798,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
   SledKind Kind) {
   auto Fn = MI.getParent()->getParent()->getFunction();
   auto Attr = Fn->getFnAttribute("function-instrument");
+  bool LogArgs = Fn->hasFnAttribute("xray-log-args");
   bool AlwaysInstrument =
     Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
+  if (Kind == SledKind::FUNCTION_ENTER && LogArgs)
+    Kind = SledKind::LOG_ARGS_ENTER;
   Sleds.emplace_back(
     XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn });
 }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 57864e4e4d4f..683e622e3d53 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -40,25 +40,24 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-namespace {
-  struct SrcMgrDiagInfo {
-    const MDNode *LocInfo;
-    LLVMContext::InlineAsmDiagHandlerTy DiagHandler;
-    void *DiagContext;
-  };
-}
-
 /// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
 /// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo
 /// struct above.
 static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
-  SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
+  AsmPrinter::SrcMgrDiagInfo *DiagInfo =
+      static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo);
   assert(DiagInfo && "Diagnostic context not passed down?");
 
+  // Look up a LocInfo for the buffer this diagnostic is coming from.
+  unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc());
+  const MDNode *LocInfo = nullptr;
+  if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size())
+    LocInfo = DiagInfo->LocInfos[BufNum-1];
+
   // If the inline asm had metadata associated with it, pull out a location
   // cookie corresponding to which line the error occurred on.
   unsigned LocCookie = 0;
-  if (const MDNode *LocInfo = DiagInfo->LocInfo) {
+  if (LocInfo) {
     unsigned ErrorLine = Diag.getLineNo()-1;
     if (ErrorLine >= LocInfo->getNumOperands())
       ErrorLine = 0;
@@ -99,35 +98,39 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
     return;
   }
 
-  SourceMgr SrcMgr;
-  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+  if (!DiagInfo) {
+    DiagInfo = make_unique<SrcMgrDiagInfo>();
 
-  SrcMgrDiagInfo DiagInfo;
-
-  // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
-  LLVMContext &LLVMCtx = MMI->getModule()->getContext();
-  bool HasDiagHandler = false;
-  if (LLVMCtx.getInlineAsmDiagnosticHandler() != nullptr) {
-    // If the source manager has an issue, we arrange for srcMgrDiagHandler
-    // to be invoked, getting DiagInfo passed into it.
-    DiagInfo.LocInfo = LocMDNode;
-    DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
-    DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
-    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
-    HasDiagHandler = true;
+    MCContext &Context = MMI->getContext();
+    Context.setInlineSourceManager(&DiagInfo->SrcMgr);
+
+    LLVMContext &LLVMCtx = MMI->getModule()->getContext();
+    if (LLVMCtx.getInlineAsmDiagnosticHandler()) {
+      DiagInfo->DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
+      DiagInfo->DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
+      DiagInfo->SrcMgr.setDiagHandler(srcMgrDiagHandler, DiagInfo.get());
+    }
   }
 
+  SourceMgr &SrcMgr = DiagInfo->SrcMgr;
+  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+
   std::unique_ptr<MemoryBuffer> Buffer;
-  if (isNullTerminated)
-    Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>");
-  else
-    Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
+  // The inline asm source manager will outlive Str, so make a copy of the
+  // string for SourceMgr to own.
+  Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
 
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
-  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+  unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+
+  // Store LocMDNode in DiagInfo, using BufNum as an identifier.
+  if (LocMDNode) {
+    DiagInfo->LocInfos.resize(BufNum);
+    DiagInfo->LocInfos[BufNum-1] = LocMDNode;
+  }
 
   std::unique_ptr<MCAsmParser> Parser(
-      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI));
+      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
@@ -151,7 +154,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
                         /*NoFinalize*/ true);
   emitInlineAsmEnd(STI, &TAP->getSTI());
-  if (Res && !HasDiagHandler)
+
+  if (Res && !DiagInfo->DiagHandler)
     report_fatal_error("Error parsing inline asm\n");
 }
 
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 83440513225c..383b8cddb1a0 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -23,13 +23,13 @@
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -38,7 +38,6 @@
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
 CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
     : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(),
@@ -495,9 +494,9 @@ void CodeViewDebug::emitTypeInformation() {
       // comments. The MSVC linker doesn't do much type record validation,
       // so the first link of an invalid type record can succeed while
       // subsequent links will fail with LNK1285.
-      ByteStream Stream(Record);
+      BinaryByteStream Stream(Record, llvm::support::little);
       CVTypeArray Types;
-      StreamReader Reader(Stream);
+      BinaryStreamReader Reader(Stream);
       Error E = Reader.readArray(Types, Reader.getLength());
       if (!E) {
         TypeVisitorCallbacks C;
@@ -948,10 +947,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
 
       // Handle fragments.
       auto Fragment = DIExpr->getFragmentInfo();
-      if (DIExpr && Fragment) {
+      if (Fragment) {
         IsSubfield = true;
         StructOffset = Fragment->OffsetInBits / 8;
-      } else if (DIExpr && DIExpr->getNumElements() > 0) {
+      } else if (DIExpr->getNumElements() > 0) {
         continue; // Ignore unrecognized exprs.
       }
 
@@ -1014,14 +1013,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
   }
 }
 
-void CodeViewDebug::beginFunction(const MachineFunction *MF) {
-  assert(!CurFn && "Can't process two functions at once!");
-
-  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram())
-    return;
-
-  DebugHandlerBase::beginFunction(MF);
-
+void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   const Function *GV = MF->getFunction();
   assert(FnDebugInfo.count(GV) == false);
   CurFn = &FnDebugInfo[GV];
@@ -1150,27 +1142,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
 
   uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8;
 
-
-  // We want to assert that the element type multiplied by the array lengths
-  // match the size of the overall array. However, if we don't have complete
-  // type information for the base type, we can't make this assertion. This
-  // happens if limited debug info is enabled in this case:
-  //   struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); };
-  //   VTableOptzn array[3];
-  // The DICompositeType of VTableOptzn will have size zero, and the array will
-  // have size 3 * sizeof(void*), and we should avoid asserting.
-  //
-  // There is a related bug in the front-end where an array of a structure,
-  // which was declared as incomplete structure first, ends up not getting a
-  // size assigned to it. (PR28303)
-  // Example:
-  //   struct A(*p)[3];
-  //   struct A { int f; } a[3];
-  bool PartiallyIncomplete = false;
-  if (Ty->getSizeInBits() == 0 || ElementSize == 0) {
-    PartiallyIncomplete = true;
-  }
-
   // Add subranges to array type.
   DINodeArray Elements = Ty->getElements();
   for (int i = Elements.size() - 1; i >= 0; --i) {
@@ -1185,16 +1156,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     // Variable Length Array (VLA) has Count equal to '-1'.
     // Replace with Count '1', assume it is the minimum VLA length.
     // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU.
-    if (Count == -1) {
+    if (Count == -1)
       Count = 1;
-      PartiallyIncomplete = true;
-    }
 
     // Update the element size and element type index for subsequent subranges.
     ElementSize *= Count;
 
     // If this is the outermost array, use the size from the array. It will be
-    // more accurate if PartiallyIncomplete is true.
+    // more accurate if we had a VLA or an incomplete element type size.
     uint64_t ArraySize =
         (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize;
 
@@ -1203,9 +1172,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     ElementTypeIndex = TypeTable.writeKnownType(AR);
   }
 
-  (void)PartiallyIncomplete;
-  assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8));
-
   return ElementTypeIndex;
 }
 
@@ -2115,18 +2081,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {
   }
 }
 
-void CodeViewDebug::endFunction(const MachineFunction *MF) {
-  if (!Asm || !CurFn)  // We haven't created any debug info for this function.
-    return;
-
+void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
   const Function *GV = MF->getFunction();
   assert(FnDebugInfo.count(GV));
   assert(CurFn == &FnDebugInfo[GV]);
 
   collectVariableInfo(GV->getSubprogram());
 
-  DebugHandlerBase::endFunction(MF);
-
   // Don't emit anything if we don't have any line tables.
   if (!CurFn->HaveLineInfo) {
     FnDebugInfo.erase(GV);
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 3dd4315e4c2f..343384c51772 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -299,6 +299,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   unsigned getPointerSizeInBytes();
 
+protected:
+  /// \brief Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// \brief Gather post-function debug information.
+  void endFunctionImpl(const MachineFunction *) override;
+
 public:
   CodeViewDebug(AsmPrinter *Asm);
 
@@ -307,12 +314,6 @@ public:
   /// \brief Emit the COFF section that holds the line table information.
   void endModule() override;
 
-  /// \brief Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// \brief Gather post-function debug information.
-  void endFunction(const MachineFunction *) override;
-
   /// \brief Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 };
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 879918995472..b510e0ef36ac 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -42,6 +42,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
   // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
   ID.AddInteger(unsigned(Attribute));
   ID.AddInteger(unsigned(Form));
+  if (Form == dwarf::DW_FORM_implicit_const)
+    ID.AddInteger(Value);
 }
 
 //===----------------------------------------------------------------------===//
@@ -107,13 +109,20 @@ void DIEAbbrev::print(raw_ostream &O) {
     O << "  "
       << dwarf::AttributeString(Data[i].getAttribute())
       << "  "
-      << dwarf::FormEncodingString(Data[i].getForm())
-      << '\n';
+      << dwarf::FormEncodingString(Data[i].getForm());
+
+    if (Data[i].getForm() == dwarf::DW_FORM_implicit_const)
+      O << " " << Data[i].getValue();
+
+    O << '\n';
   }
 }
 
-LLVM_DUMP_METHOD
-void DIEAbbrev::dump() { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIEAbbrev::dump() {
+  print(dbgs());
+}
+#endif
 
 //===----------------------------------------------------------------------===//
 // DIEAbbrevSet Implementation
@@ -249,10 +258,11 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const {
   O << "\n";
 }
 
-LLVM_DUMP_METHOD
-void DIE::dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIE::dump() {
   print(dbgs());
 }
+#endif
 
 unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
                                        DIEAbbrevSet &AbbrevSet,
@@ -340,10 +350,11 @@ void DIEValue::print(raw_ostream &O) const {
   }
 }
 
-LLVM_DUMP_METHOD
-void DIEValue::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIEValue::dump() const {
   print(dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // DIEInteger Implementation
@@ -354,57 +365,42 @@ void DIEValue::dump() const {
 void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_implicit_const:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     Asm->OutStreamer->AddBlankLine();
     return;
   case dwarf::DW_FORM_flag:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref1:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data1:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
   case dwarf::DW_FORM_ref2:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data2:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
   case dwarf::DW_FORM_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref4:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data4:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
   case dwarf::DW_FORM_ref8:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_sig8:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data8:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup8:
   case dwarf::DW_FORM_GNU_ref_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_strp_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_line_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_sec_offset:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp_sup:
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sup:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_addr:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_addr:
     Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form));
     return;
   case dwarf::DW_FORM_GNU_str_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_addr_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_udata:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -419,35 +415,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
-  case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_flag_present: return 0;
-  case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data1: return sizeof(int8_t);
-  case dwarf::DW_FORM_ref2:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data2: return sizeof(int16_t);
-  case dwarf::DW_FORM_ref4:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data4: return sizeof(int32_t);
-  case dwarf::DW_FORM_ref8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sig8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data8: return sizeof(int64_t);
+  case dwarf::DW_FORM_implicit_const:
+  case dwarf::DW_FORM_flag_present:
+    return 0;
+  case dwarf::DW_FORM_flag:
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
+    return sizeof(int8_t);
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
+    return sizeof(int16_t);
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
+    return sizeof(int32_t);
+  case dwarf::DW_FORM_ref8:
+  case dwarf::DW_FORM_ref_sig8:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_ref_sup8:
+    return sizeof(int64_t);
   case dwarf::DW_FORM_ref_addr:
     if (AP->getDwarfVersion() == 2)
       return AP->getPointerSize();
     LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_ref_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_strp_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_line_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_sec_offset:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp_sup:
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sup:
     switch (AP->OutStreamer->getContext().getDwarfFormat()) {
     case dwarf::DWARF32:
       return 4;
@@ -456,11 +458,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     }
     llvm_unreachable("Invalid DWARF format");
   case dwarf::DW_FORM_GNU_str_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_addr_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_udata:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
@@ -484,7 +483,7 @@ void DIEInteger::print(raw_ostream &O) const {
 /// EmitValue - Emit expression value.
 ///
 void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitDebugValue(Expr, SizeOf(AP, Form));
+  AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index d8ecc7ccfb9b..8e3b88d0af0e 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -490,9 +490,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) {
   Hash.final(Result);
 
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
 }
 
 /// This is based on the type signature computation given in section 7.27 of the
@@ -514,7 +514,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
   Hash.final(Result);
 
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
 }
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 94190981e88e..1d63e33a4d33 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   return getBaseTypeSize(BaseType);
 }
 
+bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
+  if (!MMI->hasDebugInfo())
+    return false;
+  auto *SP = MF->getFunction()->getSubprogram();
+  if (!SP)
+    return false;
+  assert(SP->getUnit());
+  auto EK = SP->getUnit()->getEmissionKind();
+  if (EK == DICompileUnit::NoDebug)
+    return false;
+  return true;
+}
+
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
+  assert(Asm);
+  PrevInstBB = nullptr;
+
+  if (!hasDebugInfo(MMI, MF)) {
+    skippedNonDebugFunction();
+    return;
+  }
+
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
-  if (LScopes.empty())
+  if (LScopes.empty()) {
+    beginFunctionImpl(MF);
     return;
+  }
 
   // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
@@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   PrevInstLoc = DebugLoc();
   PrevLabel = Asm->getFunctionBegin();
+  beginFunctionImpl(MF);
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
@@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
+  if (hasDebugInfo(MMI, MF))
+    endFunctionImpl(MF);
   DbgValues.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index c00fa189d94a..659a921e1fc5 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -80,6 +80,10 @@ protected:
     LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
 
+  virtual void beginFunctionImpl(const MachineFunction *MF) = 0;
+  virtual void endFunctionImpl(const MachineFunction *MF) = 0;
+  virtual void skippedNonDebugFunction() {}
+
   // AsmPrinterHandler overrides.
 public:
   void beginInstruction(const MachineInstr *MI) override;
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 36fb1507ddc6..a68e8cc6b4b3 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -76,7 +76,8 @@ public:
     const DIExpression *getExpression() const { return Expression; }
     friend bool operator==(const Value &, const Value &);
     friend bool operator<(const Value &, const Value &);
-    void dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump() const {
       if (isLocation()) {
         llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " ";
         if (Loc.isIndirect())
@@ -90,6 +91,7 @@ public:
       if (Expression)
         Expression->dump();
     }
+#endif
   };
 
 private:
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d904372af589..a550ff2fb90f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1,3 +1,16 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
 #include "DwarfCompileUnit.h"
 #include "DwarfExpression.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   bool addToAccelTable = false;
   DIELoc *Loc = nullptr;
   std::unique_ptr<DIEDwarfExpression> DwarfExpr;
-  bool AllConstant = std::all_of(
-      GlobalExprs.begin(), GlobalExprs.end(),
-      [&](const GlobalExpr GE) {
-        return GE.Expr && GE.Expr->isConstant();
-      });
-
   for (const auto &GE : GlobalExprs) {
     const GlobalVariable *Global = GE.Var;
     const DIExpression *Expr = GE.Expr;
+
     // For compatibility with DWARF 3 and earlier,
     // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes
     // DW_AT_const_value(X).
     if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) {
+      addToAccelTable = true;
       addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1));
-      // We cannot describe the location of dllimport'd variables: the
-      // computation of their address requires loads from the IAT.
-    } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) {
-      if (!Loc) {
-        Loc = new (DIEValueAllocator) DIELoc;
-        DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
-      }
+      break;
+    }
+
+    // We cannot describe the location of dllimport'd variables: the
+    // computation of their address requires loads from the IAT.
+    if (Global && Global->hasDLLImportStorageClass())
+      continue;
+
+    // Nothing to describe without address or constant.
+    if (!Global && (!Expr || !Expr->isConstant()))
+      continue;
+
+    if (!Loc) {
       addToAccelTable = true;
-      if (Global) {
-        const MCSymbol *Sym = Asm->getSymbol(Global);
-        if (Global->isThreadLocal()) {
-          if (Asm->TM.Options.EmulatedTLS) {
-            // TODO: add debug info for emulated thread local mode.
-          } else {
-            // FIXME: Make this work with -gsplit-dwarf.
-            unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-            assert((PointerSize == 4 || PointerSize == 8) &&
-                   "Add support for other sizes if necessary");
-            // Based on GCC's support for TLS:
-            if (!DD->useSplitDwarf()) {
-              // 1) Start with a constNu of the appropriate pointer size
-              addUInt(*Loc, dwarf::DW_FORM_data1,
-                      PointerSize == 4 ? dwarf::DW_OP_const4u
-                                       : dwarf::DW_OP_const8u);
-              // 2) containing the (relocated) offset of the TLS variable
-              //    within the module's TLS block.
-              addExpr(*Loc, dwarf::DW_FORM_udata,
-                      Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
-            } else {
-              addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-              addUInt(*Loc, dwarf::DW_FORM_udata,
-                      DD->getAddressPool().getIndex(Sym, /* TLS */ true));
-            }
-            // 3) followed by an OP to make the debugger do a TLS lookup.
+      Loc = new (DIEValueAllocator) DIELoc;
+      DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
+    }
+
+    if (Global) {
+      const MCSymbol *Sym = Asm->getSymbol(Global);
+      if (Global->isThreadLocal()) {
+        if (Asm->TM.Options.EmulatedTLS) {
+          // TODO: add debug info for emulated thread local mode.
+        } else {
+          // FIXME: Make this work with -gsplit-dwarf.
+          unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+          assert((PointerSize == 4 || PointerSize == 8) &&
+                 "Add support for other sizes if necessary");
+          // Based on GCC's support for TLS:
+          if (!DD->useSplitDwarf()) {
+            // 1) Start with a constNu of the appropriate pointer size
             addUInt(*Loc, dwarf::DW_FORM_data1,
-                    DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
-                                          : dwarf::DW_OP_form_tls_address);
+                    PointerSize == 4 ? dwarf::DW_OP_const4u
+                                     : dwarf::DW_OP_const8u);
+            // 2) containing the (relocated) offset of the TLS variable
+            //    within the module's TLS block.
+            addExpr(*Loc, dwarf::DW_FORM_udata,
+                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+          } else {
+            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+            addUInt(*Loc, dwarf::DW_FORM_udata,
+                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));
           }
-        } else {
-          DD->addArangeLabel(SymbolCU(this, Sym));
-          addOpAddress(*Loc, Sym);
+          // 3) followed by an OP to make the debugger do a TLS lookup.
+          addUInt(*Loc, dwarf::DW_FORM_data1,
+                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                        : dwarf::DW_OP_form_tls_address);
         }
+      } else {
+        DD->addArangeLabel(SymbolCU(this, Sym));
+        addOpAddress(*Loc, Sym);
       }
-      if (Expr) {
-        DwarfExpr->addFragmentOffset(Expr);
-        DwarfExpr->AddExpression(Expr);
-      }
+    }
+    if (Expr) {
+      DwarfExpr->addFragmentOffset(Expr);
+      DwarfExpr->addExpression(Expr);
     }
   }
   if (Loc)
@@ -507,8 +525,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
         DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
         // If there is an expression, emit raw unsigned bytes.
         DwarfExpr.addFragmentOffset(Expr);
-        DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm());
-        DwarfExpr.AddExpression(Expr);
+        DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm());
+        DwarfExpr.addExpression(Expr);
         addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
       } else
         addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
@@ -532,9 +550,15 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
     DwarfExpr.addFragmentOffset(Fragment.Expr);
-    DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                    FrameReg, Offset);
-    DwarfExpr.AddExpression(Fragment.Expr);
+    SmallVector<uint64_t, 8> Ops;
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Offset);
+    Ops.push_back(dwarf::DW_OP_deref);
+    Ops.append(Fragment.Expr->elements_begin(), Fragment.Expr->elements_end());
+    DIExpressionCursor Expr(Ops);
+    DwarfExpr.addMachineRegExpression(
+        *Asm->MF->getSubtarget().getRegisterInfo(), Expr, FrameReg);
+    DwarfExpr.addExpression(std::move(Expr));
   }
   addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
 
@@ -690,11 +714,14 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {
     Asm->OutStreamer->EmitLabel(LabelBegin);
   }
 
-  DwarfUnit::emitHeader(UseOffsets);
+  dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile
+                                : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton
+                                                      : dwarf::DW_UT_compile;
+  DwarfUnit::emitCommonHeader(UseOffsets, UT);
 }
 
 /// addGlobalName - Add a new global name to the compile unit.
-void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
+void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,
                                      const DIScope *Context) {
   if (includeMinimalInlineScopes())
     return;
@@ -702,6 +729,18 @@ void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
   GlobalNames[FullName] = &Die;
 }
 
+void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
+                                                const DIScope *Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
 /// Add a new global type to the unit.
 void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
                                      const DIScope *Context) {
@@ -711,6 +750,18 @@ void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
   GlobalTypes[FullName] = &Die;
 }
 
+void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
+                                             const DIScope *Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty->getName().str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
 /// addVariableAddress - Add DW_AT_location attribute for a
 /// DbgVariable based on provided MachineLocation.
 void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
@@ -727,22 +778,22 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
 void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
                                   const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-  DIEDwarfExpression Expr(*Asm, *this, *Loc);
-
-  bool validReg;
-  if (Location.isReg())
-    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                  Location.getReg());
-  else
-    validReg =
-        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                   Location.getReg(), Location.getOffset());
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
 
-  if (!validReg)
+  SmallVector<uint64_t, 8> Ops;
+  if (Location.isIndirect()) {
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Location.getOffset());
+    Ops.push_back(dwarf::DW_OP_deref);
+  }
+  DIExpressionCursor Cursor(Ops);
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
     return;
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Expr.finalize());
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Start with the address based on the location provided, and generate the
@@ -754,23 +805,24 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
                                          const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
-  const DIExpression *Expr = DV.getSingleExpression();
-  DIExpressionCursor ExprCursor(Expr);
+  const DIExpression *DIExpr = DV.getSingleExpression();
+  DwarfExpr.addFragmentOffset(DIExpr);
+
+  SmallVector<uint64_t, 8> Ops;
+  if (Location.isIndirect()) {
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Location.getOffset());
+    Ops.push_back(dwarf::DW_OP_deref);
+  }
+  Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  DIExpressionCursor Cursor(Ops);
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
-  auto Reg = Location.getReg();
-  DwarfExpr.addFragmentOffset(Expr);
-  bool ValidReg =
-      Location.getOffset()
-          ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset())
-          : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg);
-
-  if (!ValidReg)
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
     return;
-
-  DwarfExpr.AddExpression(std::move(ExprCursor));
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Add a Dwarf loclistptr attribute data and value.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index a8025f1d1521..9a64b4b76b06 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -210,12 +210,19 @@ public:
   }
 
   /// Add a new global name to the compile unit.
-  void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override;
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+
+  /// Add a new global name present in a type unit to this compile unit.
+  void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);
 
   /// Add a new global type to the compile unit.
   void addGlobalType(const DIType *Ty, const DIE &Die,
                      const DIScope *Context) override;
 
+  /// Add a new global type present in a type unit to this compile unit.
+  void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context);
+
   const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
   const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 91a3d0989cc5..5ce111309208 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
@@ -127,17 +126,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission";
 static const char *const DbgTimerName = "writer";
 static const char *const DbgTimerDescription = "DWARF Debug Writer";
 
-void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
   BS.EmitInt8(
       Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
                   : dwarf::OperationEncodingString(Op));
 }
 
-void DebugLocDwarfExpression::EmitSigned(int64_t Value) {
+void DebugLocDwarfExpression::emitSigned(int64_t Value) {
   BS.EmitSLEB128(Value, Twine(Value));
 }
 
-void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) {
+void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
   BS.EmitULEB128(Value, Twine(Value));
 }
 
@@ -200,6 +199,12 @@ const DIType *DbgVariable::getType() const {
 }
 
 ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const {
+  if (FrameIndexExprs.size() == 1)
+    return FrameIndexExprs;
+
+  assert(all_of(FrameIndexExprs,
+                [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) &&
+         "multiple FI expressions without DW_OP_LLVM_fragment");
   std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(),
             [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {
               return A.Expr->getFragmentInfo()->OffsetInBits <
@@ -418,7 +423,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
     Asm->OutStreamer->getContext().setMCLineTableCompilationDir(
         NewCU.getUniqueID(), CompilationDir);
 
-  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer());
+  StringRef Producer = DIUnit->getProducer();
+  StringRef Flags = DIUnit->getFlags();
+  if (!Flags.empty()) {
+    std::string ProducerWithFlags = Producer.str() + " " + Flags.str();
+    NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags);
+  } else
+    NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
+
   NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                 DIUnit->getSourceLanguage());
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
@@ -544,7 +556,6 @@ void DwarfDebug::beginModule() {
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
       if (DIType *RT = dyn_cast<DIType>(Ty))
-        if (!RT->isExternalTypeRef())
           // There is no point in force-emitting a forward declaration.
           CU.getOrCreateTypeDIE(RT);
     }
@@ -740,6 +751,7 @@ DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
 
 void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
                                         LexicalScope *Scope) {
+  assert(Scope && Scope->isAbstractScope());
   auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
   InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
   AbstractVariables[Var] = std::move(AbsDbgVariable);
@@ -1137,20 +1149,9 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
 
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
-void DwarfDebug::beginFunction(const MachineFunction *MF) {
+void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   CurFn = MF;
 
-  // If there's no debug info for the function we're not going to do anything.
-  if (!MMI->hasDebugInfo())
-    return;
-
-  auto DI = MF->getFunction()->getSubprogram();
-  if (!DI)
-    return;
-
-  // Grab the lexical scopes for the function, if we don't have any of those
-  // then we're not going to be able to do anything.
-  DebugHandlerBase::beginFunction(MF);
   if (LScopes.empty())
     return;
 
@@ -1189,23 +1190,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   }
 }
 
+void DwarfDebug::skippedNonDebugFunction() {
+  // If we don't have a subprogram for this function then there will be a hole
+  // in the range information. Keep note of this by setting the previously used
+  // section to nullptr.
+  PrevCU = nullptr;
+  CurFn = nullptr;
+}
+
 // Gather and emit post-function debug information.
-void DwarfDebug::endFunction(const MachineFunction *MF) {
+void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
+  const DISubprogram *SP = MF->getFunction()->getSubprogram();
+
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
 
-  const DISubprogram *SP = MF->getFunction()->getSubprogram();
-  if (!MMI->hasDebugInfo() || !SP ||
-      SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) {
-    // If we don't have a subprogram for this function then there will be a hole
-    // in the range information. Keep note of this by setting the previously
-    // used section to nullptr.
-    PrevCU = nullptr;
-    CurFn = nullptr;
-    DebugHandlerBase::endFunction(MF);
-    return;
-  }
-
   // Set DwarfDwarfCompileUnitID in MCContext to default value.
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
 
@@ -1220,17 +1219,14 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd()));
 
   // Under -gmlt, skip building the subprogram if there are no inlined
-  // subroutines inside it.
-  if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
+  // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
+  // is still needed as we need its source location.
+  if (!TheCU.getCUNode()->getDebugInfoForProfiling() &&
+      TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
       LScopes.getAbstractScopesList().empty() && !IsDarwin) {
     assert(InfoHolder.getScopeVariables().empty());
-    assert(DbgValues.empty());
-    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
-    // by a -gmlt CU. Add a test and remove this assertion.
-    assert(AbstractVariables.empty());
     PrevLabel = nullptr;
     CurFn = nullptr;
-    DebugHandlerBase::endFunction(MF);
     return;
   }
 
@@ -1266,7 +1262,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   InfoHolder.getScopeVariables().clear();
   PrevLabel = nullptr;
   CurFn = nullptr;
-  DebugHandlerBase::endFunction(MF);
 }
 
 // Register a source line with debug info. Returns the  unique label that was
@@ -1361,6 +1356,18 @@ void DwarfDebug::emitAccelTypes() {
 /// computeIndexValue - Compute the gdb index value for the DIE and CU.
 static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
                                                         const DIE *Die) {
+  // Entities that ended up only in a Type Unit reference the CU instead (since
+  // the pub entry has offsets within the CU there's no real offset that can be
+  // provided anyway). As it happens all such entities (namespaces and types,
+  // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out
+  // not to be true it would be necessary to persist this information from the
+  // point at which the entry is added to the index data structure - since by
+  // the time the index is built from that, the original type/namespace DIE in a
+  // type unit has already been destroyed so it can't be queried for properties
+  // like tag, etc.
+  if (Die->getTag() == dwarf::DW_TAG_compile_unit)
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE,
+                                          dwarf::GIEL_EXTERNAL);
   dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
 
   // We could have a specification DIE that has our most of our knowledge,
@@ -1498,27 +1505,37 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                               ByteStreamer &Streamer,
                               const DebugLocEntry::Value &Value,
                               DwarfExpression &DwarfExpr) {
-  DIExpressionCursor ExprCursor(Value.getExpression());
-  DwarfExpr.addFragmentOffset(Value.getExpression());
+  auto *DIExpr = Value.getExpression();
+  DIExpressionCursor ExprCursor(DIExpr);
+  DwarfExpr.addFragmentOffset(DIExpr);
   // Regular entry.
   if (Value.isInt()) {
     if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
                BT->getEncoding() == dwarf::DW_ATE_signed_char))
-      DwarfExpr.AddSignedConstant(Value.getInt());
+      DwarfExpr.addSignedConstant(Value.getInt());
     else
-      DwarfExpr.AddUnsignedConstant(Value.getInt());
+      DwarfExpr.addUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
-    MachineLocation Loc = Value.getLoc();
+    MachineLocation Location = Value.getLoc();
+
+    SmallVector<uint64_t, 8> Ops;
+    // FIXME: Should this condition be Location.isIndirect() instead?
+    if (Location.getOffset()) {
+      Ops.push_back(dwarf::DW_OP_plus);
+      Ops.push_back(Location.getOffset());
+      Ops.push_back(dwarf::DW_OP_deref);
+    }
+    Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    DIExpressionCursor Cursor(Ops);
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
-    if (Loc.getOffset())
-      DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset());
-    else
-      DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg());
+    if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+      return;
+    return DwarfExpr.addExpression(std::move(Cursor));
   } else if (Value.isConstantFP()) {
     APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
-    DwarfExpr.AddUnsignedConstant(RawBytes);
+    DwarfExpr.addUnsignedConstant(RawBytes);
   }
-  DwarfExpr.AddExpression(std::move(ExprCursor));
+  DwarfExpr.addExpression(std::move(ExprCursor));
 }
 
 void DebugLocEntry::finalize(const AsmPrinter &AP,
@@ -1940,11 +1957,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
   MD5 Hash;
   Hash.update(Identifier);
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
   MD5::MD5Result Result;
   Hash.final(Result);
-  return support::endian::read64le(Result + 8);
+  return Result.high();
 }
 
 void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 253e3f06200e..8a96e7867b6e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -89,7 +89,7 @@ public:
     assert(!MInsn && "Already initialized?");
 
     assert((!E || E->isValid()) && "Expected valid expression");
-    assert(~FI && "Expected valid index");
+    assert(FI != INT_MAX && "Expected valid index");
 
     FrameIndexExprs.push_back({FI, E});
   }
@@ -448,6 +448,15 @@ class DwarfDebug : public DebugHandlerBase {
   /// Collect variable information from the side table maintained by MF.
   void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
 
+protected:
+  /// Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function debug information.
+  void endFunctionImpl(const MachineFunction *MF) override;
+
+  void skippedNonDebugFunction() override;
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -463,12 +472,6 @@ public:
   /// Emit all Dwarf sections that should come after the content.
   void endModule() override;
 
-  /// Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// Gather and emit post-function debug information.
-  void endFunction(const MachineFunction *MF) override;
-
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 61b2c7e65842..debe88f3b1ee 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -22,79 +22,76 @@
 
 using namespace llvm;
 
-void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
+void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
   if (DwarfReg < 32) {
-    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+    emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
   } else {
-    EmitOp(dwarf::DW_OP_regx, Comment);
-    EmitUnsigned(DwarfReg);
+    emitOp(dwarf::DW_OP_regx, Comment);
+    emitUnsigned(DwarfReg);
   }
 }
 
-void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) {
+void DwarfExpression::addBReg(int DwarfReg, int Offset) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
   if (DwarfReg < 32) {
-    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+    emitOp(dwarf::DW_OP_breg0 + DwarfReg);
   } else {
-    EmitOp(dwarf::DW_OP_bregx);
-    EmitUnsigned(DwarfReg);
+    emitOp(dwarf::DW_OP_bregx);
+    emitUnsigned(DwarfReg);
   }
-  EmitSigned(Offset);
-  if (Deref)
-    EmitOp(dwarf::DW_OP_deref);
+  emitSigned(Offset);
 }
 
-void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+void DwarfExpression::addFBReg(int Offset) {
+  emitOp(dwarf::DW_OP_fbreg);
+  emitSigned(Offset);
+}
+
+void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
   if (!SizeInBits)
     return;
 
   const unsigned SizeOfByte = 8;
   if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
-    EmitOp(dwarf::DW_OP_bit_piece);
-    EmitUnsigned(SizeInBits);
-    EmitUnsigned(OffsetInBits);
+    emitOp(dwarf::DW_OP_bit_piece);
+    emitUnsigned(SizeInBits);
+    emitUnsigned(OffsetInBits);
   } else {
-    EmitOp(dwarf::DW_OP_piece);
+    emitOp(dwarf::DW_OP_piece);
     unsigned ByteSize = SizeInBits / SizeOfByte;
-    EmitUnsigned(ByteSize);
+    emitUnsigned(ByteSize);
   }
   this->OffsetInBits += SizeInBits;
 }
 
-void DwarfExpression::AddShr(unsigned ShiftBy) {
-  EmitOp(dwarf::DW_OP_constu);
-  EmitUnsigned(ShiftBy);
-  EmitOp(dwarf::DW_OP_shr);
+void DwarfExpression::addShr(unsigned ShiftBy) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(ShiftBy);
+  emitOp(dwarf::DW_OP_shr);
 }
 
-bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI,
-                                            unsigned MachineReg, int Offset) {
-  if (isFrameRegister(TRI, MachineReg)) {
-    // If variable offset is based in frame register then use fbreg.
-    EmitOp(dwarf::DW_OP_fbreg);
-    EmitSigned(Offset);
-    return true;
-  }
-
-  int DwarfReg = TRI.getDwarfRegNum(MachineReg, false);
-  if (DwarfReg < 0)
-    return false;
-
-  AddRegIndirect(DwarfReg, Offset);
-  return true;
+void DwarfExpression::addAnd(unsigned Mask) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(Mask);
+  emitOp(dwarf::DW_OP_and);
 }
 
-bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
+bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
                                     unsigned MachineReg, unsigned MaxSize) {
-  if (!TRI.isPhysicalRegister(MachineReg))
+  if (!TRI.isPhysicalRegister(MachineReg)) {
+    if (isFrameRegister(TRI, MachineReg)) {
+      DwarfRegs.push_back({-1, 0, nullptr});
+      return true;
+    }
     return false;
+  }
 
   int Reg = TRI.getDwarfRegNum(MachineReg, false);
 
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
-    AddReg(Reg);
+    DwarfRegs.push_back({Reg, 0, nullptr});
     return true;
   }
 
@@ -106,7 +103,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
       unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg);
       unsigned Size = TRI.getSubRegIdxSize(Idx);
       unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
-      AddReg(Reg, "super-register");
+      DwarfRegs.push_back({Reg, 0, "super-register"});
       // Use a DW_OP_bit_piece to describe the sub-register.
       setSubRegisterPiece(Size, RegOffset);
       return true;
@@ -136,72 +133,101 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
     // If this sub-register has a DWARF number and we haven't covered
     // its range, emit a DWARF piece for it.
     if (Reg >= 0 && Intersection.any()) {
-      AddReg(Reg, "sub-register");
+      // Emit a piece for any gap in the coverage.
+      if (Offset > CurPos)
+        DwarfRegs.push_back({-1, Offset - CurPos, nullptr});
+      DwarfRegs.push_back(
+          {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"});
       if (Offset >= MaxSize)
 	break;
-      // Emit a piece for the any gap in the coverage.
-      if (Offset > CurPos)
-        AddOpPiece(Offset - CurPos);
-      AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset));
-      CurPos = Offset + Size;
 
       // Mark it as emitted.
       Coverage.set(Offset, Offset + Size);
+      CurPos = Offset + Size;
     }
   }
 
   return CurPos;
 }
 
-void DwarfExpression::AddStackValue() {
+void DwarfExpression::addStackValue() {
   if (DwarfVersion >= 4)
-    EmitOp(dwarf::DW_OP_stack_value);
+    emitOp(dwarf::DW_OP_stack_value);
 }
 
-void DwarfExpression::AddSignedConstant(int64_t Value) {
-  EmitOp(dwarf::DW_OP_consts);
-  EmitSigned(Value);
-  AddStackValue();
+void DwarfExpression::addSignedConstant(int64_t Value) {
+  emitOp(dwarf::DW_OP_consts);
+  emitSigned(Value);
+  addStackValue();
 }
 
-void DwarfExpression::AddUnsignedConstant(uint64_t Value) {
-  EmitOp(dwarf::DW_OP_constu);
-  EmitUnsigned(Value);
-  AddStackValue();
+void DwarfExpression::addUnsignedConstant(uint64_t Value) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(Value);
+  addStackValue();
 }
 
-void DwarfExpression::AddUnsignedConstant(const APInt &Value) {
+void DwarfExpression::addUnsignedConstant(const APInt &Value) {
   unsigned Size = Value.getBitWidth();
   const uint64_t *Data = Value.getRawData();
 
   // Chop it up into 64-bit pieces, because that's the maximum that
-  // AddUnsignedConstant takes.
+  // addUnsignedConstant takes.
   unsigned Offset = 0;
   while (Offset < Size) {
-    AddUnsignedConstant(*Data++);
+    addUnsignedConstant(*Data++);
     if (Offset == 0 && Size <= 64)
       break;
-    AddOpPiece(std::min(Size-Offset, 64u), Offset);
+    addOpPiece(std::min(Size-Offset, 64u), Offset);
     Offset += 64;
   }
 }
 
-bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
+bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
                                               DIExpressionCursor &ExprCursor,
                                               unsigned MachineReg,
                                               unsigned FragmentOffsetInBits) {
-  if (!ExprCursor)
-    return AddMachineReg(TRI, MachineReg);
+  auto Fragment = ExprCursor.getFragmentInfo();
+  if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U))
+    return false;
 
-  // Pattern-match combinations for which more efficient representations exist
-  // first.
-  bool ValidReg = false;
+  bool HasComplexExpression = false;
   auto Op = ExprCursor.peek();
+  if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment)
+    HasComplexExpression = true;
+
+  // If the register can only be described by a complex expression (i.e.,
+  // multiple subregisters) it doesn't safely compose with another complex
+  // expression. For example, it is not possible to apply a DW_OP_deref
+  // operation to multiple DW_OP_pieces.
+  if (HasComplexExpression && DwarfRegs.size() > 1) {
+    DwarfRegs.clear();
+    return false;
+  }
+
+  // Handle simple register locations.
+  if (!HasComplexExpression) {
+    for (auto &Reg : DwarfRegs) {
+      if (Reg.DwarfRegNo >= 0)
+        addReg(Reg.DwarfRegNo, Reg.Comment);
+      addOpPiece(Reg.Size);
+    }
+    DwarfRegs.clear();
+    return true;
+  }
+
+  assert(DwarfRegs.size() == 1);
+  auto Reg = DwarfRegs[0];
+  bool FBReg = isFrameRegister(TRI, MachineReg); 
+  assert(Reg.Size == 0 && "subregister has same size as superregister");
+
+  // Pattern-match combinations for which more efficient representations exist.
   switch (Op->getOp()) {
   default: {
-    auto Fragment = ExprCursor.getFragmentInfo();
-    ValidReg = AddMachineReg(TRI, MachineReg,
-			     Fragment ? Fragment->SizeInBits : ~1U);
+    if (FBReg)
+      addFBReg(0);
+    else
+      addReg(Reg.DwarfRegNo, 0);
     break;
   }
   case dwarf::DW_OP_plus:
@@ -210,28 +236,42 @@ bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
     // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset].
     auto N = ExprCursor.peekNext();
     if (N && N->getOp() == dwarf::DW_OP_deref) {
-      unsigned Offset = Op->getArg(0);
-      ValidReg = AddMachineRegIndirect(
-          TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
+      int Offset = Op->getArg(0);
+      int SignedOffset = (Op->getOp() == dwarf::DW_OP_plus) ? Offset : -Offset;
+      if (FBReg)
+        addFBReg(SignedOffset);
+      else
+        addBReg(Reg.DwarfRegNo, SignedOffset);
+
       ExprCursor.consume(2);
-    } else
-      ValidReg = AddMachineReg(TRI, MachineReg);
+      break;
+    }
+    addReg(Reg.DwarfRegNo, 0);
     break;
   }
   case dwarf::DW_OP_deref:
     // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
-    ValidReg = AddMachineRegIndirect(TRI, MachineReg);
+    if (FBReg)
+      addFBReg(0);
+    else
+      addBReg(Reg.DwarfRegNo, 0);
     ExprCursor.take();
     break;
   }
-
-  return ValidReg;
+  DwarfRegs.clear();
+  return true;
 }
 
-void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
+void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
                                     unsigned FragmentOffsetInBits) {
   while (ExprCursor) {
     auto Op = ExprCursor.take();
+
+    // If we need to mask out a subregister, do it now, unless the next
+    // operation would emit an OpPiece anyway.
+    if (SubRegisterSizeInBits && Op->getOp() != dwarf::DW_OP_LLVM_fragment)
+      maskSubRegister();
+
     switch (Op->getOp()) {
     case dwarf::DW_OP_LLVM_fragment: {
       unsigned SizeInBits = Op->getArg(1);
@@ -241,39 +281,45 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
       // location.
       assert(OffsetInBits >= FragmentOffset && "fragment offset not added?");
 
-      // If \a AddMachineReg already emitted DW_OP_piece operations to represent
+      // If \a addMachineReg already emitted DW_OP_piece operations to represent
       // a super-register by splicing together sub-registers, subtract the size
       // of the pieces that was already emitted.
       SizeInBits -= OffsetInBits - FragmentOffset;
 
-      // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a
+      // If \a addMachineReg requested a DW_OP_bit_piece to stencil out a
       // sub-register that is smaller than the current fragment's size, use it.
       if (SubRegisterSizeInBits)
         SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits);
       
-      AddOpPiece(SizeInBits, SubRegisterOffsetInBits);
+      addOpPiece(SizeInBits, SubRegisterOffsetInBits);
       setSubRegisterPiece(0, 0);
       break;
     }
     case dwarf::DW_OP_plus:
-      EmitOp(dwarf::DW_OP_plus_uconst);
-      EmitUnsigned(Op->getArg(0));
+      emitOp(dwarf::DW_OP_plus_uconst);
+      emitUnsigned(Op->getArg(0));
       break;
     case dwarf::DW_OP_minus:
       // There is no OP_minus_uconst.
-      EmitOp(dwarf::DW_OP_constu);
-      EmitUnsigned(Op->getArg(0));
-      EmitOp(dwarf::DW_OP_minus);
+      emitOp(dwarf::DW_OP_constu);
+      emitUnsigned(Op->getArg(0));
+      emitOp(dwarf::DW_OP_minus);
       break;
     case dwarf::DW_OP_deref:
-      EmitOp(dwarf::DW_OP_deref);
+      emitOp(dwarf::DW_OP_deref);
       break;
     case dwarf::DW_OP_constu:
-      EmitOp(dwarf::DW_OP_constu);
-      EmitUnsigned(Op->getArg(0));
+      emitOp(dwarf::DW_OP_constu);
+      emitUnsigned(Op->getArg(0));
       break;
     case dwarf::DW_OP_stack_value:
-      AddStackValue();
+      addStackValue();
+      break;
+    case dwarf::DW_OP_swap:
+      emitOp(dwarf::DW_OP_swap);
+      break;
+    case dwarf::DW_OP_xderef:
+      emitOp(dwarf::DW_OP_xderef);
       break;
     default:
       llvm_unreachable("unhandled opcode found in expression");
@@ -281,9 +327,25 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
   }
 }
 
+/// add masking operations to stencil out a subregister.
+void DwarfExpression::maskSubRegister() {
+  assert(SubRegisterSizeInBits && "no subregister was registered");
+  if (SubRegisterOffsetInBits > 0)
+    addShr(SubRegisterOffsetInBits);
+  uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL;
+  addAnd(Mask);
+}
+
+
 void DwarfExpression::finalize() {
-  if (SubRegisterSizeInBits)
-    AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
+  assert(DwarfRegs.size() == 0 && "dwarf registers not emitted");
+  // Emit any outstanding DW_OP_piece operations to mask out subregisters.
+  if (SubRegisterSizeInBits == 0)
+    return;
+  // Don't emit a DW_OP_piece for a subregister at offset 0.
+  if (SubRegisterOffsetInBits == 0)
+    return;
+  addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
 }
 
 void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
@@ -294,6 +356,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
   assert(FragmentOffset >= OffsetInBits &&
          "overlapping or duplicate fragments");
   if (FragmentOffset > OffsetInBits)
-    AddOpPiece(FragmentOffset - OffsetInBits);
+    addOpPiece(FragmentOffset - OffsetInBits);
   OffsetInBits = FragmentOffset;
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
index fd90fa05bc32..e8dc211eb3c2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -84,9 +84,19 @@ public:
 /// entry.
 class DwarfExpression {
 protected:
-  unsigned DwarfVersion;
+  /// Holds information about all subregisters comprising a register location.
+  struct Register {
+    int DwarfRegNo;
+    unsigned Size;
+    const char *Comment;
+  };
+
+  /// The register location, if any.
+  SmallVector<Register, 2> DwarfRegs;
+
   /// Current Fragment Offset in Bits.
   uint64_t OffsetInBits = 0;
+  unsigned DwarfVersion;
 
   /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister. 
   unsigned SubRegisterSizeInBits = 0;
@@ -99,35 +109,54 @@ protected:
     SubRegisterOffsetInBits = OffsetInBits;
   }
 
-public:
-  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
-  virtual ~DwarfExpression() {};
-
-  /// This needs to be called last to commit any pending changes.
-  void finalize();
+  /// Add masking operations to stencil out a subregister.
+  void maskSubRegister();
 
   /// Output a dwarf operand and an optional assembler comment.
-  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0;
   /// Emit a raw signed value.
-  virtual void EmitSigned(int64_t Value) = 0;
+  virtual void emitSigned(int64_t Value) = 0;
   /// Emit a raw unsigned value.
-  virtual void EmitUnsigned(uint64_t Value) = 0;
+  virtual void emitUnsigned(uint64_t Value) = 0;
   /// Return whether the given machine register is the frame register in the
   /// current function.
   virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0;
 
-  /// Emit a dwarf register operation.
-  void AddReg(int DwarfReg, const char *Comment = nullptr);
-  /// Emit an (double-)indirect dwarf register operation.
-  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
+  /// Emit a DW_OP_reg operation.
+  void addReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit a DW_OP_breg operation.
+  void addBReg(int DwarfReg, int Offset);
+  /// Emit DW_OP_fbreg <Offset>.
+  void addFBReg(int Offset);
+
+  /// Emit a partial DWARF register operation.
+  ///
+  /// \param MachineReg           The register number.
+  /// \param MaxSize              If the register must be composed from
+  ///                             sub-registers this is an upper bound
+  ///                             for how many bits the emitted DW_OP_piece
+  ///                             may cover.
+  ///
+  /// If size and offset is zero an operation for the entire register is
+  /// emitted: Some targets do not provide a DWARF register number for every
+  /// register.  If this is the case, this function will attempt to emit a DWARF
+  /// register by emitting a fragment of a super-register or by piecing together
+  /// multiple subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                     unsigned MaxSize = ~1U);
+
 
   /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment.
   /// \param OffsetInBits    This is an optional offset into the location that
   /// is at the top of the DWARF stack.
-  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
 
-  /// Emit a shift-right dwarf expression.
-  void AddShr(unsigned ShiftBy);
+  /// Emit a shift-right dwarf operation.
+  void addShr(unsigned ShiftBy);
+  /// Emit a bitwise and dwarf operation.
+  void addAnd(unsigned Mask);
 
   /// Emit a DW_OP_stack_value, if supported.
   ///
@@ -140,37 +169,21 @@ public:
   /// constant value, so the producers and consumers started to rely on
   /// heuristics to disambiguate the value vs. location status of the
   /// expression.  See PR21176 for more details.
-  void AddStackValue();
+  void addStackValue();
 
-  /// Emit an indirect dwarf register operation for the given machine register.
-  /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                             int Offset = 0);
+  ~DwarfExpression() = default;
+public:
+  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
 
-  /// Emit a partial DWARF register operation.
-  ///
-  /// \param MachineReg           The register number.
-  /// \param MaxSize              If the register must be composed from
-  ///                             sub-registers this is an upper bound
-  ///                             for how many bits the emitted DW_OP_piece
-  ///                             may cover.
-  ///
-  /// If size and offset is zero an operation for the entire register is
-  /// emitted: Some targets do not provide a DWARF register number for every
-  /// register.  If this is the case, this function will attempt to emit a DWARF
-  /// register by emitting a fragment of a super-register or by piecing together
-  /// multiple subregisters that alias the register.
-  ///
-  /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                     unsigned MaxSize = ~1U);
+  /// This needs to be called last to commit any pending changes.
+  void finalize();
 
   /// Emit a signed constant.
-  void AddSignedConstant(int64_t Value);
+  void addSignedConstant(int64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(uint64_t Value);
+  void addUnsignedConstant(uint64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(const APInt &Value);
+  void addUnsignedConstant(const APInt &Value);
 
   /// Emit a machine register location. As an optimization this may also consume
   /// the prefix of a DwarfExpression if a more efficient representation for
@@ -181,7 +194,7 @@ public:
   ///                                 fragment inside the entire variable.
   /// \return                         false if no DWARF register exists
   ///                                 for MachineReg.
-  bool AddMachineRegExpression(const TargetRegisterInfo &TRI,
+  bool addMachineRegExpression(const TargetRegisterInfo &TRI,
                                DIExpressionCursor &Expr, unsigned MachineReg,
                                unsigned FragmentOffsetInBits = 0);
   /// Emit all remaining operations in the DIExpressionCursor.
@@ -189,7 +202,7 @@ public:
   /// \param FragmentOffsetInBits     If this is one fragment out of multiple
   ///                                 locations, this is the offset of the
   ///                                 fragment inside the entire variable.
-  void AddExpression(DIExpressionCursor &&Expr,
+  void addExpression(DIExpressionCursor &&Expr,
                      unsigned FragmentOffsetInBits = 0);
 
   /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to
@@ -198,33 +211,32 @@ public:
 };
 
 /// DwarfExpression implementation for .debug_loc entries.
-class DebugLocDwarfExpression : public DwarfExpression {
+class DebugLocDwarfExpression final : public DwarfExpression {
   ByteStreamer &BS;
 
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
 public:
   DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS)
       : DwarfExpression(DwarfVersion), BS(BS) {}
-
-  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
-  void EmitSigned(int64_t Value) override;
-  void EmitUnsigned(uint64_t Value) override;
-  bool isFrameRegister(const TargetRegisterInfo &TRI,
-                       unsigned MachineReg) override;
 };
 
 /// DwarfExpression implementation for singular DW_AT_location.
-class DIEDwarfExpression : public DwarfExpression {
+class DIEDwarfExpression final : public DwarfExpression {
 const AsmPrinter &AP;
   DwarfUnit &DU;
   DIELoc &DIE;
 
-public:
-  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
-  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
-  void EmitSigned(int64_t Value) override;
-  void EmitUnsigned(uint64_t Value) override;
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
   DIELoc *finalize() {
     DwarfExpression::finalize();
     return &DIE;
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 2a866c071f59..bad5b09553cd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -54,15 +54,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
     : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU),
       DIE(DIE) {}
 
-void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
   DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
 }
 
-void DIEDwarfExpression::EmitSigned(int64_t Value) {
+void DIEDwarfExpression::emitSigned(int64_t Value) {
   DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
 }
 
-void DIEDwarfExpression::EmitUnsigned(uint64_t Value) {
+void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
   DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
 }
 
@@ -98,25 +98,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
   default:
     break;
 
-  case dwarf::DW_LANG_C89:
-  case dwarf::DW_LANG_C99:
+  // The languages below have valid values in all DWARF versions.
   case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C89:
   case dwarf::DW_LANG_C_plus_plus:
-  case dwarf::DW_LANG_ObjC:
-  case dwarf::DW_LANG_ObjC_plus_plus:
     return 0;
 
   case dwarf::DW_LANG_Fortran77:
   case dwarf::DW_LANG_Fortran90:
-  case dwarf::DW_LANG_Fortran95:
     return 1;
 
-  // The languages below have valid values only if the DWARF version >= 4.
+  // The languages below have valid values only if the DWARF version >= 3.
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    if (DD->getDwarfVersion() >= 3)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Fortran95:
+    if (DD->getDwarfVersion() >= 3)
+      return 1;
+    break;
+
+  // Starting with DWARF v4, all defined languages have valid values.
+  case dwarf::DW_LANG_D:
   case dwarf::DW_LANG_Java:
   case dwarf::DW_LANG_Python:
   case dwarf::DW_LANG_UPC:
-  case dwarf::DW_LANG_D:
-    if (dwarf::DWARF_VERSION >= 4)
+    if (DD->getDwarfVersion() >= 4)
       return 0;
     break;
 
@@ -127,31 +137,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
   case dwarf::DW_LANG_Modula2:
   case dwarf::DW_LANG_Pascal83:
   case dwarf::DW_LANG_PLI:
-    if (dwarf::DWARF_VERSION >= 4)
+    if (DD->getDwarfVersion() >= 4)
       return 1;
     break;
 
-  // The languages below have valid values only if the DWARF version >= 5.
-  case dwarf::DW_LANG_OpenCL:
-  case dwarf::DW_LANG_Go:
-  case dwarf::DW_LANG_Haskell:
+  // The languages below are new in DWARF v5.
+  case dwarf::DW_LANG_BLISS:
+  case dwarf::DW_LANG_C11:
   case dwarf::DW_LANG_C_plus_plus_03:
   case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_C_plus_plus_14:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
   case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_RenderScript:
   case dwarf::DW_LANG_Rust:
-  case dwarf::DW_LANG_C11:
   case dwarf::DW_LANG_Swift:
-  case dwarf::DW_LANG_Dylan:
-  case dwarf::DW_LANG_C_plus_plus_14:
-    if (dwarf::DWARF_VERSION >= 5)
+    if (DD->getDwarfVersion() >= 5)
       return 0;
     break;
 
-  case dwarf::DW_LANG_Modula3:
-  case dwarf::DW_LANG_Julia:
   case dwarf::DW_LANG_Fortran03:
   case dwarf::DW_LANG_Fortran08:
-    if (dwarf::DWARF_VERSION >= 5)
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Modula3:
+    if (DD->getDwarfVersion() >= 5)
       return 1;
     break;
   }
@@ -285,13 +297,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
                dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));
 }
 
-void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
-                                    StringRef Identifier) {
-  uint64_t Signature = DD->makeTypeSignature(Identifier);
-  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8,
-               DIEInteger(Signature));
-}
-
 void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
                             DIEEntry Entry) {
   const DIEUnit *CU = Die.getUnit();
@@ -465,50 +470,47 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
   // Decode the original location, and use that as the start of the byref
   // variable's location.
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-  SmallVector<uint64_t, 6> DIExpr;
-  DIEDwarfExpression Expr(*Asm, *this, *Loc);
-
-  bool validReg;
-  if (Location.isReg())
-    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                  Location.getReg());
-  else
-    validReg =
-        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                   Location.getReg(), Location.getOffset());
-
-  if (!validReg)
-    return;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
 
+  SmallVector<uint64_t, 9> Ops;
+  if (Location.isIndirect()) {
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Location.getOffset());
+    Ops.push_back(dwarf::DW_OP_deref);
+  }
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    DIExpr.push_back(dwarf::DW_OP_deref);
+    Ops.push_back(dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    DIExpr.push_back(dwarf::DW_OP_plus);
-    DIExpr.push_back(forwardingFieldOffset);
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  DIExpr.push_back(dwarf::DW_OP_deref);
+  Ops.push_back(dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    DIExpr.push_back(dwarf::DW_OP_plus);
-    DIExpr.push_back(varFieldOffset);
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(varFieldOffset);
   }
-  Expr.AddExpression(makeArrayRef(DIExpr));
-  Expr.finalize();
+
+  DIExpressionCursor Cursor(Ops);
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+    return;
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Return true if type encoding is unsigned.
@@ -672,7 +674,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   return getDIE(Context);
 }
 
-DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
+DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) {
   auto *Context = resolve(Ty->getScope());
   DIE *ContextDIE = getOrCreateContextDIE(Context);
 
@@ -684,8 +686,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
 
   constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));
 
-  if (!Ty->isExternalTypeRef())
-    updateAcceleratorTables(Context, Ty, TyDIE);
+  updateAcceleratorTables(Context, Ty, TyDIE);
   return &TyDIE;
 }
 
@@ -841,6 +842,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy->isForwardDecl())
     addSourceLine(Buffer, DTy);
+
+  // If DWARF address space value is other than None, add it for pointer and
+  // reference types as DW_AT_address_class.
+  if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type ||
+                                      Tag == dwarf::DW_TAG_reference_type))
+    addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
+            DTy->getDWARFAddressSpace().getValue());
 }
 
 void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
@@ -892,13 +900,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
-  if (CTy->isExternalTypeRef()) {
-    StringRef Identifier = CTy->getIdentifier();
-    assert(!Identifier.empty() && "external type ref without identifier");
-    addFlag(Buffer, dwarf::DW_AT_declaration);
-    return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier);
-  }
-
   // Add name if not anonymous or intermediate type.
   StringRef Name = CTy->getName();
 
@@ -1180,8 +1181,12 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
 }
 
 void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
-                                          bool Minimal) {
-  if (!Minimal)
+                                          bool SkipSPAttributes) {
+  // If -fdebug-info-for-profiling is enabled, need to emit the subprogram
+  // and its source location.
+  bool SkipSPSourceLocation = SkipSPAttributes &&
+                              !CUNode->getDebugInfoForProfiling();
+  if (!SkipSPSourceLocation)
     if (applySubprogramDefinitionAttributes(SP, SPDie))
       return;
 
@@ -1189,12 +1194,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   if (!SP->getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP->getName());
 
+  if (!SkipSPSourceLocation)
+    addSourceLine(SPDie, SP);
+
   // Skip the rest of the attributes under -gmlt to save space.
-  if (Minimal)
+  if (SkipSPAttributes)
     return;
 
-  addSourceLine(SPDie, SP);
-
   // Add the prototype if we have a prototype and we have a C like
   // language.
   uint16_t Language = getLanguage();
@@ -1526,18 +1532,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
   return &StaticMemberDIE;
 }
 
-void DwarfUnit::emitHeader(bool UseOffsets) {
+void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   // Emit size of content not including length itself
   Asm->OutStreamer->AddComment("Length of Unit");
   Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize());
 
   Asm->OutStreamer->AddComment("DWARF version number");
-  Asm->EmitInt16(DD->getDwarfVersion());
-  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
+  unsigned Version = DD->getDwarfVersion();
+  Asm->EmitInt16(Version);
+
+  // DWARF v5 reorders the address size and adds a unit type.
+  if (Version >= 5) {
+    Asm->OutStreamer->AddComment("DWARF Unit Type");
+    Asm->EmitInt8(UT);
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  }
 
   // We share one abbreviations table across all units so it's always at the
   // start of the section. Use a relocatable offset where needed to ensure
   // linking doesn't invalidate that offset.
+  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   if (UseOffsets)
     Asm->EmitInt32(0);
@@ -1545,12 +1560,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) {
     Asm->emitDwarfSymbolReference(
         TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
 
-  Asm->OutStreamer->AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  if (Version <= 4) {
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  }
 }
 
 void DwarfTypeUnit::emitHeader(bool UseOffsets) {
-  DwarfUnit::emitHeader(UseOffsets);
+  DwarfUnit::emitCommonHeader(UseOffsets, 
+                              DD->useSplitDwarf() ? dwarf::DW_UT_split_type
+                                                  : dwarf::DW_UT_type);
   Asm->OutStreamer->AddComment("Type Signature");
   Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));
   Asm->OutStreamer->AddComment("Type DIE Offset");
@@ -1564,3 +1583,13 @@ bool DwarfTypeUnit::isDwoUnit() const {
   // when split DWARF is being used.
   return DD->useSplitDwarf();
 }
+
+void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalNameForTypeUnit(Name, Context);
+}
+
+void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalTypeUnitType(Ty, Context);
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8654d6f0caf4..d626ef920f95 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -124,12 +124,12 @@ public:
   std::string getParentContextString(const DIScope *Context) const;
 
   /// Add a new global name to the compile unit.
-  virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) {
-  }
+  virtual void addGlobalName(StringRef Name, const DIE &Die,
+                             const DIScope *Context) = 0;
 
   /// Add a new global type to the compile unit.
   virtual void addGlobalType(const DIType *Ty, const DIE &Die,
-                             const DIScope *Context) {}
+                             const DIScope *Context) = 0;
 
   /// Returns the DIE map slot for the specified debug variable.
   ///
@@ -198,9 +198,6 @@ public:
 
   /// Add a type's DW_AT_signature and set the  declaration flag.
   void addDIETypeSignature(DIE &Die, uint64_t Signature);
-  /// Add an attribute containing the type signature for a unique identifier.
-  void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
-                           StringRef Identifier);
 
   /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
@@ -256,15 +253,12 @@ public:
   DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
 
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
-                                 bool Minimal = false);
+                                 bool SkipSPAttributes = false);
 
   /// Find existing DIE or create new DIE for the given type.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
   /// Get context owner's DIE.
-  DIE *createTypeDIE(const DICompositeType *Ty);
-
-  /// Get context owner's DIE.
   DIE *getOrCreateContextDIE(const DIScope *Context);
 
   /// Construct DIEs for types that contain vtables.
@@ -282,11 +276,13 @@ public:
   virtual unsigned getHeaderSize() const {
     return sizeof(int16_t) + // DWARF version number
            sizeof(int32_t) + // Offset Into Abbrev. Section
-           sizeof(int8_t);   // Pointer Size (in bytes)
+           sizeof(int8_t) +  // Pointer Size (in bytes)
+           (DD->getDwarfVersion() >= 5 ? sizeof(int8_t)
+                                       : 0); // DWARF v5 unit type
   }
 
   /// Emit the header for this unit, not including the initial length field.
-  virtual void emitHeader(bool UseOffsets);
+  virtual void emitHeader(bool UseOffsets) = 0;
 
   virtual DwarfCompileUnit &getCU() = 0;
 
@@ -306,6 +302,14 @@ protected:
     return Ref.resolve();
   }
 
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
+  /// Emit the common part of the header for this unit.
+  void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
+
 private:
   void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
@@ -330,11 +334,6 @@ private:
   /// Set D as anonymous type for index which can be reused later.
   void setIndexTyDie(DIE *D) { IndexTyDie = D; }
 
-  /// If this is a named finished type then include it in the list of types for
-  /// the accelerator tables.
-  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
-                               const DIE &TyDIE);
-
   virtual bool isDwoUnit() const = 0;
 };
 
@@ -354,12 +353,19 @@ public:
   void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }
   void setType(const DIE *Ty) { this->Ty = Ty; }
 
+  /// Get context owner's DIE.
+  DIE *createTypeDIE(const DICompositeType *Ty);
+
   /// Emit the header for this unit, not including the initial length field.
   void emitHeader(bool UseOffsets) override;
   unsigned getHeaderSize() const override {
     return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
            sizeof(uint32_t);                               // Type DIE Offset
   }
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+  void addGlobalType(const DIType *Ty, const DIE &Die,
+                     const DIScope *Context) override;
   DwarfCompileUnit &getCU() override { return CU; }
 };
 } // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 6a023b998b32..342efc3611c7 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===//
+//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,21 +14,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h" 
+#include "llvm/Support/ELF.h"
 
 using namespace llvm;
 
@@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter {
 public:
   void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
 };
-}
+
+} // end anonymous namespace
 
 static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
     X("erlang", "erlang-compatible garbage collector");
 
-void llvm::linkErlangGCPrinter() {}
-
 void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                                      AsmPrinter &AP) {
   MCStreamer &OS = *AP.OutStreamer;
@@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     }
   }
 }
+
+void llvm::linkErlangGCPrinter() {}
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 9d7c96a1b8ef..704f0ac2f191 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   const Function *F = MF->getFunction();
 
-  shouldEmitMoves = Asm->needsSEHMoves();
+  shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI();
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
@@ -94,7 +94,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   // If we're not using CFI, we don't want the CFI or the personality, but we
   // might want EH tables if we had EH pads.
-  if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) {
+  if (!Asm->MAI->usesWindowsCFI()) {
     if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) {
       // If this is 32-bit SEH and we don't have any funclets (really invokes),
       // make sure we emit the parent offset label. Some unreferenced filter
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index bf5cf105a8f8..9c19a4fd3c3e 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -1532,7 +1532,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
 
   Type *ResultTy;
   SmallVector<Value *, 6> Args;
-  AttributeSet Attr;
+  AttributeList Attr;
 
   // 'size' argument.
   if (!UseSizedLibcall) {
@@ -1593,7 +1593,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // Now, the return type.
   if (CASExpected) {
     ResultTy = Type::getInt1Ty(Ctx);
-    Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
+    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
   } else if (HasResult && UseSizedLibcall)
     ResultTy = SizedIntTy;
   else
diff --git a/lib/CodeGen/BranchCoalescing.cpp b/lib/CodeGen/BranchCoalescing.cpp
new file mode 100644
index 000000000000..efdf300df850
--- /dev/null
+++ b/lib/CodeGen/BranchCoalescing.cpp
@@ -0,0 +1,758 @@
+//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Coalesce basic blocks guarded by the same branch condition into a single
+/// basic block.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coal-branch"
+
+static cl::opt<cl::boolOrDefault>
+    EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden,
+                           cl::desc("enable coalescing of duplicate branches"));
+
+STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced");
+STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged");
+STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced");
+
+//===----------------------------------------------------------------------===//
+//                               BranchCoalescing
+//===----------------------------------------------------------------------===//
+///
+/// Improve scheduling by coalescing branches that depend on the same condition.
+/// This pass looks for blocks that are guarded by the same branch condition
+/// and attempts to merge the blocks together. Such opportunities arise from
+/// the expansion of select statements in the IR.
+///
+/// For example, consider the following LLVM IR:
+///
+/// %test = icmp eq i32 %x 0
+/// %tmp1 = select i1 %test, double %a, double 2.000000e-03
+/// %tmp2 = select i1 %test, double %b, double 5.000000e-03
+///
+/// This IR expands to the following machine code on PowerPC:
+///
+/// BB#0: derived from LLVM BB %entry
+///    Live Ins: %F1 %F3 %X6
+///        <SNIP1>
+///        %vreg0<def> = COPY %F1; F8RC:%vreg0
+///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4
+///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>;
+///                    mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
+///        BCC 76, %vreg5, <BB#2>; CRRC:%vreg5
+///    Successors according to CFG: BB#1(?%) BB#2(?%)
+///
+/// BB#1: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0
+///    Successors according to CFG: BB#2(?%)
+///
+/// BB#2: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0 BB#1
+///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>;
+///                    F8RC:%vreg9,%vreg8,%vreg0
+///        <SNIP2>
+///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5
+///    Successors according to CFG: BB#3(?%) BB#4(?%)
+///
+/// BB#3: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#2
+///    Successors according to CFG: BB#4(?%)
+///
+/// BB#4: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#2 BB#3
+///        %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>;
+///                     F8RC:%vreg13,%vreg12,%vreg2
+///        <SNIP3>
+///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use>
+///
+/// When this pattern is detected, branch coalescing will try to collapse
+/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3.
+///
+/// If all conditions are meet, IR should collapse to:
+///
+/// BB#0: derived from LLVM BB %entry
+///    Live Ins: %F1 %F3 %X6
+///        <SNIP1>
+///        %vreg0<def> = COPY %F1; F8RC:%vreg0
+///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4
+///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>;
+///                     mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
+///        <SNIP2>
+///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5
+///    Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%)
+///      BB#4(0x55555554 / 0x80000000 = 66.67%)
+///
+/// BB#1: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0
+///    Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%)
+///
+/// BB#4: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0 BB#1
+///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>;
+///                    F8RC:%vreg9,%vreg8,%vreg0
+///        %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>;
+///                     F8RC:%vreg13,%vreg12,%vreg2
+///        <SNIP3>
+///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use>
+///
+/// Branch Coalescing does not split blocks, it moves everything in the same
+/// direction ensuring it does not break use/definition semantics.
+///
+/// PHI nodes and its corresponding use instructions are moved to its successor
+/// block if there are no uses within the successor block PHI nodes.  PHI
+/// node ordering cannot be assumed.
+///
+/// Non-PHI can be moved up to the predecessor basic block or down to the
+/// successor basic block following any PHI instructions. Whether it moves
+/// up or down depends on whether the register(s) defined in the instructions
+/// are used in current block or in any PHI instructions at the beginning of
+/// the successor block.
+
+namespace {
+
+class BranchCoalescing : public MachineFunctionPass {
+  struct CoalescingCandidateInfo {
+    MachineBasicBlock *BranchBlock;       // Block containing the branch
+    MachineBasicBlock *BranchTargetBlock; // Block branched to
+    MachineBasicBlock *FallThroughBlock;  // Fall-through if branch not taken
+    SmallVector<MachineOperand, 4> Cond;
+    bool MustMoveDown;
+    bool MustMoveUp;
+
+    CoalescingCandidateInfo();
+    void clear();
+  };
+
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+
+  void initialize(MachineFunction &F);
+  bool canCoalesceBranch(CoalescingCandidateInfo &Cand);
+  bool identicalOperands(ArrayRef<MachineOperand> OperandList1,
+                         ArrayRef<MachineOperand> OperandList2) const;
+  bool validateCandidates(CoalescingCandidateInfo &SourceRegion,
+                          CoalescingCandidateInfo &TargetRegion) const;
+
+  static bool isBranchCoalescingEnabled() {
+    return EnableBranchCoalescing == cl::BOU_TRUE;
+  }
+
+public:
+  static char ID;
+
+  BranchCoalescing() : MachineFunctionPass(ID) {
+    initializeBranchCoalescingPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Branch Coalescing"; }
+
+  bool mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                       CoalescingCandidateInfo &TargetRegion);
+  bool canMoveToBeginning(const MachineInstr &MI,
+                          const MachineBasicBlock &MBB) const;
+  bool canMoveToEnd(const MachineInstr &MI,
+                    const MachineBasicBlock &MBB) const;
+  bool canMerge(CoalescingCandidateInfo &SourceRegion,
+                CoalescingCandidateInfo &TargetRegion) const;
+  void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB,
+                         MachineBasicBlock *TargetRegionMBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char BranchCoalescing::ID = 0;
+char &llvm::BranchCoalescingID = BranchCoalescing::ID;
+
+INITIALIZE_PASS_BEGIN(BranchCoalescing, "branch-coalescing",
+                      "Branch Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(BranchCoalescing, "branch-coalescing", "Branch Coalescing",
+                    false, false)
+
+BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo()
+    : BranchBlock(nullptr), BranchTargetBlock(nullptr),
+      FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {}
+
+void BranchCoalescing::CoalescingCandidateInfo::clear() {
+  BranchBlock = nullptr;
+  BranchTargetBlock = nullptr;
+  FallThroughBlock = nullptr;
+  Cond.clear();
+  MustMoveDown = false;
+  MustMoveUp = false;
+}
+
+void BranchCoalescing::initialize(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = &getAnalysis<MachinePostDominatorTree>();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+///
+/// Analyze the branch statement to determine if it can be coalesced. This
+/// method analyses the branch statement for the given candidate to determine
+/// if it can be coalesced. If the branch can be coalesced, then the
+/// BranchTargetBlock and the FallThroughBlock are recorded in the specified
+/// Candidate.
+///
+///\param[in,out] Cand The coalescing candidate to analyze
+///\return true if and only if the branch can be coalesced, false otherwise
+///
+bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
+  DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
+               << " can be coalesced:");
+  MachineBasicBlock *FalseMBB = nullptr;
+
+  if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
+                         Cand.Cond)) {
+    DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+    return false;
+  }
+
+  for (auto &I : Cand.BranchBlock->terminators()) {
+    DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+    if (!I.isBranch())
+      continue;
+
+    if (I.getNumOperands() != I.getNumExplicitOperands()) {
+      DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
+                   << "\n");
+      return false;
+    }
+  }
+
+  if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
+    DEBUG(dbgs() << "EH Pad - skip\n");
+    return false;
+  }
+
+  // For now only consider triangles (i.e, BranchTargetBlock is set,
+  // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
+  if (!Cand.BranchTargetBlock || FalseMBB ||
+      !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
+    DEBUG(dbgs() << "Does not form a triangle - skip\n");
+    return false;
+  }
+
+  // Ensure there are only two successors
+  if (Cand.BranchBlock->succ_size() != 2) {
+    DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+    return false;
+  }
+
+  // Sanity check - the block must be able to fall through
+  assert(Cand.BranchBlock->canFallThrough() &&
+         "Expecting the block to fall through!");
+
+  // We have already ensured there are exactly two successors to
+  // BranchBlock and that BranchTargetBlock is a successor to BranchBlock.
+  // Ensure the single fall though block is empty.
+  MachineBasicBlock *Succ =
+    (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock)
+    ? *Cand.BranchBlock->succ_rbegin()
+    : *Cand.BranchBlock->succ_begin();
+
+  assert(Succ && "Expecting a valid fall-through block\n");
+
+  if (!Succ->empty()) {
+      DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+      return false;
+  }
+
+  if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
+      DEBUG(dbgs()
+            << "Successor of fall through block is not branch taken block\n");
+      return false;
+  }
+
+  Cand.FallThroughBlock = Succ;
+  DEBUG(dbgs() << "Valid Candidate\n");
+  return true;
+}
+
+///
+/// Determine if the two operand lists are identical
+///
+/// \param[in] OpList1 operand list
+/// \param[in] OpList2 operand list
+/// \return true if and only if the operands lists are identical
+///
+bool BranchCoalescing::identicalOperands(
+    ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
+
+  if (OpList1.size() != OpList2.size()) {
+    DEBUG(dbgs() << "Operand list is different size\n");
+    return false;
+  }
+
+  for (unsigned i = 0; i < OpList1.size(); ++i) {
+    const MachineOperand &Op1 = OpList1[i];
+    const MachineOperand &Op2 = OpList2[i];
+
+    DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+                 << "Op2: " << Op2 << "\n");
+
+    if (Op1.isIdenticalTo(Op2)) {
+      DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+      continue;
+    }
+
+    // If the operands are not identical, but are registers, check to see if the
+    // definition of the register produces the same value. If they produce the
+    // same value, consider them to be identical.
+    if (Op1.isReg() && Op2.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
+        TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+      MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
+      MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
+      if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
+        DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+                     << " produce the same value!\n");
+      } else {
+        DEBUG(dbgs() << "Operands produce different values\n");
+        return false;
+      }
+    } else {
+      DEBUG(dbgs() << "The operands are not provably identical.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+///
+/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB
+/// and update them to refer to the new block.  PHI node ordering
+/// cannot be assumed so it does not matter where the PHI instructions
+/// are moved to in TargetMBB.
+///
+/// \param[in] SourceMBB block to move PHI instructions from
+/// \param[in] TargetMBB block to move PHI instructions to
+///
+void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
+                                         MachineBasicBlock *TargetMBB) {
+
+  MachineBasicBlock::iterator MI = SourceMBB->begin();
+  MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
+
+  if (MI == ME) {
+    DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+    return;
+  }
+
+  // Update all PHI instructions in SourceMBB and move to top of TargetMBB
+  for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) {
+    MachineInstr &PHIInst = *Iter;
+    for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = PHIInst.getOperand(i);
+      if (MO.getMBB() == SourceMBB)
+        MO.setMBB(TargetMBB);
+    }
+  }
+  TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME);
+}
+
+///
+/// This function checks if MI can be moved to the beginning of the TargetMBB
+/// following PHI instructions. A MI instruction can be moved to beginning of
+/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to beginning of TargetMBB,
+///         false otherwise.
+///
+bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
+                                          const MachineBasicBlock &TargetMBB
+                                          ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Def : MI.defs()) { // Looking at Def
+    for (auto &Use : MRI->use_instructions(Def.getReg())) {
+      if (Use.isPHI() && Use.getParent() == &TargetMBB) {
+        DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n");
+       return false;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the beginning.\n");
+  return true;
+}
+
+///
+/// This function checks if MI can be moved to the end of the TargetMBB,
+/// immediately before the first terminator.  A MI instruction can be moved
+/// to then end of the TargetMBB if no PHI node defines what MI uses within
+/// it's own MBB.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to end of TargetMBB,
+///         false otherwise.
+///
+bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI,
+                                    const MachineBasicBlock &TargetMBB
+                                    ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Use : MI.uses()) {
+    if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+      MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
+      if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
+        DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
+        return false;
+      } else {
+        DEBUG(dbgs() << "    *** def is in another block -- safe to move!\n");
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the end.\n");
+  return true;
+}
+
+///
+/// This method checks to ensure the two coalescing candidates follows the
+/// expected pattern required for coalescing.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+/// into a block in TargetRegion; false otherwise.
+///
+bool BranchCoalescing::validateCandidates(
+    CoalescingCandidateInfo &SourceRegion,
+    CoalescingCandidateInfo &TargetRegion) const {
+
+  if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock)
+    llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion");
+  else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock))
+    llvm_unreachable("Expecting TargetRegion to dominate SourceRegion");
+  else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock))
+    llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion");
+  else if (!TargetRegion.FallThroughBlock->empty() ||
+           !SourceRegion.FallThroughBlock->empty())
+    llvm_unreachable("Expecting fall-through blocks to be empty");
+
+  return true;
+}
+
+///
+/// This method determines whether the two coalescing candidates can be merged.
+/// In order to be merged, all instructions must be able to
+///   1. Move to the beginning of the SourceRegion.BranchTargetBlock;
+///   2. Move to the end of the TargetRegion.BranchBlock.
+/// Merging involves moving the instructions in the
+/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock).
+///
+/// This function first try to move instructions from the
+/// TargetRegion.BranchTargetBlock down, to the beginning of the
+/// SourceRegion.BranchTargetBlock. This is not possible if any register defined
+/// in TargetRegion.BranchTargetBlock is used in a PHI node in the
+/// SourceRegion.BranchTargetBlock. In this case, check whether the statement
+/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately
+/// before the branch statement). If it cannot move, then these blocks cannot
+/// be merged.
+///
+/// Note that there is no analysis for moving instructions past the fall-through
+/// blocks because they are confirmed to be empty. An assert is thrown if they
+/// are not.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+///         into a block in TargetRegion, false otherwise.
+///
+bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
+                                CoalescingCandidateInfo &TargetRegion) const {
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Walk through PHI nodes first and see if they force the merge into the
+  // SourceRegion.BranchTargetBlock.
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->instr_begin(),
+           E = SourceRegion.BranchBlock->getFirstNonPHI();
+       I != E; ++I) {
+    for (auto &Def : I->defs())
+      for (auto &Use : MRI->use_instructions(Def.getReg())) {
+        if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
+          DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
+                          "PHI within branch target block -- can't merge\n");
+          NumPHINotMoved++;
+          return false;
+        }
+        if (Use.getParent() == SourceRegion.BranchBlock) {
+          DEBUG(dbgs() << "PHI " << *I
+                       << " defines register used in this "
+                          "block -- all must move down\n");
+          SourceRegion.MustMoveDown = true;
+        }
+      }
+  }
+
+  // Walk through the MI to see if they should be merged into
+  // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down)
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->getFirstNonPHI(),
+           E = SourceRegion.BranchBlock->end();
+       I != E; ++I) {
+    if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move down - must move up!\n");
+      SourceRegion.MustMoveUp = true;
+    }
+    if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move up - must move down!\n");
+      SourceRegion.MustMoveDown = true;
+    }
+  }
+
+  return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true;
+}
+
+/// Merge the instructions from SourceRegion.BranchBlock,
+/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into
+/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and
+/// TargetRegion.FallThroughBlock respectively.
+///
+/// The successors for blocks in TargetRegion will be updated to use the
+/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion
+/// will be removed from the function.
+///
+/// A region consists of a BranchBlock, a FallThroughBlock, and a
+/// BranchTargetBlock. Branch coalesce works on patterns where the
+/// TargetRegion's BranchTargetBlock must also be the SourceRegions's
+/// BranchBlock.
+///
+///  Before mergeCandidates:
+///
+///  +---------------------------+
+///  |  TargetRegion.BranchBlock |
+///  +---------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  TargetRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  TargetRegion.BranchTargetBlock  |
+///  |  SourceRegion.BranchBlock        |
+///  +----------------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  SourceRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+///  After mergeCandidates:
+///
+///  +-----------------------------+
+///  |  TargetRegion.BranchBlock   |
+///  |  SourceRegion.BranchBlock   |
+///  +-----------------------------+
+///     /        |
+///    /   +---------------------------------+
+///   |    |  TargetRegion.FallThroughBlock  |
+///   |    |  SourceRegion.FallThroughBlock  |
+///    \   +---------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+/// \param[in] SourceRegion The candidate to move blocks from
+/// \param[in] TargetRegion The candidate to move blocks to
+///
+bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                                       CoalescingCandidateInfo &TargetRegion) {
+
+  if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) {
+    llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!");
+    return false;
+  }
+
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Start the merging process by first handling the BranchBlock.
+  // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block
+  moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+
+  // Move remaining instructions in SourceRegion.BranchBlock into
+  // TargetRegion.BranchBlock
+  MachineBasicBlock::iterator firstInstr =
+      SourceRegion.BranchBlock->getFirstNonPHI();
+  MachineBasicBlock::iterator lastInstr =
+      SourceRegion.BranchBlock->getFirstTerminator();
+
+  MachineBasicBlock *Source = SourceRegion.MustMoveDown
+                                  ? SourceRegion.BranchTargetBlock
+                                  : TargetRegion.BranchBlock;
+
+  MachineBasicBlock::iterator Target =
+      SourceRegion.MustMoveDown
+          ? SourceRegion.BranchTargetBlock->getFirstNonPHI()
+          : TargetRegion.BranchBlock->getFirstTerminator();
+
+  Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr);
+
+  // Once PHI and instructions have been moved we need to clean up the
+  // control flow.
+
+  // Remove SourceRegion.FallThroughBlock before transferring successors of
+  // SourceRegion.BranchBlock to TargetRegion.BranchBlock.
+  SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock);
+  TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.BranchBlock);
+  // Update branch in TargetRegion.BranchBlock to jump to
+  // SourceRegion.BranchTargetBlock
+  // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock.
+  TargetRegion.BranchBlock->ReplaceUsesOfBlockWith(
+      SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+  // Remove the branch statement(s) in SourceRegion.BranchBlock
+  MachineBasicBlock::iterator I =
+      SourceRegion.BranchBlock->terminators().begin();
+  while (I != SourceRegion.BranchBlock->terminators().end()) {
+    MachineInstr &CurrInst = *I;
+    ++I;
+    if (CurrInst.isBranch())
+      CurrInst.eraseFromParent();
+  }
+
+  // Fall-through block should be empty since this is part of the condition
+  // to coalesce the branches.
+  assert(TargetRegion.FallThroughBlock->empty() &&
+         "FallThroughBlocks should be empty!");
+
+  // Transfer successor information and move PHIs down to the
+  // branch-taken block.
+  TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.FallThroughBlock);
+  TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock);
+
+  // Remove the blocks from the function.
+  assert(SourceRegion.BranchBlock->empty() &&
+         "Expecting branch block to be empty!");
+  SourceRegion.BranchBlock->eraseFromParent();
+
+  assert(SourceRegion.FallThroughBlock->empty() &&
+         "Expecting fall-through block to be empty!\n");
+  SourceRegion.FallThroughBlock->eraseFromParent();
+
+  NumBlocksCoalesced++;
+  return true;
+}
+
+bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
+
+  if (skipFunction(*MF.getFunction()) || MF.empty() ||
+      !isBranchCoalescingEnabled())
+    return false;
+
+  bool didSomething = false;
+
+  DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+  initialize(MF);
+
+  DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+
+  CoalescingCandidateInfo Cand1, Cand2;
+  // Walk over blocks and find candidates to merge
+  // Continue trying to merge with the first candidate found, as long as merging
+  // is successfull.
+  for (MachineBasicBlock &MBB : MF) {
+    bool MergedCandidates = false;
+    do {
+      MergedCandidates = false;
+      Cand1.clear();
+      Cand2.clear();
+
+      Cand1.BranchBlock = &MBB;
+
+      // If unable to coalesce the branch, then continue to next block
+      if (!canCoalesceBranch(Cand1))
+        break;
+
+      Cand2.BranchBlock = Cand1.BranchTargetBlock;
+      if (!canCoalesceBranch(Cand2))
+        break;
+
+      // Sanity check
+      // The branch-taken block of the second candidate should post-dominate the
+      // first candidate
+      assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) &&
+             "Branch-taken block should post-dominate first candidate");
+
+      if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
+        DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
+                     << Cand2.BranchBlock->getNumber()
+                     << " have different branches\n");
+        break;
+      }
+      if (!canMerge(Cand2, Cand1)) {
+        DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
+                     << " and " << Cand2.BranchBlock->getNumber() << "\n");
+        NumBlocksNotCoalesced++;
+        continue;
+      }
+      DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+                   << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+      MergedCandidates = mergeCandidates(Cand2, Cand1);
+      if (MergedCandidates)
+        didSomething = true;
+
+      DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+    } while (MergedCandidates);
+  }
+
+#ifndef NDEBUG
+  // Verify MF is still valid after branch coalescing
+  if (didSomething)
+    MF.verify(nullptr, "Error in code produced by branch coalescing");
+#endif // NDEBUG
+
+  DEBUG(dbgs() << "Finished Branch Coalescing\n");
+  return didSomething;
+}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 6fba161033b0..2d01301402f0 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -49,6 +50,7 @@ STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumBranchOpts, "Number of branches optimized");
 STATISTIC(NumTailMerge , "Number of block tails merged");
 STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
+STATISTIC(NumTailCalls,  "Number of tail calls optimized");
 
 static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
                               cl::init(cl::BOU_UNSET), cl::Hidden);
@@ -123,8 +125,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
   }
 }
 
-/// RemoveDeadBlock - Remove the specified dead machine basic block from the
-/// function, updating the CFG.
 void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
@@ -144,9 +144,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
     MLI->removeBlock(MBB);
 }
 
-/// OptimizeFunction - Perhaps branch folding, tail merging and other
-/// CFG optimizations on the given function.  Block placement changes the layout
-/// and may create new tail merging opportunities.
 bool BranchFolder::OptimizeFunction(MachineFunction &MF,
                                     const TargetInstrInfo *tii,
                                     const TargetRegisterInfo *tri,
@@ -348,8 +345,6 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   return TailLen;
 }
 
-/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
-/// after it, replacing it with an unconditional branch to NewDest.
 void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                            MachineBasicBlock *NewDest) {
   TII->ReplaceTailWithBranchTo(OldInst, NewDest);
@@ -362,9 +357,6 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
   ++NumTailMerge;
 }
 
-/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
-/// MBB so that the part before the iterator falls into the part starting at the
-/// iterator.  This returns the new MBB.
 MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
                                             MachineBasicBlock::iterator BBI1,
                                             const BasicBlock *BB) {
@@ -388,7 +380,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
 
   // NewMBB belongs to the same loop as CurMBB.
-  if (MLI) 
+  if (MLI)
     if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
       ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
 
@@ -436,7 +428,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc dl;  // FIXME: this is nowhere
+  DebugLoc dl = CurMBB->findBranchDebugLoc();
   if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
     MachineBasicBlock *NextBB = &*I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
@@ -497,6 +489,15 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
   return MBFI.printBlockFreq(OS, Freq);
 }
 
+void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) {
+  MBFI.view(Name, isSimple);
+}
+
+uint64_t
+BranchFolder::MBFIWrapper::getEntryFreq() const {
+  return MBFI.getEntryFreq();
+}
+
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
 /// is one, or MBB->end() otherwise.
@@ -516,6 +517,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB,
   return NumTerms;
 }
 
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
+static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) {
+  if (!MBB->succ_empty())
+    return false;
+  if (MBB->empty())
+    return true;
+  return !(MBB->back().isReturn() || MBB->back().isIndirectBranch());
+}
+
 /// ProfitableToMerge - Check if two machine basic blocks have a common tail
 /// and decide if it would be profitable to merge those tails.  Return the
 /// length of the common tail and iterators to the first common instruction
@@ -570,6 +582,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
       return true;
   }
 
+  // If these are identical non-return blocks with no successors, merge them.
+  // Such blocks are typically cold calls to noreturn functions like abort, and
+  // are unlikely to become a fallthrough target after machine block placement.
+  // Tail merging these blocks is unlikely to create additional unconditional
+  // branches, and will reduce the size of this cold code.
+  if (I1 == MBB1->begin() && I2 == MBB2->begin() &&
+      blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2))
+    return true;
+
   // If one of the blocks can be completely merged and happens to be in
   // a position where the other could fall through into it, merge any number
   // of instructions, because it can be done without a branch.
@@ -579,6 +600,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
   if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
     return true;
 
+  // If both blocks are identical and end in a branch, merge them unless they
+  // both have a fallthrough predecessor and successor.
+  // We can only do this after block placement because it depends on whether
+  // there are fallthroughs, and we don't know until after layout.
+  if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) {
+    auto BothFallThrough = [](MachineBasicBlock *MBB) {
+      if (MBB->succ_size() != 0 && !MBB->canFallThrough())
+        return false;
+      MachineFunction::iterator I(MBB);
+      MachineFunction *MF = MBB->getParent();
+      return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough();
+    };
+    if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2))
+      return true;
+  }
+
   // If both blocks have an unconditional branch temporarily stripped out,
   // count that as an additional common instruction for the following
   // heuristics. This heuristic is only accurate for single-succ blocks, so to
@@ -604,16 +641,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
          (I1 == MBB1->begin() || I2 == MBB2->begin());
 }
 
-/// ComputeSameTails - Look through all the blocks in MergePotentials that have
-/// hash CurHash (guaranteed to match the last element).  Build the vector
-/// SameTails of all those that have the (same) largest number of instructions
-/// in common of any pair of these blocks.  SameTails entries contain an
-/// iterator into MergePotentials (from which the MachineBasicBlock can be
-/// found) and a MachineBasicBlock::iterator into that MBB indicating the
-/// instruction where the matching code sequence begins.
-/// Order of elements in SameTails is the reverse of the order in which
-/// those blocks appear in MergePotentials (where they are not necessarily
-/// consecutive).
 unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
                                         unsigned MinCommonTailLength,
                                         MachineBasicBlock *SuccBB,
@@ -650,8 +677,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
   return maxCommonTailLength;
 }
 
-/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
-/// MergePotentials, restoring branches at ends of blocks as appropriate.
 void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
                                         MachineBasicBlock *SuccBB,
                                         MachineBasicBlock *PredBB) {
@@ -671,8 +696,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
   MergePotentials.erase(CurMPIter, MergePotentials.end());
 }
 
-/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
-/// only of the common tail.  Create a block that does by splitting one.
 bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                                              MachineBasicBlock *SuccBB,
                                              unsigned maxCommonTailLength,
@@ -723,6 +746,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   return true;
 }
 
+void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) {
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size());
+  for (unsigned int i = 0 ; i != SameTails.size() ; ++i) {
+    if (i != commonTailIndex)
+      NextCommonInsts[i] = SameTails[i].getTailStartPos();
+    else {
+      assert(SameTails[i].getTailStartPos() == MBB->begin() &&
+          "MBB is not a common tail only block");
+    }
+  }
+
+  for (auto &MI : *MBB) {
+    if (MI.isDebugValue())
+      continue;
+    DebugLoc DL = MI.getDebugLoc();
+    for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
+      if (i == commonTailIndex)
+        continue;
+
+      auto &Pos = NextCommonInsts[i];
+      assert(Pos != SameTails[i].getBlock()->end() &&
+          "Reached BB end within common tail");
+      while (Pos->isDebugValue()) {
+        ++Pos;
+        assert(Pos != SameTails[i].getBlock()->end() &&
+            "Reached BB end within common tail");
+      }
+      assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
+      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
+      NextCommonInsts[i] = ++Pos;
+    }
+    MI.setDebugLoc(DL);
+  }
+}
+
 static void
 mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
                 MachineBasicBlock &MBBCommon) {
@@ -875,10 +935,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
     // Recompute common tail MBB's edge weights and block frequency.
     setCommonTailEdgeWeights(*MBB);
 
-    // Remove the original debug location from the common tail.
-    for (auto &MI : *MBB)
-      if (!MI.isDebugValue())
-        MI.setDebugLoc(DebugLoc());
+    // Merge debug locations across identical instructions for common tail.
+    MergeCommonTailDebugLocs(commonTailIndex);
 
     // MBB is common tail.  Adjust all other BB's to jump to this one.
     // Traversal must be forwards so erases work.
@@ -1043,7 +1101,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
         // Remove the unconditional branch at the end, if any.
         if (TBB && (Cond.empty() || FBB)) {
-          DebugLoc dl;  // FIXME: this is nowhere
+          DebugLoc dl = PBB->findBranchDebugLoc();
           TII->removeBranch(*PBB);
           if (!Cond.empty())
             // reinsert conditional branch only, for now
@@ -1193,8 +1251,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
   return DebugLoc();
 }
 
-/// OptimizeBlock - Analyze and optimize control flow related to the specified
-/// block.  This is never called on the entry block.
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
@@ -1386,6 +1442,42 @@ ReoptimizeBlock:
     }
   }
 
+  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
+      MF.getFunction()->optForSize()) {
+    // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
+    // direction, thereby defeating careful block placement and regressing
+    // performance. Therefore, only consider this for optsize functions.
+    MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
+    if (TII->isUnconditionalTailCall(TailCall)) {
+      MachineBasicBlock *Pred = *MBB->pred_begin();
+      MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+      SmallVector<MachineOperand, 4> PredCond;
+      bool PredAnalyzable =
+          !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
+
+      if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) {
+        // The predecessor has a conditional branch to this block which consists
+        // of only a tail call. Try to fold the tail call into the conditional
+        // branch.
+        if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
+          // TODO: It would be nice if analyzeBranch() could provide a pointer
+          // to the branch insturction so replaceBranchWithTailCall() doesn't
+          // have to search for it.
+          TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
+          ++NumTailCalls;
+          Pred->removeSuccessor(MBB);
+          MadeChange = true;
+          return MadeChange;
+        }
+      }
+      // If the predecessor is falling through to this block, we could reverse
+      // the branch condition and fold the tail call into that. However, after
+      // that we might have to re-arrange the CFG to fall through to the other
+      // block and there is a high risk of regressing code size rather than
+      // improving it.
+    }
+  }
+
   // Analyze the branch in the current block.
   MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
   SmallVector<MachineOperand, 4> CurCond;
@@ -1599,8 +1691,6 @@ ReoptimizeBlock:
 //  Hoist Common Code
 //===----------------------------------------------------------------------===//
 
-/// HoistCommonCode - Hoist common instruction sequences at the start of basic
-/// blocks to their common predecessor.
 bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
   bool MadeChange = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
@@ -1734,9 +1824,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   return PI;
 }
 
-/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
-/// sequence at the start of the function, move the instructions before MBB
-/// terminator if it's legal.
 bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index fc48e484292d..4852721eea10 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -37,6 +37,9 @@ namespace llvm {
                           // flag. Ignored for optsize.
                           unsigned MinCommonTailLength = 0);
 
+    /// Perhaps branch folding, tail merging and other CFG optimizations on the
+    /// given function.  Block placement changes the layout and may create new
+    /// tail merging opportunities.
     bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
                           const TargetRegisterInfo *tri, MachineModuleInfo *mmi,
                           MachineLoopInfo *mli = nullptr,
@@ -122,6 +125,8 @@ namespace llvm {
                                   const MachineBasicBlock *MBB) const;
       raw_ostream &printBlockFreq(raw_ostream &OS,
                                   const BlockFrequency Freq) const;
+      void view(const Twine &Name, bool isSimple = true);
+      uint64_t getEntryFreq() const;
 
     private:
       const MachineBlockFrequencyInfo &MBFI;
@@ -137,26 +142,64 @@ namespace llvm {
                        MachineBasicBlock* PredBB,
                        unsigned MinCommonTailLength);
     void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
+
+    /// Delete the instruction OldInst and everything after it, replacing it
+    /// with an unconditional branch to NewDest.
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                  MachineBasicBlock *NewDest);
+
+    /// Given a machine basic block and an iterator into it, split the MBB so
+    /// that the part before the iterator falls into the part starting at the
+    /// iterator.  This returns the new MBB.
     MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
                                   MachineBasicBlock::iterator BBI1,
                                   const BasicBlock *BB);
+
+    /// Look through all the blocks in MergePotentials that have hash CurHash
+    /// (guaranteed to match the last element).  Build the vector SameTails of
+    /// all those that have the (same) largest number of instructions in common
+    /// of any pair of these blocks.  SameTails entries contain an iterator into
+    /// MergePotentials (from which the MachineBasicBlock can be found) and a
+    /// MachineBasicBlock::iterator into that MBB indicating the instruction
+    /// where the matching code sequence begins.  Order of elements in SameTails
+    /// is the reverse of the order in which those blocks appear in
+    /// MergePotentials (where they are not necessarily consecutive).
     unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
                               MachineBasicBlock *SuccBB,
                               MachineBasicBlock *PredBB);
+
+    /// Remove all blocks with hash CurHash from MergePotentials, restoring
+    /// branches at ends of blocks as appropriate.
     void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
                                                 MachineBasicBlock* PredBB);
+
+    /// None of the blocks to be tail-merged consist only of the common tail.
+    /// Create a block that does by splitting one.
     bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                                    MachineBasicBlock *SuccBB,
                                    unsigned maxCommonTailLength,
                                    unsigned &commonTailIndex);
 
+    /// Create merged DebugLocs of identical instructions across SameTails and
+    /// assign it to the instruction in common tail.
+    void MergeCommonTailDebugLocs(unsigned commonTailIndex);
+
     bool OptimizeBranches(MachineFunction &MF);
+
+    /// Analyze and optimize control flow related to the specified block. This
+    /// is never called on the entry block.
     bool OptimizeBlock(MachineBasicBlock *MBB);
+
+    /// Remove the specified dead machine basic block from the function,
+    /// updating the CFG.
     void RemoveDeadBlock(MachineBasicBlock *MBB);
 
+    /// Hoist common instruction sequences at the start of basic blocks to their
+    /// common predecessor.
     bool HoistCommonCode(MachineFunction &MF);
+
+    /// If the successors of MBB has common instruction sequence at the start of
+    /// the function, move the instructions before MBB terminator if it's legal.
     bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
   };
 }
diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 8b27570a17f4..7af136941661 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -126,14 +126,16 @@ void BranchRelaxation::verify() {
 #endif
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
-void BranchRelaxation::dumpBBs() {
+LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() {
   for (auto &MBB : *MF) {
     const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
     dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
            << format("size=%#x\n", BBI.Size);
   }
 }
+#endif
 
 /// scanFunction - Do the initial scan of the function, building up
 /// information about each block.
diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp
index ff7c99de0420..e4eab8c513d9 100644
--- a/lib/CodeGen/BuiltinGCs.cpp
+++ b/lib/CodeGen/BuiltinGCs.cpp
@@ -1,4 +1,4 @@
-//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===//
+//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,6 +14,8 @@
 
 #include "llvm/CodeGen/GCs.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -77,6 +79,7 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
+
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
     const PointerType *PT = cast<PointerType>(Ty);
@@ -110,6 +113,7 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
+
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
     const PointerType *PT = cast<PointerType>(Ty);
@@ -117,7 +121,8 @@ public:
     return (1 == PT->getAddressSpace());
   }
 };
-}
+
+} // end anonymous namespace
 
 // Register all the above so that they can be found at runtime.  Note that
 // these static initializers are important since the registration list is
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 398ea88363b6..0912d9f68aff 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMCodeGen
   Analysis.cpp
   AtomicExpandPass.cpp
   BasicTargetTransformInfo.cpp
+  BranchCoalescing.cpp
   BranchFolding.cpp
   BranchRelaxation.cpp
   BuiltinGCs.cpp
@@ -23,6 +24,7 @@ add_llvm_library(LLVMCodeGen
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
   FaultMaps.cpp
+  FEntryInserter.cpp
   FuncletLayout.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
@@ -36,6 +38,7 @@ add_llvm_library(LLVMCodeGen
   InterleavedAccessPass.cpp
   IntrinsicLowering.cpp
   LatencyPriorityQueue.cpp
+  LazyMachineBlockFrequencyInfo.cpp
   LexicalScopes.cpp
   LiveDebugValues.cpp
   LiveDebugVariables.cpp
@@ -46,6 +49,7 @@ add_llvm_library(LLVMCodeGen
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
   LiveRegMatrix.cpp
+  LiveRegUnits.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
   LLVMTargetMachine.cpp
@@ -70,6 +74,8 @@ add_llvm_library(LLVMCodeGen
   MachineLoopInfo.cpp
   MachineModuleInfo.cpp
   MachineModuleInfoImpls.cpp
+  MachineOptimizationRemarkEmitter.cpp
+  MachineOutliner.cpp
   MachinePassRegistry.cpp
   MachinePipeliner.cpp
   MachinePostDominators.cpp
@@ -147,7 +153,7 @@ add_llvm_library(LLVMCodeGen
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP
 
-  LINK_LIBS ${PTHREAD_LIB}
+  LINK_LIBS ${LLVM_PTHREAD_LIB}
 
   DEPENDS
   intrinsics_gen
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 2e33f14c7ee3..7cad4d031169 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -30,8 +30,7 @@ using namespace llvm;
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
     : CallingConv(CC), IsVarArg(isVarArg), MF(mf),
-      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C),
-      CallOrPrologue(Unknown) {
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) {
   // No stack is used.
   StackOffset = 0;
   MaxStackArgAlign = 1;
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 4cf9b138f10d..3fc12ccc3b60 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -21,6 +21,7 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
+  initializeBranchCoalescingPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
   initializeCodeGenPreparePass(Registry);
@@ -31,12 +32,15 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeEarlyIfConverterPass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeExpandPostRAPass(Registry);
+  initializeFEntryInserterPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
   initializeFuncletLayoutPass(Registry);
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);
   initializeIfConverterPass(Registry);
+  initializeImplicitNullChecksPass(Registry);
   initializeInterleavedAccessPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
   initializeLiveStacksPass(Registry);
@@ -47,7 +51,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCSEPass(Registry);
-  initializeImplicitNullChecksPass(Registry);
   initializeMachineCombinerPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineDominatorTreePass(Registry);
@@ -55,16 +58,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
   initializeMachineModuleInfoPass(Registry);
+  initializeMachineOptimizationRemarkEmitterPassPass(Registry);
+  initializeMachineOutlinerPass(Registry);
   initializeMachinePipelinerPass(Registry);
   initializeMachinePostDominatorTreePass(Registry);
+  initializeMachineRegionInfoPassPass(Registry);
   initializeMachineSchedulerPass(Registry);
   initializeMachineSinkingPass(Registry);
   initializeMachineVerifierPassPass(Registry);
-  initializeXRayInstrumentationPass(Registry);
-  initializePatchableFunctionPass(Registry);
   initializeOptimizePHIsPass(Registry);
   initializePEIPass(Registry);
   initializePHIEliminationPass(Registry);
+  initializePatchableFunctionPass(Registry);
   initializePeepholeOptimizerPass(Registry);
   initializePostMachineSchedulerPass(Registry);
   initializePostRAHazardRecognizerPass(Registry);
@@ -74,12 +79,11 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRAGreedyPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
+  initializeSafeStackPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
-  initializeLiveDebugValuesPass(Registry);
-  initializeSafeStackPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
@@ -91,6 +95,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeVirtRegMapPass(Registry);
   initializeVirtRegRewriterPass(Registry);
   initializeWinEHPreparePass(Registry);
+  initializeXRayInstrumentationPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 934b470f13b5..2bdd189557b4 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -15,10 +15,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -53,8 +55,10 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -77,7 +81,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
-STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
 static cl::opt<bool> DisableBranchOpts(
@@ -93,7 +96,7 @@ static cl::opt<bool> DisableSelectToBranch(
   cl::desc("Disable select to branch conversion."));
 
 static cl::opt<bool> AddrSinkUsingGEPs(
-  "addr-sink-using-gep", cl::Hidden, cl::init(false),
+  "addr-sink-using-gep", cl::Hidden, cl::init(true),
   cl::desc("Address sinking in CGP using GEPs."));
 
 static cl::opt<bool> EnableAndCmpSinking(
@@ -135,15 +138,24 @@ static cl::opt<bool> ForceSplitStore(
     "force-split-store", cl::Hidden, cl::init(false),
     cl::desc("Force store splitting no matter what the target query says."));
 
+static cl::opt<bool>
+EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
+    cl::desc("Enable merging of redundant sexts when one is dominating"
+    " the other."), cl::init(true));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+typedef SmallVector<Instruction *, 16> SExts;
+typedef DenseMap<Value *, SExts> ValueToSExts;
 class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
     const TargetMachine *TM;
+    const TargetSubtargetInfo *SubtargetInfo;
     const TargetLowering *TLI;
+    const TargetRegisterInfo *TRI;
     const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
     const LoopInfo *LI;
@@ -165,6 +177,15 @@ class TypePromotionTransaction;
     /// promotion for the current function.
     InstrToOrigTy PromotedInsts;
 
+    /// Keep track of instructions removed during promotion.
+    SetOfInstrs RemovedInsts;
+
+    /// Keep track of sext chains based on their initial value.
+    DenseMap<Value *, Instruction *> SeenChainsForSExt;
+
+    /// Keep track of SExt promoted.
+    ValueToSExts ValToSExtendedUses;
+
     /// True if CFG is modified in any way.
     bool ModifiedDT;
 
@@ -206,7 +227,7 @@ class TypePromotionTransaction;
                             Type *AccessTy, unsigned AS);
     bool optimizeInlineAsmInst(CallInst *CS);
     bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
-    bool moveExtToFormExtLoad(Instruction *&I);
+    bool optimizeExt(Instruction *&I);
     bool optimizeExtUses(Instruction *I);
     bool optimizeLoadExt(LoadInst *I);
     bool optimizeSelectInst(SelectInst *SI);
@@ -215,13 +236,21 @@ class TypePromotionTransaction;
     bool optimizeExtractElementInst(Instruction *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
     bool placeDbgValues(Function &F);
-    bool sinkAndCmp(Function &F);
-    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
-                        Instruction *&Inst,
-                        const SmallVectorImpl<Instruction *> &Exts,
-                        unsigned CreatedInstCost);
+    bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
+                      LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
+    bool tryToPromoteExts(TypePromotionTransaction &TPT,
+                          const SmallVectorImpl<Instruction *> &Exts,
+                          SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+                          unsigned CreatedInstsCost = 0);
+    bool mergeSExts(Function &F);
+    bool performAddressTypePromotion(
+        Instruction *&Inst,
+        bool AllowPromotionWithoutCommonHeader,
+        bool HasPromoted, TypePromotionTransaction &TPT,
+        SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
     bool splitBranchCondition(Function &F);
     bool simplifyOffsetableRelocate(Instruction &I);
+    bool splitIndirectCriticalEdges(Function &F);
   };
 }
 
@@ -250,8 +279,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   BPI.reset();
 
   ModifiedDT = false;
-  if (TM)
-    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  if (TM) {
+    SubtargetInfo = TM->getSubtargetImpl(F);
+    TLI = SubtargetInfo->getTargetLowering();
+    TRI = SubtargetInfo->getRegisterInfo();
+  }
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -260,9 +292,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (ProfileGuidedSectionPrefix) {
     ProfileSummaryInfo *PSI =
         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-    if (PSI->isFunctionEntryHot(&F))
+    if (PSI->isFunctionHotInCallGraph(&F))
       F.setSectionPrefix(".hot");
-    else if (PSI->isFunctionEntryCold(&F))
+    else if (PSI->isFunctionColdInCallGraph(&F))
       F.setSectionPrefix(".cold");
   }
 
@@ -290,18 +322,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // find a node corresponding to the value.
   EverMadeChange |= placeDbgValues(F);
 
-  // If there is a mask, compare against zero, and branch that can be combined
-  // into a single target instruction, push the mask and compare into branch
-  // users. Do this before OptimizeBlock -> OptimizeInst ->
-  // OptimizeCmpExpression, which perturbs the pattern being searched for.
-  if (!DisableBranchOpts) {
-    EverMadeChange |= sinkAndCmp(F);
+  if (!DisableBranchOpts)
     EverMadeChange |= splitBranchCondition(F);
-  }
+
+  // Split some critical edges where one of the sources is an indirect branch,
+  // to help generate sane code for PHIs involving such edges.
+  EverMadeChange |= splitIndirectCriticalEdges(F);
 
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
+    SeenChainsForSExt.clear();
+    ValToSExtendedUses.clear();
+    RemovedInsts.clear();
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
@@ -311,6 +344,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       if (ModifiedDTOnIteration)
         break;
     }
+    if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
+      MadeChange |= mergeSExts(F);
+
+    // Really free removed instructions during promotion.
+    for (Instruction *I : RemovedInsts)
+      delete I;
+
     EverMadeChange |= MadeChange;
   }
 
@@ -432,6 +472,154 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
   return DestBB;
 }
 
+// Return the unique indirectbr predecessor of a block. This may return null
+// even if such a predecessor exists, if it's not useful for splitting.
+// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
+// predecessors of BB.
+static BasicBlock *
+findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
+  // If the block doesn't have any PHIs, we don't care about it, since there's
+  // no point in splitting it.
+  PHINode *PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return nullptr;
+
+  // Verify we have exactly one IBR predecessor.
+  // Conservatively bail out if one of the other predecessors is not a "regular"
+  // terminator (that is, not a switch or a br).
+  BasicBlock *IBB = nullptr;
+  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
+    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+    TerminatorInst *PredTerm = PredBB->getTerminator();
+    switch (PredTerm->getOpcode()) {
+    case Instruction::IndirectBr:
+      if (IBB)
+        return nullptr;
+      IBB = PredBB;
+      break;
+    case Instruction::Br:
+    case Instruction::Switch:
+      OtherPreds.push_back(PredBB);
+      continue;
+    default:
+      return nullptr;
+    }
+  }
+
+  return IBB;
+}
+
+// Split critical edges where the source of the edge is an indirectbr
+// instruction. This isn't always possible, but we can handle some easy cases.
+// This is useful because MI is unable to split such critical edges,
+// which means it will not be able to sink instructions along those edges.
+// This is especially painful for indirect branches with many successors, where
+// we end up having to prepare all outgoing values in the origin block.
+//
+// Our normal algorithm for splitting critical edges requires us to update
+// the outgoing edges of the edge origin block, but for an indirectbr this
+// is hard, since it would require finding and updating the block addresses
+// the indirect branch uses. But if a block only has a single indirectbr
+// predecessor, with the others being regular branches, we can do it in a
+// different way.
+// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr.
+// We can split D into D0 and D1, where D0 contains only the PHIs from D,
+// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and
+// create the following structure:
+// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
+bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) {
+  // Check whether the function has any indirectbrs, and collect which blocks
+  // they may jump to. Since most functions don't have indirect branches,
+  // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
+  SmallSetVector<BasicBlock *, 16> Targets;
+  for (auto &BB : F) {
+    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
+    if (!IBI)
+      continue;
+
+    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
+      Targets.insert(IBI->getSuccessor(Succ));
+  }
+
+  if (Targets.empty())
+    return false;
+
+  bool Changed = false;
+  for (BasicBlock *Target : Targets) {
+    SmallVector<BasicBlock *, 16> OtherPreds;
+    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
+    // If we did not found an indirectbr, or the indirectbr is the only
+    // incoming edge, this isn't the kind of edge we're looking for.
+    if (!IBRPred || OtherPreds.empty())
+      continue;
+
+    // Don't even think about ehpads/landingpads.
+    Instruction *FirstNonPHI = Target->getFirstNonPHI();
+    if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+      continue;
+
+    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+    // It's possible Target was its own successor through an indirectbr.
+    // In this case, the indirectbr now comes from BodyBlock.
+    if (IBRPred == Target)
+      IBRPred = BodyBlock;
+
+    // At this point Target only has PHIs, and BodyBlock has the rest of the
+    // block's body. Create a copy of Target that will be used by the "direct"
+    // preds.
+    ValueToValueMapTy VMap;
+    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
+
+    for (BasicBlock *Pred : OtherPreds)
+      Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+
+    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
+    // they are clones, so the number of PHIs are the same.
+    // (a) Remove the edge coming from IBRPred from the "Direct" PHI
+    // (b) Leave that as the only edge in the "Indirect" PHI.
+    // (c) Merge the two in the body block.
+    BasicBlock::iterator Indirect = Target->begin(),
+                         End = Target->getFirstNonPHI()->getIterator();
+    BasicBlock::iterator Direct = DirectSucc->begin();
+    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
+
+    assert(&*End == Target->getTerminator() &&
+           "Block was expected to only contain PHIs");
+
+    while (Indirect != End) {
+      PHINode *DirPHI = cast<PHINode>(Direct);
+      PHINode *IndPHI = cast<PHINode>(Indirect);
+
+      // Now, clean up - the direct block shouldn't get the indirect value,
+      // and vice versa.
+      DirPHI->removeIncomingValue(IBRPred);
+      Direct++;
+
+      // Advance the pointer here, to avoid invalidation issues when the old
+      // PHI is erased.
+      Indirect++;
+
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
+                             IBRPred);
+
+      // Create a PHI in the body block, to merge the direct and indirect
+      // predecessors.
+      PHINode *MergePHI =
+          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
+      MergePHI->addIncoming(NewIndPHI, Target);
+      MergePHI->addIncoming(DirPHI, DirectSucc);
+
+      IndPHI->replaceAllUsesWith(MergePHI);
+      IndPHI->eraseFromParent();
+    }
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
 /// edges in ways that are non-optimal for isel. Start by eliminating these
@@ -1090,6 +1278,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
   return false;
 }
 
+/// Duplicate and sink the given 'and' instruction into user blocks where it is
+/// used in a compare to allow isel to generate better code for targets where
+/// this operation can be combined.
+///
+/// Return true if any changes are made.
+static bool sinkAndCmp0Expression(Instruction *AndI,
+                                  const TargetLowering &TLI,
+                                  SetOfInstrs &InsertedInsts) {
+  // Double-check that we're not trying to optimize an instruction that was
+  // already optimized by some other part of this pass.
+  assert(!InsertedInsts.count(AndI) &&
+         "Attempting to optimize already optimized and instruction");
+  (void) InsertedInsts;
+
+  // Nothing to do for single use in same basic block.
+  if (AndI->hasOneUse() &&
+      AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
+    return false;
+
+  // Try to avoid cases where sinking/duplicating is likely to increase register
+  // pressure.
+  if (!isa<ConstantInt>(AndI->getOperand(0)) &&
+      !isa<ConstantInt>(AndI->getOperand(1)) &&
+      AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
+    return false;
+
+  for (auto *U : AndI->users()) {
+    Instruction *User = cast<Instruction>(U);
+
+    // Only sink for and mask feeding icmp with 0.
+    if (!isa<ICmpInst>(User))
+      return false;
+
+    auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
+    if (!CmpC || !CmpC->isZero())
+      return false;
+  }
+
+  if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
+    return false;
+
+  DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
+  DEBUG(AndI->getParent()->dump());
+
+  // Push the 'and' into the same block as the icmp 0.  There should only be
+  // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
+  // others, so we don't need to keep track of which BBs we insert into.
+  for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
+
+    // Keep the 'and' in the same place if the use is already in the same block.
+    Instruction *InsertPt =
+        User->getParent() == AndI->getParent() ? AndI : User;
+    Instruction *InsertedAnd =
+        BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
+                               AndI->getOperand(1), "", InsertPt);
+    // Propagate the debug info.
+    InsertedAnd->setDebugLoc(AndI->getDebugLoc());
+
+    // Replace a use of the 'and' with a use of the new 'and'.
+    TheUse = InsertedAnd;
+    ++NumAndUses;
+    DEBUG(User->getParent()->dump());
+  }
+
+  // We removed all uses, nuke the and.
+  AndI->eraseFromParent();
+  return true;
+}
+
 /// Check if the candidates could be combined with a shift instruction, which
 /// includes:
 /// 1. Truncate instruction
@@ -2028,16 +2293,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
 
     if (TLI) {
-      // Unknown address space.
-      // TODO: Target hook to pick which address space the intrinsic cares
-      // about?
-      unsigned AddrSpace = ~0u;
       SmallVector<Value*, 2> PtrOps;
       Type *AccessTy;
-      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
-        while (!PtrOps.empty())
-          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
+      if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
+        while (!PtrOps.empty()) {
+          Value *PtrVal = PtrOps.pop_back_val();
+          unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+          if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
             return true;
+        }
     }
   }
 
@@ -2168,11 +2432,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    AttributeSet CalleeAttrs = CS.getAttributes();
-    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias) !=
-        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias))
+    AttributeList CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
+            .removeAttribute(Attribute::NoAlias) !=
+        AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
+            .removeAttribute(Attribute::NoAlias))
       continue;
 
     // Make sure the call instruction is followed by an unconditional branch to
@@ -2561,25 +2825,30 @@ class TypePromotionTransaction {
     OperandsHider Hider;
     /// Keep track of the uses replaced, if any.
     UsesReplacer *Replacer;
+    /// Keep track of instructions removed.
+    SetOfInstrs &RemovedInsts;
 
   public:
     /// \brief Remove all reference of \p Inst and optinally replace all its
     /// uses with New.
+    /// \p RemovedInsts Keep track of the instructions removed by this Action.
     /// \pre If !Inst->use_empty(), then New != nullptr
-    InstructionRemover(Instruction *Inst, Value *New = nullptr)
+    InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
+                       Value *New = nullptr)
         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
-          Replacer(nullptr) {
+          Replacer(nullptr), RemovedInsts(RemovedInsts) {
       if (New)
         Replacer = new UsesReplacer(Inst, New);
       DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
+      RemovedInsts.insert(Inst);
+      /// The instructions removed here will be freed after completing
+      /// optimizeBlock() for all blocks as we need to keep track of the
+      /// removed instructions during promotion.
       Inst->removeFromParent();
     }
 
     ~InstructionRemover() override { delete Replacer; }
 
-    /// \brief Really remove the instruction.
-    void commit() override { delete Inst; }
-
     /// \brief Resurrect the instruction and reassign it to the proper uses if
     /// new value was provided when build this action.
     void undo() override {
@@ -2588,6 +2857,7 @@ class TypePromotionTransaction {
       if (Replacer)
         Replacer->undo();
       Hider.undo();
+      RemovedInsts.erase(Inst);
     }
   };
 
@@ -2596,6 +2866,10 @@ public:
   /// The restoration point is a pointer to an action instead of an iterator
   /// because the iterator may be invalidated but not the pointer.
   typedef const TypePromotionAction *ConstRestorationPt;
+
+  TypePromotionTransaction(SetOfInstrs &RemovedInsts)
+      : RemovedInsts(RemovedInsts) {}
+
   /// Advocate every changes made in that transaction.
   void commit();
   /// Undo all the changes made after the given point.
@@ -2627,6 +2901,7 @@ private:
   /// The ordered list of actions made so far.
   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
   typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
+  SetOfInstrs &RemovedInsts;
 };
 
 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
@@ -2638,7 +2913,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
                                                 Value *NewVal) {
   Actions.push_back(
-      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal));
+      make_unique<TypePromotionTransaction::InstructionRemover>(Inst,
+                                                         RemovedInsts, NewVal));
 }
 
 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
@@ -2705,8 +2981,8 @@ void TypePromotionTransaction::rollback(
 /// This encapsulates the logic for matching the target-legal addressing modes.
 class AddressingModeMatcher {
   SmallVectorImpl<Instruction*> &AddrModeInsts;
-  const TargetMachine &TM;
   const TargetLowering &TLI;
+  const TargetRegisterInfo &TRI;
   const DataLayout &DL;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
@@ -2731,14 +3007,14 @@ class AddressingModeMatcher {
   bool IgnoreProfitability;
 
   AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
-                        const TargetMachine &TM, Type *AT, unsigned AS,
+                        const TargetLowering &TLI,
+                        const TargetRegisterInfo &TRI,
+                        Type *AT, unsigned AS,
                         Instruction *MI, ExtAddrMode &AM,
                         const SetOfInstrs &InsertedInsts,
                         InstrToOrigTy &PromotedInsts,
                         TypePromotionTransaction &TPT)
-      : AddrModeInsts(AMI), TM(TM),
-        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
-                 ->getTargetLowering()),
+      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
         PromotedInsts(PromotedInsts), TPT(TPT) {
@@ -2756,13 +3032,15 @@ public:
   static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
                            Instruction *MemoryInst,
                            SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetMachine &TM,
+                           const TargetLowering &TLI,
+                           const TargetRegisterInfo &TRI,
                            const SetOfInstrs &InsertedInsts,
                            InstrToOrigTy &PromotedInsts,
                            TypePromotionTransaction &TPT) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
+    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI,
+                                         AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
                                          PromotedInsts, TPT).matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -3583,18 +3861,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
 /// Check to see if all uses of OpVal by the specified inline asm call are due
 /// to memory operands. If so, return true, otherwise return false.
 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetMachine &TM) {
+                                    const TargetLowering &TLI,
+                                    const TargetRegisterInfo &TRI) {
   const Function *F = CI->getParent()->getParent();
-  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
+      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
                             ImmutableCallSite(CI));
+
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
     // If this asm operand is our Value*, and if it isn't an indirect memory
     // operand, we can't fold it!
@@ -3613,7 +3891,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
 static bool FindAllMemoryUses(
     Instruction *I,
     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
-    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts,
+    const TargetLowering &TLI, const TargetRegisterInfo &TRI) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -3635,11 +3914,28 @@ static bool FindAllMemoryUses(
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
       unsigned opNo = U.getOperandNo();
-      if (opNo == 0) return true; // Storing addr, not into addr.
+      if (opNo != StoreInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
       MemoryUses.push_back(std::make_pair(SI, opNo));
       continue;
     }
 
+    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo != AtomicRMWInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(RMW, opNo));
+      continue;
+    }
+
+    if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(CmpX, opNo));
+      continue;
+    }
+
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
       // If this is a cold call, we can sink the addressing calculation into
       // the cold path.  See optimizeCallInst
@@ -3650,12 +3946,12 @@ static bool FindAllMemoryUses(
       if (!IA) return true;
 
       // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
         return true;
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI))
       return true;
   }
 
@@ -3743,7 +4039,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // the use is just a particularly nice way of sinking it.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -3775,7 +4071,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     ExtAddrMode Result;
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS,
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI,
+                                  AddressAccessTy, AS,
                                   MemoryInst, Result, InsertedInsts,
                                   PromotedInsts, TPT);
     Matcher.IgnoreProfitability = true;
@@ -3844,7 +4141,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   bool IsNumUsesConsensusValid = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
   ExtAddrMode AddrMode;
-  TypePromotionTransaction TPT;
+  TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
   while (!worklist.empty()) {
@@ -3869,7 +4166,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // addressing instructions might have.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
+      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI,
       InsertedInsts, PromotedInsts, TPT);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -3935,11 +4232,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+      SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
   } else if (AddrSinkUsingGEPs ||
              (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
-              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
-                  ->useAA())) {
+              SubtargetInfo->useAA())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -4042,7 +4338,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
           // We need to add this separately from the scale above to help with
           // SDAG consecutive load/store merging.
           if (ResultPtr->getType() != I8PtrTy)
-            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+            ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
           ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
         }
 
@@ -4053,12 +4349,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         SunkAddr = ResultPtr;
       } else {
         if (ResultPtr->getType() != I8PtrTy)
-          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+          ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
         SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
       }
 
       if (SunkAddr->getType() != Addr->getType())
-        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
     }
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -4185,14 +4481,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   return MadeChange;
 }
 
-/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// \brief Check if all the uses of \p Val are equivalent (or free) zero or
 /// sign extensions.
-static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
-  assert(!Inst->use_empty() && "Input must have at least one use");
-  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
+  assert(!Val->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
   bool IsSExt = isa<SExtInst>(FirstUser);
   Type *ExtTy = FirstUser->getType();
-  for (const User *U : Inst->users()) {
+  for (const User *U : Val->users()) {
     const Instruction *UI = cast<Instruction>(U);
     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
       return false;
@@ -4202,11 +4498,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
       continue;
 
     // If IsSExt is true, we are in this situation:
-    // a = Inst
+    // a = Val
     // b = sext ty1 a to ty2
     // c = sext ty1 a to ty3
     // Assuming ty2 is shorter than ty3, this could be turned into:
-    // a = Inst
+    // a = Val
     // b = sext ty1 a to ty2
     // c = sext ty2 b to ty3
     // However, the last sext is not free.
@@ -4233,51 +4529,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
   return true;
 }
 
-/// \brief Try to form ExtLd by promoting \p Exts until they reach a
-/// load instruction.
-/// If an ext(load) can be formed, it is returned via \p LI for the load
-/// and \p Inst for the extension.
-/// Otherwise LI == nullptr and Inst == nullptr.
-/// When some promotion happened, \p TPT contains the proper state to
-/// revert them.
+/// \brief Try to speculatively promote extensions in \p Exts and continue
+/// promoting through newly promoted operands recursively as far as doing so is
+/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
+/// When some promotion happened, \p TPT contains the proper state to revert
+/// them.
 ///
-/// \return true when promoting was necessary to expose the ext(load)
-/// opportunity, false otherwise.
-///
-/// Example:
-/// \code
-/// %ld = load i32* %addr
-/// %add = add nuw i32 %ld, 4
-/// %zext = zext i32 %add to i64
-/// \endcode
-/// =>
-/// \code
-/// %ld = load i32* %addr
-/// %zext = zext i32 %ld to i64
-/// %add = add nuw i64 %zext, 4
-/// \encode
-/// Thanks to the promotion, we can match zext(load i32*) to i64.
-bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
-                                    LoadInst *&LI, Instruction *&Inst,
-                                    const SmallVectorImpl<Instruction *> &Exts,
-                                    unsigned CreatedInstsCost = 0) {
-  // Iterate over all the extensions to see if one form an ext(load).
+/// \return true if some promotion happened, false otherwise.
+bool CodeGenPrepare::tryToPromoteExts(
+    TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
+    SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+    unsigned CreatedInstsCost) {
+  bool Promoted = false;
+
+  // Iterate over all the extensions to try to promote them.
   for (auto I : Exts) {
-    // Check if we directly have ext(load).
-    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
-      Inst = I;
-      // No promotion happened here.
-      return false;
+    // Early check if we directly have ext(load).
+    if (isa<LoadInst>(I->getOperand(0))) {
+      ProfitablyMovedExts.push_back(I);
+      continue;
     }
-    // Check whether or not we want to do any promotion.
+
+    // Check whether or not we want to do any promotion.  The reason we have
+    // this check inside the for loop is to catch the case where an extension
+    // is directly fed by a load because in such case the extension can be moved
+    // up without any promotion on its operands.
     if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
-      continue;
+      return false;
+
     // Get the action to perform the promotion.
-    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
-        I, InsertedInsts, *TLI, PromotedInsts);
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
     // Check if we can promote.
-    if (!TPH)
+    if (!TPH) {
+      // Save the current extension as we cannot move up through its operand.
+      ProfitablyMovedExts.push_back(I);
       continue;
+    }
+
     // Save the current state.
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
@@ -4297,110 +4586,293 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
     // one extension but leave one. However, we optimistically keep going,
     // because the new extension may be removed too.
     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
-    TotalCreatedInstsCost -= ExtCost;
+    // FIXME: It would be possible to propagate a negative value instead of
+    // conservatively ceiling it to 0.
+    TotalCreatedInstsCost =
+        std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
     if (!StressExtLdPromotion &&
         (TotalCreatedInstsCost > 1 ||
          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
-      // The promotion is not profitable, rollback to the previous state.
+      // This promotion is not profitable, rollback to the previous state, and
+      // save the current extension in ProfitablyMovedExts as the latest
+      // speculative promotion turned out to be unprofitable.
       TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+    // Continue promoting NewExts as far as doing so is profitable.
+    SmallVector<Instruction *, 2> NewlyMovedExts;
+    (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
+    bool NewPromoted = false;
+    for (auto ExtInst : NewlyMovedExts) {
+      Instruction *MovedExt = cast<Instruction>(ExtInst);
+      Value *ExtOperand = MovedExt->getOperand(0);
+      // If we have reached to a load, we need this extra profitability check
+      // as it could potentially be merged into an ext(load).
+      if (isa<LoadInst>(ExtOperand) &&
+          !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
+            (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
+        continue;
+
+      ProfitablyMovedExts.push_back(MovedExt);
+      NewPromoted = true;
+    }
+
+    // If none of speculative promotions for NewExts is profitable, rollback
+    // and save the current extension (I) as the last profitable extension.
+    if (!NewPromoted) {
+      TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
       continue;
     }
     // The promotion is profitable.
-    // Check if it exposes an ext(load).
-    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
-    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
-               // If we have created a new extension, i.e., now we have two
-               // extensions. We must make sure one of them is merged with
-               // the load, otherwise we may degrade the code quality.
-               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
-      // Promotion happened.
-      return true;
-    // If this does not help to expose an ext(load) then, rollback.
-    TPT.rollback(LastKnownGood);
+    Promoted = true;
   }
-  // None of the extension can form an ext(load).
-  LI = nullptr;
-  Inst = nullptr;
-  return false;
+  return Promoted;
 }
 
-/// Move a zext or sext fed by a load into the same basic block as the load,
-/// unless conditions are unfavorable. This allows SelectionDAG to fold the
-/// extend into the load.
-/// \p I[in/out] the extension may be modified during the process if some
-/// promotions apply.
-///
-bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
-  // ExtLoad formation infrastructure requires TLI to be effective.
-  if (!TLI)
-    return false;
+/// Merging redundant sexts when one is dominating the other.
+bool CodeGenPrepare::mergeSExts(Function &F) {
+  DominatorTree DT(F);
+  bool Changed = false;
+  for (auto &Entry : ValToSExtendedUses) {
+    SExts &Insts = Entry.second;
+    SExts CurPts;
+    for (Instruction *Inst : Insts) {
+      if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
+          Inst->getOperand(0) != Entry.first)
+        continue;
+      bool inserted = false;
+      for (auto &Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          Pt->replaceAllUsesWith(Inst);
+          RemovedInsts.insert(Pt);
+          Pt->removeFromParent();
+          Pt = Inst;
+          inserted = true;
+          Changed = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+        Inst->replaceAllUsesWith(Pt);
+        RemovedInsts.insert(Inst);
+        Inst->removeFromParent();
+        inserted = true;
+        Changed = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+  return Changed;
+}
 
-  // Try to promote a chain of computation if it allows to form
-  // an extended load.
-  TypePromotionTransaction TPT;
-  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
-    TPT.getRestorationPoint();
-  SmallVector<Instruction *, 1> Exts;
-  Exts.push_back(I);
-  // Look for a load being extended.
-  LoadInst *LI = nullptr;
-  Instruction *OldExt = I;
-  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
-  if (!LI || !I) {
-    assert(!HasPromoted && !LI && "If we did not match any load instruction "
-                                  "the code must remain the same");
-    I = OldExt;
-    return false;
+/// Return true, if an ext(load) can be formed from an extension in
+/// \p MovedExts.
+bool CodeGenPrepare::canFormExtLd(
+    const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
+    Instruction *&Inst, bool HasPromoted) {
+  for (auto *MovedExtInst : MovedExts) {
+    if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
+      LI = cast<LoadInst>(MovedExtInst->getOperand(0));
+      Inst = MovedExtInst;
+      break;
+    }
   }
+  if (!LI)
+    return false;
 
   // If they're already in the same block, there's nothing to do.
   // Make the cheap checks first if we did not promote.
   // If we promoted, we need to check if it is indeed profitable.
-  if (!HasPromoted && LI->getParent() == I->getParent())
+  if (!HasPromoted && LI->getParent() == Inst->getParent())
     return false;
 
-  EVT VT = TLI->getValueType(*DL, I->getType());
+  EVT VT = TLI->getValueType(*DL, Inst->getType());
   EVT LoadVT = TLI->getValueType(*DL, LI->getType());
 
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType())) {
-    I = OldExt;
-    TPT.rollback(LastKnownGood);
+  if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
+      !TLI->isTruncateFree(Inst->getType(), LI->getType()))
     return false;
-  }
 
   // Check whether the target supports casts folded into loads.
   unsigned LType;
-  if (isa<ZExtInst>(I))
+  if (isa<ZExtInst>(Inst))
     LType = ISD::ZEXTLOAD;
   else {
-    assert(isa<SExtInst>(I) && "Unexpected ext type!");
+    assert(isa<SExtInst>(Inst) && "Unexpected ext type!");
     LType = ISD::SEXTLOAD;
   }
-  if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) {
-    I = OldExt;
-    TPT.rollback(LastKnownGood);
+
+  return TLI->isLoadExtLegal(LType, VT, LoadVT);
+}
+
+/// Move a zext or sext fed by a load into the same basic block as the load,
+/// unless conditions are unfavorable. This allows SelectionDAG to fold the
+/// extend into the load.
+///
+/// E.g.,
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
+/// allow us to match zext(load i32*) to i64.
+///
+/// Also, try to promote the computations used to obtain a sign extended
+/// value used into memory accesses.
+/// E.g.,
+/// \code
+/// a = add nsw i32 b, 3
+/// d = sext i32 a to i64
+/// e = getelementptr ..., i64 d
+/// \endcode
+/// =>
+/// \code
+/// f = sext i32 b to i64
+/// a = add nsw i64 f, 3
+/// e = getelementptr ..., i64 a
+/// \endcode
+///
+/// \p Inst[in/out] the extension may be modified during the process if some
+/// promotions apply.
+bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
+  // ExtLoad formation and address type promotion infrastructure requires TLI to
+  // be effective.
+  if (!TLI)
     return false;
+
+  bool AllowPromotionWithoutCommonHeader = false;
+  /// See if it is an interesting sext operations for the address type
+  /// promotion before trying to promote it, e.g., the ones with the right
+  /// type and used in memory accesses.
+  bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
+      *Inst, AllowPromotionWithoutCommonHeader);
+  TypePromotionTransaction TPT(RemovedInsts);
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  SmallVector<Instruction *, 2> SpeculativelyMovedExts;
+  Exts.push_back(Inst);
+
+  bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
+
+  // Look for a load being extended.
+  LoadInst *LI = nullptr;
+  Instruction *ExtFedByLoad;
+
+  // Try to promote a chain of computation if it allows to form an extended
+  // load.
+  if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
+    assert(LI && ExtFedByLoad && "Expect a valid load and extension");
+    TPT.commit();
+    // Move the extend into the same block as the load
+    ExtFedByLoad->removeFromParent();
+    ExtFedByLoad->insertAfter(LI);
+    // CGP does not check if the zext would be speculatively executed when moved
+    // to the same basic block as the load. Preserving its original location
+    // would pessimize the debugging experience, as well as negatively impact
+    // the quality of sample pgo. We don't want to use "line 0" as that has a
+    // size cost in the line-table section and logically the zext can be seen as
+    // part of the load. Therefore we conservatively reuse the same debug
+    // location for the load and the zext.
+    ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
+    ++NumExtsMoved;
+    Inst = ExtFedByLoad;
+    return true;
   }
 
-  // Move the extend into the same block as the load, so that SelectionDAG
-  // can fold it.
-  TPT.commit();
-  I->removeFromParent();
-  I->insertAfter(LI);
-  // CGP does not check if the zext would be speculatively executed when moved
-  // to the same basic block as the load. Preserving its original location would
-  // pessimize the debugging experience, as well as negatively impact the 
-  // quality of sample pgo. We don't want to use "line 0" as that has a
-  // size cost in the line-table section and logically the zext can be seen as
-  // part of the load. Therefore we conservatively reuse the same debug location
-  // for the load and the zext.
-  I->setDebugLoc(LI->getDebugLoc());
-  ++NumExtsMoved;
-  return true;
+  // Continue promoting SExts if known as considerable depending on targets.
+  if (ATPConsiderable &&
+      performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
+                                  HasPromoted, TPT, SpeculativelyMovedExts))
+    return true;
+
+  TPT.rollback(LastKnownGood);
+  return false;
+}
+
+// Perform address type promotion if doing so is profitable.
+// If AllowPromotionWithoutCommonHeader == false, we should find other sext
+// instructions that sign extended the same initial value. However, if
+// AllowPromotionWithoutCommonHeader == true, we expect promoting the
+// extension is just profitable.
+bool CodeGenPrepare::performAddressTypePromotion(
+    Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
+    bool HasPromoted, TypePromotionTransaction &TPT,
+    SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
+  bool Promoted = false;
+  SmallPtrSet<Instruction *, 1> UnhandledExts;
+  bool AllSeenFirst = true;
+  for (auto I : SpeculativelyMovedExts) {
+    Value *HeadOfChain = I->getOperand(0);
+    DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+        SeenChainsForSExt.find(HeadOfChain);
+    // If there is an unhandled SExt which has the same header, try to promote
+    // it as well.
+    if (AlreadySeen != SeenChainsForSExt.end()) {
+      if (AlreadySeen->second != nullptr)
+        UnhandledExts.insert(AlreadySeen->second);
+      AllSeenFirst = false;
+    }
+  }
+
+  if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
+                        SpeculativelyMovedExts.size() == 1)) {
+    TPT.commit();
+    if (HasPromoted)
+      Promoted = true;
+    for (auto I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = nullptr;
+      ValToSExtendedUses[HeadOfChain].push_back(I);
+    }
+    // Update Inst as promotion happen.
+    Inst = SpeculativelyMovedExts.pop_back_val();
+  } else {
+    // This is the first chain visited from the header, keep the current chain
+    // as unhandled. Defer to promote this until we encounter another SExt
+    // chain derived from the same header.
+    for (auto I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = Inst;
+    }
+    return false;
+  }
+
+  if (!AllSeenFirst && !UnhandledExts.empty())
+    for (auto VisitedSExt : UnhandledExts) {
+      if (RemovedInsts.count(VisitedSExt))
+        continue;
+      TypePromotionTransaction TPT(RemovedInsts);
+      SmallVector<Instruction *, 1> Exts;
+      SmallVector<Instruction *, 2> Chains;
+      Exts.push_back(VisitedSExt);
+      bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
+      TPT.commit();
+      if (HasPromoted)
+        Promoted = true;
+      for (auto I : Chains) {
+        Value *HeadOfChain = I->getOperand(0);
+        // Mark this as handled.
+        SeenChainsForSExt[HeadOfChain] = nullptr;
+        ValToSExtendedUses[HeadOfChain].push_back(I);
+      }
+    }
+  return Promoted;
 }
 
 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
@@ -4534,13 +5006,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
       !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
     return false;
 
-  // Skip loads we've already transformed or have no reason to transform.
-  if (Load->hasOneUse()) {
-    User *LoadUser = *Load->user_begin();
-    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
-        !dyn_cast<PHINode>(LoadUser))
-      return false;
-  }
+  // Skip loads we've already transformed.
+  if (Load->hasOneUse() &&
+      InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
+    return false;
 
   // Look at all uses of Load, looking through phis, to determine how many bits
   // of the loaded value are needed.
@@ -4620,7 +5089,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   //
   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
   // mask, since these are the only ands that will be removed by isel.
-  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
+  if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
       WidestAndBits != DemandBits)
     return false;
 
@@ -4636,6 +5105,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   IRBuilder<> Builder(Load->getNextNode());
   auto *NewAnd = dyn_cast<Instruction>(
       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+  // Mark this instruction as "inserted by CGP", so that other
+  // optimizations don't touch it.
+  InsertedInsts.insert(NewAnd);
 
   // Replace all uses of load with new and (except for the use of load in the
   // new and itself).
@@ -4985,7 +5457,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
   ExtInst->insertBefore(SI);
   SI->setCondition(ExtInst);
-  for (SwitchInst::CaseIt Case : SI->cases()) {
+  for (auto Case : SI->cases()) {
     APInt NarrowConst = Case.getCaseValue()->getValue();
     APInt WideConst = (ExtType == Instruction::ZExt) ?
                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
@@ -5514,7 +5986,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
               TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
-        bool MadeChange = moveExtToFormExtLoad(I);
+        bool MadeChange = optimizeExt(I);
         return MadeChange | optimizeExtUses(I);
       }
     }
@@ -5548,8 +6020,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
     return false;
   }
 
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+      unsigned AS = RMW->getPointerAddressSpace();
+      return optimizeMemoryInst(I, RMW->getPointerOperand(),
+                                RMW->getType(), AS);
+  }
+
+  if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
+      unsigned AS = CmpX->getPointerAddressSpace();
+      return optimizeMemoryInst(I, CmpX->getPointerOperand(),
+                                CmpX->getCompareOperand()->getType(), AS);
+  }
+
   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
 
+  if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
+      EnableAndCmpSinking && TLI)
+    return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
+
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
                 BinOp->getOpcode() == Instruction::LShr)) {
     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
@@ -5679,68 +6167,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
   return MadeChange;
 }
 
-// If there is a sequence that branches based on comparing a single bit
-// against zero that can be combined into a single instruction, and the
-// target supports folding these into a single instruction, sink the
-// mask and compare into the branch uses. Do this before OptimizeBlock ->
-// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
-// searched for.
-bool CodeGenPrepare::sinkAndCmp(Function &F) {
-  if (!EnableAndCmpSinking)
-    return false;
-  if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
-    return false;
-  bool MadeChange = false;
-  for (BasicBlock &BB : F) {
-    // Does this BB end with the following?
-    //   %andVal = and %val, #single-bit-set
-    //   %icmpVal = icmp %andResult, 0
-    //   br i1 %cmpVal label %dest1, label %dest2"
-    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
-    if (!Brcc || !Brcc->isConditional())
-      continue;
-    ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
-    if (!Cmp || Cmp->getParent() != &BB)
-      continue;
-    ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
-    if (!Zero || !Zero->isZero())
-      continue;
-    Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
-    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
-      continue;
-    ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
-    if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
-      continue;
-    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
-
-    // Push the "and; icmp" for any users that are conditional branches.
-    // Since there can only be one branch use per BB, we don't need to keep
-    // track of which BBs we insert into.
-    for (Use &TheUse : Cmp->uses()) {
-      // Find brcc use.
-      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
-      if (!BrccUser || !BrccUser->isConditional())
-        continue;
-      BasicBlock *UserBB = BrccUser->getParent();
-      if (UserBB == &BB) continue;
-      DEBUG(dbgs() << "found Brcc use\n");
-
-      // Sink the "and; icmp" to use.
-      MadeChange = true;
-      BinaryOperator *NewAnd =
-        BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
-                                  BrccUser);
-      CmpInst *NewCmp =
-        CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
-                        "", BrccUser);
-      TheUse = NewCmp;
-      ++NumAndCmpsMoved;
-      DEBUG(BrccUser->getParent()->dump());
-    }
-  }
-  return MadeChange;
-}
-
 /// \brief Scale down both weights to fit into uint32_t.
 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
diff --git a/lib/CodeGen/CountingFunctionInserter.cpp b/lib/CodeGen/CountingFunctionInserter.cpp
index 1e46a7a99e7e..7f7350f5fb5c 100644
--- a/lib/CodeGen/CountingFunctionInserter.cpp
+++ b/lib/CodeGen/CountingFunctionInserter.cpp
@@ -41,7 +41,7 @@ namespace {
       Type *VoidTy = Type::getVoidTy(F.getContext());
       Constant *CountingFn =
         F.getParent()->getOrInsertFunction(CountingFunctionName,
-                                           VoidTy, nullptr);
+                                           VoidTy);
       CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt());
       return true;
     }
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 5d60c3055456..e1eeddf0816c 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector Pristine = MFI.getPristineRegs(MF);
-  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
-    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg)))
+      continue;
     for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
       unsigned Reg = *AI;
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 17c229a216ae..7ac2e5445435 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
     // Start out assuming that reserved registers are live out of this block.
     LivePhysRegs = MRI->getReservedRegs();
 
-    // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
+    // Add live-ins from successors to LivePhysRegs. Normally, physregs are not
     // live across blocks, but some targets (x86) can have flags live out of a
     // block.
     for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index a7ba694c144d..6f4ea1912cf4 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -441,7 +441,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
           const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
           CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
           if (CrossCopy)
-            DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI);
+            DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);
         }
 
         if (!CrossCopy)
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 32c57e3e3705..e272d25047e6 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -6,21 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the execution dependency fix pass.
-//
-// Some X86 SSE instructions like mov, and, or, xor are available in different
-// variants for different operand types. These variant instructions are
-// equivalent, but on Nehalem and newer cpus there is extra latency
-// transferring data between integer and floating point domains.  ARM cores
-// have similar issues when they are configured with both VFP and NEON
-// pipelines.
-//
-// This pass changes the variant instructions to minimize domain crossings.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,193 +23,18 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "execution-fix"
-
-/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
-/// of execution domains.
-///
-/// An open DomainValue represents a set of instructions that can still switch
-/// execution domain. Multiple registers may refer to the same open
-/// DomainValue - they will eventually be collapsed to the same execution
-/// domain.
-///
-/// A collapsed DomainValue represents a single register that has been forced
-/// into one of more execution domains. There is a separate collapsed
-/// DomainValue for each register, but it may contain multiple execution
-/// domains. A register value is initially created in a single execution
-/// domain, but if we were forced to pay the penalty of a domain crossing, we
-/// keep track of the fact that the register is now available in multiple
-/// domains.
-namespace {
-struct DomainValue {
-  // Basic reference counting.
-  unsigned Refs;
-
-  // Bitmask of available domains. For an open DomainValue, it is the still
-  // possible domains for collapsing. For a collapsed DomainValue it is the
-  // domains where the register is available for free.
-  unsigned AvailableDomains;
-
-  // Pointer to the next DomainValue in a chain.  When two DomainValues are
-  // merged, Victim.Next is set to point to Victor, so old DomainValue
-  // references can be updated by following the chain.
-  DomainValue *Next;
-
-  // Twiddleable instructions using or defining these registers.
-  SmallVector<MachineInstr*, 8> Instrs;
-
-  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
-  // track of the domains where the registers are already available.
-  bool isCollapsed() const { return Instrs.empty(); }
-
-  // Is domain available?
-  bool hasDomain(unsigned domain) const {
-    assert(domain <
-               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
-           "undefined behavior");
-    return AvailableDomains & (1u << domain);
-  }
-
-  // Mark domain as available.
-  void addDomain(unsigned domain) {
-    AvailableDomains |= 1u << domain;
-  }
-
-  // Restrict to a single domain available.
-  void setSingleDomain(unsigned domain) {
-    AvailableDomains = 1u << domain;
-  }
-
-  // Return bitmask of domains that are available and in mask.
-  unsigned getCommonDomains(unsigned mask) const {
-    return AvailableDomains & mask;
-  }
-
-  // First domain available.
-  unsigned getFirstDomain() const {
-    return countTrailingZeros(AvailableDomains);
-  }
-
-  DomainValue() : Refs(0) { clear(); }
-
-  // Clear this DomainValue and point to next which has all its data.
-  void clear() {
-    AvailableDomains = 0;
-    Next = nullptr;
-    Instrs.clear();
-  }
-};
-}
-
-namespace {
-/// Information about a live register.
-struct LiveReg {
-  /// Value currently in this register, or NULL when no value is being tracked.
-  /// This counts as a DomainValue reference.
-  DomainValue *Value;
-
-  /// Instruction that defined this register, relative to the beginning of the
-  /// current basic block.  When a LiveReg is used to represent a live-out
-  /// register, this value is relative to the end of the basic block, so it
-  /// will be a negative number.
-  int Def;
-};
-} // anonymous namespace
-
-namespace {
-class ExeDepsFix : public MachineFunctionPass {
-  static char ID;
-  SpecificBumpPtrAllocator<DomainValue> Allocator;
-  SmallVector<DomainValue*,16> Avail;
-
-  const TargetRegisterClass *const RC;
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  RegisterClassInfo RegClassInfo;
-  std::vector<SmallVector<int, 1>> AliasMap;
-  const unsigned NumRegs;
-  LiveReg *LiveRegs;
-  typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
-  LiveOutMap LiveOuts;
-
-  /// List of undefined register reads in this block in forward order.
-  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
-
-  /// Storage for register unit liveness.
-  LivePhysRegs LiveRegSet;
-
-  /// Current instruction number.
-  /// The first instruction in each basic block is 0.
-  int CurInstr;
-
-  /// True when the current block has a predecessor that hasn't been visited
-  /// yet.
-  bool SeenUnknownBackEdge;
-
-public:
-  ExeDepsFix(const TargetRegisterClass *rc)
-    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
-  }
-
-  StringRef getPassName() const override { return "Execution dependency fix"; }
-
-private:
-  iterator_range<SmallVectorImpl<int>::const_iterator>
-  regIndices(unsigned Reg) const;
-
-  // DomainValue allocation.
-  DomainValue *alloc(int domain = -1);
-  DomainValue *retain(DomainValue *DV) {
-    if (DV) ++DV->Refs;
-    return DV;
-  }
-  void release(DomainValue*);
-  DomainValue *resolve(DomainValue*&);
-
-  // LiveRegs manipulations.
-  void setLiveReg(int rx, DomainValue *DV);
-  void kill(int rx);
-  void force(int rx, unsigned domain);
-  void collapse(DomainValue *dv, unsigned domain);
-  bool merge(DomainValue *A, DomainValue *B);
-
-  void enterBasicBlock(MachineBasicBlock*);
-  void leaveBasicBlock(MachineBasicBlock*);
-  void visitInstr(MachineInstr*);
-  void processDefs(MachineInstr*, bool Kill);
-  void visitSoftInstr(MachineInstr*, unsigned mask);
-  void visitHardInstr(MachineInstr*, unsigned domain);
-  void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                unsigned Pref);
-  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
-  void processUndefReads(MachineBasicBlock*);
-};
-}
-
-char ExeDepsFix::ID = 0;
+#define DEBUG_TYPE "execution-deps-fix"
 
 /// Translate TRI register number to a list of indices into our smaller tables
 /// of interesting registers.
 iterator_range<SmallVectorImpl<int>::const_iterator>
-ExeDepsFix::regIndices(unsigned Reg) const {
+ExecutionDepsFix::regIndices(unsigned Reg) const {
   assert(Reg < AliasMap.size() && "Invalid register");
   const auto &Entry = AliasMap[Reg];
   return make_range(Entry.begin(), Entry.end());
 }
 
-DomainValue *ExeDepsFix::alloc(int domain) {
+DomainValue *ExecutionDepsFix::alloc(int domain) {
   DomainValue *dv = Avail.empty() ?
                       new(Allocator.Allocate()) DomainValue :
                       Avail.pop_back_val();
@@ -234,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) {
 
 /// Release a reference to DV.  When the last reference is released,
 /// collapse if needed.
-void ExeDepsFix::release(DomainValue *DV) {
+void ExecutionDepsFix::release(DomainValue *DV) {
   while (DV) {
     assert(DV->Refs && "Bad DomainValue");
     if (--DV->Refs)
@@ -254,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) {
 
 /// Follow the chain of dead DomainValues until a live DomainValue is reached.
 /// Update the referenced pointer when necessary.
-DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
+DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) {
   DomainValue *DV = DVRef;
   if (!DV || !DV->Next)
     return DV;
@@ -271,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
 }
 
 /// Set LiveRegs[rx] = dv, updating reference counts.
-void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
+void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
 
@@ -283,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
 }
 
 // Kill register rx, recycle or collapse any DomainValue.
-void ExeDepsFix::kill(int rx) {
+void ExecutionDepsFix::kill(int rx) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
   if (!LiveRegs[rx].Value)
@@ -294,7 +107,7 @@ void ExeDepsFix::kill(int rx) {
 }
 
 /// Force register rx into domain.
-void ExeDepsFix::force(int rx, unsigned domain) {
+void ExecutionDepsFix::force(int rx, unsigned domain) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
   if (DomainValue *dv = LiveRegs[rx].Value) {
@@ -317,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) {
 
 /// Collapse open DomainValue into given domain. If there are multiple
 /// registers using dv, they each get a unique collapsed DomainValue.
-void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
+void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) {
   assert(dv->hasDomain(domain) && "Cannot collapse");
 
   // Collapse all the instructions.
@@ -333,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
 }
 
 /// All instructions and registers in B are moved to A, and B is released.
-bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
+bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) {
   assert(!A->isCollapsed() && "Cannot merge into collapsed");
   assert(!B->isCollapsed() && "Cannot merge from collapsed");
   if (A == B)
@@ -359,10 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
 }
 
 /// Set up LiveRegs by merging predecessor live-out values.
-void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
-  // Detect back-edges from predecessors we haven't processed yet.
-  SeenUnknownBackEdge = false;
-
+void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
@@ -397,18 +207,21 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Try to coalesce live-out registers from predecessors.
   for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
        pe = MBB->pred_end(); pi != pe; ++pi) {
-    LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
-    if (fi == LiveOuts.end()) {
-      SeenUnknownBackEdge = true;
+    auto fi = MBBInfos.find(*pi);
+    assert(fi != MBBInfos.end() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    LiveReg *Incoming = fi->second.OutRegs;
+    // Incoming is null if this is a backedge from a BB
+    // we haven't processed yet
+    if (Incoming == nullptr) {
       continue;
     }
-    assert(fi->second && "Can't have NULL entries");
 
     for (unsigned rx = 0; rx != NumRegs; ++rx) {
       // Use the most recent predecessor def for each register.
-      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def);
+      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def);
 
-      DomainValue *pdv = resolve(fi->second[rx].Value);
+      DomainValue *pdv = resolve(Incoming[rx].Value);
       if (!pdv)
         continue;
       if (!LiveRegs[rx].Value) {
@@ -432,35 +245,34 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
         force(rx, pdv->getFirstDomain());
     }
   }
-  DEBUG(dbgs() << "BB#" << MBB->getNumber()
-        << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n"));
+  DEBUG(
+      dbgs() << "BB#" << MBB->getNumber()
+             << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
 }
 
-void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
   assert(LiveRegs && "Must enter basic block first.");
-  // Save live registers at end of MBB - used by enterBasicBlock().
-  // Also use LiveOuts as a visited set to detect back-edges.
-  bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second;
-
-  if (First) {
-    // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to
-    // the end of this block instead of the beginning.
-    for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      LiveRegs[i].Def -= CurInstr;
-  } else {
-    // Insertion failed, this must be the second pass.
+  LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
+  // Save register clearances at end of MBB - used by enterBasicBlock().
+  MBBInfos[MBB].OutRegs = LiveRegs;
+
+  // While processing the basic block, we kept `Def` relative to the start
+  // of the basic block for convenience. However, future use of this information
+  // only cares about the clearance from the end of the block, so adjust
+  // everything to be relative to the end of the basic block.
+  for (unsigned i = 0, e = NumRegs; i != e; ++i)
+    LiveRegs[i].Def -= CurInstr;
+  if (OldOutRegs) {
+    // This must be the second pass.
     // Release all the DomainValues instead of keeping them.
     for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      release(LiveRegs[i].Value);
-    delete[] LiveRegs;
+      release(OldOutRegs[i].Value);
+    delete[] OldOutRegs;
   }
   LiveRegs = nullptr;
 }
 
-void ExeDepsFix::visitInstr(MachineInstr *MI) {
-  if (MI->isDebugValue())
-    return;
-
+bool ExecutionDepsFix::visitInstr(MachineInstr *MI) {
   // Update instructions with explicit execution domains.
   std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
   if (DomP.first) {
@@ -470,16 +282,16 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
       visitHardInstr(MI, DomP.first);
   }
 
-  // Process defs to track register ages, and kill values clobbered by generic
-  // instructions.
-  processDefs(MI, !DomP.first);
+  return !DomP.first;
 }
 
 /// \brief Helps avoid false dependencies on undef registers by updating the
 /// machine instructions' undef operand to use a register that the instruction
 /// is truly dependent on, or use a register with clearance higher than Pref.
-void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                          unsigned Pref) {
+/// Returns true if it was able to find a true dependency, thus not requiring
+/// a dependency breaking instruction regardless of clearance.
+bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI,
+                                                unsigned OpIdx, unsigned Pref) {
   MachineOperand &MO = MI->getOperand(OpIdx);
   assert(MO.isUndef() && "Expected undef machine operand");
 
@@ -487,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
 
   // Update only undef operands that are mapped to one register.
   if (AliasMap[OriginalReg].size() != 1)
-    return;
+    return false;
 
   // Get the undef operand's register class
   const TargetRegisterClass *OpRC =
@@ -502,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
     // We found a true dependency - replace the undef register with the true
     // dependency.
     MO.setReg(CurrMO.getReg());
-    return;
+    return true;
   }
 
   // Go over all registers in the register class and find the register with
@@ -527,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   // Update the operand if we found a register with better clearance.
   if (MaxClearanceReg != OriginalReg)
     MO.setReg(MaxClearanceReg);
+
+  return false;
 }
 
 /// \brief Return true to if it makes sense to break dependence on a partial def
 /// or undef use.
-bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
-                                       unsigned Pref) {
+bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                             unsigned Pref) {
   unsigned reg = MI->getOperand(OpIdx).getReg();
   for (int rx : regIndices(reg)) {
     unsigned Clearance = CurInstr - LiveRegs[rx].Def;
@@ -542,14 +356,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
       DEBUG(dbgs() << ": Break dependency.\n");
       continue;
     }
-    // The current clearance seems OK, but we may be ignoring a def from a
-    // back-edge.
-    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-      DEBUG(dbgs() << ": OK .\n");
-      return false;
-    }
-    // A def from an unprocessed back-edge may make us break this dependency.
-    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+    DEBUG(dbgs() << ": OK .\n");
     return false;
   }
   return true;
@@ -559,16 +366,22 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
 // If Kill is set, also kill off DomainValues clobbered by the defs.
 //
 // Also break dependencies on partial defs and undef uses.
-void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
+void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
+                                   bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
   unsigned OpNum;
-  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
-  if (Pref) {
-    pickBestRegisterForUndef(MI, OpNum, Pref);
-    if (shouldBreakDependence(MI, OpNum, Pref))
-      UndefReads.push_back(std::make_pair(MI, OpNum));
+  if (breakDependency) {
+    unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
+    if (Pref) {
+      bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref);
+      // We don't need to bother trying to break a dependency if this
+      // instruction has a true dependency on that register through another
+      // operand - we'll have to wait for it to be available regardless.
+      if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref))
+        UndefReads.push_back(std::make_pair(MI, OpNum));
+    }
   }
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
@@ -584,11 +397,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
       DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                    << '\t' << *MI);
 
-      // Check clearance before partial register updates.
-      // Call breakDependence before setting LiveRegs[rx].Def.
-      unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
-      if (Pref && shouldBreakDependence(MI, i, Pref))
-        TII->breakPartialRegDependency(*MI, i, TRI);
+      if (breakDependency) {
+        // Check clearance before partial register updates.
+        // Call breakDependence before setting LiveRegs[rx].Def.
+        unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
+        if (Pref && shouldBreakDependence(MI, i, Pref))
+          TII->breakPartialRegDependency(*MI, i, TRI);
+      }
 
       // How many instructions since rx was last written?
       LiveRegs[rx].Def = CurInstr;
@@ -607,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
 /// only do it on demand. Note that the occurrence of undefined register reads
 /// that should be broken is very rare, but when they occur we may have many in
 /// a single block.
-void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) {
   if (UndefReads.empty())
     return;
 
@@ -640,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
 
 // A hard instruction only works in one domain. All input registers will be
 // forced into that domain.
-void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
   // Collapse all uses.
   for (unsigned i = mi->getDesc().getNumDefs(),
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
@@ -663,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
 }
 
 // A soft instruction can be changed to work in other domains given by mask.
-void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // Bitmask of available domains for this instruction after taking collapsed
   // operands into account.
   unsigned available = mask;
@@ -774,7 +589,34 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
 }
 
-bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
+void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB,
+                                         bool PrimaryPass) {
+  enterBasicBlock(MBB);
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid doing the work
+  // to try and break dependencies now.
+  bool breakDependency = isBlockDone(MBB);
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isDebugValue()) {
+      bool Kill = false;
+      if (PrimaryPass)
+        Kill = visitInstr(&MI);
+      processDefs(&MI, breakDependency, Kill);
+    }
+  }
+  if (breakDependency)
+    processUndefReads(MBB);
+  leaveBasicBlock(MBB);
+}
+
+bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) {
+  return MBBInfos[MBB].PrimaryCompleted &&
+         MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
+         MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
+}
+
+bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) {
   if (skipFunction(*mf.getFunction()))
     return false;
   MF = &mf;
@@ -810,52 +652,104 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         AliasMap[*AI].push_back(i);
   }
 
+  // Initialize the MMBInfos
+  for (auto &MBB : mf) {
+    MBBInfo InitialInfo;
+    MBBInfos.insert(std::make_pair(&MBB, InitialInfo));
+  }
+
+  /*
+   *  We want to visit every instruction in every basic block in order to update
+   *  it's execution domain or break any false dependencies. However, for the
+   *  dependency breaking, we need to know clearances from all predecessors
+   *  (including any backedges). One way to do so would be to do two complete
+   *  passes over all basic blocks/instructions, the first for recording
+   *  clearances, the second to break the dependencies. However, for functions
+   *  without backedges, or functions with a lot of straight-line code, and
+   *  a small loop, that would be a lot of unnecessary work (since only the
+   *  BBs that are part of the loop require two passes). As an example,
+   *  consider the following loop.
+   *
+   *
+   *     PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
+   *           ^                                  |
+   *           +----------------------------------+
+   *
+   *  The iteration order is as follows:
+   *  Naive: PH A B C D A' B' C' D'
+   *  Optimized: PH A B C A' B' C' D
+   *
+   *  Note that we avoid processing D twice, because we can entirely process
+   *  the predecessors before getting to D. We call a block that is ready
+   *  for its second round of processing `done` (isBlockDone). Once we finish
+   *  processing some block, we update the counters in MBBInfos and re-process
+   *  any successors that are now done.
+   */
+
   MachineBasicBlock *Entry = &*MF->begin();
   ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
-  SmallVector<MachineBasicBlock*, 16> Loops;
+  SmallVector<MachineBasicBlock *, 4> Workqueue;
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
     MachineBasicBlock *MBB = *MBBI;
-    enterBasicBlock(MBB);
-    if (SeenUnknownBackEdge)
-      Loops.push_back(MBB);
-    for (MachineInstr &MI : *MBB)
-      visitInstr(&MI);
-    processUndefReads(MBB);
-    leaveBasicBlock(MBB);
+    // N.B: IncomingProcessed and IncomingCompleted were already updated while
+    // processing this block's predecessors.
+    MBBInfos[MBB].PrimaryCompleted = true;
+    MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
+    bool Primary = true;
+    Workqueue.push_back(MBB);
+    while (!Workqueue.empty()) {
+      MachineBasicBlock *ActiveMBB = &*Workqueue.back();
+      Workqueue.pop_back();
+      processBasicBlock(ActiveMBB, Primary);
+      bool Done = isBlockDone(ActiveMBB);
+      for (auto *Succ : ActiveMBB->successors()) {
+        if (!isBlockDone(Succ)) {
+          if (Primary) {
+            MBBInfos[Succ].IncomingProcessed++;
+          }
+          if (Done) {
+            MBBInfos[Succ].IncomingCompleted++;
+          }
+          if (isBlockDone(Succ)) {
+            Workqueue.push_back(Succ);
+          }
+        }
+      }
+      Primary = false;
+    }
   }
 
-  // Visit all the loop blocks again in order to merge DomainValues from
-  // back-edges.
-  for (MachineBasicBlock *MBB : Loops) {
-    enterBasicBlock(MBB);
-    for (MachineInstr &MI : *MBB)
-      if (!MI.isDebugValue())
-        processDefs(&MI, false);
-    processUndefReads(MBB);
-    leaveBasicBlock(MBB);
+  // We need to go through again and finalize any blocks that are not done yet.
+  // This is possible if blocks have dead predecessors, so we didn't visit them
+  // above.
+  for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator
+           MBBI = RPOT.begin(),
+           MBBE = RPOT.end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock *MBB = *MBBI;
+    if (!isBlockDone(MBB)) {
+      processBasicBlock(MBB, false);
+      // Don't update successors here. We'll get to them anyway through this
+      // loop.
+    }
   }
 
   // Clear the LiveOuts vectors and collapse any remaining DomainValues.
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
-    LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI);
-    if (FI == LiveOuts.end() || !FI->second)
+    auto FI = MBBInfos.find(*MBBI);
+    if (FI == MBBInfos.end() || !FI->second.OutRegs)
       continue;
     for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      if (FI->second[i].Value)
-        release(FI->second[i].Value);
-    delete[] FI->second;
+      if (FI->second.OutRegs[i].Value)
+        release(FI->second.OutRegs[i].Value);
+    delete[] FI->second.OutRegs;
   }
-  LiveOuts.clear();
+  MBBInfos.clear();
   UndefReads.clear();
   Avail.clear();
   Allocator.DestroyAll();
 
   return false;
 }
-
-FunctionPass *
-llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
-  return new ExeDepsFix(RC);
-}
diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp
new file mode 100644
index 000000000000..0759bf6713e0
--- /dev/null
+++ b/lib/CodeGen/FEntryInserter.cpp
@@ -0,0 +1,55 @@
+//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file edits function bodies to insert fentry calls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct FEntryInserter : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  FEntryInserter() : MachineFunctionPass(ID) {
+    initializeFEntryInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+}
+
+bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) {
+  const std::string FEntryName =
+      MF.getFunction()->getFnAttribute("fentry-call").getValueAsString();
+  if (FEntryName != "true")
+    return false;
+
+  auto &FirstMBB = *MF.begin();
+  auto &FirstMI = *FirstMBB.begin();
+
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+          TII->get(TargetOpcode::FENTRY_CALL));
+  return true;
+}
+
+char FEntryInserter::ID = 0;
+char &llvm::FEntryInserterID = FEntryInserter::ID;
+INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false,
+                false)
diff --git a/lib/CodeGen/FaultMaps.cpp b/lib/CodeGen/FaultMaps.cpp
index 2acafafdb9fc..43f364128978 100644
--- a/lib/CodeGen/FaultMaps.cpp
+++ b/lib/CodeGen/FaultMaps.cpp
@@ -1,4 +1,4 @@
-//===---------------------------- FaultMaps.cpp ---------------------------===//
+//===- FaultMaps.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/FaultMaps.h"
-
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
   }
 }
 
-
 const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) {
   switch (FT) {
   default:
     llvm_unreachable("unhandled fault type!");
-
   case FaultMaps::FaultingLoad:
     return "FaultingLoad";
+  case FaultMaps::FaultingLoadStore:
+    return "FaultingLoadStore";
+  case FaultMaps::FaultingStore:
+    return "FaultingStore";
   }
 }
 
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 31ab86fdf276..6be4c16c6301 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -1,4 +1,4 @@
-//===-- GCStrategy.cpp - Garbage Collector Description --------------------===//
+//===- GCStrategy.cpp - Garbage Collector Description ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,7 +18,4 @@ using namespace llvm;
 
 LLVM_INSTANTIATE_REGISTRY(GCRegistry)
 
-GCStrategy::GCStrategy()
-    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false),
-      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true),
-      UsesMetadata(false) {}
+GCStrategy::GCStrategy() = default;
diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt
index 76ab5d36047e..03a8c4f5f909 100644
--- a/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -22,7 +22,6 @@ else()
   set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
 endif()
 
-
 # In LLVMBuild.txt files, it is not possible to mark a dependency to a
 # library as optional. So instead, generate an empty library if we did
 # not ask for it. 
diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp
index 13212212fa01..035a2ac78ed9 100644
--- a/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -24,40 +24,42 @@
 using namespace llvm;
 
 bool CallLowering::lowerCall(
-    MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg,
+    MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg,
     ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const {
-  auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout();
+  auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
   SmallVector<ArgInfo, 8> OrigArgs;
   unsigned i = 0;
-  for (auto &Arg : CI.arg_operands()) {
-    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}};
-    setArgFlags(OrigArg, i + 1, DL, CI);
+  unsigned NumFixedArgs = CS.getFunctionType()->getNumParams();
+  for (auto &Arg : CS.args()) {
+    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
+                    i < NumFixedArgs};
+    setArgFlags(OrigArg, i + 1, DL, CS);
     OrigArgs.push_back(OrigArg);
     ++i;
   }
 
   MachineOperand Callee = MachineOperand::CreateImm(0);
-  if (Function *F = CI.getCalledFunction())
+  if (const Function *F = CS.getCalledFunction())
     Callee = MachineOperand::CreateGA(F, 0);
   else
     Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
 
-  ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}};
+  ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}};
   if (!OrigRet.Ty->isVoidTy())
-    setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI);
+    setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS);
 
-  return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs);
+  return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs);
 }
 
 template <typename FuncInfoTy>
 void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const DataLayout &DL,
                                const FuncInfoTy &FuncInfo) const {
-  const AttributeSet &Attrs = FuncInfo.getAttributes();
+  const AttributeList &Attrs = FuncInfo.getAttributes();
   if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
     Arg.Flags.setZExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
@@ -103,7 +105,6 @@ CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                     const CallInst &FuncInfo) const;
 
 bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
-                                     CCAssignFn *AssignFn,
                                      ArrayRef<ArgInfo> Args,
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
@@ -116,12 +117,20 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
   unsigned NumArgs = Args.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT CurVT = MVT::getVT(Args[i].Ty);
-    if (AssignFn(i, CurVT, CurVT, CCValAssign::Full, Args[i].Flags, CCInfo))
+    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo))
       return false;
   }
 
-  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) {
+    assert(j < ArgLocs.size() && "Skipped too many arg locs");
+
+    CCValAssign &VA = ArgLocs[j];
+    assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
+
+    if (VA.needsCustom()) {
+      j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      continue;
+    }
 
     if (VA.isRegLoc())
       Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA);
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 89a042ffc477..766187378446 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -12,7 +12,10 @@
 
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -21,11 +24,13 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -40,11 +45,21 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 
-static void reportTranslationError(const Value &V, const Twine &Message) {
-  std::string ErrStorage;
-  raw_string_ostream Err(ErrStorage);
-  Err << Message << ": " << V << '\n';
-  report_fatal_error(Err.str());
+static void reportTranslationError(MachineFunction &MF,
+                                   const TargetPassConfig &TPC,
+                                   OptimizationRemarkEmitter &ORE,
+                                   OptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(R.getMsg());
+  else
+    ORE.emit(R);
 }
 
 IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) {
@@ -59,28 +74,31 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
 
 unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
   unsigned &ValReg = ValToVReg[&Val];
-  // Check if this is the first time we see Val.
-  if (!ValReg) {
-    // Fill ValRegsSequence with the sequence of registers
-    // we need to concat together to produce the value.
-    assert(Val.getType()->isSized() &&
-           "Don't know how to create an empty vreg");
-    unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL});
-    ValReg = VReg;
-
-    if (auto CV = dyn_cast<Constant>(&Val)) {
-      bool Success = translate(*CV, VReg);
-      if (!Success) {
-        if (!TPC->isGlobalISelAbortEnabled()) {
-          MF->getProperties().set(
-              MachineFunctionProperties::Property::FailedISel);
-          return VReg;
-        }
-        reportTranslationError(Val, "unable to translate constant");
-      }
+
+  if (ValReg)
+    return ValReg;
+
+  // Fill ValRegsSequence with the sequence of registers
+  // we need to concat together to produce the value.
+  assert(Val.getType()->isSized() &&
+         "Don't know how to create an empty vreg");
+  unsigned VReg =
+      MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL));
+  ValReg = VReg;
+
+  if (auto CV = dyn_cast<Constant>(&Val)) {
+    bool Success = translate(*CV, VReg);
+    if (!Success) {
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 MF->getFunction()->getSubprogram(),
+                                 &MF->getFunction()->getEntryBlock());
+      R << "unable to translate constant: " << ore::NV("Type", Val.getType());
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return VReg;
     }
   }
-  return ValReg;
+
+  return VReg;
 }
 
 int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
@@ -112,28 +130,27 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     Alignment = LI->getAlignment();
     ValTy = LI->getType();
-  } else if (!TPC->isGlobalISelAbortEnabled()) {
-    MF->getProperties().set(
-        MachineFunctionProperties::Property::FailedISel);
+  } else {
+    OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
+    R << "unable to translate memop: " << ore::NV("Opcode", &I);
+    reportTranslationError(*MF, *TPC, *ORE, R);
     return 1;
-  } else
-    llvm_unreachable("unhandled memory instruction");
+  }
 
   return Alignment ? Alignment : DL->getABITypeAlignment(ValTy);
 }
 
-MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
+MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
-  if (!MBB) {
-    MBB = MF->CreateMachineBasicBlock(&BB);
-    MF->push_back(MBB);
-
-    if (BB.hasAddressTaken())
-      MBB->setHasAddressTaken();
-  }
+  assert(MBB && "BasicBlock was not encountered before");
   return *MBB;
 }
 
+void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
+  assert(NewPred && "new predecessor must be a real MachineBasicBlock");
+  MachinePreds[Edge].push_back(NewPred);
+}
+
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
   // FIXME: handle signed/unsigned wrapping flags.
@@ -149,6 +166,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
   return true;
 }
 
+bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
+  // -0.0 - X --> G_FNEG
+  if (isa<Constant>(U.getOperand(0)) &&
+      U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) {
+    MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
+        .addDef(getOrCreateVReg(U))
+        .addUse(getOrCreateVReg(*U.getOperand(1)));
+    return true;
+  }
+  return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
+}
+
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   const CmpInst *CI = dyn_cast<CmpInst>(&U);
@@ -158,9 +187,14 @@ bool IRTranslator::translateCompare(const User &U,
   CmpInst::Predicate Pred =
       CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>(
                                     cast<ConstantExpr>(U).getPredicate());
-
   if (CmpInst::isIntPredicate(Pred))
     MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
+  else if (Pred == CmpInst::FCMP_FALSE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getNullValue(CI->getType())));
+  else if (Pred == CmpInst::FCMP_TRUE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
   else
     MIRBuilder.buildFCmp(Pred, Res, Op0, Op1);
 
@@ -183,18 +217,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
     // We want a G_BRCOND to the true BB followed by an unconditional branch.
     unsigned Tst = getOrCreateVReg(*BrInst.getCondition());
     const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++));
-    MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt);
+    MachineBasicBlock &TrueBB = getMBB(TrueTgt);
     MIRBuilder.buildBrCond(Tst, TrueBB);
   }
 
   const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ));
-  MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt);
-  MIRBuilder.buildBr(TgtBB);
+  MachineBasicBlock &TgtBB = getMBB(BrTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the unconditional target is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
 
   // Link successors.
-  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
   for (const BasicBlock *Succ : BrInst.successors())
-    CurBB.addSuccessor(&getOrCreateBB(*Succ));
+    CurBB.addSuccessor(&getMBB(*Succ));
   return true;
 }
 
@@ -209,30 +246,52 @@ bool IRTranslator::translateSwitch(const User &U,
 
   const SwitchInst &SwInst = cast<SwitchInst>(U);
   const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition());
+  const BasicBlock *OrigBB = SwInst.getParent();
 
-  LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL);
+  LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL);
   for (auto &CaseIt : SwInst.cases()) {
     const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());
     const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);
     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue);
-    MachineBasicBlock &CurBB = MIRBuilder.getMBB();
-    MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor());
+    MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
+    const BasicBlock *TrueBB = CaseIt.getCaseSuccessor();
+    MachineBasicBlock &TrueMBB = getMBB(*TrueBB);
 
-    MIRBuilder.buildBrCond(Tst, TrueBB);
-    CurBB.addSuccessor(&TrueBB);
+    MIRBuilder.buildBrCond(Tst, TrueMBB);
+    CurMBB.addSuccessor(&TrueMBB);
+    addMachineCFGPred({OrigBB, TrueBB}, &CurMBB);
 
-    MachineBasicBlock *FalseBB =
+    MachineBasicBlock *FalseMBB =
         MF->CreateMachineBasicBlock(SwInst.getParent());
-    MF->push_back(FalseBB);
-    MIRBuilder.buildBr(*FalseBB);
-    CurBB.addSuccessor(FalseBB);
+    // Insert the comparison blocks one after the other.
+    MF->insert(std::next(CurMBB.getIterator()), FalseMBB);
+    MIRBuilder.buildBr(*FalseMBB);
+    CurMBB.addSuccessor(FalseMBB);
 
-    MIRBuilder.setMBB(*FalseBB);
+    MIRBuilder.setMBB(*FalseMBB);
   }
   // handle default case
-  MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest());
-  MIRBuilder.buildBr(DefaultBB);
-  MIRBuilder.getMBB().addSuccessor(&DefaultBB);
+  const BasicBlock *DefaultBB = SwInst.getDefaultDest();
+  MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB);
+  MIRBuilder.buildBr(DefaultMBB);
+  MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
+  CurMBB.addSuccessor(&DefaultMBB);
+  addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB);
+
+  return true;
+}
+
+bool IRTranslator::translateIndirectBr(const User &U,
+                                       MachineIRBuilder &MIRBuilder) {
+  const IndirectBrInst &BrInst = cast<IndirectBrInst>(U);
+
+  const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress());
+  MIRBuilder.buildBrIndirect(Tgt);
+
+  // Link successors.
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+  for (const BasicBlock *Succ : BrInst.successors())
+    CurBB.addSuccessor(&getMBB(*Succ));
 
   return true;
 }
@@ -240,47 +299,38 @@ bool IRTranslator::translateSwitch(const User &U,
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
-  if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic())
-    return false;
-
-  assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment");
   auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOLoad;
 
   unsigned Res = getOrCreateVReg(LI);
   unsigned Addr = getOrCreateVReg(*LI.getPointerOperand());
-  LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL};
+
   MIRBuilder.buildLoad(
       Res, Addr,
       *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
                                 Flags, DL->getTypeStoreSize(LI.getType()),
-                                getMemOpAlignment(LI)));
+                                getMemOpAlignment(LI), AAMDNodes(), nullptr,
+                                LI.getSynchScope(), LI.getOrdering()));
   return true;
 }
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
-
-  if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic())
-    return false;
-
-  assert(!SI.isAtomic() && "only non-atomic stores supported at the moment");
   auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOStore;
 
   unsigned Val = getOrCreateVReg(*SI.getValueOperand());
   unsigned Addr = getOrCreateVReg(*SI.getPointerOperand());
-  LLT VTy{*SI.getValueOperand()->getType(), *DL},
-      PTy{*SI.getPointerOperand()->getType(), *DL};
 
   MIRBuilder.buildStore(
       Val, Addr,
       *MF->getMachineMemOperand(
           MachinePointerInfo(SI.getPointerOperand()), Flags,
           DL->getTypeStoreSize(SI.getValueOperand()->getType()),
-          getMemOpAlignment(SI)));
+          getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSynchScope(),
+          SI.getOrdering()));
   return true;
 }
 
@@ -305,7 +355,7 @@ bool IRTranslator::translateExtractValue(const User &U,
   uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
 
   unsigned Res = getOrCreateVReg(U);
-  MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src));
+  MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset);
 
   return true;
 }
@@ -348,12 +398,18 @@ bool IRTranslator::translateSelect(const User &U,
 
 bool IRTranslator::translateBitCast(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) {
+  // If we're bitcasting to the source type, we can reuse the source vreg.
+  if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
+      getLLTForType(*U.getType(), *DL)) {
+    // Get the source vreg now, to avoid invalidating ValToVReg.
+    unsigned SrcReg = getOrCreateVReg(*U.getOperand(0));
     unsigned &Reg = ValToVReg[&U];
+    // If we already assigned a vreg for this bitcast, we can't change that.
+    // Emit a copy to satisfy the users we already emitted.
     if (Reg)
-      MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0)));
+      MIRBuilder.buildCopy(Reg, SrcReg);
     else
-      Reg = getOrCreateVReg(*U.getOperand(0));
+      Reg = SrcReg;
     return true;
   }
   return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
@@ -375,9 +431,10 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
   Value &Op0 = *U.getOperand(0);
   unsigned BaseReg = getOrCreateVReg(Op0);
-  LLT PtrTy{*Op0.getType(), *DL};
-  unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace());
-  LLT OffsetTy = LLT::scalar(PtrSize);
+  Type *PtrIRTy = Op0.getType();
+  LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+  Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy);
+  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
 
   int64_t Offset = 0;
   for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U);
@@ -399,8 +456,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
       if (Offset != 0) {
         unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
-        unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-        MIRBuilder.buildConstant(OffsetReg, Offset);
+        unsigned OffsetReg =
+            getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
         MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);
 
         BaseReg = NewBaseReg;
@@ -408,8 +465,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       // N = N + Idx * ElementSize;
-      unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy);
-      MIRBuilder.buildConstant(ElementSizeReg, ElementSize);
+      unsigned ElementSizeReg =
+          getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));
 
       unsigned IdxReg = getOrCreateVReg(*Idx);
       if (MRI->getType(IdxReg) != OffsetTy) {
@@ -428,8 +485,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   }
 
   if (Offset != 0) {
-    unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-    MIRBuilder.buildConstant(OffsetReg, Offset);
+    unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
     MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg);
     return true;
   }
@@ -438,13 +494,12 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   return true;
 }
 
-bool IRTranslator::translateMemcpy(const CallInst &CI,
-                                   MachineIRBuilder &MIRBuilder) {
-  LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL};
-  if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() !=
-          0 ||
-      cast<PointerType>(CI.getArgOperand(1)->getType())->getAddressSpace() !=
-          0 ||
+bool IRTranslator::translateMemfunc(const CallInst &CI,
+                                    MachineIRBuilder &MIRBuilder,
+                                    unsigned ID) {
+  LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL);
+  Type *DstTy = CI.getArgOperand(0)->getType();
+  if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||
       SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))
     return false;
 
@@ -454,14 +509,32 @@ bool IRTranslator::translateMemcpy(const CallInst &CI,
     Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
   }
 
-  MachineOperand Callee = MachineOperand::CreateES("memcpy");
+  const char *Callee;
+  switch (ID) {
+  case Intrinsic::memmove:
+  case Intrinsic::memcpy: {
+    Type *SrcTy = CI.getArgOperand(1)->getType();
+    if(cast<PointerType>(SrcTy)->getAddressSpace() != 0)
+      return false;
+    Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove";
+    break;
+  }
+  case Intrinsic::memset:
+    Callee = "memset";
+    break;
+  default:
+    return false;
+  }
 
-  return CLI->lowerCall(MIRBuilder, Callee,
+  return CLI->lowerCall(MIRBuilder, CI.getCallingConv(),
+                        MachineOperand::CreateES(Callee),
                         CallLowering::ArgInfo(0, CI.getType()), Args);
 }
 
 void IRTranslator::getStackGuard(unsigned DstReg,
                                  MachineIRBuilder &MIRBuilder) {
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));
   auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD);
   MIB.addDef(DstReg);
 
@@ -482,7 +555,7 @@ void IRTranslator::getStackGuard(unsigned DstReg,
 
 bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                               MachineIRBuilder &MIRBuilder) {
-  LLT Ty{*CI.getOperand(0)->getType(), *DL};
+  LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL);
   LLT s1 = LLT::scalar(1);
   unsigned Width = Ty.getSizeInBits();
   unsigned Res = MRI->createGenericVirtualRegister(Ty);
@@ -494,8 +567,8 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                  .addUse(getOrCreateVReg(*CI.getOperand(1)));
 
   if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) {
-    unsigned Zero = MRI->createGenericVirtualRegister(s1);
-    EntryBuilder.buildConstant(Zero, 0);
+    unsigned Zero = getOrCreateVReg(
+        *Constant::getNullValue(Type::getInt1Ty(CI.getContext())));
     MIB.addUse(Zero);
   }
 
@@ -508,12 +581,83 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   switch (ID) {
   default:
     break;
-  case Intrinsic::dbg_declare:
-  case Intrinsic::dbg_value:
-    // FIXME: these obviously need to be supported properly.
-    MF->getProperties().set(
-          MachineFunctionProperties::Property::FailedISel);
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    // Stack coloring is not enabled in O0 (which we care about now) so we can
+    // drop these. Make sure someone notices when we start compiling at higher
+    // opts though.
+    if (MF->getTarget().getOptLevel() != CodeGenOpt::None)
+      return false;
+    return true;
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
+    assert(DI.getVariable() && "Missing variable");
+
+    const Value *Address = DI.getAddress();
+    if (!Address || isa<UndefValue>(Address)) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return true;
+    }
+
+    assert(DI.getVariable()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+    auto AI = dyn_cast<AllocaInst>(Address);
+    if (AI && AI->isStaticAlloca()) {
+      // Static allocas are tracked at the MF level, no need for DBG_VALUE
+      // instructions (in fact, they get ignored if they *do* exist).
+      MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(),
+                             getOrCreateFrameIndex(*AI), DI.getDebugLoc());
+    } else
+      MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address),
+                                     DI.getVariable(), DI.getExpression());
+    return true;
+  }
+  case Intrinsic::vaend:
+    // No target I know of cares about va_end. Certainly no in-tree target
+    // does. Simplest intrinsic ever!
     return true;
+  case Intrinsic::vastart: {
+    auto &TLI = *MF->getSubtarget().getTargetLowering();
+    Value *Ptr = CI.getArgOperand(0);
+    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+
+    MIRBuilder.buildInstr(TargetOpcode::G_VASTART)
+        .addUse(getOrCreateVReg(*Ptr))
+        .addMemOperand(MF->getMachineMemOperand(
+            MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0));
+    return true;
+  }
+  case Intrinsic::dbg_value: {
+    // This form of DBG_VALUE is target-independent.
+    const DbgValueInst &DI = cast<DbgValueInst>(CI);
+    const Value *V = DI.getValue();
+    assert(DI.getVariable()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+    if (!V) {
+      // Currently the optimizer can produce this; insert an undef to
+      // help debugging.  Probably the optimizer should not do this.
+      MIRBuilder.buildIndirectDbgValue(0, DI.getOffset(), DI.getVariable(),
+                                       DI.getExpression());
+    } else if (const auto *CI = dyn_cast<Constant>(V)) {
+      MIRBuilder.buildConstDbgValue(*CI, DI.getOffset(), DI.getVariable(),
+                                    DI.getExpression());
+    } else {
+      unsigned Reg = getOrCreateVReg(*V);
+      // FIXME: This does not handle register-indirect values at offset 0. The
+      // direct/indirect thing shouldn't really be handled by something as
+      // implicit as reg+noreg vs reg+imm in the first palce, but it seems
+      // pretty baked in right now.
+      if (DI.getOffset() != 0)
+        MIRBuilder.buildIndirectDbgValue(Reg, DI.getOffset(), DI.getVariable(),
+                                         DI.getExpression());
+      else
+        MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(),
+                                       DI.getExpression());
+    }
+    return true;
+  }
   case Intrinsic::uadd_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder);
   case Intrinsic::sadd_with_overflow:
@@ -526,8 +670,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
   case Intrinsic::smul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
+  case Intrinsic::pow:
+    MIRBuilder.buildInstr(TargetOpcode::G_FPOW)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
+    return true;
   case Intrinsic::memcpy:
-    return translateMemcpy(CI, MIRBuilder);
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+    return translateMemfunc(CI, MIRBuilder, ID);
   case Intrinsic::eh_typeid_for: {
     GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
     unsigned Reg = getOrCreateVReg(CI);
@@ -546,7 +698,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL};
+    LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);
     getStackGuard(GuardVal, MIRBuilder);
 
@@ -564,18 +716,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   return false;
 }
 
+bool IRTranslator::translateInlineAsm(const CallInst &CI,
+                                      MachineIRBuilder &MIRBuilder) {
+  const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue());
+  if (!IA.getConstraintString().empty())
+    return false;
+
+  unsigned ExtraInfo = 0;
+  if (IA.hasSideEffects())
+    ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+  if (IA.getDialect() == InlineAsm::AD_Intel)
+    ExtraInfo |= InlineAsm::Extra_AsmDialect;
+
+  MIRBuilder.buildInstr(TargetOpcode::INLINEASM)
+    .addExternalSymbol(IA.getAsmString().c_str())
+    .addImm(ExtraInfo);
+
+  return true;
+}
+
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
   const Function *F = CI.getCalledFunction();
 
+  if (CI.isInlineAsm())
+    return translateInlineAsm(CI, MIRBuilder);
+
   if (!F || !F->isIntrinsic()) {
     unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
     SmallVector<unsigned, 8> Args;
     for (auto &Arg: CI.arg_operands())
       Args.push_back(getOrCreateVReg(*Arg));
 
-    return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() {
+    MF->getFrameInfo().setHasCalls(true);
+    return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {
       return getOrCreateVReg(*CI.getCalledValue());
     });
   }
@@ -594,10 +769,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory());
 
   for (auto &Arg : CI.arg_operands()) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg))
-      MIB.addImm(CI->getSExtValue());
-    else
-      MIB.addUse(getOrCreateVReg(*Arg));
+    // Some intrinsics take metadata parameters. Reject them.
+    if (isa<MetadataAsValue>(Arg))
+      return false;
+    MIB.addUse(getOrCreateVReg(*Arg));
   }
   return true;
 }
@@ -610,7 +785,7 @@ bool IRTranslator::translateInvoke(const User &U,
   const BasicBlock *ReturnBB = I.getSuccessor(0);
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
-  const Value *Callee(I.getCalledValue());
+  const Value *Callee = I.getCalledValue();
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     return false;
@@ -634,23 +809,24 @@ bool IRTranslator::translateInvoke(const User &U,
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
   unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I);
-  SmallVector<CallLowering::ArgInfo, 8> Args;
+  SmallVector<unsigned, 8> Args;
   for (auto &Arg: I.arg_operands())
-    Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
+    Args.push_back(getOrCreateVReg(*Arg));
 
-  if (!CLI->lowerCall(MIRBuilder, MachineOperand::CreateGA(Fn, 0),
-                      CallLowering::ArgInfo(Res, I.getType()), Args))
+  if (!CLI->lowerCall(MIRBuilder, &I, Res, Args,
+                      [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
 
   // FIXME: track probabilities.
-  MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB),
-                    &ReturnMBB = getOrCreateBB(*ReturnBB);
+  MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB),
+                    &ReturnMBB = getMBB(*ReturnBB);
   MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
   MIRBuilder.getMBB().addSuccessor(&ReturnMBB);
   MIRBuilder.getMBB().addSuccessor(&EHPadMBB);
+  MIRBuilder.buildBr(ReturnMBB);
 
   return true;
 }
@@ -684,37 +860,158 @@ bool IRTranslator::translateLandingPad(const User &U,
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
     .addSym(MF->addLandingPad(&MBB));
 
+  LLT Ty = getLLTForType(*LP.getType(), *DL);
+  unsigned Undef = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildUndef(Undef);
+
+  SmallVector<LLT, 2> Tys;
+  for (Type *Ty : cast<StructType>(LP.getType())->elements())
+    Tys.push_back(getLLTForType(*Ty, *DL));
+  assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
+
   // Mark exception register as live in.
-  SmallVector<unsigned, 2> Regs;
-  SmallVector<uint64_t, 2> Offsets;
-  LLT p0 = LLT::pointer(0, DL->getPointerSizeInBits());
-  if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) {
-    unsigned VReg = MRI->createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(VReg, Reg);
-    Regs.push_back(VReg);
-    Offsets.push_back(0);
+  unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  if (!ExceptionReg)
+    return false;
+
+  MBB.addLiveIn(ExceptionReg);
+  unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]),
+           Tmp = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildCopy(VReg, ExceptionReg);
+  MIRBuilder.buildInsert(Tmp, Undef, VReg, 0);
+
+  unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  if (!SelectorReg)
+    return false;
+
+  MBB.addLiveIn(SelectorReg);
+
+  // N.b. the exception selector register always has pointer type and may not
+  // match the actual IR-level type in the landingpad so an extra cast is
+  // needed.
+  unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
+  MIRBuilder.buildCopy(PtrVReg, SelectorReg);
+
+  VReg = MRI->createGenericVirtualRegister(Tys[1]);
+  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg);
+  MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg,
+                         Tys[0].getSizeInBits());
+  return true;
+}
+
+bool IRTranslator::translateAlloca(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  auto &AI = cast<AllocaInst>(U);
+
+  if (AI.isStaticAlloca()) {
+    unsigned Res = getOrCreateVReg(AI);
+    int FI = getOrCreateFrameIndex(AI);
+    MIRBuilder.buildFrameIndex(Res, FI);
+    return true;
+  }
+
+  // Now we're in the harder dynamic case.
+  Type *Ty = AI.getAllocatedType();
+  unsigned Align =
+      std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment());
+
+  unsigned NumElts = getOrCreateVReg(*AI.getArraySize());
+
+  Type *IntPtrIRTy = DL->getIntPtrType(AI.getType());
+  LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL);
+  if (MRI->getType(NumElts) != IntPtrTy) {
+    unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy);
+    MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts);
+    NumElts = ExtElts;
   }
 
-  if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) {
-    unsigned VReg = MRI->createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(VReg, Reg);
-    Regs.push_back(VReg);
-    Offsets.push_back(p0.getSizeInBits());
+  unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
+  unsigned TySize =
+      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
+  MIRBuilder.buildMul(AllocSize, NumElts, TySize);
+
+  LLT PtrTy = getLLTForType(*AI.getType(), *DL);
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+
+  unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildCopy(SPTmp, SPReg);
+
+  unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
+
+  // Handle alignment. We have to realign if the allocation granule was smaller
+  // than stack alignment, or the specific alloca requires more than stack
+  // alignment.
+  unsigned StackAlign =
+      MF->getSubtarget().getFrameLowering()->getStackAlignment();
+  Align = std::max(Align, StackAlign);
+  if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
+    // Round the size of the allocation up to the stack alignment size
+    // by add SA-1 to the size. This doesn't overflow because we're computing
+    // an address inside an alloca.
+    unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
+    AllocTmp = AlignedAlloc;
   }
 
-  MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets);
+  MIRBuilder.buildCopy(SPReg, AllocTmp);
+  MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
+
+  MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
+  assert(MF->getFrameInfo().hasVarSizedObjects());
   return true;
 }
 
-bool IRTranslator::translateStaticAlloca(const AllocaInst &AI,
-                                         MachineIRBuilder &MIRBuilder) {
-  if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca())
-    return false;
+bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
+  // FIXME: We may need more info about the type. Because of how LLT works,
+  // we're completely discarding the i64/double distinction here (amongst
+  // others). Fortunately the ABIs I know of where that matters don't use va_arg
+  // anyway but that's not guaranteed.
+  MIRBuilder.buildInstr(TargetOpcode::G_VAARG)
+    .addDef(getOrCreateVReg(U))
+    .addUse(getOrCreateVReg(*U.getOperand(0)))
+    .addImm(DL->getABITypeAlignment(U.getType()));
+  return true;
+}
 
-  assert(AI.isStaticAlloca() && "only handle static allocas now");
-  unsigned Res = getOrCreateVReg(AI);
-  int FI = getOrCreateFrameIndex(AI);
-  MIRBuilder.buildFrameIndex(Res, FI);
+bool IRTranslator::translateInsertElement(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (U.getType()->getVectorNumElements() == 1) {
+    unsigned Elt = getOrCreateVReg(*U.getOperand(1));
+    ValToVReg[&U] = Elt;
+    return true;
+  }
+  MIRBuilder.buildInsertVectorElement(
+      getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)),
+      getOrCreateVReg(*U.getOperand(1)), getOrCreateVReg(*U.getOperand(2)));
+  return true;
+}
+
+bool IRTranslator::translateExtractElement(const User &U,
+                                           MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (U.getOperand(0)->getType()->getVectorNumElements() == 1) {
+    unsigned Elt = getOrCreateVReg(*U.getOperand(0));
+    ValToVReg[&U] = Elt;
+    return true;
+  }
+  MIRBuilder.buildExtractVectorElement(getOrCreateVReg(U),
+                                       getOrCreateVReg(*U.getOperand(0)),
+                                       getOrCreateVReg(*U.getOperand(1)));
+  return true;
+}
+
+bool IRTranslator::translateShuffleVector(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR)
+      .addDef(getOrCreateVReg(U))
+      .addUse(getOrCreateVReg(*U.getOperand(0)))
+      .addUse(getOrCreateVReg(*U.getOperand(1)))
+      .addUse(getOrCreateVReg(*U.getOperand(2)));
   return true;
 }
 
@@ -736,11 +1033,21 @@ void IRTranslator::finishPendingPhis() {
     // won't create extra control flow here, otherwise we need to find the
     // dominating predecessor here (or perhaps force the weirder IRTranslators
     // to provide a simple boundary).
+    SmallSet<const BasicBlock *, 4> HandledPreds;
+
     for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
-      assert(BBToMBB[PI->getIncomingBlock(i)]->isSuccessor(MIB->getParent()) &&
-             "I appear to have misunderstood Machine PHIs");
-      MIB.addUse(getOrCreateVReg(*PI->getIncomingValue(i)));
-      MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]);
+      auto IRPred = PI->getIncomingBlock(i);
+      if (HandledPreds.count(IRPred))
+        continue;
+
+      HandledPreds.insert(IRPred);
+      unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i));
+      for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
+        assert(Pred->isSuccessor(MIB->getParent()) &&
+               "incorrect CFG at MachineBasicBlock level");
+        MIB.addUse(ValReg);
+        MIB.addMBB(Pred);
+      }
     }
   }
 }
@@ -752,9 +1059,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
     case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
 #include "llvm/IR/Instruction.def"
   default:
-    if (!TPC->isGlobalISelAbortEnabled())
-      return false;
-    llvm_unreachable("unknown opcode");
+    return false;
   }
 }
 
@@ -764,25 +1069,43 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
   else if (auto CF = dyn_cast<ConstantFP>(&C))
     EntryBuilder.buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
-    EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg);
+    EntryBuilder.buildUndef(Reg);
   else if (isa<ConstantPointerNull>(C))
     EntryBuilder.buildConstant(Reg, 0);
   else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder.buildGlobalValue(Reg, GV);
-  else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
+  else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
+    if (!CAZ->getType()->isVectorTy())
+      return false;
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CAZ->getNumElements() == 1)
+      return translate(*CAZ->getElementValue(0u), Reg);
+    std::vector<unsigned> Ops;
+    for (unsigned i = 0; i < CAZ->getNumElements(); ++i) {
+      Constant &Elt = *CAZ->getElementValue(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CV->getNumElements() == 1)
+      return translate(*CV->getElementAsConstant(0), Reg);
+    std::vector<unsigned> Ops;
+    for (unsigned i = 0; i < CV->getNumElements(); ++i) {
+      Constant &Elt = *CV->getElementAsConstant(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
     switch(CE->getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                         \
       case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder);
 #include "llvm/IR/Instruction.def"
     default:
-      if (!TPC->isGlobalISelAbortEnabled())
-        return false;
-      llvm_unreachable("unknown opcode");
+      return false;
     }
-  } else if (!TPC->isGlobalISelAbortEnabled())
+  } else
     return false;
-  else
-    llvm_unreachable("unhandled constant kind");
 
   return true;
 }
@@ -793,7 +1116,7 @@ void IRTranslator::finalizeFunction() {
   PendingPHIs.clear();
   ValToVReg.clear();
   FrameIndices.clear();
-  Constants.clear();
+  MachinePreds.clear();
 }
 
 bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
@@ -807,85 +1130,101 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   TPC = &getAnalysis<TargetPassConfig>();
+  ORE = make_unique<OptimizationRemarkEmitter>(&F);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
-  // Setup a separate basic-block for the arguments and constants, falling
-  // through to the IR-level Function's entry block.
+  // Release the per-function state when we return, whether we succeeded or not.
+  auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); });
+
+  // Setup a separate basic-block for the arguments and constants
   MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();
   MF->push_back(EntryBB);
-  EntryBB->addSuccessor(&getOrCreateBB(F.front()));
   EntryBuilder.setMBB(*EntryBB);
 
+  // Create all blocks, in IR order, to preserve the layout.
+  for (const BasicBlock &BB: F) {
+    auto *&MBB = BBToMBB[&BB];
+
+    MBB = MF->CreateMachineBasicBlock(&BB);
+    MF->push_back(MBB);
+
+    if (BB.hasAddressTaken())
+      MBB->setHasAddressTaken();
+  }
+
+  // Make our arguments/constants entry block fallthrough to the IR entry block.
+  EntryBB->addSuccessor(&getMBB(F.front()));
+
   // Lower the actual args into this basic block.
   SmallVector<unsigned, 8> VRegArgs;
   for (const Argument &Arg: F.args())
     VRegArgs.push_back(getOrCreateVReg(Arg));
-  bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs);
-  if (!Succeeded) {
-    if (!TPC->isGlobalISelAbortEnabled()) {
-      MF->getProperties().set(
-          MachineFunctionProperties::Property::FailedISel);
-      finalizeFunction();
-      return false;
-    }
-    report_fatal_error("Unable to lower arguments");
+  if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) {
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               MF->getFunction()->getSubprogram(),
+                               &MF->getFunction()->getEntryBlock());
+    R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
+    reportTranslationError(*MF, *TPC, *ORE, R);
+    return false;
   }
 
   // And translate the function!
   for (const BasicBlock &BB: F) {
-    MachineBasicBlock &MBB = getOrCreateBB(BB);
+    MachineBasicBlock &MBB = getMBB(BB);
     // Set the insertion point of all the following translations to
     // the end of this basic block.
     CurBuilder.setMBB(MBB);
 
     for (const Instruction &Inst: BB) {
-      Succeeded &= translate(Inst);
-      if (!Succeeded) {
-        if (TPC->isGlobalISelAbortEnabled())
-          reportTranslationError(Inst, "unable to translate instruction");
-        MF->getProperties().set(
-            MachineFunctionProperties::Property::FailedISel);
-        break;
-      }
-    }
-  }
-
-  if (Succeeded) {
-    finishPendingPhis();
-
-    // Now that the MachineFrameInfo has been configured, no further changes to
-    // the reserved registers are possible.
-    MRI->freezeReservedRegs(*MF);
-
-    // Merge the argument lowering and constants block with its single
-    // successor, the LLVM-IR entry block.  We want the basic block to
-    // be maximal.
-    assert(EntryBB->succ_size() == 1 &&
-           "Custom BB used for lowering should have only one successor");
-    // Get the successor of the current entry block.
-    MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
-    assert(NewEntryBB.pred_size() == 1 &&
-           "LLVM-IR entry block has a predecessor!?");
-    // Move all the instruction from the current entry block to the
-    // new entry block.
-    NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
-                      EntryBB->end());
-
-    // Update the live-in information for the new entry block.
-    for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
-      NewEntryBB.addLiveIn(LiveIn);
-    NewEntryBB.sortUniqueLiveIns();
+      if (translate(Inst))
+        continue;
 
-    // Get rid of the now empty basic block.
-    EntryBB->removeSuccessor(&NewEntryBB);
-    MF->remove(EntryBB);
+      std::string InstStrStorage;
+      raw_string_ostream InstStr(InstStrStorage);
+      InstStr << Inst;
 
-    assert(&MF->front() == &NewEntryBB &&
-           "New entry wasn't next in the list of basic block!");
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 Inst.getDebugLoc(), &BB);
+      R << "unable to translate instruction: " << ore::NV("Opcode", &Inst)
+        << ": '" << InstStr.str() << "'";
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return false;
+    }
   }
 
-  finalizeFunction();
+  finishPendingPhis();
+
+  // Now that the MachineFrameInfo has been configured, no further changes to
+  // the reserved registers are possible.
+  MRI->freezeReservedRegs(*MF);
+
+  // Merge the argument lowering and constants block with its single
+  // successor, the LLVM-IR entry block.  We want the basic block to
+  // be maximal.
+  assert(EntryBB->succ_size() == 1 &&
+         "Custom BB used for lowering should have only one successor");
+  // Get the successor of the current entry block.
+  MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
+  assert(NewEntryBB.pred_size() == 1 &&
+         "LLVM-IR entry block has a predecessor!?");
+  // Move all the instruction from the current entry block to the
+  // new entry block.
+  NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
+                    EntryBB->end());
+
+  // Update the live-in information for the new entry block.
+  for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
+    NewEntryBB.addLiveIn(LiveIn);
+  NewEntryBB.sortUniqueLiveIns();
+
+  // Get rid of the now empty basic block.
+  EntryBB->removeSuccessor(&NewEntryBB);
+  MF->remove(EntryBB);
+  MF->DeleteMachineBasicBlock(EntryBB);
+
+  assert(&MF->front() == &NewEntryBB &&
+         "New entry wasn't next in the list of basic block!");
 
   return false;
 }
diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 1d205cd6c9c8..26454c1ef00f 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,11 +12,15 @@
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -44,17 +48,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static void reportSelectionError(const MachineInstr *MI, const Twine &Message) {
-  const MachineFunction &MF = *MI->getParent()->getParent();
-  std::string ErrStorage;
-  raw_string_ostream Err(ErrStorage);
-  Err << Message << ":\nIn function: " << MF.getName() << '\n';
-  if (MI)
-    Err << *MI << '\n';
-  report_fatal_error(Err.str());
-}
-
 bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // No matter what happens, whether we successfully select the function or not,
+  // nothing is going to use the vreg types after us.  Make sure they disappear.
+  auto ClearVRegTypesOnReturn =
+      make_scope_exit([&]() { MRI.getVRegToType().clear(); });
+
   // If the ISel pipeline failed, do not bother running that pass.
   if (MF.getProperties().hasProperty(
           MachineFunctionProperties::Property::FailedISel))
@@ -66,11 +67,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
   assert(ISel && "Cannot work without InstructionSelector");
 
+  // An optimization remark emitter. Used to report failures.
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+
   // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many
   // other MF/MFI fields we need to initialize.
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
 #ifndef NDEBUG
   // Check that our input is fully legal: we require the function to have the
   // Legalized property, so it should be.
@@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   // that it has the same layering problem, but we only use inline methods so
   // end up not needing to link against the GlobalISel library.
   if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo())
-    for (const MachineBasicBlock &MBB : MF)
-      for (const MachineInstr &MI : MBB)
-        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI))
-          reportSelectionError(&MI, "Instruction is not legal");
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB)
+        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
+          reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                             "instruction is not legal", MI);
+          return false;
+        }
 
 #endif
   // FIXME: We could introduce new blocks and will need to fix the outer loop.
   // Until then, keep track of the number of blocks to assert that we don't.
   const size_t NumBlocks = MF.size();
 
-  bool Failed = false;
   for (MachineBasicBlock *MBB : post_order(&MF)) {
     if (MBB->empty())
       continue;
@@ -115,14 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
       DEBUG(dbgs() << "Selecting: \n  " << MI);
 
+      // We could have folded this instruction away already, making it dead.
+      // If so, erase it.
+      if (isTriviallyDead(MI, MRI)) {
+        DEBUG(dbgs() << "Is dead; erasing.\n");
+        MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        continue;
+      }
+
       if (!ISel->select(MI)) {
-        if (TPC.isGlobalISelAbortEnabled())
-          // FIXME: It would be nice to dump all inserted instructions.  It's
-          // not
-          // obvious how, esp. considering select() can insert after MI.
-          reportSelectionError(&MI, "Cannot select");
-        Failed = true;
-        break;
+        // FIXME: It would be nice to dump all inserted instructions.  It's
+        // not obvious how, esp. considering select() can insert after MI.
+        reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI);
+        return false;
       }
 
       // Dump the range of instructions that MI expanded into.
@@ -142,33 +151,36 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   for (auto &VRegToType : MRI.getVRegToType()) {
     unsigned VReg = VRegToType.first;
     auto *RC = MRI.getRegClassOrNull(VReg);
-    auto *MI = MRI.def_instr_begin(VReg) == MRI.def_instr_end()
-                   ? nullptr
-                   : &*MRI.def_instr_begin(VReg);
-    if (!RC) {
-      if (TPC.isGlobalISelAbortEnabled())
-        reportSelectionError(MI, "VReg as no regclass after selection");
-      Failed = true;
-      break;
-    }
+    MachineInstr *MI = nullptr;
+    if (!MRI.def_empty(VReg))
+      MI = &*MRI.def_instr_begin(VReg);
+    else if (!MRI.use_empty(VReg))
+      MI = &*MRI.use_instr_begin(VReg);
+
+    if (MI && !RC) {
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has no regclass after selection", *MI);
+      return false;
+    } else if (!RC)
+      continue;
 
     if (VRegToType.second.isValid() &&
         VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) {
-      if (TPC.isGlobalISelAbortEnabled())
-        reportSelectionError(
-            MI, "VReg has explicit size different from class size");
-      Failed = true;
-      break;
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has explicit size different from class size",
+                         *MI);
+      return false;
     }
   }
 
-  MRI.getVRegToType().clear();
-
-  if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) {
-    MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  if (MF.size() != NumBlocks) {
+    MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure",
+                                      MF.getFunction()->getSubprogram(),
+                                      /*MBB=*/nullptr);
+    R << "inserting blocks is not supported yet";
+    reportGISelFailure(MF, TPC, MORE, R);
     return false;
   }
-  assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet");
 
   // FIXME: Should we accurately track changes?
   return true;
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 5c34da0dc557..fb9d01ef8542 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -14,6 +14,8 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -55,6 +57,45 @@ bool InstructionSelector::constrainSelectedInstRegOperands(
     // constrainOperandRegClass does that for us.
     MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),
                                        Reg, OpI));
+
+    // Tie uses to defs as indicated in MCInstrDesc.
+    if (MO.isUse()) {
+      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO);
+      if (DefIdx != -1)
+        I.tieOperands(DefIdx, OpI);
+    }
   }
   return true;
 }
+
+Optional<int64_t>
+InstructionSelector::getConstantVRegVal(unsigned VReg,
+                                        const MachineRegisterInfo &MRI) const {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (MI->getOpcode() != TargetOpcode::G_CONSTANT)
+    return None;
+
+  if (MI->getOperand(1).isImm())
+    return MI->getOperand(1).getImm();
+
+  if (MI->getOperand(1).isCImm() &&
+      MI->getOperand(1).getCImm()->getBitWidth() <= 64)
+    return MI->getOperand(1).getCImm()->getSExtValue();
+
+  return None;
+}
+
+bool InstructionSelector::isOperandImmEqual(
+    const MachineOperand &MO, int64_t Value,
+    const MachineRegisterInfo &MRI) const {
+
+  if (MO.getReg())
+    if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI))
+      return *VRegVal == Value;
+  return false;
+}
+
+bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const {
+  return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() &&
+         MI.implicit_operands().begin() == MI.implicit_operands().end();
+}
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index e86356880e99..657ddb307919 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -16,6 +16,8 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
@@ -92,10 +94,7 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
            "unexpected physical register in G_SEQUENCE");
 
     // Finally we can replace the uses.
-    for (auto &Use : MRI.use_operands(ExtractReg)) {
-      Changed = true;
-      Use.setReg(OrigReg);
-    }
+    MRI.replaceRegWith(ExtractReg, OrigReg);
   }
 
   if (AllDefsReplaced) {
@@ -114,6 +113,36 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
   return Changed;
 }
 
+bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo &TII) {
+  if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
+    return false;
+
+  unsigned NumDefs = MI.getNumOperands() - 1;
+  unsigned SrcReg = MI.getOperand(NumDefs).getReg();
+  MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg);
+  if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
+
+  if (MergeI.getNumOperands() - 1 != NumDefs)
+    return false;
+
+  // FIXME: is a COPY appropriate if the types mismatch? We know both registers
+  // are allocatable by now.
+  if (MRI.getType(MI.getOperand(0).getReg()) !=
+      MRI.getType(MergeI.getOperand(1).getReg()))
+    return false;
+
+  for (unsigned Idx = 0; Idx < NumDefs; ++Idx)
+    MRI.replaceRegWith(MI.getOperand(Idx).getReg(),
+                       MergeI.getOperand(Idx + 1).getReg());
+
+  MI.eraseFromParent();
+  if (MRI.use_empty(MergeI.getOperand(0).getReg()))
+    MergeI.eraseFromParent();
+  return true;
+}
+
 bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   // If the ISel pipeline failed, do not bother running that pass.
   if (MF.getProperties().hasProperty(
@@ -122,7 +151,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');
   init(MF);
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
-  const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo();
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
   LegalizerHelper Helper(MF);
 
   // FIXME: an instruction may need more than one pass before it is legal. For
@@ -142,27 +171,33 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // and are assumed to be legal.
       if (!isPreISelGenericOpcode(MI->getOpcode()))
         continue;
-
-      auto Res = Helper.legalizeInstr(*MI, LegalizerInfo);
-
-      // Error out if we couldn't legalize this instruction. We may want to fall
-      // back to DAG ISel instead in the future.
-      if (Res == LegalizerHelper::UnableToLegalize) {
-        if (!TPC.isGlobalISelAbortEnabled()) {
-          MF.getProperties().set(
-              MachineFunctionProperties::Property::FailedISel);
-          return false;
+      SmallVector<MachineInstr *, 4> WorkList;
+      Helper.MIRBuilder.recordInsertions(
+          [&](MachineInstr *MI) { WorkList.push_back(MI); });
+      WorkList.push_back(&*MI);
+
+      LegalizerHelper::LegalizeResult Res;
+      unsigned Idx = 0;
+      do {
+        Res = Helper.legalizeInstrStep(*WorkList[Idx]);
+        // Error out if we couldn't legalize this instruction. We may want to
+        // fall
+        // back to DAG ISel instead in the future.
+        if (Res == LegalizerHelper::UnableToLegalize) {
+          Helper.MIRBuilder.stopRecordingInsertions();
+          if (Res == LegalizerHelper::UnableToLegalize) {
+            reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
+                               "unable to legalize instruction",
+                               *WorkList[Idx]);
+            return false;
+          }
         }
-        std::string Msg;
-        raw_string_ostream OS(Msg);
-        OS << "unable to legalize instruction: ";
-        MI->print(OS);
-        report_fatal_error(OS.str());
-      }
-
-      Changed |= Res == LegalizerHelper::Legalized;
-    }
+        Changed |= Res == LegalizerHelper::Legalized;
+        ++Idx;
+      } while (Idx < WorkList.size());
 
+      Helper.MIRBuilder.stopRecordingInsertions();
+    }
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -173,6 +208,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       NextMI = std::next(MI);
 
       Changed |= combineExtracts(*MI, MRI, TII);
+      Changed |= combineMerges(*MI, MRI, TII);
     }
   }
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index eb25b6ca268f..20358f7ee6c2 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -29,14 +29,13 @@
 using namespace llvm;
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF)
-  : MRI(MF.getRegInfo()) {
+    : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {
   MIRBuilder.setMF(MF);
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
-                                   const LegalizerInfo &LegalizerInfo) {
-  auto Action = LegalizerInfo.getAction(MI, MRI);
+LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
+  auto Action = LI.getAction(MI, MRI);
   switch (std::get<0>(Action)) {
   case LegalizerInfo::Legal:
     return AlreadyLegal;
@@ -50,46 +49,32 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
     return lower(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::FewerElements:
     return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action));
+  case LegalizerInfo::Custom:
+    return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized
+                                                  : UnableToLegalize;
   default:
     return UnableToLegalize;
   }
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::legalizeInstr(MachineInstr &MI,
-                               const LegalizerInfo &LegalizerInfo) {
-  SmallVector<MachineInstr *, 4> WorkList;
-  MIRBuilder.recordInsertions(
-      [&](MachineInstr *MI) { WorkList.push_back(MI); });
-  WorkList.push_back(&MI);
-
-  bool Changed = false;
-  LegalizeResult Res;
-  unsigned Idx = 0;
-  do {
-    Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo);
-    if (Res == UnableToLegalize) {
-      MIRBuilder.stopRecordingInsertions();
-      return UnableToLegalize;
-    }
-    Changed |= Res == Legalized;
-    ++Idx;
-  } while (Idx < WorkList.size());
-
-  MIRBuilder.stopRecordingInsertions();
-
-  return Changed ? Legalized : AlreadyLegal;
-}
-
 void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts,
                                    SmallVectorImpl<unsigned> &VRegs) {
-  unsigned Size = Ty.getSizeInBits();
-  SmallVector<uint64_t, 4> Indexes;
-  for (int i = 0; i < NumParts; ++i) {
+  for (int i = 0; i < NumParts; ++i)
     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
-    Indexes.push_back(i * Size);
+  MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
+  switch (Opcode) {
+  case TargetOpcode::G_FADD:
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32;
+  case TargetOpcode::G_FREM:
+    return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32;
+  case TargetOpcode::G_FPOW:
+    return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;
   }
-  MIRBuilder.buildExtract(VRegs, Indexes, Reg);
+  llvm_unreachable("Unknown libcall function");
 }
 
 LegalizerHelper::LegalizeResult
@@ -101,17 +86,19 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FPOW:
   case TargetOpcode::G_FREM: {
     auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
     Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
     auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
     auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-    const char *Name =
-        TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32);
-
+    auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
+    const char *Name = TLI.getLibcallName(Libcall);
+    MIRBuilder.getMF().getFrameInfo().setHasCalls(true);
     CLI.lowerCall(
-        MIRBuilder, MachineOperand::CreateES(Name),
-        {MI.getOperand(0).getReg(), Ty},
+        MIRBuilder, TLI.getLibcallCallingConv(Libcall),
+        MachineOperand::CreateES(Name), {MI.getOperand(0).getReg(), Ty},
         {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}});
     MI.eraseFromParent();
     return Legalized;
@@ -125,19 +112,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   // FIXME: Don't know how to handle secondary types yet.
   if (TypeIdx != 0)
     return UnableToLegalize;
+
+  MIRBuilder.setInstr(MI);
+
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_ADD: {
     // Expand in terms of carry-setting/consuming G_ADDE instructions.
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
     int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
                    NarrowTy.getSizeInBits();
 
-    MIRBuilder.setInstr(MI);
-
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
@@ -152,11 +138,138 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                             Src2Regs[i], CarryIn);
 
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
       CarryIn = CarryOut;
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_INSERT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    int64_t NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+
+    SmallVector<unsigned, 2> SrcRegs, DstRegs;
+    SmallVector<uint64_t, 2> Indexes;
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    unsigned OpReg = MI.getOperand(2).getReg();
+    int64_t OpStart = MI.getOperand(3).getImm();
+    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstStart = i * NarrowSize;
+
+      if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
+        // No part of the insert affects this subregister, forward the original.
+        DstRegs.push_back(SrcRegs[i]);
+        continue;
+      } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+        // The entire subregister is defined by this insert, forward the new
+        // value.
+        DstRegs.push_back(OpReg);
+        continue;
+      }
+
+      // OpSegStart is where this destination segment would start in OpReg if it
+      // extended infinitely in both directions.
+      int64_t ExtractOffset, InsertOffset, SegSize;
+      if (OpStart < DstStart) {
+        InsertOffset = 0;
+        ExtractOffset = DstStart - OpStart;
+        SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
+      } else {
+        InsertOffset = OpStart - DstStart;
+        ExtractOffset = 0;
+        SegSize =
+            std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
+      }
+
+      unsigned SegReg = OpReg;
+      if (ExtractOffset != 0 || SegSize != OpSize) {
+        // A genuine extract is needed.
+        SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+        MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
+      }
+
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset);
+      DstRegs.push_back(DstReg);
+    }
+
+    assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered");
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_LOAD: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    LLT NarrowPtrTy = LLT::pointer(
+        MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize);
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      unsigned SrcReg = MRI.createGenericVirtualRegister(NarrowPtrTy);
+      unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64));
+
+      MIRBuilder.buildConstant(Offset, i * NarrowSize / 8);
+      MIRBuilder.buildGEP(SrcReg, MI.getOperand(1).getReg(), Offset);
+      // TODO: This is conservatively correct, but we probably want to split the
+      // memory operands in the future.
+      MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin());
+
+      DstRegs.push_back(DstReg);
+    }
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_STORE: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    LLT NarrowPtrTy = LLT::pointer(
+        MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize);
+
+    SmallVector<unsigned, 2> SrcRegs;
+    extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowPtrTy);
+      unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64));
+      MIRBuilder.buildConstant(Offset, i * NarrowSize / 8);
+      MIRBuilder.buildGEP(DstReg, MI.getOperand(1).getReg(), Offset);
+      // TODO: This is conservatively correct, but we probably want to split the
+      // memory operands in the future.
+      MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin());
+    }
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    const APInt &Cst = MI.getOperand(1).getCImm()->getValue();
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      ConstantInt *CI =
+          ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize));
+      MIRBuilder.buildConstant(DstReg, *CI);
+      DstRegs.push_back(DstReg);
+    }
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -175,7 +288,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_MUL:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
-  case TargetOpcode::G_SUB: {
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_SHL: {
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
@@ -195,10 +309,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_SDIV:
-  case TargetOpcode::G_UDIV: {
-    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV
-                          ? TargetOpcode::G_SEXT
-                          : TargetOpcode::G_ZEXT;
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR: {
+    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV ||
+                             MI.getOpcode() == TargetOpcode::G_ASHR
+                         ? TargetOpcode::G_SEXT
+                         : TargetOpcode::G_ZEXT;
 
     unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse(
@@ -218,6 +335,85 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SELECT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    // Perform operation at larger width (any extension is fine here, high bits
+    // don't affect the result) and then truncate the result back to the
+    // original type.
+    unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
+    unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg());
+    MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg());
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildInstr(TargetOpcode::G_SELECT)
+        .addDef(DstExt)
+        .addReg(MI.getOperand(1).getReg())
+        .addUse(Src1Ext)
+        .addUse(Src2Ext);
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildInstr(MI.getOpcode())
+        .addDef(DstExt)
+        .addUse(MI.getOperand(1).getReg());
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
+
+    if (MI.getOpcode() == TargetOpcode::G_SITOFP) {
+      MIRBuilder.buildSExt(SrcExt, Src);
+    } else {
+      assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op");
+      MIRBuilder.buildZExt(SrcExt, Src);
+    }
+
+    MIRBuilder.buildInstr(MI.getOpcode())
+        .addDef(MI.getOperand(0).getReg())
+        .addUse(SrcExt);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_INSERT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildAnyExt(SrcExt, Src);
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(),
+                                      MI.getOperand(3).getImm());
+    for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) {
+      MIB.addReg(MI.getOperand(OpNum).getReg());
+      MIB.addImm(MI.getOperand(OpNum + 1).getImm());
+    }
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_LOAD: {
     assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
                WideTy.getSizeInBits() &&
@@ -231,12 +427,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_STORE: {
-    assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
-               WideTy.getSizeInBits() &&
-           "illegal to increase number of bytes modified by a store");
+    if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) ||
+        WideTy != LLT::scalar(8))
+      return UnableToLegalize;
+
+    auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+    auto Content = TLI.getBooleanContents(false, false);
+
+    unsigned ExtOp = TargetOpcode::G_ANYEXT;
+    if (Content == TargetLoweringBase::ZeroOrOneBooleanContent)
+      ExtOp = TargetOpcode::G_ZEXT;
+    else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent)
+      ExtOp = TargetOpcode::G_SEXT;
+    else
+      ExtOp = TargetOpcode::G_ANYEXT;
 
     unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg());
+    MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse(
+        MI.getOperand(0).getReg());
     MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(),
                           **MI.memoperands_begin());
     MI.eraseFromParent();
@@ -315,6 +523,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
+    // result.
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned Overflow = MI.getOperand(1).getReg();
+    unsigned LHS = MI.getOperand(2).getReg();
+    unsigned RHS = MI.getOperand(3).getReg();
+
+    MIRBuilder.buildMul(Res, LHS, RHS);
+
+    unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
+                          ? TargetOpcode::G_SMULH
+                          : TargetOpcode::G_UMULH;
+
+    unsigned HiPart = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildInstr(Opcode)
+      .addDef(HiPart)
+      .addUse(LHS)
+      .addUse(RHS);
+
+    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildConstant(Zero, 0);
+    MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FNEG: {
+    // TODO: Handle vector types once we are able to
+    // represent them.
+    if (Ty.isVector())
+      return UnableToLegalize;
+    unsigned Res = MI.getOperand(0).getReg();
+    Type *ZeroTy;
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+    switch (Ty.getSizeInBits()) {
+    case 16:
+      ZeroTy = Type::getHalfTy(Ctx);
+      break;
+    case 32:
+      ZeroTy = Type::getFloatTy(Ctx);
+      break;
+    case 64:
+      ZeroTy = Type::getDoubleTy(Ctx);
+      break;
+    default:
+      llvm_unreachable("unexpected floating-point type");
+    }
+    ConstantFP &ZeroForNegation =
+        *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
+    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildFConstant(Zero, ZeroForNegation);
+    MIRBuilder.buildInstr(TargetOpcode::G_FSUB)
+        .addDef(Res)
+        .addUse(Zero)
+        .addUse(MI.getOperand(1).getReg());
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FSUB: {
+    // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
+    // First, check if G_FNEG is marked as Lower. If so, we may
+    // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
+    if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower)
+      return UnableToLegalize;
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned LHS = MI.getOperand(1).getReg();
+    unsigned RHS = MI.getOperand(2).getReg();
+    unsigned Neg = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS);
+    MIRBuilder.buildInstr(TargetOpcode::G_FADD)
+        .addDef(Res)
+        .addUse(LHS)
+        .addUse(Neg);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -335,7 +620,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     MIRBuilder.setInstr(MI);
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
@@ -343,10 +627,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]);
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
     }
 
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index e49662075ed5..eaf4056e47ea 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -41,6 +41,8 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
   DefaultActions[TargetOpcode::G_STORE] = NarrowScalar;
 
   DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar;
+  DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar;
+  DefaultActions[TargetOpcode::G_FNEG] = Lower;
 }
 
 void LegalizerInfo::computeTables() {
@@ -71,28 +73,36 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
   // These *have* to be implemented for now, they're the fundamental basis of
   // how everything else is transformed.
 
-  // Nothing is going to go well with types that aren't a power of 2 yet, so
-  // don't even try because we might make things worse.
-  if (!isPowerOf2_64(Aspect.Type.getSizeInBits()))
-      return std::make_pair(Unsupported, LLT());
-
   // FIXME: the long-term plan calls for expansion in terms of load/store (if
   // they're not legal).
   if (Aspect.Opcode == TargetOpcode::G_SEQUENCE ||
-      Aspect.Opcode == TargetOpcode::G_EXTRACT)
+      Aspect.Opcode == TargetOpcode::G_EXTRACT ||
+      Aspect.Opcode == TargetOpcode::G_MERGE_VALUES ||
+      Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
     return std::make_pair(Legal, Aspect.Type);
 
+  LLT Ty = Aspect.Type;
   LegalizeAction Action = findInActions(Aspect);
+  // LegalizerHelper is not able to handle non-power-of-2 types right now, so do
+  // not try to legalize them unless they are marked as Legal or Custom.
+  // FIXME: This is a temporary hack until the general non-power-of-2
+  // legalization works.
+  if (!isPowerOf2_64(Ty.getSizeInBits()) &&
+      !(Action == Legal || Action == Custom))
+    return std::make_pair(Unsupported, LLT());
+
   if (Action != NotFound)
     return findLegalAction(Aspect, Action);
 
   unsigned Opcode = Aspect.Opcode;
-  LLT Ty = Aspect.Type;
   if (!Ty.isVector()) {
     auto DefaultAction = DefaultActions.find(Aspect.Opcode);
     if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)
       return std::make_pair(Legal, Ty);
 
+    if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower)
+      return std::make_pair(Lower, Ty);
+
     if (DefaultAction == DefaultActions.end() ||
         DefaultAction->second != NarrowScalar)
       return std::make_pair(Unsupported, LLT());
@@ -160,6 +170,7 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
   case Legal:
   case Lower:
   case Libcall:
+  case Custom:
     return Aspect.Type;
   case NarrowScalar: {
     return findLegalType(Aspect,
@@ -180,3 +191,9 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
   }
   }
 }
+
+bool LegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                   MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &MIRBuilder) const {
+  return false;
+}
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index c04f6e4ae897..8d1a263395a0 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -54,7 +55,7 @@ void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,
 
 void MachineIRBuilder::recordInsertions(
     std::function<void(MachineInstr *)> Inserted) {
-  InsertedInstr = Inserted;
+  InsertedInstr = std::move(Inserted);
 }
 
 void MachineIRBuilder::stopRecordingInsertions() {
@@ -82,6 +83,70 @@ MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) {
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildDirectDbgValue(
+    unsigned Reg, const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return buildInstr(TargetOpcode::DBG_VALUE)
+      .addReg(Reg, RegState::Debug)
+      .addReg(0, RegState::Debug)
+      .addMetadata(Variable)
+      .addMetadata(Expr);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildIndirectDbgValue(
+    unsigned Reg, unsigned Offset, const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return buildInstr(TargetOpcode::DBG_VALUE)
+      .addReg(Reg, RegState::Debug)
+      .addImm(Offset)
+      .addMetadata(Variable)
+      .addMetadata(Expr);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
+                                                      const MDNode *Variable,
+                                                      const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return buildInstr(TargetOpcode::DBG_VALUE)
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addMetadata(Variable)
+      .addMetadata(Expr);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
+                                                         unsigned Offset,
+                                                         const MDNode *Variable,
+                                                         const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  auto MIB = buildInstr(TargetOpcode::DBG_VALUE);
+  if (auto *CI = dyn_cast<ConstantInt>(&C)) {
+    if (CI->getBitWidth() > 64)
+      MIB.addCImm(CI);
+    else
+      MIB.addImm(CI->getZExtValue());
+  } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) {
+    MIB.addFPImm(CFP);
+  } else {
+    // Insert %noreg if we didn't find a usable constant and had to drop it.
+    MIB.addReg(0U);
+  }
+
+  return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) {
   assert(MRI->getType(Res).isPointer() && "invalid operand type");
   return buildInstr(TargetOpcode::G_FRAME_INDEX)
@@ -126,6 +191,17 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0,
+                                                   uint32_t NumBits) {
+  assert(MRI->getType(Res).isPointer() &&
+         MRI->getType(Res) == MRI->getType(Op0) && "type mismatch");
+
+  return buildInstr(TargetOpcode::G_PTR_MASK)
+      .addDef(Res)
+      .addUse(Op0)
+      .addImm(NumBits);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0,
                                                unsigned Op1) {
   assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
@@ -152,10 +228,27 @@ MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0,
+                                               unsigned Op1) {
+  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
+         "invalid operand type");
+  assert(MRI->getType(Res) == MRI->getType(Op0) &&
+         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
+
+  return buildInstr(TargetOpcode::G_AND)
+      .addDef(Res)
+      .addUse(Op0)
+      .addUse(Op1);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
   return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) {
+  return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) {
   return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);
 }
@@ -262,34 +355,56 @@ MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res,
   return buildInstr(Opcode).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results,
-                                                   ArrayRef<uint64_t> Indices,
-                                                   unsigned Src) {
-#ifndef NDEBUG
-  assert(Results.size() == Indices.size() && "inconsistent number of regs");
-  assert(!Results.empty() && "invalid trivial extract");
-  assert(std::is_sorted(Indices.begin(), Indices.end()) &&
-         "extract offsets must be in ascending order");
+MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res,
+                                                       unsigned Op) {
+  unsigned Opcode = TargetOpcode::COPY;
+  if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
+    Opcode = TargetOpcode::G_ZEXT;
+  else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
+    Opcode = TargetOpcode::G_TRUNC;
 
-  assert(MRI->getType(Src).isValid() && "invalid operand type");
-  for (auto Res : Results)
-    assert(MRI->getType(Res).isValid() && "invalid operand type");
-#endif
+  return buildInstr(Opcode).addDef(Res).addUse(Op);
+}
 
-  auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT));
-  for (auto Res : Results)
-    MIB.addDef(Res);
 
-  MIB.addUse(Src);
+MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) {
+  LLT SrcTy = MRI->getType(Src);
+  LLT DstTy = MRI->getType(Dst);
+  if (SrcTy == DstTy)
+    return buildCopy(Dst, Src);
+
+  unsigned Opcode;
+  if (SrcTy.isPointer() && DstTy.isScalar())
+    Opcode = TargetOpcode::G_PTRTOINT;
+  else if (DstTy.isPointer() && SrcTy.isScalar())
+    Opcode = TargetOpcode::G_INTTOPTR;
+  else {
+    assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet");
+    Opcode = TargetOpcode::G_BITCAST;
+  }
 
-  for (auto Idx : Indices)
-    MIB.addImm(Idx);
+  return buildInstr(Opcode).addDef(Dst).addUse(Src);
+}
 
-  getMBB().insert(getInsertPt(), MIB);
-  if (InsertedInstr)
-    InsertedInstr(MIB);
+MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
+                                                   uint64_t Index) {
+#ifndef NDEBUG
+  assert(MRI->getType(Src).isValid() && "invalid operand type");
+  assert(MRI->getType(Res).isValid() && "invalid operand type");
+  assert(Index + MRI->getType(Res).getSizeInBits() <=
+             MRI->getType(Src).getSizeInBits() &&
+         "extracting off end of register");
+#endif
 
-  return MIB;
+  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) {
+    assert(Index == 0 && "insertion past the end of a register");
+    return buildCast(Res, Src);
+  }
+
+  return buildInstr(TargetOpcode::G_EXTRACT)
+      .addDef(Res)
+      .addUse(Src)
+      .addImm(Index);
 }
 
 MachineInstrBuilder
@@ -316,6 +431,64 @@ MachineIRBuilder::buildSequence(unsigned Res,
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) {
+  return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
+                                                 ArrayRef<unsigned> Ops) {
+
+#ifndef NDEBUG
+  assert(!Ops.empty() && "invalid trivial sequence");
+  LLT Ty = MRI->getType(Ops[0]);
+  for (auto Reg : Ops)
+    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() ==
+             MRI->getType(Res).getSizeInBits() &&
+         "input operands do not cover output register");
+#endif
+
+  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES);
+  MIB.addDef(Res);
+  for (unsigned i = 0; i < Ops.size(); ++i)
+    MIB.addUse(Ops[i]);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
+                                                   unsigned Op) {
+
+#ifndef NDEBUG
+  assert(!Res.empty() && "invalid trivial sequence");
+  LLT Ty = MRI->getType(Res[0]);
+  for (auto Reg : Res)
+    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() ==
+             MRI->getType(Op).getSizeInBits() &&
+         "input operands do not cover output register");
+#endif
+
+  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+  for (unsigned i = 0; i < Res.size(); ++i)
+    MIB.addDef(Res[i]);
+  MIB.addUse(Op);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
+                                                  unsigned Op, unsigned Index) {
+  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) {
+    assert(Index == 0 && "insertion past the end of a register");
+    return buildCast(Res, Op);
+  }
+
+  return buildInstr(TargetOpcode::G_INSERT)
+      .addDef(Res)
+      .addUse(Src)
+      .addUse(Op)
+      .addImm(Index);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
                                                      unsigned Res,
                                                      bool HasSideEffects) {
@@ -395,9 +568,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
   if (ResTy.isScalar() || ResTy.isPointer())
     assert(MRI->getType(Tst).isScalar() && "type mismatch");
   else
-    assert(MRI->getType(Tst).isVector() &&
-           MRI->getType(Tst).getNumElements() ==
-               MRI->getType(Op0).getNumElements() &&
+    assert((MRI->getType(Tst).isScalar() ||
+            (MRI->getType(Tst).isVector() &&
+             MRI->getType(Tst).getNumElements() ==
+                 MRI->getType(Op0).getNumElements())) &&
            "type mismatch");
 #endif
 
@@ -408,6 +582,46 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res,
+                                                               unsigned Val,
+                                                               unsigned Elt,
+                                                               unsigned Idx) {
+#ifndef NDEBUG
+  LLT ResTy = MRI->getType(Res);
+  LLT ValTy = MRI->getType(Val);
+  LLT EltTy = MRI->getType(Elt);
+  LLT IdxTy = MRI->getType(Idx);
+  assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type");
+  assert(EltTy.isScalar() && IdxTy.isScalar() && "invalid operand type");
+  assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch");
+  assert(ResTy.getElementType() == EltTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT)
+      .addDef(Res)
+      .addUse(Val)
+      .addUse(Elt)
+      .addUse(Idx);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res,
+                                                                unsigned Val,
+                                                                unsigned Idx) {
+#ifndef NDEBUG
+  LLT ResTy = MRI->getType(Res);
+  LLT ValTy = MRI->getType(Val);
+  LLT IdxTy = MRI->getType(Idx);
+  assert(ValTy.isVector() && "invalid operand type");
+  assert(ResTy.isScalar() && IdxTy.isScalar() && "invalid operand type");
+  assert(ValTy.getElementType() == ResTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT)
+      .addDef(Res)
+      .addUse(Val)
+      .addUse(Idx);
+}
+
 void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src,
                                         bool IsExtend) {
 #ifndef NDEBUG
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index cc026ef27296..f935390a8d1b 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -71,6 +72,7 @@ void RegBankSelect::init(MachineFunction &MF) {
     MBPI = nullptr;
   }
   MIRBuilder.setMF(MF);
+  MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
 }
 
 void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -585,18 +587,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
   // LegalizerInfo as it's currently in the separate GlobalISel library.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) {
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
         if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
-          if (!TPC->isGlobalISelAbortEnabled()) {
-            MF.getProperties().set(
-                MachineFunctionProperties::Property::FailedISel);
-            return false;
-          }
-          std::string ErrStorage;
-          raw_string_ostream Err(ErrStorage);
-          Err << "Instruction is not legal: " << MI << '\n';
-          report_fatal_error(Err.str());
+          reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                             "instruction is not legal", MI);
+          return false;
         }
       }
     }
@@ -622,9 +618,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       if (!assignInstr(MI)) {
-        if (TPC->isGlobalISelAbortEnabled())
-          report_fatal_error("Unable to map instruction");
-        MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+        reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                           "unable to map instruction", MI);
         return false;
       }
     }
@@ -968,10 +963,12 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
          LocalFreq == Cost.LocalFreq;
 }
 
-void RegBankSelect::MappingCost::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void RegBankSelect::MappingCost::print(raw_ostream &OS) const {
   if (*this == ImpossibleCost()) {
diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 49d676f11da6..940957d02152 100644
--- a/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -19,10 +19,11 @@ using namespace llvm;
 
 const unsigned RegisterBank::InvalidID = UINT_MAX;
 
-RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size,
-                           const uint32_t *CoveredClasses)
+RegisterBank::RegisterBank(
+    unsigned ID, const char *Name, unsigned Size,
+    const uint32_t *CoveredClasses, unsigned NumRegClasses)
     : ID(ID), Name(Name), Size(Size) {
-  ContainedRegClasses.resize(200);
+  ContainedRegClasses.resize(NumRegClasses);
   ContainedRegClasses.setBitsInMask(CoveredClasses);
 }
 
@@ -75,9 +76,11 @@ bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
   return &OtherRB == this;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
   print(dbgs(), /* IsForDebug */ true, TRI);
 }
+#endif
 
 void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
                          const TargetRegisterInfo *TRI) const {
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index da5ab0b9fb7b..b2df2f159676 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -63,13 +63,6 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
 #endif // NDEBUG
 }
 
-RegisterBankInfo::~RegisterBankInfo() {
-  for (auto It : MapOfPartialMappings)
-    delete It.second;
-  for (auto It : MapOfValueMappings)
-    delete It.second;
-}
-
 bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
 #ifndef NDEBUG
   for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
@@ -133,15 +126,26 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
   return &RC;
 }
 
+/// Check whether or not \p MI should be treated like a copy
+/// for the mappings.
+/// Copy like instruction are special for mapping because
+/// they don't have actual register constraints. Moreover,
+/// they sometimes have register classes assigned and we can
+/// just use that instead of failing to provide a generic mapping.
+static bool isCopyLike(const MachineInstr &MI) {
+  return MI.isCopy() || MI.isPHI() ||
+         MI.getOpcode() == TargetOpcode::REG_SEQUENCE;
+}
+
 RegisterBankInfo::InstructionMapping
 RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
   // For copies we want to walk over the operands and try to find one
   // that has a register bank since the instruction itself will not get
   // us any constraint.
-  bool isCopyLike = MI.isCopy() || MI.isPHI();
+  bool IsCopyLike = isCopyLike(MI);
   // For copy like instruction, only the mapping of the definition
   // is important. The rest is not constrained.
-  unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands();
+  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();
 
   RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1,
                                                /*OperandsMapping*/ nullptr,
@@ -175,7 +179,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     // For copy-like instruction, we want to reuse the register bank
     // that is already set on Reg, if any, since those instructions do
     // not have any constraints.
-    const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr;
+    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;
     if (!CurRegBank) {
       // If this is a target specific instruction, we can deduce
       // the register bank from the encoding constraints.
@@ -184,7 +188,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
         // All our attempts failed, give up.
         CompleteMapping = false;
 
-        if (!isCopyLike)
+        if (!IsCopyLike)
           // MI does not carry enough information to guess the mapping.
           return InstructionMapping();
         continue;
@@ -192,7 +196,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     }
     const ValueMapping *ValMapping =
         &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank);
-    if (isCopyLike) {
+    if (IsCopyLike) {
       OperandsMapping[0] = ValMapping;
       CompleteMapping = true;
       break;
@@ -200,7 +204,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     OperandsMapping[OpIdx] = ValMapping;
   }
 
-  if (isCopyLike && !CompleteMapping)
+  if (IsCopyLike && !CompleteMapping)
     // No way to deduce the type from what we have.
     return InstructionMapping();
 
@@ -234,8 +238,8 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
 
   ++NumPartialMappingsCreated;
 
-  const PartialMapping *&PartMapping = MapOfPartialMappings[Hash];
-  PartMapping = new PartialMapping{StartIdx, Length, RegBank};
+  auto &PartMapping = MapOfPartialMappings[Hash];
+  PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank);
   return *PartMapping;
 }
 
@@ -268,8 +272,8 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
 
   ++NumValueMappingsCreated;
 
-  const ValueMapping *&ValMapping = MapOfValueMappings[Hash];
-  ValMapping = new ValueMapping{BreakDown, NumBreakDowns};
+  auto &ValMapping = MapOfValueMappings[Hash];
+  ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
   return *ValMapping;
 }
 
@@ -282,9 +286,9 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
   // The addresses of the value mapping are unique.
   // Therefore, we can use them directly to hash the operand mapping.
   hash_code Hash = hash_combine_range(Begin, End);
-  const auto &It = MapOfOperandsMappings.find(Hash);
-  if (It != MapOfOperandsMappings.end())
-    return It->second;
+  auto &Res = MapOfOperandsMappings[Hash];
+  if (Res)
+    return Res.get();
 
   ++NumOperandsMappingsCreated;
 
@@ -293,8 +297,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
   // mapping, because we use the pointer of the ValueMapping
   // to hash and we expect them to uniquely identify an instance
   // of value mapping.
-  ValueMapping *&Res = MapOfOperandsMappings[Hash];
-  Res = new ValueMapping[std::distance(Begin, End)];
+  Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End));
   unsigned Idx = 0;
   for (Iterator It = Begin; It != End; ++It, ++Idx) {
     const ValueMapping *ValMap = *It;
@@ -302,7 +305,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
       continue;
     Res[Idx] = *ValMap;
   }
-  return Res;
+  return Res.get();
 }
 
 const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
@@ -349,6 +352,7 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
 
 void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
   MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
   DEBUG(dbgs() << "Applying default-like mapping\n");
   for (unsigned OpIdx = 0,
                 EndIdx = OpdMapper.getInstrMapping().getNumOperands();
@@ -359,6 +363,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
       DEBUG(dbgs() << " is not a register, nothing to be done\n");
       continue;
     }
+    if (!MO.getReg()) {
+      DEBUG(dbgs() << " is %%noreg, nothing to be done\n");
+      continue;
+    }
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
+               0 &&
+           "Invalid mapping");
     assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==
                1 &&
            "This mapping is too complex for this function");
@@ -368,9 +379,25 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
       DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
-    DEBUG(dbgs() << " changed, replace " << MO.getReg());
-    MO.setReg(*NewRegs.begin());
-    DEBUG(dbgs() << " with " << MO.getReg());
+    unsigned OrigReg = MO.getReg();
+    unsigned NewReg = *NewRegs.begin();
+    DEBUG(dbgs() << " changed, replace " << PrintReg(OrigReg, nullptr));
+    MO.setReg(NewReg);
+    DEBUG(dbgs() << " with " << PrintReg(NewReg, nullptr));
+
+    // The OperandsMapper creates plain scalar, we may have to fix that.
+    // Check if the types match and if not, fix that.
+    LLT OrigTy = MRI.getType(OrigReg);
+    LLT NewTy = MRI.getType(NewReg);
+    if (OrigTy != NewTy) {
+      assert(OrigTy.getSizeInBits() == NewTy.getSizeInBits() &&
+             "Types with difference size cannot be handled by the default "
+             "mapping");
+      DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
+                   << OrigTy);
+      MRI.setType(NewReg, OrigTy);
+    }
+    DEBUG(dbgs() << '\n');
   }
 }
 
@@ -400,10 +427,12 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg,
 //------------------------------------------------------------------------------
 // Helper classes implementation.
 //------------------------------------------------------------------------------
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 bool RegisterBankInfo::PartialMapping::verify() const {
   assert(RegBank && "Register bank not set");
@@ -451,10 +480,12 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const {
   return true;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
   OS << "#BreakDown: " << NumBreakDowns << " ";
@@ -472,8 +503,7 @@ bool RegisterBankInfo::InstructionMapping::verify(
   // Check that all the register operands are properly mapped.
   // Check the constructor invariant.
   // For PHI, we only care about mapping the definition.
-  assert(NumOperands ==
-             ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) &&
+  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&
          "NumOperands must match, see constructor");
   assert(MI.getParent() && MI.getParent()->getParent() &&
          "MI must be connected to a MachineFunction");
@@ -503,10 +533,12 @@ bool RegisterBankInfo::InstructionMapping::verify(
   return true;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
   OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
@@ -576,6 +608,11 @@ void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
   for (unsigned &NewVReg : NewVRegsForOpIdx) {
     assert(PartMap != ValMapping.end() && "Out-of-bound access");
     assert(NewVReg == 0 && "Register has already been created");
+    // The new registers are always bound to scalar with the right size.
+    // The actual type has to be set when the target does the mapping
+    // of the instruction.
+    // The rationale is that this generic code cannot guess how the
+    // target plans to split the input type.
     NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length));
     MRI.setRegBank(NewVReg, *PartMap->RegBank);
     ++PartMap;
@@ -619,10 +656,12 @@ RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
   return Res;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const {
   print(dbgs(), true);
   dbgs() << '\n';
 }
+#endif
 
 void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
                                              bool ForDebug) const {
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index e50091833c26..606a59680a3d 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -11,10 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -43,3 +46,50 @@ unsigned llvm::constrainOperandRegClass(
 
   return Reg;
 }
+
+bool llvm::isTriviallyDead(const MachineInstr &MI,
+                           const MachineRegisterInfo &MRI) {
+  // If we can move an instruction, we can remove it.  Otherwise, it has
+  // a side-effect of some sort.
+  bool SawStore = false;
+  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore))
+    return false;
+
+  // Instructions without side-effects are dead iff they only define dead vregs.
+  for (auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI.use_nodbg_empty(Reg))
+      return false;
+  }
+  return true;
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(R.getMsg());
+  else
+    MORE.emit(R);
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              const char *PassName, StringRef Msg,
+                              const MachineInstr &MI) {
+  MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ",
+                                    MI.getDebugLoc(), MI.getParent());
+  R << Msg << ": " << ore::MNV("Inst", MI);
+  reportGISelFailure(MF, TPC, MORE, R);
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index b9f3d86eabd8..37fe41582333 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -588,19 +588,6 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
   return TExit && TExit == FalseBBI.BB;
 }
 
-/// Shrink the provided inclusive range by one instruction.
-/// If the range was one instruction (\p It == \p Begin), It is not modified,
-/// but \p Empty is set to true.
-static inline void shrinkInclusiveRange(
-    MachineBasicBlock::iterator &Begin,
-    MachineBasicBlock::iterator &It,
-    bool &Empty) {
-  if (It == Begin)
-    Empty = true;
-  else
-    It--;
-}
-
 /// Count duplicated instructions and move the iterators to show where they
 /// are.
 /// @param TIB True Iterator Begin
@@ -633,10 +620,8 @@ bool IfConverter::CountDuplicatedInstructions(
   while (TIB != TIE && FIB != FIE) {
     // Skip dbg_value instructions. These do not count.
     TIB = skipDebugInstructionsForward(TIB, TIE);
-    if(TIB == TIE)
-      break;
     FIB = skipDebugInstructionsForward(FIB, FIE);
-    if(FIB == FIE)
+    if (TIB == TIE || FIB == FIE)
       break;
     if (!TIB->isIdenticalTo(*FIB))
       break;
@@ -656,58 +641,42 @@ bool IfConverter::CountDuplicatedInstructions(
   if (TIB == TIE || FIB == FIE)
     return true;
   // Now, in preparation for counting duplicate instructions at the ends of the
-  // blocks, move the end iterators up past any branch instructions.
-  --TIE;
-  --FIE;
-
-  // After this point TIB and TIE define an inclusive range, which means that
-  // TIB == TIE is true when there is one more instruction to consider, not at
-  // the end. Because we may not be able to go before TIB, we need a flag to
-  // indicate a completely empty range.
-  bool TEmpty = false, FEmpty = false;
-
-  // Upon exit TIE and FIE will both point at the last non-shared instruction.
-  // They need to be moved forward to point past the last non-shared
-  // instruction if the range they delimit is non-empty.
-  auto IncrementEndIteratorsOnExit = make_scope_exit([&]() {
-    if (!TEmpty)
-      ++TIE;
-    if (!FEmpty)
-      ++FIE;
-  });
+  // blocks, switch to reverse_iterators. Note that getReverse() returns an
+  // iterator that points to the same instruction, unlike std::reverse_iterator.
+  // We have to do our own shifting so that we get the same range.
+  MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse());
+  MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse());
+  const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse());
+  const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse());
 
   if (!TBB.succ_empty() || !FBB.succ_empty()) {
     if (SkipUnconditionalBranches) {
-      while (!TEmpty && TIE->isUnconditionalBranch())
-        shrinkInclusiveRange(TIB, TIE, TEmpty);
-      while (!FEmpty && FIE->isUnconditionalBranch())
-        shrinkInclusiveRange(FIB, FIE, FEmpty);
+      while (RTIE != RTIB && RTIE->isUnconditionalBranch())
+        ++RTIE;
+      while (RFIE != RFIB && RFIE->isUnconditionalBranch())
+        ++RFIE;
     }
   }
 
-  // If Dups1 includes all of a block, then don't count duplicate
-  // instructions at the end of the blocks.
-  if (TEmpty || FEmpty)
-    return true;
-
   // Count duplicate instructions at the ends of the blocks.
-  while (!TEmpty && !FEmpty) {
+  while (RTIE != RTIB && RFIE != RFIB) {
     // Skip dbg_value instructions. These do not count.
-    TIE = skipDebugInstructionsBackward(TIE, TIB);
-    FIE = skipDebugInstructionsBackward(FIE, FIB);
-    TEmpty = TIE == TIB && TIE->isDebugValue();
-    FEmpty = FIE == FIB && FIE->isDebugValue();
-    if (TEmpty || FEmpty)
+    // Note that these are reverse iterators going forward.
+    RTIE = skipDebugInstructionsForward(RTIE, RTIB);
+    RFIE = skipDebugInstructionsForward(RFIE, RFIB);
+    if (RTIE == RTIB || RFIE == RFIB)
       break;
-    if (!TIE->isIdenticalTo(*FIE))
+    if (!RTIE->isIdenticalTo(*RFIE))
       break;
     // We have to verify that any branch instructions are the same, and then we
     // don't count them toward the # of duplicate instructions.
-    if (!TIE->isBranch())
+    if (!RTIE->isBranch())
       ++Dups2;
-    shrinkInclusiveRange(TIB, TIE, TEmpty);
-    shrinkInclusiveRange(FIB, FIE, FEmpty);
+    ++RTIE;
+    ++RFIE;
   }
+  TIE = std::next(RTIE.getReverse());
+  FIE = std::next(RFIE.getReverse());
   return true;
 }
 
@@ -741,25 +710,21 @@ bool IfConverter::RescanInstructions(
 static void verifySameBranchInstructions(
     MachineBasicBlock *MBB1,
     MachineBasicBlock *MBB2) {
-  MachineBasicBlock::iterator B1 = MBB1->begin();
-  MachineBasicBlock::iterator B2 = MBB2->begin();
-  MachineBasicBlock::iterator E1 = std::prev(MBB1->end());
-  MachineBasicBlock::iterator E2 = std::prev(MBB2->end());
-  bool Empty1 = false, Empty2 = false;
-  while (!Empty1 && !Empty2) {
-    E1 = skipDebugInstructionsBackward(E1, B1);
-    E2 = skipDebugInstructionsBackward(E2, B2);
-    Empty1 = E1 == B1 && E1->isDebugValue();
-    Empty2 = E2 == B2 && E2->isDebugValue();
-
-    if (Empty1 && Empty2)
+  const MachineBasicBlock::reverse_iterator B1 = MBB1->rend();
+  const MachineBasicBlock::reverse_iterator B2 = MBB2->rend();
+  MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin();
+  MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin();
+  while (E1 != B1 && E2 != B2) {
+    skipDebugInstructionsForward(E1, B1);
+    skipDebugInstructionsForward(E2, B2);
+    if (E1 == B1 && E2 == B2)
       break;
 
-    if (Empty1) {
+    if (E1 == B1) {
       assert(!E2->isBranch() && "Branch mis-match, one block is empty.");
       break;
     }
-    if (Empty2) {
+    if (E2 == B2) {
       assert(!E1->isBranch() && "Branch mis-match, one block is empty.");
       break;
     }
@@ -769,8 +734,8 @@ static void verifySameBranchInstructions(
              "Branch mis-match, branch instructions don't match.");
     else
       break;
-    shrinkInclusiveRange(B1, E1, Empty1);
-    shrinkInclusiveRange(B2, E2, Empty2);
+    ++E1;
+    ++E2;
   }
 }
 #endif
@@ -2183,7 +2148,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   // unknown probabilities into known ones.
   // FIXME: This usage is too tricky and in the future we would like to
   // eliminate all unknown probabilities in MBB.
-  ToBBI.BB->normalizeSuccProbs();
+  if (ToBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
 
   SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(),
                                                 FromMBB.succ_end());
@@ -2263,7 +2229,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
 
   // Normalize the probabilities of ToBBI.BB's successors with all adjustment
   // we've done above.
-  ToBBI.BB->normalizeSuccProbs();
+  if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
 
   ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
   FromBBI.Predicate.clear();
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 9588dfb72058..920c2a372a9b 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -22,6 +22,7 @@
 // With the help of a runtime that understands the .fault_maps section,
 // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs
 // a page fault.
+// Store and LoadStore are also supported.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,6 +30,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass {
   const TargetRegisterInfo *TRI = nullptr;
   AliasAnalysis *AA = nullptr;
   MachineModuleInfo *MMI = nullptr;
+  MachineFrameInfo *MFI = nullptr;
 
   bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,
                                  SmallVectorImpl<NullCheck> &NullCheckList);
-  MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB,
-                                   MachineBasicBlock *HandlerMBB);
+  MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                                    MachineBasicBlock *HandlerMBB);
   void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList);
 
-  /// Is \p MI a memory operation that can be used to implicitly null check the
-  /// value in \p PointerReg?  \p PrevInsts is the set of instruction seen since
+  enum AliasResult {
+    AR_NoAlias,
+    AR_MayAlias,
+    AR_WillAliasEverything
+  };
+  /// Returns AR_NoAlias if \p MI memory operation does not alias with
+  /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if
+  /// they may alias and any further memory operation may alias with \p PrevMI.
+  AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI);
+
+  enum SuitabilityResult {
+    SR_Suitable,
+    SR_Unsuitable,
+    SR_Impossible
+  };
+  /// Return SR_Suitable if \p MI a memory operation that can be used to
+  /// implicitly null check the value in \p PointerReg, SR_Unsuitable if
+  /// \p MI cannot be used to null check and SR_Impossible if there is
+  /// no sense to continue lookup due to any other instruction will not be able
+  /// to be used. \p PrevInsts is the set of instruction seen since
   /// the explicit null check on \p PointerReg.
-  bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
-                          ArrayRef<MachineInstr *> PrevInsts);
+  SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts);
 
   /// Return true if \p FaultingMI can be hoisted from after the the
   /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a
   /// non-null value if we also need to (and legally can) hoist a depedency.
-  bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg,
-                        ArrayRef<MachineInstr *> InstsSeenSoFar,
-                        MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
+  bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg,
+                    ArrayRef<MachineInstr *> InstsSeenSoFar,
+                    MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
 
 public:
   static char ID;
@@ -193,7 +214,7 @@ public:
 }
 
 bool ImplicitNullChecks::canHandle(const MachineInstr *MI) {
-  if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects())
+  if (MI->isCall() || MI->hasUnmodeledSideEffects())
     return false;
   auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); };
   (void)IsRegMask;
@@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,
 
       unsigned RegB = MOB.getReg();
 
-      if (TRI->regsOverlap(RegA, RegB))
+      if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))
         return false;
     }
   }
@@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getRegInfo().getTargetRegisterInfo();
   MMI = &MF.getMMI();
+  MFI = &MF.getFrameInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   SmallVector<NullCheck, 16> NullCheckList;
@@ -283,36 +305,91 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,
   return false;
 }
 
-bool ImplicitNullChecks::isSuitableMemoryOp(
-    MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) {
+ImplicitNullChecks::AliasResult
+ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI,
+                                        MachineInstr *PrevMI) {
+  // If it is not memory access, skip the check.
+  if (!(PrevMI->mayStore() || PrevMI->mayLoad()))
+    return AR_NoAlias;
+  // Load-Load may alias
+  if (!(MI.mayStore() || PrevMI->mayStore()))
+    return AR_NoAlias;
+  // We lost info, conservatively alias. If it was store then no sense to
+  // continue because we won't be able to check against it further.
+  if (MI.memoperands_empty())
+    return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+  if (PrevMI->memoperands_empty())
+    return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+
+  for (MachineMemOperand *MMO1 : MI.memoperands()) {
+    // MMO1 should have a value due it comes from operation we'd like to use
+    // as implicit null check.
+    assert(MMO1->getValue() && "MMO1 should have a Value!");
+    for (MachineMemOperand *MMO2 : PrevMI->memoperands()) {
+      if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) {
+        if (PSV->mayAlias(MFI))
+          return AR_MayAlias;
+        continue;
+      }
+      llvm::AliasResult AAResult = AA->alias(
+          MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
+                         MMO1->getAAInfo()),
+          MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
+                         MMO2->getAAInfo()));
+      if (AAResult != NoAlias)
+        return AR_MayAlias;
+    }
+  }
+  return AR_NoAlias;
+}
+
+ImplicitNullChecks::SuitabilityResult
+ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts) {
   int64_t Offset;
   unsigned BaseReg;
 
   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) ||
       BaseReg != PointerReg)
-    return false;
-
-  // We want the load to be issued at a sane offset from PointerReg, so that
-  // if PointerReg is null then the load reliably page faults.
-  if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize))
-    return false;
-
-  // Finally, we need to make sure that the load instruction actually is
-  // loading from PointerReg, and there isn't some re-definition of PointerReg
-  // between the compare and the load.
+    return SR_Unsuitable;
+
+  // We want the mem access to be issued at a sane offset from PointerReg,
+  // so that if PointerReg is null then the access reliably page faults.
+  if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() &&
+        Offset < PageSize))
+    return SR_Unsuitable;
+
+  // Finally, we need to make sure that the access instruction actually is
+  // accessing from PointerReg, and there isn't some re-definition of PointerReg
+  // between the compare and the memory access.
+  // If PointerReg has been redefined before then there is no sense to continue
+  // lookup due to this condition will fail for any further instruction.
+  SuitabilityResult Suitable = SR_Suitable;
   for (auto *PrevMI : PrevInsts)
-    for (auto &PrevMO : PrevMI->operands())
-      if (PrevMO.isReg() && PrevMO.getReg() &&
+    for (auto &PrevMO : PrevMI->operands()) {
+      if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() &&
           TRI->regsOverlap(PrevMO.getReg(), PointerReg))
-        return false;
-
-  return true;
+        return SR_Impossible;
+
+      // Check whether the current memory access aliases with previous one.
+      // If we already found that it aliases then no need to continue.
+      // But we continue base pointer check as it can result in SR_Impossible.
+      if (Suitable == SR_Suitable) {
+        AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
+        if (AR == AR_WillAliasEverything)
+          return SR_Impossible;
+        if (AR == AR_MayAlias)
+          Suitable = SR_Unsuitable;
+      }
+    }
+  return Suitable;
 }
 
-bool ImplicitNullChecks::canHoistLoadInst(
-    MachineInstr *FaultingMI, unsigned PointerReg,
-    ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc,
-    MachineInstr *&Dependence) {
+bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
+                                      unsigned PointerReg,
+                                      ArrayRef<MachineInstr *> InstsSeenSoFar,
+                                      MachineBasicBlock *NullSucc,
+                                      MachineInstr *&Dependence) {
   auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar);
   if (!DepResult.CanReorder)
     return false;
@@ -359,7 +436,8 @@ bool ImplicitNullChecks::canHoistLoadInst(
     // The Dependency can't be re-defining the base register -- then we won't
     // get the memory operation on the address we want.  This is already
     // checked in \c IsSuitableMemoryOp.
-    assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) &&
+    assert(!(DependenceMO.isDef() &&
+             TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) &&
            "Should have been checked before!");
   }
 
@@ -481,9 +559,11 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
       return false;
 
     MachineInstr *Dependence;
-    if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) &&
-        canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc,
-                         Dependence)) {
+    SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar);
+    if (SR == SR_Impossible)
+      return false;
+    if (SR == SR_Suitable &&
+        canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) {
       NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
                                  NullSucc, Dependence);
       return true;
@@ -495,36 +575,42 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   return false;
 }
 
-/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine
-/// instruction.  The FAULTING_LOAD_OP instruction does the same load as LoadMI
-/// (defining the same register), and branches to HandlerMBB if the load
-/// faults.  The FAULTING_LOAD_OP instruction is inserted at the end of MBB.
-MachineInstr *
-ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
-                                       MachineBasicBlock *MBB,
-                                       MachineBasicBlock *HandlerMBB) {
+/// Wrap a machine instruction, MI, into a FAULTING machine instruction.
+/// The FAULTING instruction does the same load/store as MI
+/// (defining the same register), and branches to HandlerMBB if the mem access
+/// faults.  The FAULTING instruction is inserted at the end of MBB.
+MachineInstr *ImplicitNullChecks::insertFaultingInstr(
+    MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) {
   const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
                                  // all targets.
 
   DebugLoc DL;
-  unsigned NumDefs = LoadMI->getDesc().getNumDefs();
+  unsigned NumDefs = MI->getDesc().getNumDefs();
   assert(NumDefs <= 1 && "other cases unhandled!");
 
   unsigned DefReg = NoRegister;
   if (NumDefs != 0) {
-    DefReg = LoadMI->defs().begin()->getReg();
-    assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 &&
+    DefReg = MI->defs().begin()->getReg();
+    assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 &&
            "expected exactly one def!");
   }
 
-  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg)
+  FaultMaps::FaultKind FK;
+  if (MI->mayLoad())
+    FK =
+        MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad;
+  else
+    FK = FaultMaps::FaultingStore;
+
+  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg)
+                 .addImm(FK)
                  .addMBB(HandlerMBB)
-                 .addImm(LoadMI->getOpcode());
+                 .addImm(MI->getOpcode());
 
-  for (auto &MO : LoadMI->uses())
-    MIB.addOperand(MO);
+  for (auto &MO : MI->uses())
+    MIB.add(MO);
 
-  MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end());
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
   return MIB;
 }
@@ -545,18 +631,18 @@ void ImplicitNullChecks::rewriteNullChecks(
       NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);
     }
 
-    // Insert a faulting load where the conditional branch was originally.  We
-    // check earlier ensures that this bit of code motion is legal.  We do not
-    // touch the successors list for any basic block since we haven't changed
-    // control flow, we've just made it implicit.
-    MachineInstr *FaultingLoad = insertFaultingLoad(
+    // Insert a faulting instruction where the conditional branch was
+    // originally. We check earlier ensures that this bit of code motion
+    // is legal.  We do not touch the successors list for any basic block
+    // since we haven't changed control flow, we've just made it implicit.
+    MachineInstr *FaultingInstr = insertFaultingInstr(
         NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());
     // Now the values defined by MemOperation, if any, are live-in of
     // the block of MemOperation.
-    // The original load operation may define implicit-defs alongside
-    // the loaded value.
+    // The original operation may define implicit-defs alongside
+    // the value.
     MachineBasicBlock *MBB = NC.getMemOperation()->getParent();
-    for (const MachineOperand &MO : FaultingLoad->operands()) {
+    for (const MachineOperand &MO : FaultingInstr->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
       unsigned Reg = MO.getReg();
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 3d81184f774a..a1cb0a0695bf 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
       Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
 
   // We take the DebugLoc from MI, since OrigMI may be attributed to a
-  // different source location. 
+  // different source location.
   auto *NewMI = LIS.getInstructionFromIndex(DefIdx);
   NewMI->setDebugLoc(MI.getDebugLoc());
 
@@ -686,7 +686,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   return true;
 }
 
-#if !defined(NDEBUG)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 // Dump the range of instructions from B to E with their slot indexes.
 static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
                                                MachineBasicBlock::iterator E,
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index afd24067ace7..c6cc909e25d3 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) {
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::memmove:
         M.getOrInsertFunction("memmove",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::memset:
         M.getOrInsertFunction("memset",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt32Ty(M.getContext()), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::sqrt:
         EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl");
diff --git a/lib/CodeGen/LLVMBuild.txt b/lib/CodeGen/LLVMBuild.txt
index 86d3624a9d6e..07ea9dcaea7a 100644
--- a/lib/CodeGen/LLVMBuild.txt
+++ b/lib/CodeGen/LLVMBuild.txt
@@ -22,4 +22,4 @@ subdirectories = AsmPrinter SelectionDAG MIRParser GlobalISel
 type = Library
 name = CodeGen
 parent = Libraries
-required_libraries = Analysis BitReader BitWriter Core MC Scalar Support Target TransformUtils
+required_libraries = Analysis BitReader BitWriter Core MC ProfileData Scalar Support Target TransformUtils
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 26794e28020e..7b1706f0f4ba 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -42,8 +42,8 @@ static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
-static cl::opt<bool>
-    EnableGlobalISel("global-isel", cl::Hidden, cl::init(false),
+static cl::opt<cl::boolOrDefault>
+    EnableGlobalISel("global-isel", cl::Hidden,
                      cl::desc("Enable the \"global\" instruction selector"));
 
 void LLVMTargetMachine::initAsmInfo() {
@@ -85,7 +85,7 @@ void LLVMTargetMachine::initAsmInfo() {
 LLVMTargetMachine::LLVMTargetMachine(const Target &T,
                                      StringRef DataLayoutString,
                                      const Triple &TT, StringRef CPU,
-                                     StringRef FS, TargetOptions Options,
+                                     StringRef FS, const TargetOptions &Options,
                                      Reloc::Model RM, CodeModel::Model CM,
                                      CodeGenOpt::Level OL)
     : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) {
@@ -149,7 +149,9 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
     TM->setFastISel(true);
 
   // Ask the target for an isel.
-  if (LLVM_UNLIKELY(EnableGlobalISel)) {
+  // Enable GlobalISel if the target wants to, but allow that to be overriden.
+  if (EnableGlobalISel == cl::BOU_TRUE || (EnableGlobalISel == cl::BOU_UNSET &&
+                                           PassConfig->isGlobalISelEnabled())) {
     if (PassConfig->addIRTranslator())
       return nullptr;
 
@@ -172,11 +174,12 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
 
     // Pass to reset the MachineFunction if the ISel failed.
     PM.add(createResetMachineFunctionPass(
-        PassConfig->reportDiagnosticWhenGlobalISelFallback()));
+        PassConfig->reportDiagnosticWhenGlobalISelFallback(),
+        PassConfig->isGlobalISelAbortEnabled()));
 
     // Provide a fallback path when we do not want to abort on
     // not-yet-supported input.
-    if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) &&
+    if (!PassConfig->isGlobalISelAbortEnabled() &&
         PassConfig->addInstSelector())
       return nullptr;
 
diff --git a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..996d40ca6e1e
--- /dev/null
+++ b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -0,0 +1,97 @@
+///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===//
+///
+///                     The LLVM Compiler Infrastructure
+///
+/// This file is distributed under the University of Illinois Open Source
+/// License. See LICENSE.TXT for details.
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// This is an alternative analysis pass to MachineBlockFrequencyInfo.  The
+/// difference is that with this pass the block frequencies are not computed
+/// when the analysis pass is executed but rather when the BFI result is
+/// explicitly requested by the analysis client.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lazy-machine-block-freq"
+
+INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                      "Lazy Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                    "Lazy Machine Block Frequency Analysis", true, true)
+
+char LazyMachineBlockFrequencyInfoPass::ID = 0;
+
+LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass()
+    : MachineFunctionPass(ID) {
+  initializeLazyMachineBlockFrequencyInfoPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS,
+                                              const Module *M) const {
+  getBFI().print(OS, M);
+}
+
+void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LazyMachineBlockFrequencyInfoPass::releaseMemory() {
+  OwnedMBFI.reset();
+  OwnedMLI.reset();
+  OwnedMDT.reset();
+}
+
+MachineBlockFrequencyInfo &
+LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
+  auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (MBFI) {
+    DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n");
+    return *MBFI;
+  }
+
+  auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
+  auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+  auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
+  DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
+
+  if (!MLI) {
+    DEBUG(dbgs() << "Building LoopInfo on the fly\n");
+    // First create a dominator tree.
+    DEBUG(if (MDT) dbgs() << "DominatorTree is available\n");
+
+    if (!MDT) {
+      DEBUG(dbgs() << "Building DominatorTree on the fly\n");
+      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT->getBase().recalculate(*MF);
+      MDT = OwnedMDT.get();
+    }
+
+    // Generate LoopInfo from it.
+    OwnedMLI = make_unique<MachineLoopInfo>();
+    OwnedMLI->getBase().analyze(MDT->getBase());
+    MLI = OwnedMLI.get();
+  }
+
+  OwnedMBFI = make_unique<MachineBlockFrequencyInfo>();
+  OwnedMBFI->calculate(*MF, MBPI, *MLI);
+  return *OwnedMBFI.get();
+}
+
+bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction(
+    MachineFunction &F) {
+  MF = &F;
+  return false;
+}
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index 834ed5f06c94..275d84e2c185 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -14,14 +14,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <string>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
@@ -38,6 +47,10 @@ void LexicalScopes::reset() {
 
 /// initialize - Scan machine function and constuct lexical scope nest.
 void LexicalScopes::initialize(const MachineFunction &Fn) {
+  // Don't attempt any lexical scope creation for a NoDebug compile unit.
+  if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return;
   reset();
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) {
 void LexicalScopes::extractLexicalScopes(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
-
   // Scan each instruction and create scopes. First build working set of scopes.
   for (const auto &MBB : *MF) {
     const MachineInstr *RangeBeginMI = nullptr;
@@ -127,6 +139,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {
 LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
+    // Skip scopes inlined from a NoDebug compile unit.
+    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
+        DICompileUnit::NoDebug)
+      return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
     // Create an inlined scope for inlined function.
@@ -181,10 +197,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
   else
     Parent = getOrCreateLexicalScope(InlinedAt);
 
-  I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
-                                     std::forward_as_tuple(P),
-                                     std::forward_as_tuple(Parent, Scope,
-                                                           InlinedAt, false))
+  I = InlinedLexicalScopeMap
+          .emplace(std::piecewise_construct, std::forward_as_tuple(P),
+                   std::forward_as_tuple(Parent, Scope, InlinedAt, false))
           .first;
   return &I->second;
 }
@@ -241,7 +256,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
 void LexicalScopes::assignInstructionRanges(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
-
   LexicalScope *PrevLexicalScope = nullptr;
   for (const auto &R : MIRanges) {
     LexicalScope *S = MI2ScopeMap.lookup(R.first);
@@ -299,9 +313,8 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
   return Result;
 }
 
-/// dump - Print data structures.
-void LexicalScope::dump(unsigned Indent) const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const {
   raw_ostream &err = dbgs();
   err.indent(Indent);
   err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
@@ -316,5 +329,5 @@ void LexicalScope::dump(unsigned Indent) const {
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
     if (Children[i] != this)
       Children[i]->dump(Indent + 2);
-#endif
 }
+#endif
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index c945376560f7..f956974b1aaf 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -24,13 +24,16 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass {
 private:
   const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
   LexicalScopes LS;
 
   /// Keeps track of lexical scopes associated with a user value's source
@@ -127,11 +131,13 @@ private:
       if (int RegNo = isDbgValueDescribedByReg(MI)) {
         Kind = RegisterKind;
         Loc.RegisterLoc.RegNo = RegNo;
-        uint64_t Offset =
+        int64_t Offset =
             MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0;
         // We don't support offsets larger than 4GiB here. They are
         // slated to be replaced with DIExpressions anyway.
-        if (Offset >= (1ULL << 32))
+        // With indirect debug values used for spill locations, Offset 
+        // can be negative.
+        if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32))
           Kind = InvalidKind;
         else
           Loc.RegisterLoc.Offset = Offset;
@@ -150,7 +156,9 @@ private:
     /// dominates MBB.
     bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }
 
-    void dump() const { MI.dump(); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump() const { MI.dump(); }
+#endif
 
     bool operator==(const VarLoc &Other) const {
       return Var == Other.Var && Loc.Hash == Other.Loc.Hash;
@@ -167,6 +175,11 @@ private:
   typedef UniqueVector<VarLoc> VarLocMap;
   typedef SparseBitVector<> VarLocSet;
   typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB;
+  struct SpillDebugPair {
+    MachineInstr *SpillInst;
+    MachineInstr *DebugInst;
+  };
+  typedef SmallVector<SpillDebugPair, 4> SpillMap;
 
   /// This holds the working set of currently open ranges. For fast
   /// access, this is done both as a set of VarLocIDs, and a map of
@@ -216,14 +229,21 @@ private:
     }
   };
 
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
+                          unsigned &Reg);
+  int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg);
+
   void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
                           VarLocMap &VarLocIDs);
+  void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                         VarLocMap &VarLocIDs, SpillMap &Spills);
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
                            const VarLocMap &VarLocIDs);
   bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
                               VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
   bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs);
+                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills,
+                bool transferSpills);
 
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
@@ -282,6 +302,7 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
 //            Debug Range Extension Implementation
 //===----------------------------------------------------------------------===//
 
+#ifndef NDEBUG
 void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
                                        const VarLocInMBB &V,
                                        const VarLocMap &VarLocIDs,
@@ -300,6 +321,22 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
   }
   Out << "\n";
 }
+#endif
+
+/// Given a spill instruction, extract the register and offset used to
+/// address the spill location in a target independent way.
+int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI,
+                                                  unsigned &Reg) {
+  assert(MI.hasOneMemOperand() && 
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+}
 
 /// End all previous ranges related to @MI and start a new range from @MI
 /// if it is a DBG_VALUE instr.
@@ -336,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
   SparseBitVector<> KillSet;
   for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.  Assume that call
+    // instructions never clobber SP, because some backends (e.g., AArch64)
+    // never list SP in the regmask.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
-        TRI->isPhysicalRegister(MO.getReg())) {
+        TRI->isPhysicalRegister(MO.getReg()) &&
+        !(MI.isCall() && MO.getReg() == SP)) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         for (unsigned ID : OpenRanges.getVarLocs())
@@ -358,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
   OpenRanges.erase(KillSet, VarLocIDs);
 }
 
+/// Decide if @MI is a spill instruction and return true if it is. We use 2
+/// criteria to make this decision:
+/// - Is this instruction a store to a spill slot?
+/// - Is there a register operand that is both used and killed?
+/// TODO: Store optimization can fold spills into other stores (including
+/// other spills). We do not handle this yet (more than one memory operand).
+bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
+                                         MachineFunction *MF, unsigned &Reg) {
+  const MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  int FI;
+  const MachineMemOperand *MMO;
+
+  // TODO: Handle multiple stores folded into one. 
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  // To identify a spill instruction, use the same criteria as in AsmPrinter.
+  if (!((TII->isStoreToStackSlotPostFE(MI, FI) ||
+         TII->hasStoreToStackSlot(MI, MMO, FI)) &&
+        FrameInfo.isSpillSlotObjectIndex(FI)))
+    return false;
+
+  // In a spill instruction generated by the InlineSpiller the spilled register
+  // has its kill flag set. Return false if we don't find such a register.
+  Reg = 0;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse() && MO.isKill()) {
+      Reg = MO.getReg();
+      break;
+    }
+  }
+  return Reg != 0;
+}
+
+/// A spilled register may indicate that we have to end the current range of
+/// a variable and create a new one for the spill location.
+/// We don't want to insert any instructions in transfer(), so we just create
+/// the DBG_VALUE witout inserting it and keep track of it in @Spills.
+/// It will be inserted into the BB when we're done iterating over the
+/// instructions.
+void LiveDebugValues::transferSpillInst(MachineInstr &MI,
+                                        OpenRangesSet &OpenRanges,
+                                        VarLocMap &VarLocIDs,
+                                        SpillMap &Spills) {
+  unsigned Reg;
+  MachineFunction *MF = MI.getParent()->getParent();
+  if (!isSpillInstruction(MI, MF, Reg))
+    return;
+
+  // Check if the register is the location of a debug value.
+  for (unsigned ID : OpenRanges.getVarLocs()) {
+    if (VarLocIDs[ID].isDescribedByReg() == Reg) {
+      DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '('
+                   << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
+
+      // Create a DBG_VALUE instruction to describe the Var in its spilled
+      // location, but don't insert it yet to avoid invalidating the
+      // iterator in our caller.
+      unsigned SpillBase;
+      int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase);
+      const MachineInstr *DMI = &VarLocIDs[ID].MI;
+      MachineInstr *SpDMI =
+          BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0,
+                  DMI->getDebugVariable(), DMI->getDebugExpression());
+      SpDMI->getOperand(1).setImm(SpillOffset);
+      DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
+            SpDMI->print(dbgs(), false, TII));
+
+      // The newly created DBG_VALUE instruction SpDMI must be inserted after
+      // MI. Keep track of the pairing.
+      SpillDebugPair MIP = {&MI, SpDMI};
+      Spills.push_back(MIP);
+
+      // End all previous ranges of Var.
+      OpenRanges.erase(VarLocIDs[ID].Var);
+
+      // Add the VarLoc to OpenRanges.
+      VarLoc VL(*SpDMI, LS);
+      unsigned SpillLocID = VarLocIDs.insert(VL);
+      OpenRanges.insert(SpillLocID, VL.Var);
+      return;
+    }
+  }
+}
+
 /// Terminate all open ranges at the end of the current basic block.
 bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
                                              OpenRangesSet &OpenRanges,
@@ -383,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
 
 /// This routine creates OpenRanges and OutLocs.
 bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) {
+                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
+                               SpillMap &Spills, bool transferSpills) {
   bool Changed = false;
   transferDebugValue(MI, OpenRanges, VarLocIDs);
   transferRegisterDef(MI, OpenRanges, VarLocIDs);
+  if (transferSpills)
+    transferSpillInst(MI, OpenRanges, VarLocIDs, Spills);
   Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
   return Changed;
 }
@@ -475,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   bool OLChanged = false;
   bool MBBJoined = false;
 
-  VarLocMap VarLocIDs;   // Map VarLoc<>unique ID for use in bitvectors.
+  VarLocMap VarLocIDs;      // Map VarLoc<>unique ID for use in bitvectors.
   OpenRangesSet OpenRanges; // Ranges that are open until end of bb.
-  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
-  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
+  VarLocInMBB OutLocs;      // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;       // Ranges that are incoming after joining.
+  SpillMap Spills;          // DBG_VALUEs associated with spills.
 
   DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
   DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
@@ -490,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       Pending;
 
   // Initialize every mbb with OutLocs.
+  // We are not looking at any spill instructions during the initial pass
+  // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
+  // instructions for spills of registers that are known to be user variables
+  // within the BB in which the spill occurs.
   for (auto &MBB : MF)
     for (auto &MI : MBB)
-      transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+      transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
+               /*transferSpills=*/false);
 
   DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",
                          dbgs()));
@@ -524,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       if (MBBJoined) {
         MBBJoined = false;
         Changed = true;
+        // Now that we have started to extend ranges across BBs we need to
+        // examine spill instructions to see whether they spill registers that
+        // correspond to user variables.
         for (auto &MI : *MBB)
-          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
+                                /*transferSpills=*/true);
+
+        // Add any DBG_VALUE instructions necessitated by spills.
+        for (auto &SP : Spills)
+          MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst),
+                           SP.DebugInst);
+        Spills.clear();
 
         DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
                                "OutLocs after propagating", dbgs()));
@@ -559,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
 
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
   LS.initialize(MF);
 
   bool Changed = ExtendRanges(MF);
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 0934d8cfeaa1..bcf7c8e99c7f 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -944,7 +944,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
             IsIndirect, Loc.getReg(), offset, Variable, Expression);
   else
     BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
-        .addOperand(Loc)
+        .add(Loc)
         .addImm(offset)
         .addMetadata(Variable)
         .addMetadata(Expression);
@@ -1005,7 +1005,7 @@ bool LiveDebugVariables::doInitialization(Module &M) {
   return Pass::doInitialization(M);
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LiveDebugVariables::dump() {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->print(dbgs());
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 623af492fcd4..9ef9f238fdce 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() {
   SubRanges = nullptr;
 }
 
+void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator,
+    LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) {
+
+  LaneBitmask ToApply = LaneMask;
+  for (SubRange &SR : subranges()) {
+    LaneBitmask SRMask = SR.LaneMask;
+    LaneBitmask Matching = SRMask & LaneMask;
+    if (Matching.none())
+      continue;
+
+    SubRange *MatchingRange;
+    if (SRMask == Matching) {
+      // The subrange fits (it does not cover bits outside \p LaneMask).
+      MatchingRange = &SR;
+    } else {
+      // We have to split the subrange into a matching and non-matching part.
+      // Reduce lanemask of existing lane to non-matching part.
+      SR.LaneMask = SRMask & ~Matching;
+      // Create a new subrange for the matching part
+      MatchingRange = createSubRangeFrom(Allocator, Matching, SR);
+    }
+    Apply(*MatchingRange);
+    ToApply &= ~Matching;
+  }
+  // Create a new subrange if there are uncovered bits left.
+  if (ToApply.any()) {
+    SubRange *NewRange = createSubRange(Allocator, ToApply);
+    Apply(*NewRange);
+  }
+}
+
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
   for (const Segment &S : segments)
@@ -1032,6 +1063,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
 // When they exist, Spills.back().start <= LastStart,
 //                 and WriteI[-1].start <= LastStart.
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveRangeUpdater::print(raw_ostream &OS) const {
   if (!isDirty()) {
     if (LR)
@@ -1058,6 +1090,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const {
   print(errs());
 }
+#endif
 
 // Determine if A and B should be coalesced.
 static inline bool coalescable(const LiveRange::Segment &A,
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 70d34838b237..3f5b8e19d1f0 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the LiveInterval analysis pass which is used
-// by the Linear Scan Register allocator. This pass linearizes the
-// basic blocks of the function in DFS order and computes live intervals for
-// each virtual and physical register.
+/// \file This file implements the LiveInterval analysis pass which is used
+/// by the Linear Scan Register allocator. This pass linearizes the
+/// basic blocks of the function in DFS order and computes live intervals for
+/// each virtual and physical register.
 //
 //===----------------------------------------------------------------------===//
 
@@ -96,16 +96,14 @@ void LiveIntervals::releaseMemory() {
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
-  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
-    delete RegUnitRanges[i];
+  for (LiveRange *LR : RegUnitRanges)
+    delete LR;
   RegUnitRanges.clear();
 
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
 
-/// runOnMachineFunction - calculates LiveIntervals
-///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   MRI = &MF->getRegInfo();
@@ -135,14 +133,13 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   return true;
 }
 
-/// print - Implement the dump method.
 void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
   // Dump the regunits.
-  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
-    if (LiveRange *LR = RegUnitRanges[i])
-      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n';
+  for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit)
+    if (LiveRange *LR = RegUnitRanges[Unit])
+      OS << PrintRegUnit(Unit, TRI) << ' ' << *LR << '\n';
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
@@ -152,8 +149,8 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   }
 
   OS << "RegMasks:";
-  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
-    OS << ' ' << RegMaskSlots[i];
+  for (SlotIndex Idx : RegMaskSlots)
+    OS << ' ' << Idx;
   OS << '\n';
 
   printInstrs(OS);
@@ -165,7 +162,7 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveIntervals::dumpInstrs() const {
+LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
   printInstrs(dbgs());
 }
 #endif
@@ -177,8 +174,7 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
 }
 
 
-/// computeVirtRegInterval - Compute the live interval of a virtual register,
-/// based on defs and uses.
+/// Compute the live interval of a virtual register, based on defs and uses.
 void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
@@ -200,7 +196,7 @@ void LiveIntervals::computeRegMasks() {
   RegMaskBlocks.resize(MF->getNumBlockIDs());
 
   // Find all instructions with regmask operands.
-  for (MachineBasicBlock &MBB : *MF) {
+  for (const MachineBasicBlock &MBB : *MF) {
     std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()];
     RMB.first = RegMaskSlots.size();
 
@@ -210,7 +206,7 @@ void LiveIntervals::computeRegMasks() {
       RegMaskBits.push_back(Mask);
     }
 
-    for (MachineInstr &MI : MBB) {
+    for (const MachineInstr &MI : MBB) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
           continue;
@@ -245,9 +241,9 @@ void LiveIntervals::computeRegMasks() {
 // interference.
 //
 
-/// computeRegUnitInterval - Compute the live range of a register unit, based
-/// on the uses and defs of aliasing registers.  The range should be empty,
-/// or contain only dead phi-defs from ABI blocks.
+/// Compute the live range of a register unit, based on the uses and defs of
+/// aliasing registers.  The range should be empty, or contain only dead
+/// phi-defs from ABI blocks.
 void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
@@ -257,22 +253,30 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   // may share super-registers. That's OK because createDeadDefs() is
   // idempotent. It is very rare for a register unit to have multiple roots, so
   // uniquing super-registers is probably not worthwhile.
-  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
-    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
-         Supers.isValid(); ++Supers) {
-      if (!MRI->reg_empty(*Supers))
-        LRCalc->createDeadDefs(LR, *Supers);
+  bool IsReserved = true;
+  for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+    for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
+         Super.isValid(); ++Super) {
+      unsigned Reg = *Super;
+      if (!MRI->reg_empty(Reg))
+        LRCalc->createDeadDefs(LR, Reg);
+      // A register unit is considered reserved if all its roots and all their
+      // super registers are reserved.
+      if (!MRI->isReserved(Reg))
+        IsReserved = false;
     }
   }
 
   // Now extend LR to reach all uses.
   // Ignore uses of reserved registers. We only track defs of those.
-  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
-    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
-         Supers.isValid(); ++Supers) {
-      unsigned Reg = *Supers;
-      if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
-        LRCalc->extendToUses(LR, Reg);
+  if (!IsReserved) {
+    for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+      for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
+           Super.isValid(); ++Super) {
+        unsigned Reg = *Super;
+        if (!MRI->reg_empty(Reg))
+          LRCalc->extendToUses(LR, Reg);
+      }
     }
   }
 
@@ -281,11 +285,9 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
     LR.flushSegmentSet();
 }
 
-
-/// computeLiveInRegUnits - Precompute the live ranges of any register units
-/// that are live-in to an ABI block somewhere. Register values can appear
-/// without a corresponding def when entering the entry block or a landing pad.
-///
+/// Precompute the live ranges of any register units that are live-in to an ABI
+/// block somewhere. Register values can appear without a corresponding def when
+/// entering the entry block or a landing pad.
 void LiveIntervals::computeLiveInRegUnits() {
   RegUnitRanges.resize(TRI->getNumRegUnits());
   DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
@@ -294,18 +296,15 @@ void LiveIntervals::computeLiveInRegUnits() {
   SmallVector<unsigned, 8> NewRanges;
 
   // Check all basic blocks for live-ins.
-  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-       MFI != MFE; ++MFI) {
-    const MachineBasicBlock *MBB = &*MFI;
-
+  for (const MachineBasicBlock &MBB : *MF) {
     // We only care about ABI blocks: Entry + landing pads.
-    if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty())
+    if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty())
       continue;
 
     // Create phi-defs at Begin for all live-in registers.
-    SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
-    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
-    for (const auto &LI : MBB->liveins()) {
+    SlotIndex Begin = Indexes->getMBBStartIdx(&MBB);
+    DEBUG(dbgs() << Begin << "\tBB#" << MBB.getNumber());
+    for (const auto &LI : MBB.liveins()) {
       for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
         LiveRange *LR = RegUnitRanges[Unit];
@@ -324,16 +323,13 @@ void LiveIntervals::computeLiveInRegUnits() {
   DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
   // Compute the 'normal' part of the ranges.
-  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) {
-    unsigned Unit = NewRanges[i];
+  for (unsigned Unit : NewRanges)
     computeRegUnitRange(*RegUnitRanges[Unit], Unit);
-  }
 }
 
-
 static void createSegmentsForValues(LiveRange &LR,
-      iterator_range<LiveInterval::vni_iterator> VNIs) {
-  for (auto VNI : VNIs) {
+    iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (VNInfo *VNI : VNIs) {
     if (VNI->isUnused())
       continue;
     SlotIndex Def = VNI->def;
@@ -349,7 +345,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
   // Keep track of the PHIs that are in use.
   SmallPtrSet<VNInfo*, 8> UsedPHIs;
   // Blocks that have already been added to WorkList as live-out.
-  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+  SmallPtrSet<const MachineBasicBlock*, 16> LiveOut;
 
   // Extend intervals to reach all uses in WorkList.
   while (!WorkList.empty()) {
@@ -368,7 +364,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
           !UsedPHIs.insert(VNI).second)
         continue;
       // The PHI is live, make sure the predecessors are live-out.
-      for (auto &Pred : MBB->predecessors()) {
+      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
         if (!LiveOut.insert(Pred).second)
           continue;
         SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
@@ -384,7 +380,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
     LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
 
     // Make sure VNI is live-out from the predecessors.
-    for (auto &Pred : MBB->predecessors()) {
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
       if (!LiveOut.insert(Pred).second)
         continue;
       SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
@@ -415,22 +411,20 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   ShrinkToUsesWorkList WorkList;
 
   // Visit all instructions reading li->reg.
-  for (MachineRegisterInfo::reg_instr_iterator
-       I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end();
-       I != E; ) {
-    MachineInstr *UseMI = &*(I++);
-    if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
+  unsigned Reg = li->reg;
+  for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) {
+    if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg))
       continue;
-    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
     LiveQueryResult LRQ = li->Query(Idx);
     VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
       // no live value. It is likely caused by a target getting <undef> flags
       // wrong.
-      DEBUG(dbgs() << Idx << '\t' << *UseMI
+      DEBUG(dbgs() << Idx << '\t' << UseMI
                    << "Warning: Instr claims to read non-existent value in "
-                    << *li << '\n');
+                   << *li << '\n');
       continue;
     }
     // Special case: An early-clobber tied operand reads and writes the
@@ -458,7 +452,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 bool LiveIntervals::computeDeadValues(LiveInterval &LI,
                                       SmallVectorImpl<MachineInstr*> *dead) {
   bool MayHaveSplitComponents = false;
-  for (auto VNI : LI.valnos) {
+  for (VNInfo *VNI : LI.valnos) {
     if (VNI->isUnused())
       continue;
     SlotIndex Def = VNI->def;
@@ -548,7 +542,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
   SR.segments.swap(NewLR.segments);
 
   // Remove dead PHI value numbers
-  for (auto VNI : SR.valnos) {
+  for (VNInfo *VNI : SR.valnos) {
     if (VNI->isUnused())
       continue;
     const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
@@ -571,8 +565,8 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
                                     ArrayRef<SlotIndex> Undefs) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-    LRCalc->extend(LR, Indices[i], /*PhysReg=*/0, Undefs);
+  for (SlotIndex Idx : Indices)
+    LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs);
 }
 
 void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
@@ -601,11 +595,9 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
   // from each successor.
   typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy;
   VisitedTy Visited;
-  for (MachineBasicBlock::succ_iterator
-       SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end();
-       SuccI != SuccE; ++SuccI) {
+  for (MachineBasicBlock *Succ : KillMBB->successors()) {
     for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
-         I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited);
+         I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited);
          I != E;) {
       MachineBasicBlock *MBB = *I;
 
@@ -657,9 +649,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     // Find the regunit intervals for the assigned register. They may overlap
     // the virtual register live range, cancelling any kills.
     RU.clear();
-    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
-         ++Units) {
-      const LiveRange &RURange = getRegUnit(*Units);
+    for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid();
+         ++Unit) {
+      const LiveRange &RURange = getRegUnit(*Unit);
       if (RURange.empty())
         continue;
       RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
@@ -802,9 +794,8 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
     // Conservatively return true instead of scanning huge predecessor lists.
     if (PHIMBB->pred_size() > 100)
       return true;
-    for (MachineBasicBlock::const_pred_iterator
-         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI)
-      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI)))
+    for (const MachineBasicBlock *Pred : PHIMBB->predecessors())
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred)))
         return true;
   }
   return false;
@@ -895,7 +886,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
 //                         IntervalUpdate class.
 //===----------------------------------------------------------------------===//
 
-// HMEditor is a toolkit used by handleMove to trim or extend live intervals.
+/// Toolkit used by handleMove to trim or extend live intervals.
 class LiveIntervals::HMEditor {
 private:
   LiveIntervals& LIS;
@@ -1241,10 +1232,12 @@ private:
           LiveRange::iterator NewIdxIn = NewIdxOut;
           assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));
           const SlotIndex SplitPos = NewIdxDef;
+          OldIdxVNI = OldIdxIn->valno;
 
           // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
+          OldIdxOut->valno->def = OldIdxIn->start;
           *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
-                                          OldIdxIn->valno);
+                                          OldIdxOut->valno);
           // OldIdxIn and OldIdxVNI are now undef and can be overridden.
           // We Slide [NewIdxIn, OldIdxIn) down one position.
           //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -|
@@ -1514,8 +1507,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     }
   }
 
-  for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) {
-    unsigned Reg = OrigRegs[i];
+  for (unsigned Reg : OrigRegs) {
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
@@ -1524,16 +1516,16 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     if (!LI.hasAtLeastOneValue())
       continue;
 
-    for (LiveInterval::SubRange &S : LI.subranges()) {
+    for (LiveInterval::SubRange &S : LI.subranges())
       repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
-    }
+
     repairOldRegInRange(Begin, End, endIdx, LI, Reg);
   }
 }
 
 void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) {
-  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-    if (LiveRange *LR = getCachedRegUnit(*Units))
+  for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
+    if (LiveRange *LR = getCachedRegUnit(*Unit))
       if (VNInfo *VNI = LR->getVNInfoAt(Pos))
         LR->removeValNo(VNI);
   }
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index fc2f233f6d68..b4aa0dc326a5 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -1,4 +1,4 @@
-//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===//
+//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,19 +13,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/LiveIntervalUnion.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SparseBitVector.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <algorithm>
+#include <cassert>
+#include <cstdlib>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
-
 // Merge a LiveInterval's segments. Guarantee no overlaps.
 void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
   if (Range.empty())
@@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
   LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
-  for (;;) {
+  while (true) {
     assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");
     SegPos.erase();
     if (!SegPos.valid())
@@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     CheckedFirstInterference = true;
 
     // Quickly skip interference check for empty sets.
-    if (VirtReg->empty() || LiveUnion->empty()) {
+    if (LR->empty() || LiveUnion->empty()) {
       SeenAllInterferences = true;
       return 0;
     }
 
-    // In most cases, the union will start before VirtReg.
-    VirtRegI = VirtReg->begin();
+    // In most cases, the union will start before LR.
+    LRI = LR->begin();
     LiveUnionI.setMap(LiveUnion->getMap());
-    LiveUnionI.find(VirtRegI->start);
+    LiveUnionI.find(LRI->start);
   }
 
-  LiveInterval::iterator VirtRegEnd = VirtReg->end();
+  LiveRange::const_iterator LREnd = LR->end();
   LiveInterval *RecentReg = nullptr;
   while (LiveUnionI.valid()) {
-    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
+    assert(LRI != LREnd && "Reached end of LR");
 
     // Check for overlapping interference.
-    while (VirtRegI->start < LiveUnionI.stop() &&
-           VirtRegI->end > LiveUnionI.start()) {
+    while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) {
       // This is an overlap, record the interfering register.
       LiveInterval *VReg = LiveUnionI.value();
       if (VReg != RecentReg && !isSeenInterference(VReg)) {
@@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     }
 
     // The iterators are now not overlapping, LiveUnionI has been advanced
-    // beyond VirtRegI.
-    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap");
+    // beyond LRI.
+    assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap");
 
     // Advance the iterator that ends first.
-    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start());
-    if (VirtRegI == VirtRegEnd)
+    LRI = LR->advanceTo(LRI, LiveUnionI.start());
+    if (LRI == LREnd)
       break;
 
     // Detect overlap, handle above.
-    if (VirtRegI->start < LiveUnionI.stop())
+    if (LRI->start < LiveUnionI.stop())
       continue;
 
     // Still not overlapping. Catch up LiveUnionI.
-    LiveUnionI.advanceTo(VirtRegI->start);
+    LiveUnionI.advanceTo(LRI->start);
   }
   SeenAllInterferences = true;
   return InterferingVRegs.size();
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index dcc41c1718a6..9f7d7cf54848 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -120,12 +120,11 @@ void LivePhysRegs::print(raw_ostream &OS) const {
   OS << "\n";
 }
 
-/// Dumps the currently live registers to the debug output.
-LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
   dbgs() << "  " << *this;
-#endif
 }
+#endif
 
 bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
                              unsigned Reg) const {
@@ -161,7 +160,9 @@ void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
 static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
                          const MachineFrameInfo &MFI,
                          const TargetRegisterInfo &TRI) {
-  for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
+       ++CSR)
     LiveRegs.addReg(*CSR);
   for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
     LiveRegs.removeReg(Info.getReg());
@@ -180,7 +181,8 @@ void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
     if (MBB.isReturnBlock()) {
       // The return block has no successors whose live-ins we could merge
       // below. So instead we add the callee saved registers manually.
-      for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      for (const MCPhysReg *I = MRI.getCalleeSavedRegs(); *I; ++I)
         addReg(*I);
     } else {
       addPristines(*this, MF, MFI, *TRI);
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index 012837608628..398066bf8903 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -75,34 +75,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
         LI.createSubRangeFrom(*Alloc, ClassMask, LI);
       }
 
-      LaneBitmask Mask = SubMask;
-      for (LiveInterval::SubRange &S : LI.subranges()) {
-        // A Mask for subregs common to the existing subrange and current def.
-        LaneBitmask Common = S.LaneMask & Mask;
-        if (Common.none())
-          continue;
-        LiveInterval::SubRange *CommonRange;
-        // A Mask for subregs covered by the subrange but not the current def.
-        LaneBitmask RM = S.LaneMask & ~Mask;
-        if (RM.any()) {
-          // Split the subrange S into two parts: one covered by the current
-          // def (CommonRange), and the one not affected by it (updated S).
-          S.LaneMask = RM;
-          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
-        } else {
-          assert(Common == S.LaneMask);
-          CommonRange = &S;
-        }
+      LI.refineSubRanges(*Alloc, SubMask,
+          [&MO, this](LiveInterval::SubRange &SR) {
         if (MO.isDef())
-          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
-        Mask &= ~Common;
-      }
-      // Create a new SubRange for subregs we did not cover yet.
-      if (Mask.any()) {
-        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
-        if (MO.isDef())
-          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
-      }
+          createDeadDef(*Indexes, *Alloc, SR, MO);
+      });
     }
 
     // Create the def in the main liverange. We do not have to do this if
@@ -289,8 +266,7 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
   if (UndefOnEntry[BN])
     return false;
 
-  auto MarkDefined =
-        [this,BN,&DefOnEntry,&UndefOnEntry] (MachineBasicBlock &B) -> bool {
+  auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool {
     for (MachineBasicBlock *S : B.successors())
       DefOnEntry[S->getNumber()] = true;
     DefOnEntry[BN] = true;
@@ -311,7 +287,12 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
       return MarkDefined(B);
     SlotIndex Begin, End;
     std::tie(Begin, End) = Indexes->getMBBRange(&B);
-    LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), End);
+    // Treat End as not belonging to B.
+    // If LR has a segment S that starts at the next block, i.e. [End, ...),
+    // std::upper_bound will return the segment following S. Instead,
+    // S should be treated as the first segment that does not overlap B.
+    LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(),
+                                              End.getPrevSlot());
     if (UB != LR.begin()) {
       LiveRange::Segment &Seg = *std::prev(UB);
       if (Seg.end > Begin) {
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 7f1c69c0b4a2..92cca1a54951 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
   LiveInterval &LI = LIS.createEmptyInterval(VReg);
+  if (Parent && !Parent->isSpillable())
+    LI.markNotSpillable();
   // Create empty subranges if the OldReg's interval has them. Do not create
   // the main range here---it will be constructed later after the subranges
   // have been finalized.
@@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
+  // FIXME: Getting the interval here actually computes it.
+  // In theory, this may not be what we want, but in practice
+  // the createEmptyIntervalFrom API is used when this is not
+  // the case. Generally speaking we just want to annotate the
+  // LiveInterval when it gets created but we cannot do that at
+  // the moment.
+  if (Parent && !Parent->isSpillable())
+    LIS.getInterval(VReg).markNotSpillable();
   return VReg;
 }
 
@@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
   if (VRM)
     VRM->grow();
 
-  if (Parent && !Parent->isSpillable())
-    LIS.getInterval(VReg).markNotSpillable();
-
   NewRegs.push_back(VReg);
 }
 
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 7a51386aa9ca..882de1a3fad9 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -1,4 +1,4 @@
-//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//===- LiveRegMatrix.cpp - Track register interference --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Pass.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
                     "Live Register Matrix", false, false)
 
-LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
-  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {}
 
 void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
@@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   return Result;
 }
 
-LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
                                                unsigned RegUnit) {
   LiveIntervalUnion::Query &Q = Queries[RegUnit];
-  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  Q.init(UserTag, LR, Matrix[RegUnit]);
   return Q;
 }
 
@@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
     return IK_RegUnit;
 
   // Check the matrix for virtual register interference.
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (query(VirtReg, *Units).checkInterference())
-      return IK_VirtReg;
+  bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
+                                  [&](unsigned Unit, const LiveRange &LR) {
+    return query(LR, Unit).checkInterference();
+  });
+  if (Interference)
+    return IK_VirtReg;
 
   return IK_Free;
 }
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
new file mode 100644
index 000000000000..dff555f49565
--- /dev/null
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -0,0 +1,126 @@
+//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file imlements the LiveRegUnits set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) {
+  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
+      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg))
+        Units.reset(U);
+    }
+  }
+}
+
+void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) {
+  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
+      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg))
+        Units.set(U);
+    }
+  }
+}
+
+void LiveRegUnits::stepBackward(const MachineInstr &MI) {
+  // Remove defined registers and regmask kills from the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (!O->isDef())
+        continue;
+      unsigned Reg = O->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      removeReg(Reg);
+    } else if (O->isRegMask())
+      removeRegsNotPreserved(O->getRegMask());
+  }
+
+  // Add uses to the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg())
+      continue;
+    unsigned Reg = O->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    addReg(Reg);
+  }
+}
+
+void LiveRegUnits::accumulateBackward(const MachineInstr &MI) {
+  // Add defs, uses and regmask clobbers to the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      unsigned Reg = O->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      if (!O->isDef() && !O->readsReg())
+        continue;
+      addReg(Reg);
+    } else if (O->isRegMask())
+      addRegsInMask(O->getRegMask());
+  }
+}
+
+/// Add live-in registers of basic block \p MBB to \p LiveUnits.
+static void addLiveIns(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) {
+  for (const auto &LI : MBB.liveins())
+    LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask);
+}
+
+static void addLiveOuts(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) {
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    addLiveIns(LiveUnits, *Succ);
+}
+
+/// Add pristine registers to the given \p LiveUnits. This function removes
+/// actually saved callee save registers when \p InPrologueEpilogue is false.
+static void removeSavedRegs(LiveRegUnits &LiveUnits, const MachineFunction &MF,
+                            const MachineFrameInfo &MFI,
+                            const TargetRegisterInfo &TRI) {
+  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+    LiveUnits.removeReg(Info.getReg());
+}
+
+void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MFI.isCalleeSavedInfoValid()) {
+    for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
+      addReg(*I);
+    if (!MBB.isReturnBlock())
+      removeSavedRegs(*this, MF, MFI, *TRI);
+  }
+  ::addLiveOuts(*this, MBB);
+}
+
+void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MFI.isCalleeSavedInfoValid()) {
+    for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
+      addReg(*I);
+    if (&MBB != &MF.front())
+      removeSavedRegs(*this, MF, MFI, *TRI);
+  }
+  ::addLiveIns(*this, MBB);
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 269b990a3149..3568b0294ad9 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -64,8 +64,8 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
   return nullptr;
 }
 
-LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
   dbgs() << "  Alive in blocks: ";
   for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
            E = AliveBlocks.end(); I != E; ++I)
@@ -78,8 +78,8 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
       dbgs() << "\n    #" << i << ": " << *Kills[i];
     dbgs() << "\n";
   }
-#endif
 }
+#endif
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
 LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
diff --git a/lib/CodeGen/LowLevelType.cpp b/lib/CodeGen/LowLevelType.cpp
index d74b7306e0f4..c4b9068fa905 100644
--- a/lib/CodeGen/LowLevelType.cpp
+++ b/lib/CodeGen/LowLevelType.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===//
+//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,54 +18,21 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-LLT::LLT(Type &Ty, const DataLayout &DL) {
+LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
   if (auto VTy = dyn_cast<VectorType>(&Ty)) {
-    SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    ElementsOrAddrSpace = VTy->getNumElements();
-    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
+    auto NumElements = VTy->getNumElements();
+    auto ScalarSizeInBits = VTy->getElementType()->getPrimitiveSizeInBits();
+    if (NumElements == 1)
+      return LLT::scalar(ScalarSizeInBits);
+    return LLT::vector(NumElements, ScalarSizeInBits);
   } else if (auto PTy = dyn_cast<PointerType>(&Ty)) {
-    Kind = Pointer;
-    SizeInBits = DL.getTypeSizeInBits(&Ty);
-    ElementsOrAddrSpace = PTy->getAddressSpace();
+    return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty));
   } else if (Ty.isSized()) {
     // Aggregates are no different from real scalars as far as GlobalISel is
     // concerned.
-    Kind = Scalar;
-    SizeInBits = DL.getTypeSizeInBits(&Ty);
-    ElementsOrAddrSpace = 1;
+    auto SizeInBits = DL.getTypeSizeInBits(&Ty);
     assert(SizeInBits != 0 && "invalid zero-sized type");
-  } else {
-    Kind = Invalid;
-    SizeInBits = ElementsOrAddrSpace = 0;
+    return LLT::scalar(SizeInBits);
   }
-}
-
-LLT::LLT(MVT VT) {
-  if (VT.isVector()) {
-    SizeInBits = VT.getVectorElementType().getSizeInBits();
-    ElementsOrAddrSpace = VT.getVectorNumElements();
-    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
-  } else if (VT.isValid()) {
-    // Aggregates are no different from real scalars as far as GlobalISel is
-    // concerned.
-    Kind = Scalar;
-    SizeInBits = VT.getSizeInBits();
-    ElementsOrAddrSpace = 1;
-    assert(SizeInBits != 0 && "invalid zero-sized type");
-  } else {
-    Kind = Invalid;
-    SizeInBits = ElementsOrAddrSpace = 0;
-  }
-}
-
-void LLT::print(raw_ostream &OS) const {
-  if (isVector())
-    OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">";
-  else if (isPointer())
-    OS << "p" << getAddressSpace();
-  else if (isValid()) {
-    assert(isScalar() && "unexpected type");
-    OS << "s" << getScalarSizeInBits();
-  } else
-    llvm_unreachable("trying to print an invalid type");
+  return LLT();
 }
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index c8bed0890dd6..cac22af32956 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -41,8 +41,11 @@
 using namespace llvm;
 
 PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF,
-    SourceMgr &SM, const SlotMapping &IRSlots)
-  : MF(MF), SM(&SM), IRSlots(IRSlots) {
+    SourceMgr &SM, const SlotMapping &IRSlots,
+    const Name2RegClassMap &Names2RegClasses,
+    const Name2RegBankMap &Names2RegBanks)
+  : MF(MF), SM(&SM), IRSlots(IRSlots), Names2RegClasses(Names2RegClasses),
+    Names2RegBanks(Names2RegBanks) {
 }
 
 VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) {
@@ -139,6 +142,7 @@ public:
   bool parseVirtualRegister(VRegInfo *&Info);
   bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo);
   bool parseRegisterFlag(unsigned &Flags);
+  bool parseRegisterClassOrBank(VRegInfo &RegInfo);
   bool parseSubRegisterIndex(unsigned &SubReg);
   bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);
   bool parseRegisterOperand(MachineOperand &Dest,
@@ -172,6 +176,7 @@ public:
   bool parseIntrinsicOperand(MachineOperand &Dest);
   bool parsePredicateOperand(MachineOperand &Dest);
   bool parseTargetIndexOperand(MachineOperand &Dest);
+  bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
   bool parseMachineOperand(MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
@@ -184,6 +189,7 @@ public:
   bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
   bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
   bool parseMachinePointerInfo(MachinePointerInfo &Dest);
+  bool parseOptionalAtomicOrdering(AtomicOrdering &Order);
   bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
 
 private:
@@ -878,6 +884,66 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) {
   }
 }
 
+bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) {
+  if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore))
+    return error("expected '_', register class, or register bank name");
+  StringRef::iterator Loc = Token.location();
+  StringRef Name = Token.stringValue();
+
+  // Was it a register class?
+  auto RCNameI = PFS.Names2RegClasses.find(Name);
+  if (RCNameI != PFS.Names2RegClasses.end()) {
+    lex();
+    const TargetRegisterClass &RC = *RCNameI->getValue();
+
+    switch (RegInfo.Kind) {
+    case VRegInfo::UNKNOWN:
+    case VRegInfo::NORMAL:
+      RegInfo.Kind = VRegInfo::NORMAL;
+      if (RegInfo.Explicit && RegInfo.D.RC != &RC) {
+        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+        return error(Loc, Twine("conflicting register classes, previously: ") +
+                     Twine(TRI.getRegClassName(RegInfo.D.RC)));
+      }
+      RegInfo.D.RC = &RC;
+      RegInfo.Explicit = true;
+      return false;
+
+    case VRegInfo::GENERIC:
+    case VRegInfo::REGBANK:
+      return error(Loc, "register class specification on generic register");
+    }
+    llvm_unreachable("Unexpected register kind");
+  }
+
+  // Should be a register bank or a generic register.
+  const RegisterBank *RegBank = nullptr;
+  if (Name != "_") {
+    auto RBNameI = PFS.Names2RegBanks.find(Name);
+    if (RBNameI == PFS.Names2RegBanks.end())
+      return error(Loc, "expected '_', register class, or register bank name");
+    RegBank = RBNameI->getValue();
+  }
+
+  lex();
+
+  switch (RegInfo.Kind) {
+  case VRegInfo::UNKNOWN:
+  case VRegInfo::GENERIC:
+  case VRegInfo::REGBANK:
+    RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC;
+    if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank)
+      return error(Loc, "conflicting generic register banks");
+    RegInfo.D.RegBank = RegBank;
+    RegInfo.Explicit = true;
+    return false;
+
+  case VRegInfo::NORMAL:
+    return error(Loc, "register bank specification on normal register");
+  }
+  llvm_unreachable("Unexpected register kind");
+}
+
 bool MIParser::parseRegisterFlag(unsigned &Flags) {
   const unsigned OldFlags = Flags;
   switch (Token.kind()) {
@@ -1004,6 +1070,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       return error("subregister index expects a virtual register");
   }
+  if (Token.is(MIToken::colon)) {
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return error("register class specification expects a virtual register");
+    lex();
+    if (parseRegisterClassOrBank(*RegInfo))
+        return true;
+  }
   MachineRegisterInfo &MRI = MF.getRegInfo();
   if ((Flags & RegState::Define) == 0) {
     if (consumeIfPresent(MIToken::lparen)) {
@@ -1598,6 +1671,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask");
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  while (true) {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg;
+    if (parseNamedRegister(Reg))
+      return true;
+    lex();
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+    // TODO: Report an error if the same register is used more than once.
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegMask(Mask);
+  return false;
+}
+
 bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_liveout));
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
@@ -1695,8 +1797,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       Dest = MachineOperand::CreateRegMask(RegMask);
       lex();
       break;
-    }
-    LLVM_FALLTHROUGH;
+    } else
+      return parseCustomRegisterMaskOperand(Dest);
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -1969,6 +2071,28 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   return false;
 }
 
+bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) {
+  Order = AtomicOrdering::NotAtomic;
+  if (Token.isNot(MIToken::Identifier))
+    return false;
+
+  Order = StringSwitch<AtomicOrdering>(Token.stringValue())
+              .Case("unordered", AtomicOrdering::Unordered)
+              .Case("monotonic", AtomicOrdering::Monotonic)
+              .Case("acquire", AtomicOrdering::Acquire)
+              .Case("release", AtomicOrdering::Release)
+              .Case("acq_rel", AtomicOrdering::AcquireRelease)
+              .Case("seq_cst", AtomicOrdering::SequentiallyConsistent)
+              .Default(AtomicOrdering::NotAtomic);
+
+  if (Order != AtomicOrdering::NotAtomic) {
+    lex();
+    return false;
+  }
+
+  return error("expected an atomic scope, ordering or a size integer literal");
+}
+
 bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   if (expectAndConsume(MIToken::lparen))
     return true;
@@ -1986,6 +2110,21 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     Flags |= MachineMemOperand::MOStore;
   lex();
 
+  // Optional "singlethread" scope.
+  SynchronizationScope Scope = SynchronizationScope::CrossThread;
+  if (Token.is(MIToken::Identifier) && Token.stringValue() == "singlethread") {
+    Scope = SynchronizationScope::SingleThread;
+    lex();
+  }
+
+  // Up to two atomic orderings (cmpxchg provides guarantees on failure).
+  AtomicOrdering Order, FailureOrder;
+  if (parseOptionalAtomicOrdering(Order))
+    return true;
+
+  if (parseOptionalAtomicOrdering(FailureOrder))
+    return true;
+
   if (Token.isNot(MIToken::IntegerLiteral))
     return error("expected the size integer literal after memory operation");
   uint64_t Size;
@@ -2040,8 +2179,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   }
   if (expectAndConsume(MIToken::rparen))
     return true;
-  Dest =
-      MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range);
+  Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range,
+                                 Scope, Order, FailureOrder);
   return false;
 }
 
diff --git a/lib/CodeGen/MIRParser/MIParser.h b/lib/CodeGen/MIRParser/MIParser.h
index 93a4d84ba62f..9b3879cf8377 100644
--- a/lib/CodeGen/MIRParser/MIParser.h
+++ b/lib/CodeGen/MIRParser/MIParser.h
@@ -45,11 +45,16 @@ struct VRegInfo {
   unsigned PreferredReg = 0;
 };
 
+typedef StringMap<const TargetRegisterClass*> Name2RegClassMap;
+typedef StringMap<const RegisterBank*> Name2RegBankMap;
+
 struct PerFunctionMIParsingState {
   BumpPtrAllocator Allocator;
   MachineFunction &MF;
   SourceMgr *SM;
   const SlotMapping &IRSlots;
+  const Name2RegClassMap &Names2RegClasses;
+  const Name2RegBankMap &Names2RegBanks;
 
   DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
   DenseMap<unsigned, VRegInfo*> VRegInfos;
@@ -59,7 +64,9 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
-                            const SlotMapping &IRSlots);
+                            const SlotMapping &IRSlots,
+                            const Name2RegClassMap &Names2RegClasses,
+                            const Name2RegBankMap &Names2RegBanks);
 
   VRegInfo &getVRegInfo(unsigned VReg);
 };
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index 3dff1147631b..a2773cccc5db 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -55,9 +55,9 @@ class MIRParserImpl {
   StringMap<std::unique_ptr<yaml::MachineFunction>> Functions;
   SlotMapping IRSlots;
   /// Maps from register class names to register classes.
-  StringMap<const TargetRegisterClass *> Names2RegClasses;
+  Name2RegClassMap Names2RegClasses;
   /// Maps from register bank names to register banks.
-  StringMap<const RegisterBank *> Names2RegBanks;
+  Name2RegBankMap Names2RegBanks;
 
 public:
   MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename,
@@ -325,11 +325,15 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
     return error(Twine("no machine function information for function '") +
                  MF.getName() + "' in the MIR file");
   // TODO: Recreate the machine function.
+  initNames2RegClasses(MF);
+  initNames2RegBanks(MF);
   const yaml::MachineFunction &YamlMF = *It->getValue();
   if (YamlMF.Alignment)
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
 
+  if (YamlMF.NoVRegs)
+    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
   if (YamlMF.Legalized)
     MF.getProperties().set(MachineFunctionProperties::Property::Legalized);
   if (YamlMF.RegBankSelected)
@@ -338,7 +342,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   if (YamlMF.Selected)
     MF.getProperties().set(MachineFunctionProperties::Property::Selected);
 
-  PerFunctionMIParsingState PFS(MF, SM, IRSlots);
+  PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses,
+                                Names2RegBanks);
   if (parseRegisterInfo(PFS, YamlMF))
     return true;
   if (!YamlMF.Constants.empty()) {
@@ -362,9 +367,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   }
   PFS.SM = &SM;
 
-  if (MF.empty())
-    return error(Twine("machine function '") + Twine(MF.getName()) +
-                 "' requires at least one machine basic block in its body");
   // Initialize the frame information after creating all the MBBs so that the
   // MBB references in the frame information can be resolved.
   if (initializeFrameInfo(PFS, YamlMF))
@@ -462,17 +464,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
     RegInfo.addLiveIn(Reg, VReg);
   }
 
-  // Parse the callee saved register mask.
-  BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size());
-  if (!YamlMF.CalleeSavedRegisters)
-    return false;
-  for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
-    unsigned Reg = 0;
-    if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
-      return error(Error, RegSource.SourceRange);
-    CalleeSavedRegisterMask[Reg] = true;
+  // Parse the callee saved registers (Registers that will
+  // be saved for the caller).
+  if (YamlMF.CalleeSavedRegisters) {
+    SmallVector<MCPhysReg, 16> CalleeSavedRegisters;
+    for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
+      unsigned Reg = 0;
+      if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
+        return error(Error, RegSource.SourceRange);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+    RegInfo.setCalleeSavedRegs(CalleeSavedRegisters);
   }
-  RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip());
+
   return false;
 }
 
@@ -505,14 +509,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
   }
 
   // Compute MachineRegisterInfo::UsedPhysRegMask
-  if (!YamlMF.CalleeSavedRegisters) {
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
-        for (const MachineOperand &MO : MI.operands()) {
-          if (!MO.isRegMask())
-            continue;
-          MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
-        }
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
       }
     }
   }
@@ -818,7 +820,6 @@ void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) {
 
 const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
                                                       StringRef Name) {
-  initNames2RegClasses(MF);
   auto RegClassInfo = Names2RegClasses.find(Name);
   if (RegClassInfo == Names2RegClasses.end())
     return nullptr;
@@ -827,7 +828,6 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
 
 const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF,
                                               StringRef Name) {
-  initNames2RegBanks(MF);
   auto RegBankInfo = Names2RegBanks.find(Name);
   if (RegBankInfo == Names2RegBanks.end())
     return nullptr;
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index db87092177ca..6da174a53666 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -175,6 +175,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
 
+  YamlMF.NoVRegs = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoVRegs);
   YamlMF.Legalized = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Legalized);
   YamlMF.RegBankSelected = MF.getProperties().hasProperty(
@@ -205,6 +207,25 @@ void MIRPrinter::print(const MachineFunction &MF) {
   Out << YamlMF;
 }
 
+static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS,
+                               const TargetRegisterInfo *TRI) {
+  assert(RegMask && "Can't print an empty register mask");
+  OS << StringRef("CustomRegMask(");
+
+  bool IsRegInRegMaskFound = false;
+  for (int I = 0, E = TRI->getNumRegs(); I < E; I++) {
+    // Check whether the register is asserted in regmask.
+    if (RegMask[I / 32] & (1u << (I % 32))) {
+      if (IsRegInRegMaskFound)
+        OS << ',';
+      printReg(I, OS, TRI);
+      IsRegInRegMaskFound = true;
+    }
+  }
+
+  OS << ')';
+}
+
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineRegisterInfo &RegInfo,
                          const TargetRegisterInfo *TRI) {
@@ -239,20 +260,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
       printReg(I->second, LiveIn.VirtualRegister, TRI);
     MF.LiveIns.push_back(LiveIn);
   }
-  // The used physical register mask is printed as an inverted callee saved
-  // register mask.
-  const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask();
-  if (UsedPhysRegMask.none())
-    return;
-  std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
-  for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) {
-    if (!UsedPhysRegMask[I]) {
+
+  // Prints the callee saved registers.
+  if (RegInfo.isUpdatedCSRsInitialized()) {
+    const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs();
+    std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
+    for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) {
       yaml::FlowStringValue Reg;
-      printReg(I, Reg, TRI);
+      printReg(*I, Reg, TRI);
       CalleeSavedRegisters.push_back(Reg);
     }
+    MF.CalleeSavedRegisters = CalleeSavedRegisters;
   }
-  MF.CalleeSavedRegisters = CalleeSavedRegisters;
 }
 
 void MIRPrinter::convert(ModuleSlotTracker &MST,
@@ -860,7 +879,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
     if (RegMaskInfo != RegisterMaskIds.end())
       OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower();
     else
-      llvm_unreachable("Can't print this machine register mask yet.");
+      printCustomRegMask(Op.getRegMask(), OS, TRI);
     break;
   }
   case MachineOperand::MO_RegisterLiveOut: {
@@ -906,6 +925,9 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
        << CmpInst::getPredicateName(Pred) << ')';
     break;
   }
+  case MachineOperand::MO_Placeholder:
+    OS << "<placeholder>";
+    break;
   }
 }
 
@@ -926,6 +948,15 @@ void MIPrinter::print(const MachineMemOperand &Op) {
     assert(Op.isStore() && "Non load machine operand must be a store");
     OS << "store ";
   }
+
+  if (Op.getSynchScope() == SynchronizationScope::SingleThread)
+    OS << "singlethread ";
+
+  if (Op.getOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(Op.getOrdering()) << ' ';
+  if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(Op.getFailureOrdering()) << ' ';
+
   OS << Op.getSize();
   if (const Value *Val = Op.getValue()) {
     OS << (Op.isLoad() ? " from " : " into ");
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 3869f976854d..06112723497b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -148,8 +149,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
 
 MachineBasicBlock::iterator
 MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
   iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition()))
+  while (I != E && (I->isPHI() || I->isPosition() ||
+                    TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels
   // inside the bundle.
@@ -160,8 +164,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
 
 MachineBasicBlock::iterator
 MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
   iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue()))
+  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() ||
+                    TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels / dbg_values
   // inside the bundle.
@@ -225,7 +232,7 @@ StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
     return LBB->getName();
   else
-    return "(null)";
+    return StringRef("", 0);
 }
 
 /// Return a hopefully unique identifier for this block.
@@ -417,7 +424,7 @@ void MachineBasicBlock::updateTerminator() {
 
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc DL;  // FIXME: this is nowhere
+  DebugLoc DL = findBranchDebugLoc();
   bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
@@ -485,7 +492,7 @@ void MachineBasicBlock::updateTerminator() {
       // FIXME: This does not seem like a reasonable pattern to support, but it
       // has been seen in the wild coming out of degenerate ARM test cases.
       TII->removeBranch(*this);
-  
+
       // Finally update the unconditional successor to be reached via a branch if
       // it would not be reached by fallthrough.
       if (!isLayoutSuccessor(TBB))
@@ -681,16 +688,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
   return std::next(I) == MachineFunction::const_iterator(MBB);
 }
 
-bool MachineBasicBlock::canFallThrough() {
+MachineBasicBlock *MachineBasicBlock::getFallThrough() {
   MachineFunction::iterator Fallthrough = getIterator();
   ++Fallthrough;
   // If FallthroughBlock is off the end of the function, it can't fall through.
   if (Fallthrough == getParent()->end())
-    return false;
+    return nullptr;
 
   // If FallthroughBlock isn't a successor, no fallthrough is possible.
   if (!isSuccessor(&*Fallthrough))
-    return false;
+    return nullptr;
 
   // Analyze the branches, if any, at the end of the block.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -702,25 +709,31 @@ bool MachineBasicBlock::canFallThrough() {
     // is possible. The isPredicated check is needed because this code can be
     // called during IfConversion, where an instruction which is normally a
     // Barrier is predicated and thus no longer an actual control barrier.
-    return empty() || !back().isBarrier() || TII->isPredicated(back());
+    return (empty() || !back().isBarrier() || TII->isPredicated(back()))
+               ? &*Fallthrough
+               : nullptr;
   }
 
   // If there is no branch, control always falls through.
-  if (!TBB) return true;
+  if (!TBB) return &*Fallthrough;
 
   // If there is some explicit branch to the fallthrough block, it can obviously
   // reach, even though the branch should get folded to fall through implicitly.
   if (MachineFunction::iterator(TBB) == Fallthrough ||
       MachineFunction::iterator(FBB) == Fallthrough)
-    return true;
+    return &*Fallthrough;
 
   // If it's an unconditional branch to some block not the fall through, it
   // doesn't fall through.
-  if (Cond.empty()) return false;
+  if (Cond.empty()) return nullptr;
 
   // Otherwise, if it is conditional and has no explicit false block, it falls
   // through.
-  return FBB == nullptr;
+  return (FBB == nullptr) ? &*Fallthrough : nullptr;
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  return getFallThrough() != nullptr;
 }
 
 MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
@@ -1144,6 +1157,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return {};
 }
 
+/// Find and return the merged DebugLoc of the branch instructions of the block.
+/// Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findBranchDebugLoc() {
+  DebugLoc DL;
+  auto TI = getFirstTerminator();
+  while (TI != end() && !TI->isBranch())
+    ++TI;
+
+  if (TI != end()) {
+    DL = TI->getDebugLoc();
+    for (++TI ; TI != end() ; ++TI)
+      if (TI->isBranch())
+        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc());
+  }
+  return DL;
+}
+
 /// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 7d5124d30a04..9c7367b4c780 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -28,7 +28,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "block-freq"
 
-#ifndef NDEBUG
 
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
     "view-machine-block-freq-propagation-dags", cl::Hidden,
@@ -43,10 +42,37 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
                           "integer fractional block frequency representation."),
                clEnumValN(GVDT_Count, "count", "display a graph using the real "
                                                "profile count if available.")));
+// Similar option above, but used to control BFI display only after MBP pass
+cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
+    "view-block-layout-with-bfi", cl::Hidden,
+    cl::desc(
+        "Pop up a window to show a dag displaying MBP layout and associated "
+        "block frequencies of the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count",
+                          "display a graph using the real "
+                          "profile count if available.")));
 
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
+// Command line option to specify hot frequency threshold.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
 extern cl::opt<unsigned> ViewHotFreqPercent;
 
+static GVDAGType getGVDT() {
+  if (ViewBlockLayoutWithBFI != GVDT_None)
+    return ViewBlockLayoutWithBFI;
+
+  return ViewMachineBlockFreqPropagationDAG;
+}
+
 namespace llvm {
 
 template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
@@ -80,12 +106,32 @@ template <>
 struct DOTGraphTraits<MachineBlockFrequencyInfo *>
     : public MBFIDOTGraphTraitsBase {
   explicit DOTGraphTraits(bool isSimple = false)
-      : MBFIDOTGraphTraitsBase(isSimple) {}
+      : MBFIDOTGraphTraitsBase(isSimple), CurFunc(nullptr), LayoutOrderMap() {}
+
+  const MachineFunction *CurFunc;
+  DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
 
   std::string getNodeLabel(const MachineBasicBlock *Node,
                            const MachineBlockFrequencyInfo *Graph) {
-    return MBFIDOTGraphTraitsBase::getNodeLabel(
-        Node, Graph, ViewMachineBlockFreqPropagationDAG);
+
+    int layout_order = -1;
+    // Attach additional ordering information if 'isSimple' is false.
+    if (!isSimple()) {
+      const MachineFunction *F = Node->getParent();
+      if (!CurFunc || F != CurFunc) {
+        if (CurFunc)
+          LayoutOrderMap.clear();
+
+        CurFunc = F;
+        int O = 0;
+        for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) {
+          LayoutOrderMap[&*MBI] = O;
+        }
+      }
+      layout_order = LayoutOrderMap[Node];
+    }
+    return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(),
+                                                layout_order);
   }
 
   std::string getNodeAttributes(const MachineBasicBlock *Node,
@@ -102,7 +148,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
 };
 
 } // end namespace llvm
-#endif
 
 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
                       "Machine Block Frequency Analysis", true, true)
@@ -127,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
-  MachineBranchProbabilityInfo &MBPI =
-      getAnalysis<MachineBranchProbabilityInfo>();
-  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+void MachineBlockFrequencyInfo::calculate(
+    const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI,
+    const MachineLoopInfo &MLI) {
   if (!MBFI)
     MBFI.reset(new ImplType);
   MBFI->calculate(F, MBPI, MLI);
-#ifndef NDEBUG
   if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F.getName().equals(ViewBlockFreqFuncName))) {
-    view();
+    view("MachineBlockFrequencyDAGS." + F.getName());
   }
-#endif
+}
+
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
+  MachineBranchProbabilityInfo &MBPI =
+      getAnalysis<MachineBranchProbabilityInfo>();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  calculate(F, MBPI, MLI);
   return false;
 }
 
@@ -148,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); }
 
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
-void MachineBlockFrequencyInfo::view() const {
-// This code is only for debugging.
-#ifndef NDEBUG
-  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this),
-            "MachineBlockFrequencyDAGs");
-#else
-  errs() << "MachineBlockFrequencyInfo::view is only available in debug builds "
-            "on systems with Graphviz or gv!\n";
-#endif // NDEBUG
+void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const {
+  // This code is only for debugging.
+  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple);
 }
 
 BlockFrequency
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 40e3840e6b0b..4cfc128a8c1d 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -32,14 +32,15 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
@@ -49,6 +50,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <functional>
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "block-placement"
@@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias(
 // Definition:
 // - Outlining: placement of a basic block outside the chain or hot path.
 
-static cl::opt<bool> OutlineOptionalBranches(
-    "outline-optional-branches",
-    cl::desc("Outlining optional branches will place blocks that are optional "
-              "branches, i.e. branches with a common post dominator, outside "
-              "the hot path or chain"),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<unsigned> OutlineOptionalThreshold(
-    "outline-optional-threshold",
-    cl::desc("Don't outline optional branches that are a single block with an "
-             "instruction count below this threshold"),
-    cl::init(4), cl::Hidden);
-
 static cl::opt<unsigned> LoopToColdBlockRatio(
     "loop-to-cold-block-ratio",
     cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
@@ -136,20 +126,47 @@ BranchFoldPlacement("branch-fold-placement",
               cl::init(true), cl::Hidden);
 
 // Heuristic for tail duplication.
-static cl::opt<unsigned> TailDuplicatePlacementThreshold(
+static cl::opt<unsigned> TailDupPlacementThreshold(
     "tail-dup-placement-threshold",
     cl::desc("Instruction cutoff for tail duplication during layout. "
              "Tail merging during layout is forced to have a threshold "
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDupPlacementPenalty(
+    "tail-dup-placement-penalty",
+    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
+             "Copying can increase fallthrough, but it also increases icache "
+             "pressure. This parameter controls the penalty to account for that. "
+             "Percent as integer."),
+    cl::init(2),
+    cl::Hidden);
+
+// Heuristic for triangle chains.
+static cl::opt<unsigned> TriangleChainCount(
+    "triangle-chain-count",
+    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
+             "triangle tail duplication heuristic to kick in. 0 to disable."),
+    cl::init(2),
+    cl::Hidden);
+
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
 
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi=
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
-typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
+typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType;
 }
 
 namespace {
@@ -193,12 +210,15 @@ public:
 
   /// \brief Iterator over blocks within the chain.
   typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
+  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator;
 
   /// \brief Beginning of blocks within the chain.
   iterator begin() { return Blocks.begin(); }
+  const_iterator begin() const { return Blocks.begin(); }
 
   /// \brief End of blocks within the chain.
   iterator end() { return Blocks.end(); }
+  const_iterator end() const { return Blocks.end(); }
 
   bool remove(MachineBasicBlock* BB) {
     for(iterator i = begin(); i != end(); ++i) {
@@ -264,12 +284,28 @@ public:
 namespace {
 class MachineBlockPlacement : public MachineFunctionPass {
   /// \brief A typedef for a block filter set.
-  typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet;
+  typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet;
+
+  /// Pair struct containing basic block and taildup profitiability
+  struct BlockAndTailDupResult {
+    MachineBasicBlock *BB;
+    bool ShouldTailDup;
+  };
+
+  /// Triple struct containing edge weight and the edge.
+  struct WeightedEdge {
+    BlockFrequency Weight;
+    MachineBasicBlock *Src;
+    MachineBasicBlock *Dest;
+  };
 
   /// \brief work lists of blocks that are ready to be laid out
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
 
+  /// Edges that have already been computed as optimal.
+  DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
+
   /// \brief Machine Function
   MachineFunction *F;
 
@@ -294,7 +330,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   const TargetLoweringBase *TLI;
 
   /// \brief A handle to the post dominator tree.
-  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
 
   /// \brief Duplicator used to duplicate tails during placement.
   ///
@@ -303,10 +339,6 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
-  /// all terminators of the MachineFunction.
-  SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
-
   /// \brief Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -322,7 +354,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// BlockChain it participates in, if any. We use it to, among other things,
   /// allow implicitly defining edges between chains as the existing edges
   /// between basic blocks.
-  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
+  DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
 
 #ifndef NDEBUG
   /// The set of basic blocks that have terminators that cannot be fully
@@ -334,75 +366,107 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
-                           const BlockFilterSet *BlockFilter = nullptr);
+  void markChainSuccessors(
+      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
 
   /// Decrease the UnscheduledPredecessors count for a single block, and
   /// if the count goes to 0, add them to the appropriate work list.
   void markBlockSuccessors(
-      BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB,
+      const BlockChain &Chain, const MachineBasicBlock *BB,
+      const MachineBasicBlock *LoopHeaderBB,
       const BlockFilterSet *BlockFilter = nullptr);
 
-
   BranchProbability
-  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
-                          const BlockFilterSet *BlockFilter,
-                          SmallVector<MachineBasicBlock *, 4> &Successors);
-  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                                 BlockChain &Chain,
-                                 const BlockFilterSet *BlockFilter,
-                                 BranchProbability SuccProb,
-                                 BranchProbability HotProb);
+  collectViableSuccessors(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter,
+      SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool shouldPredBlockBeOutlined(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter,
+      BranchProbability SuccProb, BranchProbability HotProb);
   bool repeatedlyTailDuplicateBlock(
       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-      MachineBasicBlock *LoopHeaderBB,
+      const MachineBasicBlock *LoopHeaderBB,
       BlockChain &Chain, BlockFilterSet *BlockFilter,
       MachineFunction::iterator &PrevUnplacedBlockIt);
-  bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
-                               const BlockChain &Chain,
-                               BlockFilterSet *BlockFilter,
-                               MachineFunction::iterator &PrevUnplacedBlockIt,
-                               bool &DuplicatedToPred);
-  bool
-  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                             BlockChain &SuccChain, BranchProbability SuccProb,
-                             BranchProbability RealSuccProb, BlockChain &Chain,
-                             const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
-                                         BlockChain &Chain,
-                                         const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *
-  selectBestCandidateBlock(BlockChain &Chain,
-                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
-  MachineBasicBlock *
-  getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                        MachineFunction::iterator &PrevUnplacedBlockIt,
-                        const BlockFilterSet *BlockFilter);
+  bool maybeTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *LPred,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      bool &DuplicatedToPred);
+  bool hasBetterLayoutPredecessor(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &SuccChain, BranchProbability SuccProb,
+      BranchProbability RealSuccProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestCandidateBlock(
+      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
 
   /// \brief Add a basic block to the work list if it is appropriate.
   ///
   /// If the optional parameter BlockFilter is provided, only MBB
   /// present in the set will be added to the worklist. If nullptr
   /// is provided, no filtering occurs.
-  void fillWorkLists(MachineBasicBlock *MBB,
+  void fillWorkLists(const MachineBasicBlock *MBB,
                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
                      const BlockFilterSet *BlockFilter);
-  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
                   BlockFilterSet *BlockFilter = nullptr);
-  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
-                                     const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(MachineLoop &L,
-                                      const BlockFilterSet &LoopBlockSet);
-  BlockFilterSet collectLoopBlockSet(MachineLoop &L);
-  void buildLoopChains(MachineLoop &L);
-  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
-                  const BlockFilterSet &LoopBlockSet);
-  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
-                             const BlockFilterSet &LoopBlockSet);
-  void collectMustExecuteBBs();
+  MachineBasicBlock *findBestLoopTop(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
+  void buildLoopChains(const MachineLoop &L);
+  void rotateLoop(
+      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+      const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(
+      BlockChain &LoopChain, const MachineLoop &L,
+      const BlockFilterSet &LoopBlockSet);
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  /// Check the edge frequencies to see if tail duplication will increase
+  /// fallthroughs.
+  bool isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability AdjustedSumProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Check for a trellis layout.
+  bool isTrellis(const MachineBasicBlock *BB,
+                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+                 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Get the best successor given a trellis layout.
+  BlockAndTailDupResult getBestTrellisSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  /// Get the best pair of non-conflicting edges.
+  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+      const MachineBasicBlock *BB,
+      MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges);
+  /// Returns true if a block can tail duplicate into all unplaced
+  /// predecessors. Filters based on loop.
+  bool canTailDuplicateUnplacedPreds(
+      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Find chains of triangles to tail-duplicate where a global analysis works,
+  /// but a local analysis would not find them.
+  void precomputeTriangleChains();
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -415,7 +479,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineDominatorTree>();
+    if (TailDupPlacement)
+      AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -429,7 +494,7 @@ INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
                       "Branch Probability Basic Block Placement", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
                     "Branch Probability Basic Block Placement", false, false)
@@ -438,7 +503,7 @@ INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
 /// \brief Helper to print the name of a MBB.
 ///
 /// Only used by debug logging.
-static std::string getBlockName(MachineBasicBlock *BB) {
+static std::string getBlockName(const MachineBasicBlock *BB) {
   std::string Result;
   raw_string_ostream OS(Result);
   OS << "BB#" << BB->getNumber();
@@ -455,7 +520,7 @@ static std::string getBlockName(MachineBasicBlock *BB) {
 /// having one fewer active predecessor. It also adds any successors of this
 /// chain which reach the zero-predecessor state to the appropriate worklist.
 void MachineBlockPlacement::markChainSuccessors(
-    BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
+    const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
     const BlockFilterSet *BlockFilter) {
   // Walk all the blocks in this chain, marking their successors as having
   // a predecessor placed.
@@ -471,8 +536,8 @@ void MachineBlockPlacement::markChainSuccessors(
 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
 /// for just that block.
 void MachineBlockPlacement::markBlockSuccessors(
-    BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB,
-    const BlockFilterSet *BlockFilter) {
+    const BlockChain &Chain, const MachineBasicBlock *MBB,
+    const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
   // Add any successors for which this is the only un-placed in-loop
   // predecessor to the worklist as a viable candidate for CFG-neutral
   // placement. No subsequent placement of this block will violate the CFG
@@ -504,7 +569,8 @@ void MachineBlockPlacement::markBlockSuccessors(
 /// the total branch probability of edges from \p BB to those
 /// blocks.
 BranchProbability MachineBlockPlacement::collectViableSuccessors(
-    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter,
     SmallVector<MachineBasicBlock *, 4> &Successors) {
   // Adjust edge probabilities by excluding edges pointing to blocks that is
   // either not in BlockFilter or is already in the current chain. Consider the
@@ -561,46 +627,573 @@ getAdjustedProbability(BranchProbability OrigProb,
   return SuccProb;
 }
 
-/// When the option OutlineOptionalBranches is on, this method
-/// checks if the fallthrough candidate block \p Succ (of block
-/// \p BB) also has other unscheduled predecessor blocks which
-/// are also successors of \p BB (forming triangular shape CFG).
-/// If none of such predecessors are small, it returns true.
-/// The caller can choose to select \p Succ as the layout successors
-/// so that \p Succ's predecessors (optional branches) can be
-/// outlined.
-/// FIXME: fold this with more general layout cost analysis.
-bool MachineBlockPlacement::shouldPredBlockBeOutlined(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
-    const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
-    BranchProbability HotProb) {
-  if (!OutlineOptionalBranches)
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
     return false;
-  // If we outline optional branches, look whether Succ is unavoidable, i.e.
-  // dominates all terminators of the MachineFunction. If it does, other
-  // successors must be optional. Don't do this for cold branches.
-  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
-    for (MachineBasicBlock *Pred : Succ->predecessors()) {
-      // Check whether there is an unplaced optional branch.
-      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
-          BlockToChain[Pred] == &Chain)
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// Compare 2 BlockFrequency's with a small penalty for \p A.
+/// In order to be conservative, we apply a X% penalty to account for
+/// increased icache pressure and static heuristics. For small frequencies
+/// we use only the numerators to improve accuracy. For simplicity, we assume the
+/// penalty is less than 100%
+/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
+static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
+                            uint64_t EntryFreq) {
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  BlockFrequency Gain = A - B;
+  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
+}
+
+/// Check the edge frequencies to see if tail duplication will increase
+/// fallthroughs. It only makes sense to call this function when
+/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
+/// always locally profitable if we would have picked \p Succ without
+/// considering duplication.
+bool MachineBlockPlacement::isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability QProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // We need to do a probability calculation to make sure this is profitable.
+  // First: does succ have a successor that post-dominates? This affects the
+  // calculation. The 2 relevant cases are:
+  //    BB         BB
+  //    | \Qout    | \Qout
+  //   P|  C       |P C
+  //    =   C'     =   C'
+  //    |  /Qin    |  /Qin
+  //    | /        | /
+  //    Succ       Succ
+  //    / \        | \  V
+  //  U/   =V      |U \
+  //  /     \      =   D
+  //  D      E     |  /
+  //               | /
+  //               |/
+  //               PDom
+  //  '=' : Branch taken for that CFG edge
+  // In the second case, Placing Succ while duplicating it into C prevents the
+  // fallthrough of Succ into either D or PDom, because they now have C as an
+  // unplaced predecessor
+
+  // Start by figuring out which case we fall into
+  MachineBasicBlock *PDom = nullptr;
+  SmallVector<MachineBasicBlock *, 4> SuccSuccs;
+  // Only scan the relevant successors
+  auto AdjustedSuccSumProb =
+      collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
+  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
+  auto BBFreq = MBFI->getBlockFreq(BB);
+  auto SuccFreq = MBFI->getBlockFreq(Succ);
+  BlockFrequency P = BBFreq * PProb;
+  BlockFrequency Qout = BBFreq * QProb;
+  uint64_t EntryFreq = MBFI->getEntryFreq();
+  // If there are no more successors, it is profitable to copy, as it strictly
+  // increases fallthrough.
+  if (SuccSuccs.size() == 0)
+    return greaterWithBias(P, Qout, EntryFreq);
+
+  auto BestSuccSucc = BranchProbability::getZero();
+  // Find the PDom or the best Succ if no PDom exists.
+  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
+    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
+    if (Prob > BestSuccSucc)
+      BestSuccSucc = Prob;
+    if (PDom == nullptr)
+      if (MPDT->dominates(SuccSucc, Succ)) {
+        PDom = SuccSucc;
+        break;
+      }
+  }
+  // For the comparisons, we need to know Succ's best incoming edge that isn't
+  // from BB.
+  auto SuccBestPred = BlockFrequency(0);
+  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+    if (SuccPred == Succ || SuccPred == BB
+        || BlockToChain[SuccPred] == &Chain
+        || (BlockFilter && !BlockFilter->count(SuccPred)))
+      continue;
+    auto Freq = MBFI->getBlockFreq(SuccPred)
+        * MBPI->getEdgeProbability(SuccPred, Succ);
+    if (Freq > SuccBestPred)
+      SuccBestPred = Freq;
+  }
+  // Qin is Succ's best unplaced incoming edge that isn't BB
+  BlockFrequency Qin = SuccBestPred;
+  // If it doesn't have a post-dominating successor, here is the calculation:
+  //    BB        BB
+  //    | \Qout   |  \
+  //   P|  C      |   =
+  //    =   C'    |    C
+  //    |  /Qin   |     |
+  //    | /       |     C' (+Succ)
+  //    Succ      Succ /|
+  //    / \       |  \/ |
+  //  U/   =V     |  == |
+  //  /     \     | /  \|
+  //  D      E    D     E
+  //  '=' : Branch taken for that CFG edge
+  //  Cost in the first case is: P + V
+  //  For this calculation, we always assume P > Qout. If Qout > P
+  //  The result of this function will be ignored at the caller.
+  //  Let F = SuccFreq - Qin
+  //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
+
+  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
+    BranchProbability UProb = BestSuccSucc;
+    BranchProbability VProb = AdjustedSuccSumProb - UProb;
+    BlockFrequency F = SuccFreq - Qin;
+    BlockFrequency V = SuccFreq * VProb;
+    BlockFrequency QinU = std::min(Qin, F) * UProb;
+    BlockFrequency BaseCost = P + V;
+    BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
+    return greaterWithBias(BaseCost, DupCost, EntryFreq);
+  }
+  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
+  BranchProbability VProb = AdjustedSuccSumProb - UProb;
+  BlockFrequency U = SuccFreq * UProb;
+  BlockFrequency V = SuccFreq * VProb;
+  BlockFrequency F = SuccFreq - Qin;
+  // If there is a post-dominating successor, here is the calculation:
+  // BB         BB                 BB          BB
+  // | \Qout    |   \               | \Qout     |  \
+  // |P C       |    =              |P C        |   =
+  // =   C'     |P    C             =   C'      |P   C
+  // |  /Qin    |      |            |  /Qin     |     |
+  // | /        |      C' (+Succ)   | /         |     C' (+Succ)
+  // Succ       Succ  /|            Succ        Succ /|
+  // | \  V     |   \/ |            | \  V      |  \/ |
+  // |U \       |U  /\ =?           |U =        |U /\ |
+  // =   D      = =  =?|            |   D       | =  =|
+  // |  /       |/     D            |  /        |/    D
+  // | /        |     /             | =         |    /
+  // |/         |    /              |/          |   =
+  // Dom         Dom                Dom         Dom
+  //  '=' : Branch taken for that CFG edge
+  // The cost for taken branches in the first case is P + U
+  // Let F = SuccFreq - Qin
+  // The cost in the second case (assuming independence), given the layout:
+  // BB, Succ, (C+Succ), D, Dom or the layout:
+  // BB, Succ, D, Dom, (C+Succ)
+  // is Qout + max(F, Qin) * U + min(F, Qin)
+  // compare P + U vs Qout + P * U + Qin.
+  //
+  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
+  //
+  // For the 3rd case, the cost is P + 2 * V
+  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
+  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
+  if (UProb > AdjustedSuccSumProb / 2 &&
+      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+                                  Chain, BlockFilter))
+    // Cases 3 & 4
+    return greaterWithBias(
+        (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
+        EntryFreq);
+  // Cases 1 & 2
+  return greaterWithBias((P + U),
+                         (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
+                          std::max(Qin, F) * UProb),
+                         EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a trellis with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+    return false;
+
+  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (MachineBasicBlock *Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred)) {
+        // Make sure that it is actually a triangle.
+        for (MachineBasicBlock *CheckSucc : SuccPred->successors())
+          if (!Successors.count(CheckSucc))
+            return false;
         continue;
-      // Check whether the optional branch has exactly one BB.
-      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
+      }
+      const BlockChain *PredChain = BlockToChain[SuccPred];
+      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+          PredChain == &Chain || PredChain == BlockToChain[Succ])
         continue;
-      // Check whether the optional branch is small.
-      if (Pred->size() < OutlineOptionalThreshold)
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
+        continue;
+      if (!hasSameSuccessors(*SuccPred, Successors))
         return false;
     }
-    return true;
-  } else
+    // If one of the successors has only BB as a predecessor, it is not a
+    // trellis.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
+}
+
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+          MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+    const MachineBasicBlock *BB,
+    MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>>
+        Edges) {
+  // Sort the edges, and then for each successor, find the best incoming
+  // predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall.
+
+  // Sort for highest frequency.
+  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
+  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
+  auto BestA = Edges[0].begin();
+  auto BestB = Edges[1].begin();
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (BestA->Src == BestB->Src) {
+    // Compare the total fallthrough of (Best + Second Best) for both pairs
+    auto SecondBestA = std::next(BestA);
+    auto SecondBestB = std::next(BestB);
+    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (BestB->Src == BB)
+    std::swap(BestA, BestB);
+  return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = {nullptr, false};
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a trellis even more so.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the trellis.
+  SmallVector<WeightedEdge, 8> Edges[2];
+  int SuccIndex = 0;
+  for (auto Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+            BlockToChain[SuccPred] == &Chain ||
+            BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+                                MBPI->getEdgeProbability(SuccPred, Succ);
+      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+    }
+    ++SuccIndex;
+  }
+
+  // Pick the best combination of 2 edges from all the edges in the trellis.
+  WeightedEdge BestA, BestB;
+  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+  if (BestA.Src != BB) {
+    // If we have a trellis, and BB doesn't have the best fallthrough edges,
+    // we shouldn't choose any successor. We've already looked and there's a
+    // better fallthrough edge for all the successors.
+    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    return Result;
+  }
+
+  // Did we pick the triangle edge? If tail-duplication is profitable, do
+  // that instead. Otherwise merge the triangle edge now while we know it is
+  // optimal.
+  if (BestA.Dest == BestB.Src) {
+    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+    // would be better.
+    MachineBasicBlock *Succ1 = BestA.Dest;
+    MachineBasicBlock *Succ2 = BestB.Dest;
+    // Check to see if tail-duplication would be profitable.
+    if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                              Chain, BlockFilter)) {
+      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+            dbgs() << "    Selected: " << getBlockName(Succ2)
+                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+      Result.BB = Succ2;
+      Result.ShouldTailDup = true;
+      return Result;
+    }
+  }
+  // We have already computed the optimal edge for the other side of the
+  // trellis.
+  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+
+  auto TrellisSucc = BestA.Dest;
+  DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+        dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+               << ", probability: " << SuccProb << " (Trellis)\n");
+  Result.BB = TrellisSucc;
+  return Result;
+}
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
     return false;
+
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    // Skip any blocks that are already placed or not in this loop.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+        // This will result in a trellis after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a trellis tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering
+        // whether Succ can be duplicated into all its unplaced predecessors, we
+        // ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows trellises to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend trellises that the user created in addition
+        // to trellises created by tail-duplication, so we just look for the
+        // CFG.
+        continue;
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Find chains of triangles where we believe it would be profitable to
+/// tail-duplicate them all, but a local analysis would not find them.
+/// There are 3 ways this can be profitable:
+/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
+///    longer chains)
+/// 2) The chains are statically correlated. Branch probabilities have a very
+///    U-shaped distribution.
+///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
+///    If the branches in a chain are likely to be from the same side of the
+///    distribution as their predecessor, but are independent at runtime, this
+///    transformation is profitable. (Because the cost of being wrong is a small
+///    fixed cost, unlike the standard triangle layout where the cost of being
+///    wrong scales with the # of triangles.)
+/// 3) The chains are dynamically correlated. If the probability that a previous
+///    branch was taken positively influences whether the next branch will be
+///    taken
+/// We believe that 2 and 3 are common enough to justify the small margin in 1.
+void MachineBlockPlacement::precomputeTriangleChains() {
+  struct TriangleChain {
+    std::vector<MachineBasicBlock *> Edges;
+    TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
+        : Edges({src, dst}) {}
+
+    void append(MachineBasicBlock *dst) {
+      assert(getKey()->isSuccessor(dst) &&
+             "Attempting to append a block that is not a successor.");
+      Edges.push_back(dst);
+    }
+
+    unsigned count() const { return Edges.size() - 1; }
+
+    MachineBasicBlock *getKey() const {
+      return Edges.back();
+    }
+  };
+
+  if (TriangleChainCount == 0)
+    return;
+
+  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  // Map from last block to the chain that contains it. This allows us to extend
+  // chains as we find new triangles.
+  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
+  for (MachineBasicBlock &BB : *F) {
+    // If BB doesn't have 2 successors, it doesn't start a triangle.
+    if (BB.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PDom = nullptr;
+    for (MachineBasicBlock *Succ : BB.successors()) {
+      if (!MPDT->dominates(Succ, &BB))
+        continue;
+      PDom = Succ;
+      break;
+    }
+    // If BB doesn't have a post-dominating successor, it doesn't form a
+    // triangle.
+    if (PDom == nullptr)
+      continue;
+    // If PDom has a hint that it is low probability, skip this triangle.
+    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
+      continue;
+    // If PDom isn't eligible for duplication, this isn't the kind of triangle
+    // we're looking for.
+    if (!shouldTailDuplicate(PDom))
+      continue;
+    bool CanTailDuplicate = true;
+    // If PDom can't tail-duplicate into it's non-BB predecessors, then this
+    // isn't the kind of triangle we're looking for.
+    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+      if (Pred == &BB)
+        continue;
+      if (!TailDup.canTailDuplicate(PDom, Pred)) {
+        CanTailDuplicate = false;
+        break;
+      }
+    }
+    // If we can't tail-duplicate PDom to its predecessors, then skip this
+    // triangle.
+    if (!CanTailDuplicate)
+      continue;
+
+    // Now we have an interesting triangle. Insert it if it's not part of an
+    // existing chain
+    // Note: This cannot be replaced with a call insert() or emplace() because
+    // the find key is BB, but the insert/emplace key is PDom.
+    auto Found = TriangleChainMap.find(&BB);
+    // If it is, remove the chain from the map, grow it, and put it back in the
+    // map with the end as the new key.
+    if (Found != TriangleChainMap.end()) {
+      TriangleChain Chain = std::move(Found->second);
+      TriangleChainMap.erase(Found);
+      Chain.append(PDom);
+      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
+    } else {
+      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+    }
+  }
+
+  // Iterating over a DenseMap is safe here, because the only thing in the body
+  // of the loop is inserting into another DenseMap (ComputedEdges).
+  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
+  for (auto &ChainPair : TriangleChainMap) {
+    TriangleChain &Chain = ChainPair.second;
+    // Benchmarking has shown that due to branch correlation duplicating 2 or
+    // more triangles is profitable, despite the calculations assuming
+    // independence.
+    if (Chain.count() < TriangleChainCount)
+      continue;
+    MachineBasicBlock *dst = Chain.Edges.back();
+    Chain.Edges.pop_back();
+    for (MachineBasicBlock *src : reverse(Chain.Edges)) {
+      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
+            getBlockName(dst) << " as pre-computed based on triangles.\n");
+
+      auto InsertResult = ComputedEdges.insert({src, {dst, true}});
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+
+      dst = src;
+    }
+  }
 }
 
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
-      MachineBasicBlock *BB) {
+      const MachineBasicBlock *BB) {
   if (!BB->getParent()->getFunction()->getEntryCount())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
@@ -609,11 +1202,11 @@ static BranchProbability getLayoutSuccessorProbThreshold(
     if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
       /* See case 1 below for the cost analysis. For BB->Succ to
        * be taken with smaller cost, the following needs to hold:
-       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
-       *   So the threshold T
-       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
-       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
-       * branch bias, we have
+       *   Prob(BB->Succ) > 2 * Prob(BB->Pred)
+       *   So the threshold T in the calculation below
+       *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
+       *   So T / (1 - T) = 2, Yielding T = 2/3
+       * Also adding user specified branch bias, we have
        *   T = (2/3)*(ProfileLikelyProb/50)
        *     = (2*ProfileLikelyProb)/150)
        */
@@ -625,10 +1218,17 @@ static BranchProbability getLayoutSuccessorProbThreshold(
 
 /// Checks to see if the layout candidate block \p Succ has a better layout
 /// predecessor than \c BB. If yes, returns true.
+/// \p SuccProb: The probability adjusted for only remaining blocks.
+///   Only used for logging
+/// \p RealSuccProb: The un-adjusted probability.
+/// \p Chain: The chain that BB belongs to and Succ is being considered for.
+/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
+///    considered
 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
-    BranchProbability SuccProb, BranchProbability RealSuccProb,
-    BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    const BlockChain &SuccChain, BranchProbability SuccProb,
+    BranchProbability RealSuccProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
 
   // There isn't a better layout when there are no unscheduled predecessors.
   if (SuccChain.UnscheduledPredecessors == 0)
@@ -734,11 +1334,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   //  |  Pred----|                     |  S1----
   //  |  |                             |       |
   //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
   //
   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
   //    + min(freq(Pred->S1), freq(Pred->S2))
   // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
   // is 0. Then the non topo layout is better when
@@ -756,13 +1357,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
         (BlockFilter && !BlockFilter->count(Pred)) ||
-        BlockToChain[Pred] == &Chain)
+        BlockToChain[Pred] == &Chain ||
+        // This check is redundant except for look ahead. This function is
+        // called for lookahead by isProfitableToTailDup when BB hasn't been
+        // placed yet.
+        (Pred == BB))
       continue;
     // Do backward checking.
     // For all cases above, we need a backward checking to filter out edges that
-    // are not 'strongly' biased. With profile data available, the check is
-    // mostly redundant for case 2 (when threshold prob is set at 50%) unless S
-    // has more than two successors.
+    // are not 'strongly' biased.
     // BB  Pred
     //  \ /
     //  Succ
@@ -798,14 +1401,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
 /// breaking CFG structure, but cave and break such structures in the case of
 /// very hot successor edges.
 ///
-/// \returns The best successor block found, or null if none are viable.
-MachineBasicBlock *
-MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
-                                           BlockChain &Chain,
-                                           const BlockFilterSet *BlockFilter) {
+/// \returns The best successor block found, or null if none are viable, along
+/// with a boolean indicating if tail duplication is necessary.
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::selectBestSuccessor(
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(StaticLikelyProb, 100);
 
-  MachineBasicBlock *BestSucc = nullptr;
+  BlockAndTailDupResult BestSucc = { nullptr, false };
   auto BestProb = BranchProbability::getZero();
 
   SmallVector<MachineBasicBlock *, 4> Successors;
@@ -813,22 +1417,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
 
   DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
+
+  // if we already precomputed the best successor for BB, return that if still
+  // applicable.
+  auto FoundEdge = ComputedEdges.find(BB);
+  if (FoundEdge != ComputedEdges.end()) {
+    MachineBasicBlock *Succ = FoundEdge->second.BB;
+    ComputedEdges.erase(FoundEdge);
+    BlockChain *SuccChain = BlockToChain[Succ];
+    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+        SuccChain != &Chain && Succ == *SuccChain->begin())
+      return FoundEdge->second;
+  }
+
+  // if BB is part of a trellis, Use the trellis to determine the optimal
+  // fallthrough edges
+  if (isTrellis(BB, Successors, Chain, BlockFilter))
+    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+                                   BlockFilter);
+
+  // For blocks with CFG violations, we may be able to lay them out anyway with
+  // tail-duplication. We keep this vector so we can perform the probability
+  // calculations the minimum number of times.
+  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
+      DupCandidates;
   for (MachineBasicBlock *Succ : Successors) {
     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
     BranchProbability SuccProb =
         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
 
-    // This heuristic is off by default.
-    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
-                                  HotProb))
-      return Succ;
-
     BlockChain &SuccChain = *BlockToChain[Succ];
     // Skip the edge \c BB->Succ if block \c Succ has a better layout
     // predecessor that yields lower global cost.
     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
-                                   Chain, BlockFilter))
+                                   Chain, BlockFilter)) {
+      // If tail duplication would make Succ profitable, place it.
+      if (TailDupPlacement && shouldTailDuplicate(Succ))
+        DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
       continue;
+    }
 
     DEBUG(
         dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
@@ -836,17 +1463,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
                << "\n");
 
-    if (BestSucc && BestProb >= SuccProb) {
+    if (BestSucc.BB && BestProb >= SuccProb) {
       DEBUG(dbgs() << "    Not the best candidate, continuing\n");
       continue;
     }
 
     DEBUG(dbgs() << "    Setting it as best candidate\n");
-    BestSucc = Succ;
+    BestSucc.BB = Succ;
     BestProb = SuccProb;
   }
-  if (BestSucc)
-    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc) << "\n");
+  // Handle the tail duplication candidates in order of decreasing probability.
+  // Stop at the first one that is profitable. Also stop if they are less
+  // profitable than BestSucc. Position is important because we preserve it and
+  // prefer first best match. Here we aren't comparing in order, so we capture
+  // the position instead.
+  if (DupCandidates.size() != 0) {
+    auto cmp =
+        [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
+           const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
+          return std::get<0>(a) > std::get<0>(b);
+        };
+    std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
+  }
+  for(auto &Tup : DupCandidates) {
+    BranchProbability DupProb;
+    MachineBasicBlock *Succ;
+    std::tie(DupProb, Succ) = Tup;
+    if (DupProb < BestProb)
+      break;
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
+        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+      DEBUG(
+          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
+                 << DupProb
+                 << " (Tail Duplicate)\n");
+      BestSucc.BB = Succ;
+      BestSucc.ShouldTailDup = true;
+      break;
+    }
+  }
+
+  if (BestSucc.BB)
+    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
 
   return BestSucc;
 }
@@ -862,7 +1520,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 ///
 /// \returns The best block found, or null if none are viable.
 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
-    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+    const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
   // Once we need to walk the worklist looking for a candidate, cleanup the
   // worklist of already placed entries.
   // FIXME: If this shows up on profiles, it could be folded (at the cost of
@@ -948,7 +1606,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 }
 
 void MachineBlockPlacement::fillWorkLists(
-    MachineBasicBlock *MBB,
+    const MachineBasicBlock *MBB,
     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
     const BlockFilterSet *BlockFilter = nullptr) {
   BlockChain &Chain = *BlockToChain[MBB];
@@ -970,23 +1628,23 @@ void MachineBlockPlacement::fillWorkLists(
   if (Chain.UnscheduledPredecessors != 0)
     return;
 
-  MBB = *Chain.begin();
-  if (MBB->isEHPad())
-    EHPadWorkList.push_back(MBB);
+  MachineBasicBlock *BB = *Chain.begin();
+  if (BB->isEHPad())
+    EHPadWorkList.push_back(BB);
   else
-    BlockWorkList.push_back(MBB);
+    BlockWorkList.push_back(BB);
 }
 
 void MachineBlockPlacement::buildChain(
-    MachineBasicBlock *BB, BlockChain &Chain,
+    const MachineBasicBlock *HeadBB, BlockChain &Chain,
     BlockFilterSet *BlockFilter) {
-  assert(BB && "BB must not be null.\n");
-  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
+  assert(HeadBB && "BB must not be null.\n");
+  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
 
-  MachineBasicBlock *LoopHeaderBB = BB;
+  const MachineBasicBlock *LoopHeaderBB = HeadBB;
   markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
-  BB = *std::prev(Chain.end());
+  MachineBasicBlock *BB = *std::prev(Chain.end());
   for (;;) {
     assert(BB && "null block found at end of chain in loop.");
     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
@@ -995,7 +1653,11 @@ void MachineBlockPlacement::buildChain(
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
-    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock* BestSucc = Result.BB;
+    bool ShouldTailDup = Result.ShouldTailDup;
+    if (TailDupPlacement)
+      ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -1016,7 +1678,7 @@ void MachineBlockPlacement::buildChain(
 
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
-    if (TailDupPlacement && BestSucc) {
+    if (TailDupPlacement && BestSucc && ShouldTailDup) {
       // If the chosen successor was duplicated into all its predecessors,
       // don't bother laying it out, just go round the loop again with BB as
       // the chain end.
@@ -1052,7 +1714,7 @@ void MachineBlockPlacement::buildChain(
 /// unconditional jump (for the backedge) rotating it in front of the loop
 /// header is always profitable.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
                                        const BlockFilterSet &LoopBlockSet) {
   // Placing the latch block before the header may introduce an extra branch
   // that skips this block the first time the loop is executed, which we want
@@ -1116,7 +1778,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
+MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
                                         const BlockFilterSet &LoopBlockSet) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
@@ -1235,7 +1897,7 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
 /// branches. For example, if the loop has fallthrough into its header and out
 /// of its bottom already, don't rotate it.
 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
-                                       MachineBasicBlock *ExitingBB,
+                                       const MachineBasicBlock *ExitingBB,
                                        const BlockFilterSet &LoopBlockSet) {
   if (!ExitingBB)
     return;
@@ -1285,7 +1947,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
 ///  Therefore, the cost for a given rotation is the sum of costs listed above.
 ///  We select the best rotation with the smallest cost.
 void MachineBlockPlacement::rotateLoopWithProfile(
-    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
+    BlockChain &LoopChain, const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
   auto HeaderBB = L.getHeader();
   auto HeaderIter = find(LoopChain, HeaderBB);
   auto RotationPos = LoopChain.end();
@@ -1422,7 +2085,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
 /// When profile data is available, exclude cold blocks from the returned set;
 /// otherwise, collect all blocks in the loop.
 MachineBlockPlacement::BlockFilterSet
-MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
+MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   BlockFilterSet LoopBlockSet;
 
   // Filter cold blocks off from LoopBlockSet when profile data is available.
@@ -1459,10 +2122,10 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
 /// as much as possible. We can then stitch the chains together in a way which
 /// both preserves the topological structure and minimizes taken conditional
 /// branches.
-void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
+void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // First recurse through any nested loops, building chains for those inner
   // loops.
-  for (MachineLoop *InnerLoop : L)
+  for (const MachineLoop *InnerLoop : L)
     buildLoopChains(*InnerLoop);
 
   assert(BlockWorkList.empty());
@@ -1499,7 +2162,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   assert(LoopChain.UnscheduledPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
 
-  for (MachineBasicBlock *LoopBB : LoopBlockSet)
+  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
     fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
 
   buildChain(LoopTop, LoopChain, &LoopBlockSet);
@@ -1533,7 +2196,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
 
     if (!LoopBlockSet.empty()) {
       BadLoop = true;
-      for (MachineBasicBlock *LoopBB : LoopBlockSet)
+      for (const MachineBasicBlock *LoopBB : LoopBlockSet)
         dbgs() << "Loop contains blocks never placed into a chain!\n"
                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -1546,31 +2209,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   EHPadWorkList.clear();
 }
 
-/// When OutlineOpitonalBranches is on, this method collects BBs that
-/// dominates all terminator blocks of the function \p F.
-void MachineBlockPlacement::collectMustExecuteBBs() {
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : *F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : *F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
-}
-
 void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
@@ -1605,9 +2243,6 @@ void MachineBlockPlacement::buildCFGChains() {
     }
   }
 
-  // Turned on with OutlineOptionalBranches option
-  collectMustExecuteBBs();
-
   // Build any loop-based chains.
   PreferredLoopExit = nullptr;
   for (MachineLoop *L : *MLI)
@@ -1839,7 +2474,7 @@ void MachineBlockPlacement::alignBlocks() {
 /// @return true if \p BB was removed.
 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-    MachineBasicBlock *LoopHeaderBB,
+    const MachineBasicBlock *LoopHeaderBB,
     BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt) {
   bool Removed, DuplicatedToLPred;
@@ -1901,21 +2536,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
 /// \return  - True if the block was duplicated into all preds and removed.
 bool MachineBlockPlacement::maybeTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *LPred,
-    const BlockChain &Chain, BlockFilterSet *BlockFilter,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt,
     bool &DuplicatedToLPred) {
-
   DuplicatedToLPred = false;
+  if (!shouldTailDuplicate(BB))
+    return false;
+
   DEBUG(dbgs() << "Redoing tail duplication for Succ#"
         << BB->getNumber() << "\n");
-  bool IsSimple = TailDup.isSimpleBB(BB);
-  // Blocks with single successors don't create additional fallthrough
-  // opportunities. Don't duplicate them. TODO: When conditional exits are
-  // analyzable, allow them to be duplicated.
-  if (!IsSimple && BB->succ_size() == 1)
-    return false;
-  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
-    return false;
+
   // This has to be a callback because none of it can be done after
   // BB is deleted.
   bool Removed = false;
@@ -1967,6 +2597,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
       llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
                                  &DuplicatedPreds, &RemovalCallbackRef);
 
@@ -2006,21 +2637,24 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = nullptr;
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
 
+  assert(BlockToChain.empty());
+  assert(ComputedEdges.empty());
+
   if (TailDupPlacement) {
-    unsigned TailDupSize = TailDuplicatePlacementThreshold;
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
+    unsigned TailDupSize = TailDupPlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
+    precomputeTriangleChains();
   }
 
-  assert(BlockToChain.empty());
-
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
@@ -2032,7 +2666,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
@@ -2041,8 +2675,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
-      // Must redo the dominator tree if blocks were changed.
-      MDT->runOnMachineFunction(MF);
+      ComputedEdges.clear();
+      // Must redo the post-dominator tree if blocks were changed.
+      if (MPDT)
+        MPDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
@@ -2052,6 +2688,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   alignBlocks();
 
   BlockToChain.clear();
+  ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
   if (AlignAllBlock)
@@ -2067,6 +2704,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
         MBI->setAlignment(AlignAllNonFallThruBlocks);
     }
   }
+  if (ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F->getFunction()->getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MBP." + MF.getName(), false);
+  }
+
 
   // We always return true as we have no way to track whether the final order
   // differs from the original order.
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 5beed5f5dd08..50e453e4067c 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // The machine combiner pass uses machine trace metrics to ensure the combined
-// instructions does not lengthen the critical path or the resource depth.
+// instructions do not lengthen the critical path or the resource depth.
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "machine-combiner"
@@ -135,7 +135,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
   for (auto *InstrPtr : InsInstrs) { // for each Use
     unsigned IDepth = 0;
-    DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";);
+    DEBUG(dbgs() << "NEW INSTR ";
+          InstrPtr->print(dbgs(), TII);
+          dbgs() << "\n";);
     for (const MachineOperand &MO : InstrPtr->operands()) {
       // Check for virtual register operand.
       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
@@ -352,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   return false;
 }
 
+static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
+                                     SmallVector<MachineInstr *, 16> InsInstrs,
+                                     SmallVector<MachineInstr *, 16> DelInstrs,
+                                     MachineTraceMetrics *Traces) {
+  for (auto *InstrPtr : InsInstrs)
+    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
+  for (auto *InstrPtr : DelInstrs)
+    InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+  ++NumInstCombined;
+  Traces->invalidate(MBB);
+  Traces->verifyAnalysis();
+}
+
 /// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
@@ -406,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
       if (!MinInstr)
         MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-      MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
       Traces->verifyAnalysis();
       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                       InstrIdxForVirtReg);
@@ -426,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
-      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
-          (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                   DelInstrs, InstrIdxForVirtReg, P) &&
-           preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
-        for (auto *InstrPtr : InsInstrs)
-          MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
-        for (auto *InstrPtr : DelInstrs)
-          InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
-
-        Changed = true;
-        ++NumInstCombined;
-
-        Traces->invalidate(MBB);
-        Traces->verifyAnalysis();
+      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
         // Eagerly stop after the first pattern fires.
+        Changed = true;
         break;
       } else {
+        // Calculating the trace metrics may be expensive,
+        // so only do this when necessary.
+        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg, P) &&
+            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+          // Eagerly stop after the first pattern fires.
+          Changed = true;
+          break;
+        }
         // Cleanup instructions of the alternative code sequence. There is no
         // use for them.
         MachineFunction *MF = MBB->getParent();
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 5de6dec29fb9..7312dc5e94bd 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -291,17 +291,9 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       if (MO.isDef()) {
         Defs.push_back(Reg);
-      } else {
+        continue;
+      } else if (MO.readsReg())
         ReadRegister(Reg);
-      }
-      // Treat undef use like defs for copy propagation but not for
-      // dead copy. We would need to do a liveness check to be sure the copy
-      // is dead for undef uses.
-      // The backends are allowed to do whatever they want with undef value
-      // and we cannot be sure this register will not be rewritten to break
-      // some false dependencies for the hardware for instance.
-      if (MO.isUndef())
-        Defs.push_back(Reg);
     }
 
     // The instruction has a register mask operand which means that it clobbers
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 303a6a9263be..e3a6c51c47ad 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
+  DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
   DT->recalculate(F);
-
   return false;
 }
 
 MachineDominatorTree::MachineDominatorTree()
     : MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new DominatorTreeBase<MachineBasicBlock>(false);
-}
-
-MachineDominatorTree::~MachineDominatorTree() {
-  delete DT;
 }
 
 void MachineDominatorTree::releaseMemory() {
-  DT->releaseMemory();
+  CriticalEdgesToSplit.clear();
+  DT.reset(nullptr);
 }
 
 void MachineDominatorTree::verifyAnalysis() const {
-  if (VerifyMachineDomInfo)
+  if (DT && VerifyMachineDomInfo)
     verifyDomTree();
 }
 
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
-  DT->print(OS);
+  if (DT)
+    DT->print(OS);
 }
 
 void MachineDominatorTree::applySplitCriticalEdges() const {
@@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
 }
 
 void MachineDominatorTree::verifyDomTree() const {
+  if (!DT)
+    return;
   MachineFunction &F = *getRoot()->getParent();
 
-  MachineDominatorTree OtherDT;
-  OtherDT.DT->recalculate(F);
-  if (compare(OtherDT)) {
+  DominatorTreeBase<MachineBasicBlock> OtherDT(false);
+  OtherDT.recalculate(F);
+  if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
+      DT->compare(OtherDT)) {
     errs() << "MachineDominatorTree is not up to date!\nComputed:\n";
-    print(errs(), nullptr);
+    DT->print(errs());
     errs() << "\nActual:\n";
-    OtherDT.print(errs(), nullptr);
+    OtherDT.print(errs());
     abort();
   }
 }
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index c1d5ea96cd17..c9767a25e908 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -169,6 +169,7 @@ void MachineFunction::clear() {
   InstructionRecycler.clear(Allocator);
   OperandRecycler.clear(Allocator);
   BasicBlockRecycler.clear(Allocator);
+  VariableDbgInfos.clear();
   if (RegInfo) {
     RegInfo->~MachineRegisterInfo();
     Allocator.Deallocate(RegInfo);
@@ -859,7 +860,9 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
   if (!isCalleeSavedInfoValid())
     return BV;
 
-  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
+       ++CSR)
     BV.set(*CSR);
 
   // Saved CSRs are not pristine.
@@ -956,7 +959,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineFrameInfo::dump(const MachineFunction &MF) const {
+LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const {
   print(MF, dbgs());
 }
 #endif
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 2f2e3b3d8e9f..c0a8b95ed8a0 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -262,8 +262,21 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getBlockAddress() == Other.getBlockAddress() &&
            getOffset() == Other.getOffset();
   case MachineOperand::MO_RegisterMask:
-  case MachineOperand::MO_RegisterLiveOut:
-    return getRegMask() == Other.getRegMask();
+  case MachineOperand::MO_RegisterLiveOut: {
+    // Shallow compare of the two RegMasks
+    const uint32_t *RegMask = getRegMask();
+    const uint32_t *OtherRegMask = Other.getRegMask();
+    if (RegMask == OtherRegMask)
+      return true;
+
+    // Calculate the size of the RegMask
+    const MachineFunction *MF = getParent()->getParent()->getParent();
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+
+    // Deep compare of the two RegMasks
+    return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask);
+  }
   case MachineOperand::MO_MCSymbol:
     return getMCSymbol() == Other.getMCSymbol();
   case MachineOperand::MO_CFIIndex:
@@ -274,6 +287,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getIntrinsicID() == Other.getIntrinsicID();
   case MachineOperand::MO_Predicate:
     return getPredicate() == Other.getPredicate();
+  case MachineOperand::MO_Placeholder:
+    return true;
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -322,6 +337,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
   case MachineOperand::MO_Predicate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
+  case MachineOperand::MO_Placeholder:
+    return hash_combine();
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -403,6 +420,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       bool Unused;
       APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused);
       OS << "half " << APF.convertToFloat();
+    } else if (getFPImm()->getType()->isFP128Ty()) {
+      APFloat APF = getFPImm()->getValueAPF();
+      SmallString<16> Str;
+      getFPImm()->getValueAPF().toString(Str);
+      OS << "quad " << Str;
     } else {
       OS << getFPImm()->getValueAPF().convertToDouble();
     }
@@ -491,7 +513,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
     OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred")
        << CmpInst::getPredicateName(Pred) << '>';
+    break;
   }
+  case MachineOperand::MO_Placeholder:
+    OS << "<placeholder>";
+    break;
   }
   if (unsigned TF = getTargetFlags())
     OS << "[TF=" << TF << ']';
@@ -1571,6 +1597,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
   return true;
 }
 
+bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other,
+                            bool UseTBAA) {
+  const MachineFunction *MF = getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // If neither instruction stores to memory, they can't alias in any
+  // meaningful way, even if they read from the same address.
+  if (!mayStore() && !Other.mayStore())
+    return false;
+
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA))
+    return false;
+
+  if (!AA)
+    return true;
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
+    return true;
+
+  MachineMemOperand *MMOa = *memoperands_begin();
+  MachineMemOperand *MMOb = *Other.memoperands_begin();
+
+  if (!MMOa->getValue() || !MMOb->getValue())
+    return true;
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasResult AAResult =
+      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
+                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+                MemoryLocation(MMOb->getValue(), Overlapb,
+                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+
+  return (AAResult != NoAlias);
+}
+
 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
 /// or volatile memory reference, or if the information describing the memory
 /// reference is not available. Return false if it is known to have no ordered
@@ -1692,14 +1777,14 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF,
   }
 }
 
-LLVM_DUMP_METHOD void MachineInstr::dump(const TargetInstrInfo *TII) const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineInstr::dump() const {
   dbgs() << "  ";
-  print(dbgs(), false /* SkipOpers */, TII);
-#endif
+  print(dbgs());
 }
+#endif
 
-void MachineInstr::print(raw_ostream &OS, bool SkipOpers,
+void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc,
                          const TargetInstrInfo *TII) const {
   const Module *M = nullptr;
   if (const MachineBasicBlock *MBB = getParent())
@@ -1707,11 +1792,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers,
       M = MF->getFunction()->getParent();
 
   ModuleSlotTracker MST(M);
-  print(OS, MST, SkipOpers, TII);
+  print(OS, MST, SkipOpers, SkipDebugLoc, TII);
 }
 
 void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                         bool SkipOpers, const TargetInstrInfo *TII) const {
+                         bool SkipOpers, bool SkipDebugLoc,
+                         const TargetInstrInfo *TII) const {
   // We can be a bit tidier if we know the MachineFunction.
   const MachineFunction *MF = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
@@ -1987,6 +2073,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
     if (isIndirectDebugValue())
       OS << " indirect";
+  } else if (SkipDebugLoc) {
+    return;
   } else if (debugLoc && MF) {
     if (!HaveSemi)
       OS << ";";
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index fdeaf7b71161..a9aa1d954e70 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -87,6 +87,22 @@ MachineBasicBlock *MachineLoop::findLoopControlBlock() {
   return nullptr;
 }
 
+DebugLoc MachineLoop::getStartLoc() const {
+  // Try the pre-header first.
+  if (MachineBasicBlock *PHeadMBB = getLoopPreheader())
+    if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock())
+      if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc())
+        return DL;
+
+  // If we have no pre-header or there are no instructions with debug
+  // info in it, try the header.
+  if (MachineBasicBlock *HeadMBB = getHeader())
+    if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock())
+      return HeadBB->getTerminator()->getDebugLoc();
+
+  return DebugLoc();
+}
+
 MachineBasicBlock *
 MachineLoopInfo::findLoopPreheader(MachineLoop *L,
                                    bool SpeculativePreheader) const {
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 6618857477ed..2f0f4297ef5c 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -306,6 +306,10 @@ public:
     MMI.deleteMachineFunctionFor(F);
     return true;
   }
+  
+  StringRef getPassName() const override {
+    return "Free MachineFunction";
+  } 
 };
 char FreeMachineFunction::ID;
 } // end anonymous namespace
diff --git a/lib/CodeGen/MachineModuleInfoImpls.cpp b/lib/CodeGen/MachineModuleInfoImpls.cpp
index 22d519e5d88f..4c81fd91cb82 100644
--- a/lib/CodeGen/MachineModuleInfoImpls.cpp
+++ b/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -23,6 +23,7 @@ using namespace llvm;
 // Out of line virtual method.
 void MachineModuleInfoMachO::anchor() {}
 void MachineModuleInfoELF::anchor() {}
+void MachineModuleInfoWasm::anchor() {}
 
 static int SortSymbolPair(const void *LHS, const void *RHS) {
   typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
new file mode 100644
index 000000000000..6b6b5f2814a9
--- /dev/null
+++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -0,0 +1,100 @@
+///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===//
+///
+///                     The LLVM Compiler Infrastructure
+///
+/// This file is distributed under the University of Illinois Open Source
+/// License. See LICENSE.TXT for details.
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// Optimization diagnostic interfaces for machine passes.  It's packaged as an
+/// analysis pass so that by using this service passes become dependent on MBFI
+/// as well.  MBFI is used to compute the "hotness" of the diagnostic message.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
+    StringRef MKey, const MachineInstr &MI)
+    : Argument() {
+  Key = MKey;
+
+  raw_string_ostream OS(Val);
+  MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true);
+}
+
+Optional<uint64_t>
+MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) {
+  if (!MBFI)
+    return None;
+
+  return MBFI->getBlockProfileCount(&MBB);
+}
+
+void MachineOptimizationRemarkEmitter::computeHotness(
+    DiagnosticInfoMIROptimization &Remark) {
+  const MachineBasicBlock *MBB = Remark.getBlock();
+  if (MBB)
+    Remark.setHotness(computeHotness(*MBB));
+}
+
+void MachineOptimizationRemarkEmitter::emit(
+    DiagnosticInfoOptimizationBase &OptDiagCommon) {
+  auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon);
+  computeHotness(OptDiag);
+
+  LLVMContext &Ctx = MF.getFunction()->getContext();
+  yaml::Output *Out = Ctx.getDiagnosticsOutputFile();
+  if (Out) {
+    auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon);
+    *Out << P;
+  }
+  // FIXME: now that IsVerbose is part of DI, filtering for this will be moved
+  // from here to clang.
+  if (!OptDiag.isVerbose() || shouldEmitVerbose())
+    Ctx.diagnose(OptDiag);
+}
+
+MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineOptimizationRemarkEmitterPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  MachineBlockFrequencyInfo *MBFI;
+
+  if (MF.getFunction()->getContext().getDiagnosticHotnessRequested())
+    MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
+  else
+    MBFI = nullptr;
+
+  ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  return false;
+}
+
+void MachineOptimizationRemarkEmitterPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+char MachineOptimizationRemarkEmitterPass::ID = 0;
+static const char ore_name[] = "Machine Optimization Remark Emitter";
+#define ORE_NAME "machine-opt-remark-emitter"
+
+INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass)
+INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
+                    false, true)
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
new file mode 100644
index 000000000000..581a8ad81149
--- /dev/null
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -0,0 +1,1251 @@
+//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Replaces repeated sequences of instructions with function calls.
+///
+/// This works by placing every instruction from every basic block in a
+/// suffix tree, and repeatedly querying that tree for repeated sequences of
+/// instructions. If a sequence of instructions appears often, then it ought
+/// to be beneficial to pull out into a function.
+///
+/// This was originally presented at the 2016 LLVM Developers' Meeting in the
+/// talk "Reducing Code Size Using Outlining". For a high-level overview of
+/// how this pass works, the talk is available on YouTube at
+///
+/// https://www.youtube.com/watch?v=yorld-WSOeU
+///
+/// The slides for the talk are available at
+///
+/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf
+///
+/// The talk provides an overview of how the outliner finds candidates and
+/// ultimately outlines them. It describes how the main data structure for this
+/// pass, the suffix tree, is queried and purged for candidates. It also gives
+/// a simplified suffix tree construction algorithm for suffix trees based off
+/// of the algorithm actually used here, Ukkonen's algorithm.
+///
+/// For the original RFC for this pass, please see
+///
+/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html
+///
+/// For more information on the suffix tree data structure, please see
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+///
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <functional>
+#include <map>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+#define DEBUG_TYPE "machine-outliner"
+
+using namespace llvm;
+
+STATISTIC(NumOutlined, "Number of candidates outlined");
+STATISTIC(FunctionsCreated, "Number of functions created");
+
+namespace {
+
+/// \brief An individual sequence of instructions to be replaced with a call to
+/// an outlined function.
+struct Candidate {
+
+  /// Set to false if the candidate overlapped with another candidate.
+  bool InCandidateList = true;
+
+  /// The start index of this \p Candidate.
+  size_t StartIdx;
+
+  /// The number of instructions in this \p Candidate.
+  size_t Len;
+
+  /// The index of this \p Candidate's \p OutlinedFunction in the list of
+  /// \p OutlinedFunctions.
+  size_t FunctionIdx;
+
+  /// \brief The number of instructions that would be saved by outlining every
+  /// candidate of this type.
+  ///
+  /// This is a fixed value which is not updated during the candidate pruning
+  /// process. It is only used for deciding which candidate to keep if two
+  /// candidates overlap. The true benefit is stored in the OutlinedFunction
+  /// for some given candidate.
+  unsigned Benefit = 0;
+
+  Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx)
+      : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {}
+
+  Candidate() {}
+
+  /// \brief Used to ensure that \p Candidates are outlined in an order that
+  /// preserves the start and end indices of other \p Candidates.
+  bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; }
+};
+
+/// \brief The information necessary to create an outlined function for some
+/// class of candidate.
+struct OutlinedFunction {
+
+  /// The actual outlined function created.
+  /// This is initialized after we go through and create the actual function.
+  MachineFunction *MF = nullptr;
+
+  /// A number assigned to this function which appears at the end of its name.
+  size_t Name;
+
+  /// The number of candidates for this OutlinedFunction.
+  size_t OccurrenceCount = 0;
+
+  /// \brief The sequence of integers corresponding to the instructions in this
+  /// function.
+  std::vector<unsigned> Sequence;
+
+  /// The number of instructions this function would save.
+  unsigned Benefit = 0;
+
+  /// \brief Set to true if candidates for this outlined function should be
+  /// replaced with tail calls to this OutlinedFunction.
+  bool IsTailCall = false;
+
+  OutlinedFunction(size_t Name, size_t OccurrenceCount,
+                   const std::vector<unsigned> &Sequence,
+                   unsigned Benefit, bool IsTailCall)
+      : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence),
+        Benefit(Benefit), IsTailCall(IsTailCall)
+        {}
+};
+
+/// Represents an undefined index in the suffix tree.
+const size_t EmptyIdx = -1;
+
+/// A node in a suffix tree which represents a substring or suffix.
+///
+/// Each node has either no children or at least two children, with the root
+/// being a exception in the empty tree.
+///
+/// Children are represented as a map between unsigned integers and nodes. If
+/// a node N has a child M on unsigned integer k, then the mapping represented
+/// by N is a proper prefix of the mapping represented by M. Note that this,
+/// although similar to a trie is somewhat different: each node stores a full
+/// substring of the full mapping rather than a single character state.
+///
+/// Each internal node contains a pointer to the internal node representing
+/// the same string, but with the first character chopped off. This is stored
+/// in \p Link. Each leaf node stores the start index of its respective
+/// suffix in \p SuffixIdx.
+struct SuffixTreeNode {
+
+  /// The children of this node.
+  ///
+  /// A child existing on an unsigned integer implies that from the mapping
+  /// represented by the current node, there is a way to reach another
+  /// mapping by tacking that character on the end of the current string.
+  DenseMap<unsigned, SuffixTreeNode *> Children;
+
+  /// A flag set to false if the node has been pruned from the tree.
+  bool IsInTree = true;
+
+  /// The start index of this node's substring in the main string.
+  size_t StartIdx = EmptyIdx;
+
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  size_t *EndIdx = nullptr;
+
+  /// For leaves, the start index of the suffix represented by this node.
+  ///
+  /// For all other nodes, this is ignored.
+  size_t SuffixIdx = EmptyIdx;
+
+  /// \brief For internal nodes, a pointer to the internal node representing
+  /// the same sequence with the first character chopped off.
+  ///
+  /// This has two major purposes in the suffix tree. The first is as a
+  /// shortcut in Ukkonen's construction algorithm. One of the things that
+  /// Ukkonen's algorithm does to achieve linear-time construction is
+  /// keep track of which node the next insert should be at. This makes each
+  /// insert O(1), and there are a total of O(N) inserts. The suffix link
+  /// helps with inserting children of internal nodes.
+  ///
+  /// Say we add a child to an internal node with associated mapping S. The 
+  /// next insertion must be at the node representing S - its first character.
+  /// This is given by the way that we iteratively build the tree in Ukkonen's
+  /// algorithm. The main idea is to look at the suffixes of each prefix in the
+  /// string, starting with the longest suffix of the prefix, and ending with
+  /// the shortest. Therefore, if we keep pointers between such nodes, we can
+  /// move to the next insertion point in O(1) time. If we don't, then we'd
+  /// have to query from the root, which takes O(N) time. This would make the
+  /// construction algorithm O(N^2) rather than O(N).
+  ///
+  /// The suffix link is also used during the tree pruning process to let us
+  /// quickly throw out a bunch of potential overlaps. Say we have a sequence
+  /// S we want to outline. Then each of its suffixes contribute to at least
+  /// one overlapping case. Therefore, we can follow the suffix links
+  /// starting at the node associated with S to the root and "delete" those
+  /// nodes, save for the root. For each candidate, this removes
+  /// O(|candidate|) overlaps from the search space. We don't actually
+  /// completely invalidate these nodes though; doing that is far too
+  /// aggressive. Consider the following pathological string:
+  ///
+  /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3
+  ///
+  /// If we, for the sake of example, outlined 1 2 3, then we would throw
+  /// out all instances of 2 3. This isn't desirable. To get around this,
+  /// when we visit a link node, we decrement its occurrence count by the
+  /// number of sequences we outlined in the current step. In the pathological
+  /// example, the 2 3 node would have an occurrence count of 8, while the
+  /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node
+  /// would survive to the next round allowing us to outline the extra
+  /// instances of 2 3.
+  SuffixTreeNode *Link = nullptr;
+
+  /// The parent of this node. Every node except for the root has a parent.
+  SuffixTreeNode *Parent = nullptr;
+
+  /// The number of times this node's string appears in the tree.
+  ///
+  /// This is equal to the number of leaf children of the string. It represents
+  /// the number of suffixes that the node's string is a prefix of.
+  size_t OccurrenceCount = 0;
+
+  /// The length of the string formed by concatenating the edge labels from the
+  /// root to this node.
+  size_t ConcatLen = 0;
+
+  /// Returns true if this node is a leaf.
+  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
+
+  /// Returns true if this node is the root of its owning \p SuffixTree.
+  bool isRoot() const { return StartIdx == EmptyIdx; }
+
+  /// Return the number of elements in the substring associated with this node.
+  size_t size() const {
+
+    // Is it the root? If so, it's the empty string so return 0.
+    if (isRoot())
+      return 0;
+
+    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
+
+    // Size = the number of elements in the string.
+    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
+    return *EndIdx - StartIdx + 1;
+  }
+
+  SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link,
+                 SuffixTreeNode *Parent)
+      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {}
+
+  SuffixTreeNode() {}
+};
+
+/// A data structure for fast substring queries.
+///
+/// Suffix trees represent the suffixes of their input strings in their leaves.
+/// A suffix tree is a type of compressed trie structure where each node
+/// represents an entire substring rather than a single character. Each leaf
+/// of the tree is a suffix.
+///
+/// A suffix tree can be seen as a type of state machine where each state is a
+/// substring of the full string. The tree is structured so that, for a string
+/// of length N, there are exactly N leaves in the tree. This structure allows
+/// us to quickly find repeated substrings of the input string.
+///
+/// In this implementation, a "string" is a vector of unsigned integers.
+/// These integers may result from hashing some data type. A suffix tree can
+/// contain 1 or many strings, which can then be queried as one large string.
+///
+/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
+/// suffix tree construction. Ukkonen's algorithm is explained in more detail
+/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
+/// paper is available at
+///
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+class SuffixTree {
+private:
+  /// Each element is an integer representing an instruction in the module.
+  ArrayRef<unsigned> Str;
+
+  /// Maintains each node in the tree.
+  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
+
+  /// The root of the suffix tree.
+  ///
+  /// The root represents the empty string. It is maintained by the
+  /// \p NodeAllocator like every other node in the tree.
+  SuffixTreeNode *Root = nullptr;
+
+  /// Stores each leaf node in the tree.
+  ///
+  /// This is used for finding outlining candidates.
+  std::vector<SuffixTreeNode *> LeafVector;
+
+  /// Maintains the end indices of the internal nodes in the tree.
+  ///
+  /// Each internal node is guaranteed to never have its end index change
+  /// during the construction algorithm; however, leaves must be updated at
+  /// every step. Therefore, we need to store leaf end indices by reference
+  /// to avoid updating O(N) leaves at every step of construction. Thus,
+  /// every internal node must be allocated its own end index.
+  BumpPtrAllocator InternalEndIdxAllocator;
+
+  /// The end index of each leaf in the tree.
+  size_t LeafEndIdx = -1;
+
+  /// \brief Helper struct which keeps track of the next insertion point in
+  /// Ukkonen's algorithm.
+  struct ActiveState {
+    /// The next node to insert at.
+    SuffixTreeNode *Node;
+
+    /// The index of the first character in the substring currently being added.
+    size_t Idx = EmptyIdx;
+
+    /// The length of the substring we have to add at the current step.
+    size_t Len = 0;
+  };
+
+  /// \brief The point the next insertion will take place at in the
+  /// construction algorithm.
+  ActiveState Active;
+
+  /// Allocate a leaf node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated leaf node.
+  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx,
+                             unsigned Edge) {
+
+    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+
+    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, 
+                                                                   &LeafEndIdx,
+                                                                       nullptr,
+                                                                      &Parent);
+    Parent.Children[Edge] = N;
+
+    return N;
+  }
+
+  /// Allocate an internal node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node. Only null when allocating the root.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param EndIdx The end index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated internal node.
+  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx,
+                                     size_t EndIdx, unsigned Edge) {
+
+    assert(StartIdx <= EndIdx && "String can't start after it ends!");
+    assert(!(!Parent && StartIdx != EmptyIdx) &&
+    "Non-root internal nodes must have parents!");
+
+    size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx);
+    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx,
+                                                                      E,
+                                                                      Root,
+                                                                      Parent);
+    if (Parent)
+      Parent->Children[Edge] = N;
+
+    return N;
+  }
+
+  /// \brief Set the suffix indices of the leaves to the start indices of their
+  /// respective suffixes. Also stores each leaf in \p LeafVector at its
+  /// respective suffix index.
+  ///
+  /// \param[in] CurrNode The node currently being visited.
+  /// \param CurrIdx The current index of the string being visited.
+  void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) {
+
+    bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot();
+
+    // Store the length of the concatenation of all strings from the root to
+    // this node.
+    if (!CurrNode.isRoot()) {
+      if (CurrNode.ConcatLen == 0)
+        CurrNode.ConcatLen = CurrNode.size();
+
+      if (CurrNode.Parent)
+       CurrNode.ConcatLen += CurrNode.Parent->ConcatLen;
+    }
+
+    // Traverse the tree depth-first.
+    for (auto &ChildPair : CurrNode.Children) {
+      assert(ChildPair.second && "Node had a null child!");
+      setSuffixIndices(*ChildPair.second,
+                       CurrIdx + ChildPair.second->size());
+    }
+
+    // Is this node a leaf?
+    if (IsLeaf) {
+      // If yes, give it a suffix index and bump its parent's occurrence count.
+      CurrNode.SuffixIdx = Str.size() - CurrIdx;
+      assert(CurrNode.Parent && "CurrNode had no parent!");
+      CurrNode.Parent->OccurrenceCount++;
+
+      // Store the leaf in the leaf vector for pruning later.
+      LeafVector[CurrNode.SuffixIdx] = &CurrNode;
+    }
+  }
+
+  /// \brief Construct the suffix tree for the prefix of the input ending at
+  /// \p EndIdx.
+  ///
+  /// Used to construct the full suffix tree iteratively. At the end of each
+  /// step, the constructed suffix tree is either a valid suffix tree, or a
+  /// suffix tree with implicit suffixes. At the end of the final step, the
+  /// suffix tree is a valid tree.
+  ///
+  /// \param EndIdx The end index of the current prefix in the main string.
+  /// \param SuffixesToAdd The number of suffixes that must be added
+  /// to complete the suffix tree at the current phase.
+  ///
+  /// \returns The number of suffixes that have not been added at the end of
+  /// this step.
+  unsigned extend(size_t EndIdx, size_t SuffixesToAdd) {
+    SuffixTreeNode *NeedsLink = nullptr;
+
+    while (SuffixesToAdd > 0) {
+    
+      // Are we waiting to add anything other than just the last character?
+      if (Active.Len == 0) {
+        // If not, then say the active index is the end index.
+        Active.Idx = EndIdx;
+      }
+
+      assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+      // The first character in the current substring we're looking at.
+      unsigned FirstChar = Str[Active.Idx];
+
+      // Have we inserted anything starting with FirstChar at the current node?
+      if (Active.Node->Children.count(FirstChar) == 0) {
+        // If not, then we can just insert a leaf and move too the next step.
+        insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+        // The active node is an internal node, and we visited it, so it must
+        // need a link if it doesn't have one.
+        if (NeedsLink) {
+          NeedsLink->Link = Active.Node;
+          NeedsLink = nullptr;
+        }
+      } else {
+        // There's a match with FirstChar, so look for the point in the tree to
+        // insert a new node.
+        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
+
+        size_t SubstringLen = NextNode->size();
+
+        // Is the current suffix we're trying to insert longer than the size of
+        // the child we want to move to?
+        if (Active.Len >= SubstringLen) {
+          // If yes, then consume the characters we've seen and move to the next
+          // node.
+          Active.Idx += SubstringLen;
+          Active.Len -= SubstringLen;
+          Active.Node = NextNode;
+          continue;
+        }
+
+        // Otherwise, the suffix we're trying to insert must be contained in the
+        // next node we want to move to.
+        unsigned LastChar = Str[EndIdx];
+
+        // Is the string we're trying to insert a substring of the next node?
+        if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+          // If yes, then we're done for this step. Remember our insertion point
+          // and move to the next end index. At this point, we have an implicit
+          // suffix tree.
+          if (NeedsLink && !Active.Node->isRoot()) {
+            NeedsLink->Link = Active.Node;
+            NeedsLink = nullptr;
+          }
+
+          Active.Len++;
+          break;
+        }
+
+        // The string we're trying to insert isn't a substring of the next node,
+        // but matches up to a point. Split the node.
+        //
+        // For example, say we ended our search at a node n and we're trying to
+        // insert ABD. Then we'll create a new node s for AB, reduce n to just
+        // representing C, and insert a new leaf node l to represent d. This
+        // allows us to ensure that if n was a leaf, it remains a leaf.
+        //
+        //   | ABC  ---split--->  | AB
+        //   n                    s
+        //                     C / \ D
+        //                      n   l
+
+        // The node s from the diagram
+        SuffixTreeNode *SplitNode =
+            insertInternalNode(Active.Node,
+                               NextNode->StartIdx,
+                               NextNode->StartIdx + Active.Len - 1,
+                               FirstChar);
+
+        // Insert the new node representing the new substring into the tree as
+        // a child of the split node. This is the node l from the diagram.
+        insertLeaf(*SplitNode, EndIdx, LastChar);
+
+        // Make the old node a child of the split node and update its start
+        // index. This is the node n from the diagram.
+        NextNode->StartIdx += Active.Len;
+        NextNode->Parent = SplitNode;
+        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+
+        // SplitNode is an internal node, update the suffix link.
+        if (NeedsLink)
+          NeedsLink->Link = SplitNode;
+
+        NeedsLink = SplitNode;
+      }
+
+      // We've added something new to the tree, so there's one less suffix to
+      // add.
+      SuffixesToAdd--;
+
+      if (Active.Node->isRoot()) {
+        if (Active.Len > 0) {
+          Active.Len--;
+          Active.Idx = EndIdx - SuffixesToAdd + 1;
+        }
+      } else {
+        // Start the next phase at the next smallest suffix.
+        Active.Node = Active.Node->Link;
+      }
+    }
+
+    return SuffixesToAdd;
+  }
+
+public:
+
+  /// Find all repeated substrings that satisfy \p BenefitFn.
+  ///
+  /// If a substring appears at least twice, then it must be represented by
+  /// an internal node which appears in at least two suffixes. Each suffix is
+  /// represented by a leaf node. To do this, we visit each internal node in
+  /// the tree, using the leaf children of each internal node. If an internal
+  /// node represents a beneficial substring, then we use each of its leaf
+  /// children to find the locations of its substring.
+  ///
+  /// \param[out] CandidateList Filled with candidates representing each
+  /// beneficial substring.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each
+  /// type of candidate.
+  /// \param BenefitFn The function to satisfy.
+  ///
+  /// \returns The length of the longest candidate found.
+  size_t findCandidates(std::vector<Candidate> &CandidateList,
+  std::vector<OutlinedFunction> &FunctionList,
+  const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)>
+  &BenefitFn) {
+
+    CandidateList.clear();
+    FunctionList.clear();
+    size_t FnIdx = 0;
+    size_t MaxLen = 0;
+
+    for (SuffixTreeNode* Leaf : LeafVector) {
+      assert(Leaf && "Leaves in LeafVector cannot be null!");
+      if (!Leaf->IsInTree)
+        continue;
+
+      assert(Leaf->Parent && "All leaves must have parents!");
+      SuffixTreeNode &Parent = *(Leaf->Parent);
+
+      // If it doesn't appear enough, or we already outlined from it, skip it.
+      if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree)
+        continue;
+
+      size_t StringLen = Leaf->ConcatLen - Leaf->size();
+
+      // How many instructions would outlining this string save?
+      unsigned Benefit = BenefitFn(Parent,
+        StringLen, Str[Leaf->SuffixIdx + StringLen - 1]);
+
+      // If it's not beneficial, skip it.
+      if (Benefit < 1)
+        continue;
+
+      if (StringLen > MaxLen)
+        MaxLen = StringLen;
+
+      unsigned OccurrenceCount = 0;
+      for (auto &ChildPair : Parent.Children) {
+        SuffixTreeNode *M = ChildPair.second;
+
+        // Is it a leaf? If so, we have an occurrence of this candidate.
+        if (M && M->IsInTree && M->isLeaf()) {
+          OccurrenceCount++;
+          CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx);
+          CandidateList.back().Benefit = Benefit;
+          M->IsInTree = false;
+        }
+      }
+
+      // Save the function for the new candidate sequence.
+      std::vector<unsigned> CandidateSequence;
+      for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
+        CandidateSequence.push_back(Str[i]);
+
+      FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence,
+                                Benefit, false);
+
+      // Move to the next function.
+      FnIdx++;
+      Parent.IsInTree = false;
+    }
+
+    return MaxLen;
+  }
+ 
+  /// Construct a suffix tree from a sequence of unsigned integers.
+  ///
+  /// \param Str The string to construct the suffix tree for.
+  SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
+    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
+    Root->IsInTree = true;
+    Active.Node = Root;
+    LeafVector = std::vector<SuffixTreeNode*>(Str.size());
+
+    // Keep track of the number of suffixes we have to add of the current
+    // prefix.
+    size_t SuffixesToAdd = 0;
+    Active.Node = Root;
+
+    // Construct the suffix tree iteratively on each prefix of the string.
+    // PfxEndIdx is the end index of the current prefix.
+    // End is one past the last element in the string.
+    for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+      SuffixesToAdd++;
+      LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+    }
+
+    // Set the suffix indices of each leaf.
+    assert(Root && "Root node can't be nullptr!");
+    setSuffixIndices(*Root, 0);
+  }
+};
+
+/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings.
+struct InstructionMapper {
+
+  /// \brief The next available integer to assign to a \p MachineInstr that
+  /// cannot be outlined.
+  ///
+  /// Set to -3 for compatability with \p DenseMapInfo<unsigned>.
+  unsigned IllegalInstrNumber = -3;
+
+  /// \brief The next available integer to assign to a \p MachineInstr that can
+  /// be outlined.
+  unsigned LegalInstrNumber = 0;
+
+  /// Correspondence from \p MachineInstrs to unsigned integers.
+  DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>
+      InstructionIntegerMap;
+
+  /// Corresponcence from unsigned integers to \p MachineInstrs.
+  /// Inverse of \p InstructionIntegerMap.
+  DenseMap<unsigned, MachineInstr *> IntegerInstructionMap;
+
+  /// The vector of unsigned integers that the module is mapped to.
+  std::vector<unsigned> UnsignedVec;
+
+  /// \brief Stores the location of the instruction associated with the integer
+  /// at index i in \p UnsignedVec for each index i.
+  std::vector<MachineBasicBlock::iterator> InstrList;
+
+  /// \brief Maps \p *It to a legal integer.
+  ///
+  /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
+  /// \p IntegerInstructionMap, and \p LegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) {
+
+    // Get the integer for this instruction or give it the current
+    // LegalInstrNumber.
+    InstrList.push_back(It);
+    MachineInstr &MI = *It;
+    bool WasInserted;
+    DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator
+    ResultIt;
+    std::tie(ResultIt, WasInserted) =
+    InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber));
+    unsigned MINumber = ResultIt->second;
+
+    // There was an insertion.
+    if (WasInserted) {
+      LegalInstrNumber++;
+      IntegerInstructionMap.insert(std::make_pair(MINumber, &MI));
+    }
+
+    UnsignedVec.push_back(MINumber);
+
+    // Make sure we don't overflow or use any integers reserved by the DenseMap.
+    if (LegalInstrNumber >= IllegalInstrNumber)
+      report_fatal_error("Instruction mapping overflow!");
+
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey()
+          && "Tried to assign DenseMap tombstone or empty key to instruction.");
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey()
+          && "Tried to assign DenseMap tombstone or empty key to instruction.");
+
+    return MINumber;
+  }
+
+  /// Maps \p *It to an illegal integer.
+  ///
+  /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) {
+    unsigned MINumber = IllegalInstrNumber;
+
+    InstrList.push_back(It);
+    UnsignedVec.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+
+    assert(LegalInstrNumber < IllegalInstrNumber &&
+           "Instruction mapping overflow!");
+
+    assert(IllegalInstrNumber !=
+      DenseMapInfo<unsigned>::getEmptyKey() &&
+      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    assert(IllegalInstrNumber !=
+      DenseMapInfo<unsigned>::getTombstoneKey() &&
+      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    return MINumber;
+  }
+
+  /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds
+  /// and appends it to \p UnsignedVec and \p InstrList.
+  ///
+  /// Two instructions are assigned the same integer if they are identical.
+  /// If an instruction is deemed unsafe to outline, then it will be assigned an
+  /// unique integer. The resulting mapping is placed into a suffix tree and
+  /// queried for candidates.
+  ///
+  /// \param MBB The \p MachineBasicBlock to be translated into integers.
+  /// \param TRI \p TargetRegisterInfo for the module.
+  /// \param TII \p TargetInstrInfo for the module.
+  void convertToUnsignedVec(MachineBasicBlock &MBB,
+                            const TargetRegisterInfo &TRI,
+                            const TargetInstrInfo &TII) {
+    for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et;
+         It++) {
+
+      // Keep track of where this instruction is in the module.
+      switch(TII.getOutliningType(*It)) {
+        case TargetInstrInfo::MachineOutlinerInstrType::Illegal:
+          mapToIllegalUnsigned(It);
+          break;
+
+        case TargetInstrInfo::MachineOutlinerInstrType::Legal:
+          mapToLegalUnsigned(It);
+          break;
+
+        case TargetInstrInfo::MachineOutlinerInstrType::Invisible:
+          break;
+      }
+    }
+
+    // After we're done every insertion, uniquely terminate this part of the
+    // "string". This makes sure we won't match across basic block or function
+    // boundaries since the "end" is encoded uniquely and thus appears in no
+    // repeated substring.
+    InstrList.push_back(MBB.end());
+    UnsignedVec.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+  }
+
+  InstructionMapper() {
+    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
+    // changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+                "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+                "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
+};
+
+/// \brief An interprocedural pass which finds repeated sequences of
+/// instructions and replaces them with calls to functions.
+///
+/// Each instruction is mapped to an unsigned integer and placed in a string.
+/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree
+/// is then repeatedly queried for repeated sequences of instructions. Each
+/// non-overlapping repeated sequence is then placed in its own
+/// \p MachineFunction and each instance is then replaced with a call to that
+/// function.
+struct MachineOutliner : public ModulePass {
+
+  static char ID;
+
+  StringRef getPassName() const override { return "Machine Outliner"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfo>();
+    AU.addPreserved<MachineModuleInfo>();
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  MachineOutliner() : ModulePass(ID) {
+    initializeMachineOutlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// \brief Replace the sequences of instructions represented by the
+  /// \p Candidates in \p CandidateList with calls to \p MachineFunctions
+  /// described in \p FunctionList.
+  ///
+  /// \param M The module we are outlining from.
+  /// \param CandidateList A list of candidates to be outlined.
+  /// \param FunctionList A list of functions to be inserted into the module.
+  /// \param Mapper Contains the instruction mappings for the module.
+  bool outline(Module &M, const ArrayRef<Candidate> &CandidateList,
+               std::vector<OutlinedFunction> &FunctionList,
+               InstructionMapper &Mapper);
+
+  /// Creates a function for \p OF and inserts it into the module.
+  MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+                                          InstructionMapper &Mapper);
+
+  /// Find potential outlining candidates and store them in \p CandidateList.
+  ///
+  /// For each type of potential candidate, also build an \p OutlinedFunction
+  /// struct containing the information to build the function for that
+  /// candidate.
+  ///
+  /// \param[out] CandidateList Filled with outlining candidates for the module.
+  /// \param[out] FunctionList Filled with functions corresponding to each type
+  /// of \p Candidate.
+  /// \param ST The suffix tree for the module.
+  /// \param TII TargetInstrInfo for the module.
+  ///
+  /// \returns The length of the longest candidate found. 0 if there are none.
+  unsigned buildCandidateList(std::vector<Candidate> &CandidateList,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              SuffixTree &ST,
+                              InstructionMapper &Mapper,
+                              const TargetInstrInfo &TII);
+
+  /// \brief Remove any overlapping candidates that weren't handled by the
+  /// suffix tree's pruning method.
+  ///
+  /// Pruning from the suffix tree doesn't necessarily remove all overlaps.
+  /// If a short candidate is chosen for outlining, then a longer candidate
+  /// which has that short candidate as a suffix is chosen, the tree's pruning
+  /// method will not find it. Thus, we need to prune before outlining as well.
+  ///
+  /// \param[in,out] CandidateList A list of outlining candidates.
+  /// \param[in,out] FunctionList A list of functions to be outlined.
+  /// \param MaxCandidateLen The length of the longest candidate.
+  /// \param TII TargetInstrInfo for the module.
+  void pruneOverlaps(std::vector<Candidate> &CandidateList,
+                     std::vector<OutlinedFunction> &FunctionList,
+                     unsigned MaxCandidateLen,
+                     const TargetInstrInfo &TII);
+
+  /// Construct a suffix tree on the instructions in \p M and outline repeated
+  /// strings from that tree.
+  bool runOnModule(Module &M) override;
+};
+
+} // Anonymous namespace.
+
+char MachineOutliner::ID = 0;
+
+namespace llvm {
+ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); }
+}
+
+INITIALIZE_PASS(MachineOutliner, "machine-outliner",
+                "Machine Function Outliner", false, false)
+
+void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList,
+                                    std::vector<OutlinedFunction> &FunctionList,
+                                    unsigned MaxCandidateLen,
+                                    const TargetInstrInfo &TII) {
+  // TODO: Experiment with interval trees or other interval-checking structures
+  // to lower the time complexity of this function.
+  // TODO: Can we do better than the simple greedy choice?
+  // Check for overlaps in the range.
+  // This is O(MaxCandidateLen * CandidateList.size()).
+  for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et;
+       It++) {
+    Candidate &C1 = *It;
+    OutlinedFunction &F1 = FunctionList[C1.FunctionIdx];
+
+    // If we removed this candidate, skip it.
+    if (!C1.InCandidateList)
+      continue;
+
+    // Is it still worth it to outline C1?
+    if (F1.Benefit < 1 || F1.OccurrenceCount < 2) {
+      assert(F1.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+      F1.OccurrenceCount--;
+      C1.InCandidateList = false;
+      continue;
+    }
+
+    // The minimum start index of any candidate that could overlap with this
+    // one.
+    unsigned FarthestPossibleIdx = 0;
+
+    // Either the index is 0, or it's at most MaxCandidateLen indices away.
+    if (C1.StartIdx > MaxCandidateLen)
+      FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen;
+
+    // Compare against the candidates in the list that start at at most
+    // FarthestPossibleIdx indices away from C1. There are at most
+    // MaxCandidateLen of these.
+    for (auto Sit = It + 1; Sit != Et; Sit++) {
+      Candidate &C2 = *Sit;
+      OutlinedFunction &F2 = FunctionList[C2.FunctionIdx];
+
+      // Is this candidate too far away to overlap?
+      if (C2.StartIdx < FarthestPossibleIdx)
+        break;
+
+      // Did we already remove this candidate in a previous step?
+      if (!C2.InCandidateList)
+        continue;
+
+      // Is the function beneficial to outline?
+      if (F2.OccurrenceCount < 2 || F2.Benefit < 1) {
+        // If not, remove this candidate and move to the next one.
+        assert(F2.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F2.OccurrenceCount--;
+        C2.InCandidateList = false;
+        continue;
+      }
+
+      size_t C2End = C2.StartIdx + C2.Len - 1;
+
+      // Do C1 and C2 overlap?
+      //
+      // Not overlapping:
+      // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices
+      //
+      // We sorted our candidate list so C2Start <= C1Start. We know that
+      // C2End > C2Start since each candidate has length >= 2. Therefore, all we
+      // have to check is C2End < C2Start to see if we overlap.
+      if (C2End < C1.StartIdx)
+        continue;
+
+      // C1 and C2 overlap.
+      // We need to choose the better of the two.
+      //
+      // Approximate this by picking the one which would have saved us the
+      // most instructions before any pruning.
+      if (C1.Benefit >= C2.Benefit) {
+
+        // C1 is better, so remove C2 and update C2's OutlinedFunction to
+        // reflect the removal.
+        assert(F2.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F2.OccurrenceCount--;
+        F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(),
+                                             F2.OccurrenceCount,
+                                             F2.IsTailCall
+                                             );
+
+        C2.InCandidateList = false;
+
+        DEBUG (
+          dbgs() << "- Removed C2. \n";
+          dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n";
+          dbgs() << "--- C2's benefit: " << F2.Benefit << "\n";
+        );
+
+      } else {
+        // C2 is better, so remove C1 and update C1's OutlinedFunction to
+        // reflect the removal.
+        assert(F1.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F1.OccurrenceCount--;
+        F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(),
+                                             F1.OccurrenceCount,
+                                             F1.IsTailCall
+                                             );
+        C1.InCandidateList = false;
+
+        DEBUG (
+          dbgs() << "- Removed C1. \n";
+          dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n";
+          dbgs() << "--- C1's benefit: " << F1.Benefit << "\n";
+        );
+
+        // C1 is out, so we don't have to compare it against anyone else.
+        break;
+      }
+    }
+  }
+}
+
+unsigned
+MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList,
+                                    std::vector<OutlinedFunction> &FunctionList,
+                                    SuffixTree &ST,
+                                    InstructionMapper &Mapper,
+                                    const TargetInstrInfo &TII) {
+
+  std::vector<unsigned> CandidateSequence; // Current outlining candidate.
+  size_t MaxCandidateLen = 0; // Length of the longest candidate.
+
+  // Function for maximizing query in the suffix tree.
+  // This allows us to define more fine-grained types of things to outline in
+  // the target without putting target-specific info in the suffix tree.
+  auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr,
+                                   size_t StringLen, unsigned EndVal) {
+
+    // The root represents the empty string.
+    if (Curr.isRoot())
+      return 0u;
+
+    // Is this long enough to outline?
+	// TODO: Let the target decide how "long" a string is in terms of the sizes
+	// of the instructions in the string. For example, if a call instruction
+	// is smaller than a one instruction string, we should outline that string.
+    if (StringLen < 2)
+      return 0u;
+
+    size_t Occurrences = Curr.OccurrenceCount;
+
+    // Anything we want to outline has to appear at least twice.
+    if (Occurrences < 2)
+      return 0u;
+
+    // Check if the last instruction in the sequence is a return.
+    MachineInstr *LastInstr =
+    Mapper.IntegerInstructionMap[EndVal];
+    assert(LastInstr && "Last instruction in sequence was unmapped!");
+
+    // The only way a terminator could be mapped as legal is if it was safe to
+    // tail call.
+    bool IsTailCall = LastInstr->isTerminator();
+    return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall);
+  };
+
+  MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn);
+
+  for (auto &OF : FunctionList)
+    OF.IsTailCall = Mapper.
+                    IntegerInstructionMap[OF.Sequence.back()]->isTerminator();
+
+  // Sort the candidates in decending order. This will simplify the outlining
+  // process when we have to remove the candidates from the mapping by
+  // allowing us to cut them out without keeping track of an offset.
+  std::stable_sort(CandidateList.begin(), CandidateList.end());
+
+  return MaxCandidateLen;
+}
+
+MachineFunction *
+MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+  InstructionMapper &Mapper) {
+
+  // Create the function name. This should be unique. For now, just hash the
+  // module name and include it in the function name plus the number of this
+  // function.
+  std::ostringstream NameStream;
+  NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name;
+
+  // Create the function using an IR-level function.
+  LLVMContext &C = M.getContext();
+  Function *F = dyn_cast<Function>(
+      M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C)));
+  assert(F && "Function was null!");
+
+  // NOTE: If this is linkonceodr, then we can take advantage of linker deduping
+  // which gives us better results when we outline from linkonceodr functions.
+  F->setLinkage(GlobalValue::PrivateLinkage);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  IRBuilder<> Builder(EntryBB);
+  Builder.CreateRetVoid();
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineFunction &MF = MMI.getMachineFunction(*F);
+  MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Insert the new function into the module.
+  MF.insert(MF.begin(), &MBB);
+
+  TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall);
+
+  // Copy over the instructions for the function using the integer mappings in
+  // its sequence.
+  for (unsigned Str : OF.Sequence) {
+    MachineInstr *NewMI =
+        MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second);
+    NewMI->dropMemRefs();
+
+    // Don't keep debug information for outlined instructions.
+    // FIXME: This means outlined functions are currently undebuggable.
+    NewMI->setDebugLoc(DebugLoc());
+    MBB.insert(MBB.end(), NewMI);
+  }
+
+  TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall);
+
+  return &MF;
+}
+
+bool MachineOutliner::outline(Module &M,
+                              const ArrayRef<Candidate> &CandidateList,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              InstructionMapper &Mapper) {
+
+  bool OutlinedSomething = false;
+
+  // Replace the candidates with calls to their respective outlined functions.
+  for (const Candidate &C : CandidateList) {
+
+    // Was the candidate removed during pruneOverlaps?
+    if (!C.InCandidateList)
+      continue;
+
+    // If not, then look at its OutlinedFunction.
+    OutlinedFunction &OF = FunctionList[C.FunctionIdx];
+
+    // Was its OutlinedFunction made unbeneficial during pruneOverlaps?
+    if (OF.OccurrenceCount < 2 || OF.Benefit < 1)
+      continue;
+
+    // If not, then outline it.
+    assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
+    MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent();
+    MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx];
+    unsigned EndIdx = C.StartIdx + C.Len - 1;
+
+    assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
+    MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+    assert(EndIt != MBB->end() && "EndIt out of bounds!");
+
+    EndIt++; // Erase needs one past the end index.
+
+    // Does this candidate have a function yet?
+    if (!OF.MF) {
+      OF.MF = createOutlinedFunction(M, OF, Mapper);
+      FunctionsCreated++;
+    }
+
+    MachineFunction *MF = OF.MF;
+    const TargetSubtargetInfo &STI = MF->getSubtarget();
+    const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+    // Insert a call to the new function and erase the old sequence.
+    TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall);
+    StartIt = Mapper.InstrList[C.StartIdx];
+    MBB->erase(StartIt, EndIt);
+
+    OutlinedSomething = true;
+
+    // Statistics.
+    NumOutlined++;
+  }
+
+  DEBUG (
+    dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";
+  );
+
+  return OutlinedSomething;
+}
+
+bool MachineOutliner::runOnModule(Module &M) {
+
+  // Is there anything in the module at all?
+  if (M.empty())
+    return false;
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  const TargetSubtargetInfo &STI = MMI.getMachineFunction(*M.begin())
+                                      .getSubtarget();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+
+  InstructionMapper Mapper;
+
+  // Build instruction mappings for each function in the module.
+  for (Function &F : M) {
+    MachineFunction &MF = MMI.getMachineFunction(F);
+
+    // Is the function empty? Safe to outline from?
+    if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF))
+      continue;
+
+    // If it is, look at each MachineBasicBlock in the function.
+    for (MachineBasicBlock &MBB : MF) {
+
+      // Is there anything in MBB?
+      if (MBB.empty())
+        continue;
+
+      // If yes, map it.
+      Mapper.convertToUnsignedVec(MBB, *TRI, *TII);
+    }
+  }
+
+  // Construct a suffix tree, use it to find candidates, and then outline them.
+  SuffixTree ST(Mapper.UnsignedVec);
+  std::vector<Candidate> CandidateList;
+  std::vector<OutlinedFunction> FunctionList;
+
+  // Find all of the outlining candidates.
+  unsigned MaxCandidateLen =
+      buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII);
+
+  // Remove candidates that overlap with other candidates.
+  pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII);
+
+  // Outline each of the candidates and return true if something was outlined.
+  return outline(M, CandidateList, FunctionList, Mapper);
+}
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 43a18099d39a..d06c38cf4ed8 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -552,7 +552,9 @@ public:
     os << "\n";
   }
 
-  void dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
 };
 
 /// This class repesents the scheduled code.  The main data structure is a
@@ -593,7 +595,7 @@ private:
   /// Virtual register information.
   MachineRegisterInfo &MRI;
 
-  DFAPacketizer *Resources;
+  std::unique_ptr<DFAPacketizer> Resources;
 
 public:
   SMSchedule(MachineFunction *mf)
@@ -604,13 +606,6 @@ public:
     InitiationInterval = 0;
   }
 
-  ~SMSchedule() {
-    ScheduledInstrs.clear();
-    InstrToCycle.clear();
-    RegToStageDiff.clear();
-    delete Resources;
-  }
-
   void reset() {
     ScheduledInstrs.clear();
     InstrToCycle.clear();
@@ -738,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
     return false;
 
   if (mf.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
       !EnableSWPOptSize.getPosition())
     return false;
 
@@ -960,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     if (Phi.getOperand(i + 1).getMBB() != Loop)
       InitVal = Phi.getOperand(i).getReg();
-    else if (Phi.getOperand(i + 1).getMBB() == Loop)
+    else
       LoopVal = Phi.getOperand(i).getReg();
 
   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
@@ -2514,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis(
     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
     bool IsLast) {
-  // Compute the stage number for the inital value of the Phi, which
+  // Compute the stage number for the initial value of the Phi, which
   // comes from the prolog. The prolog to use depends on to which kernel/
   // epilog that we're adding the Phi.
   unsigned PrologStage = 0;
@@ -3480,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
   // increment value to determine if the accesses may be loop carried.
   if (OffsetS >= OffsetD)
     return OffsetS + AccessSizeS > DeltaS;
-  else if (OffsetS < OffsetD)
+  else
     return OffsetD + AccessSizeD > DeltaD;
 
   return true;
@@ -3980,5 +3975,7 @@ void SMSchedule::print(raw_ostream &os) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Utility function used for debugging to print the schedule.
-void SMSchedule::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
+#endif
diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index fc32183c7f63..71ad4e6aa7f5 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp
@@ -1,10 +1,9 @@
-
 #include "llvm/CodeGen/MachineRegionInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 
-#define DEBUG_TYPE "region"
+#define DEBUG_TYPE "machine-region-info"
 
 using namespace llvm;
 
@@ -86,6 +85,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
   auto DF = &getAnalysis<MachineDominanceFrontier>();
 
   RI.recalculate(F, DT, PDT, DF);
+
+  DEBUG(RI.dump());
+
   return false;
 }
 
@@ -103,9 +105,10 @@ void MachineRegionInfoPass::verifyAnalysis() const {
 
 void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTreeWrapperPass>();
-  AU.addRequired<DominanceFrontierWrapperPass>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
@@ -119,14 +122,15 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
 #endif
 
 char MachineRegionInfoPass::ID = 0;
+char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
 
-INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions",
-                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
+                      "Detect single entry single exit regions", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
-INITIALIZE_PASS_END(MachineRegionInfoPass, "regions",
-                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE,
+                    "Detect single entry single exit regions", true, true)
 
 // Create methods available outside of this file, to use them
 // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 242cb0b80953..128910f8eb2a 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden,
 void MachineRegisterInfo::Delegate::anchor() {}
 
 MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
-    : MF(MF), TheDelegate(nullptr),
-      TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
-                           EnableSubRegLiveness) {
+    : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
+                                   EnableSubRegLiveness),
+      IsUpdatedCSRsInitialized(false) {
   unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
@@ -444,8 +458,8 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
   return TRC.getLaneMask();
 }
 
-#ifndef NDEBUG
-void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const {
   for (MachineInstr &I : use_instructions(Reg))
     I.dump();
 }
@@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
   }
   return false;
 }
+
+void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  if (!IsUpdatedCSRsInitialized) {
+    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      UpdatedCSRs.push_back(*I);
+
+    // Zero value represents the end of the register list
+    // (no more registers should be pushed).
+    UpdatedCSRs.push_back(0);
+
+    IsUpdatedCSRsInitialized = true;
+  }
+
+  // Remove the register (and its aliases from the list).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
+                      UpdatedCSRs.end());
+}
+
+const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
+  if (IsUpdatedCSRsInitialized)
+    return UpdatedCSRs.data();
+
+  return getTargetRegisterInfo()->getCalleeSavedRegs(MF);
+}
+
+void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) {
+  if (IsUpdatedCSRsInitialized)
+    UpdatedCSRs.clear();
+
+  for (MCPhysReg Reg : CSRs)
+    UpdatedCSRs.push_back(Reg);
+
+  // Zero value represents the end of the register list
+  // (no more registers should be pushed).
+  UpdatedCSRs.push_back(0);
+  IsUpdatedCSRsInitialized = true;
+}
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index e06bc517fa91..41e161f71e53 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -12,30 +12,67 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "misched"
 
 namespace llvm {
+
 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
@@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
 cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
-}
+
+} // end namespace llvm
 
 #ifndef NDEBUG
 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
@@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
 
-// Experimental heuristics
-static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
   cl::desc("Verify machine instrs before and after machine scheduling"));
 
@@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8;
 
 // Pin the vtables to this file.
 void MachineSchedStrategy::anchor() {}
+
 void ScheduleDAGMutation::anchor() {}
 
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
 
-MachineSchedContext::MachineSchedContext():
-    MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) {
+MachineSchedContext::MachineSchedContext() {
   RegClassInfo = new RegisterClassInfo();
 }
 
@@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() {
 }
 
 namespace {
+
 /// Base class for a machine scheduler class that can run at any point.
 class MachineSchedulerBase : public MachineSchedContext,
                              public MachineFunctionPass {
@@ -149,7 +184,8 @@ public:
 protected:
   ScheduleDAGInstrs *createPostMachineScheduler();
 };
-} // namespace
+
+} // end anonymous namespace
 
 char MachineScheduler::ID = 0;
 
@@ -158,6 +194,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;
 INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
@@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
 
 /// MachineSchedOpt allows command line selection of the scheduler.
 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
-               RegisterPassParser<MachineSchedRegistry> >
+               RegisterPassParser<MachineSchedRegistry>>
 MachineSchedOpt("misched",
                 cl::init(&useDefaultMachineSched), cl::Hidden,
                 cl::desc("Machine instruction scheduler to use"));
@@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for (;I != MBB->begin(); --I) {
+      for (; I != MBB->begin(); --I) {
         MachineInstr &MI = *std::prev(I);
         if (isSchedBoundary(&MI, &*MBB, MF, TII))
           break;
@@ -504,13 +541,14 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
   // unimplemented
 }
 
-LLVM_DUMP_METHOD
-void ReadyQueue::dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ReadyQueue::dump() {
   dbgs() << "Queue " << Name << ": ";
   for (unsigned i = 0, e = Queue.size(); i < e; ++i)
     dbgs() << Queue[i]->NodeNum << " ";
   dbgs() << "\n";
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // ScheduleDAGMI - Basic machine instruction scheduling. This is
@@ -519,8 +557,7 @@ void ReadyQueue::dump() {
 // ===----------------------------------------------------------------------===/
 
 // Provide a vtable anchor.
-ScheduleDAGMI::~ScheduleDAGMI() {
-}
+ScheduleDAGMI::~ScheduleDAGMI() = default;
 
 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
   return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
@@ -825,7 +862,7 @@ void ScheduleDAGMI::placeDebugValues() {
     RegionBegin = FirstDbgValue;
   }
 
-  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
          DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
     std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
     MachineInstr *DbgValue = P.first;
@@ -841,7 +878,7 @@ void ScheduleDAGMI::placeDebugValues() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void ScheduleDAGMI::dumpSchedule() const {
+LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
   for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
     if (SUnit *SU = getSUnit(&(*MI)))
       SU->dump(this);
@@ -1012,7 +1049,7 @@ updateScheduledPressure(const SUnit *SU,
       ++CritIdx;
     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
       if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
-          && NewMaxPressure[ID] <= INT16_MAX)
+          && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
         RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
     }
     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
@@ -1136,6 +1173,12 @@ void ScheduleDAGMILive::schedule() {
         dbgs() << "  Pressure Diff      : ";
         getPressureDiff(&SU).dump(*TRI);
       }
+      dbgs() << "  Single Issue       : ";
+      if (SchedModel.mustBeginGroup(SU.getInstr()) &&
+         SchedModel.mustEndGroup(SU.getInstr()))
+        dbgs() << "true;";
+      else
+        dbgs() << "false;";
       dbgs() << '\n';
     }
     if (ExitSU.getInstr() != nullptr)
@@ -1396,6 +1439,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Post-process the DAG to create cluster edges between neighboring
 /// loads or between neighboring stores.
 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
@@ -1403,6 +1447,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SUnit *SU;
     unsigned BaseReg;
     int64_t Offset;
+
     MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
         : SU(su), BaseReg(reg), Offset(ofs) {}
 
@@ -1439,25 +1484,26 @@ public:
   LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
       : BaseMemOpClusterMutation(tii, tri, true) {}
 };
-} // anonymous
+
+} // end anonymous namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
-} // namespace llvm
+} // end namespace llvm
 
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
@@ -1543,80 +1589,11 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
 }
 
 //===----------------------------------------------------------------------===//
-// MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class MacroFusion : public ScheduleDAGMutation {
-  const TargetInstrInfo &TII;
-public:
-  MacroFusion(const TargetInstrInfo &TII)
-    : TII(TII) {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-} // anonymous
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
-  return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr;
-}
-
-} // namespace llvm
-
-/// \brief Callback from DAG postProcessing to create cluster edges to encourage
-/// fused operations.
-void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch)
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!TII.shouldScheduleAdjacent(Pred, *Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds) {
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    }
-    for (SDep &SuccDep : SU.Succs) {
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-    }
-
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
-    break;
-  }
-}
-
-//===----------------------------------------------------------------------===//
 // CopyConstrain - DAG post-processing to encourage copy elimination.
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Post-process the DAG to create weak edges from all uses of a copy to
 /// the one use that defines the copy's source vreg, most likely an induction
 /// variable increment.
@@ -1626,6 +1603,7 @@ class CopyConstrain : public ScheduleDAGMutation {
   // RegionEndIdx is the slot index of the last non-debug instruction in the
   // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
   SlotIndex RegionEndIdx;
+
 public:
   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
 
@@ -1634,17 +1612,18 @@ public:
 protected:
   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
 };
-} // anonymous
+
+} // end anonymous namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  return make_unique<CopyConstrain>(TII, TRI);
+                               const TargetRegisterInfo *TRI) {
+  return llvm::make_unique<CopyConstrain>(TII, TRI);
 }
 
-} // namespace llvm
+} // end namespace llvm
 
 /// constrainLocalCopy handles two possibilities:
 /// 1) Local src:
@@ -1836,7 +1815,7 @@ void SchedBoundary::reset() {
   CheckPending = false;
   CurrCycle = 0;
   CurrMOps = 0;
-  MinReadyCycle = UINT_MAX;
+  MinReadyCycle = std::numeric_limits<unsigned>::max();
   ExpectedLatency = 0;
   DependentLatency = 0;
   RetiredMOps = 0;
@@ -1937,12 +1916,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
     return true;
   }
+
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
     DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
           << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
   }
+
+  if (CurrMOps > 0 &&
+      ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
+       (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
+    DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
+                 << (isTop()? "begin" : "end") << " group\n");
+    return true;
+  }
+
   if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
     const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
     for (TargetSchedModel::ProcResIter
@@ -2039,7 +2028,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
 /// Move the boundary of scheduled code by one cycle.
 void SchedBoundary::bumpCycle(unsigned NextCycle) {
   if (SchedModel->getMicroOpBufferSize() == 0) {
-    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+    assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
+           "MinReadyCycle uninitialized");
     if (MinReadyCycle > NextCycle)
       NextCycle = MinReadyCycle;
   }
@@ -2237,6 +2227,18 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   // one cycle.  Since we commonly reach the max MOps here, opportunistically
   // bump the cycle to avoid uselessly checking everything in the readyQ.
   CurrMOps += IncMOps;
+
+  // Bump the cycle count for issue group constraints.
+  // This must be done after NextCycle has been adjust for all other stalls.
+  // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
+  // currCycle to X.
+  if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
+      (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
+    DEBUG(dbgs() << "  Bump cycle to "
+                 << (isTop() ? "end" : "begin") << " group\n");
+    bumpCycle(++NextCycle);
+  }
+
   while (CurrMOps >= SchedModel->getIssueWidth()) {
     DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
           << " at cycle " << CurrCycle << '\n');
@@ -2250,7 +2252,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
 void SchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
-    MinReadyCycle = UINT_MAX;
+    MinReadyCycle = std::numeric_limits<unsigned>::max();
 
   // Check to see if any of the pending instructions are ready to issue.  If
   // so, add them to the available queue.
@@ -2323,10 +2325,10 @@ SUnit *SchedBoundary::pickOnlyChoice() {
   return nullptr;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // This is useful information to dump after bumpNode.
 // Note that the Queue contents are more useful before pickNodeFromQueue.
-void SchedBoundary::dumpScheduledState() {
+LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() {
   unsigned ResFactor;
   unsigned ResCount;
   if (ZoneCritResIdx) {
@@ -2666,11 +2668,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
 }
 
 void GenericScheduler::dumpPolicy() {
+  // Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "GenericScheduler RegionPolicy: "
          << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
          << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
          << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
          << "\n";
+#endif
 }
 
 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
@@ -2724,7 +2729,7 @@ void GenericScheduler::registerRoots() {
     errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
   }
 
-  if (EnableCyclicPath) {
+  if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
     checkAcyclicLatency();
   }
@@ -3106,7 +3111,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
 }
 
 void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
-
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
     ++InsertPos;
@@ -3154,7 +3158,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
-  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C));
+  ScheduleDAGMILive *DAG =
+      new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
@@ -3195,7 +3200,6 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
   }
 }
 
-
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
@@ -3302,7 +3306,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C),
+  return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C),
                            /*RemoveKillFlags=*/true);
 }
 
@@ -3311,14 +3315,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Order nodes by the ILP metric.
 struct ILPOrder {
-  const SchedDFSResult *DFSResult;
-  const BitVector *ScheduledTrees;
+  const SchedDFSResult *DFSResult = nullptr;
+  const BitVector *ScheduledTrees = nullptr;
   bool MaximizeILP;
 
-  ILPOrder(bool MaxILP)
-    : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {}
+  ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
 
   /// \brief Apply a less-than relation on node priority.
   ///
@@ -3347,12 +3351,13 @@ struct ILPOrder {
 
 /// \brief Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
-  ScheduleDAGMILive *DAG;
+  ScheduleDAGMILive *DAG = nullptr;
   ILPOrder Cmp;
 
   std::vector<SUnit*> ReadyQ;
+
 public:
-  ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {}
+  ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
 
   void initialize(ScheduleDAGMI *dag) override {
     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
@@ -3405,14 +3410,16 @@ public:
     std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true));
+  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true));
 }
 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false));
+  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false));
 }
+
 static MachineSchedRegistry ILPMaxRegistry(
   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
 static MachineSchedRegistry ILPMinRegistry(
@@ -3424,6 +3431,7 @@ static MachineSchedRegistry ILPMinRegistry(
 
 #ifndef NDEBUG
 namespace {
+
 /// Apply a less-than relation on the node order, which corresponds to the
 /// instruction order prior to scheduling. IsReverse implements greater-than.
 template<bool IsReverse>
@@ -3444,11 +3452,12 @@ class InstructionShuffler : public MachineSchedStrategy {
   // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
   // gives nodes with a higher number higher priority causing the latest
   // instructions to be scheduled first.
-  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
     TopQ;
   // When scheduling bottom-up, use greater-than as the queue priority.
-  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
     BottomQ;
+
 public:
   InstructionShuffler(bool alternate, bool topdown)
     : IsAlternating(alternate), IsTopDown(topdown) {}
@@ -3492,15 +3501,18 @@ public:
     BottomQ.push(SU);
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
   bool Alternate = !ForceTopDown && !ForceBottomUp;
   bool TopDown = !ForceBottomUp;
   assert((TopDown || !ForceTopDown) &&
          "-misched-topdown incompatible with -misched-bottomup");
-  return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown));
+  return new ScheduleDAGMILive(
+      C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown));
 }
+
 static MachineSchedRegistry ShufflerRegistry(
   "shuffle", "Shuffle machine instructions alternating directions",
   createInstructionShuffler);
@@ -3518,8 +3530,7 @@ template<> struct GraphTraits<
 
 template<>
 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
-
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
     return G->MF.getName();
@@ -3576,7 +3587,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
     return Str;
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 #endif // NDEBUG
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index ef7e525e8165..998a9645e68b 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -1,4 +1,4 @@
-//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -37,9 +51,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineTraceMetrics,
                   "machine-trace-metrics", "Machine Trace Metrics", false, true)
 
-MachineTraceMetrics::MachineTraceMetrics()
-  : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr),
-    MRI(nullptr), Loops(nullptr) {
+MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
   std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
 }
 
@@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
   return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                         Ensemble utility functions
 //===----------------------------------------------------------------------===//
@@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
 }
 
 // Virtual destructor serves as an anchor.
-MachineTraceMetrics::Ensemble::~Ensemble() {}
+MachineTraceMetrics::Ensemble::~Ensemble() = default;
 
 const MachineLoop*
 MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
@@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
 // MinInstrCountEnsemble - Pick the trace that executes the least number of
 // instructions.
 namespace {
+
 class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
   const char *getName() const override { return "MinInstr"; }
   const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override;
@@ -306,7 +318,8 @@ public:
   MinInstrCountEnsemble(MachineTraceMetrics *mtm)
     : MachineTraceMetrics::Ensemble(mtm) {}
 };
-}
+
+} // end anonymous namespace
 
 // Select the preferred predecessor for MBB.
 const MachineBasicBlock*
@@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const {
 // revisit blocks.
 
 namespace {
+
 struct LoopBounds {
   MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
   SmallPtrSet<const MachineBasicBlock*, 8> Visited;
   const MachineLoopInfo *Loops;
-  bool Downward;
+  bool Downward = false;
+
   LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
-             const MachineLoopInfo *loops)
-    : Blocks(blocks), Loops(loops), Downward(false) {}
+             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}
 };
-}
+
+} // end anonymous namespace
 
 // Specialize po_iterator_storage in order to prune the post-order traversal so
 // it is limited to the current loop and doesn't traverse the loop back edges.
 namespace llvm {
+
 template<>
 class po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
+
 public:
   po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+
   void finishPostorder(const MachineBasicBlock*) {}
 
   bool insertEdge(Optional<const MachineBasicBlock *> From,
@@ -452,7 +470,8 @@ public:
     return LB.Visited.insert(To).second;
   }
 };
-}
+
+} // end namespace llvm
 
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
@@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const {
 // A data dependency is represented as a defining MI and operand numbers on the
 // defining and using MI.
 namespace {
+
 struct DataDep {
   const MachineInstr *DefMI;
   unsigned DefOp;
@@ -622,7 +642,8 @@ struct DataDep {
     assert((++DefI).atEnd() && "Register has multiple defs");
   }
 };
-}
+
+} // end anonymous namespace
 
 // Get the input data dependencies that must be ready before UseMI can issue.
 // Return true if UseMI has any physreg operands.
@@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI,
 // direction instructions are scanned, it could be the operand that defined the
 // regunit, or the highest operand to read the regunit.
 namespace {
+
 struct LiveRegUnit {
   unsigned RegUnit;
-  unsigned Cycle;
-  const MachineInstr *MI;
-  unsigned Op;
+  unsigned Cycle = 0;
+  const MachineInstr *MI = nullptr;
+  unsigned Op = 0;
 
   unsigned getSparseSetIndex() const { return RegUnit; }
 
-  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {}
+  LiveRegUnit(unsigned RU) : RegUnit(RU) {}
 };
-}
+
+} // end anonymous namespace
 
 // Identify physreg dependencies for UseMI, and update the live regunit
 // tracking set when scanning instructions downwards.
@@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
   return Height;
 }
 
-
 typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
 
 // Push the height of DefMI upwards if required to match UseMI.
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index a98139f9e5af..d392c044bd71 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -260,8 +260,8 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     const std::string Banner;
 
-    MachineVerifierPass(const std::string &banner = nullptr)
-      : MachineFunctionPass(ID), Banner(banner) {
+    MachineVerifierPass(std::string banner = std::string())
+      : MachineFunctionPass(ID), Banner(std::move(banner)) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
 
@@ -528,7 +528,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
   lastIndex = SlotIndex();
   regsReserved = MRI->getReservedRegs();
 
-  markReachable(&MF->front());
+  if (!MF->empty())
+    markReachable(&MF->front());
 
   // Build a set of the basic blocks in the function.
   FunctionBlocks.clear();
@@ -548,7 +549,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // Check that the register use lists are sane.
   MRI->verifyUseLists();
 
-  verifyStackFrame();
+  if (!MF->empty())
+    verifyStackFrame();
 }
 
 // Does iterator point to a and b as the first two elements?
@@ -572,7 +574,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     for (const auto &LI : MBB->liveins()) {
       if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
           MBB->getIterator() != MBB->getParent()->begin()) {
-        report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
+        report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB);
       }
     }
   }
@@ -908,6 +910,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
+  // Generic loads and stores must have a single MachineMemOperand
+  // describing that access.
+  if ((MI->getOpcode() == TargetOpcode::G_LOAD ||
+       MI->getOpcode() == TargetOpcode::G_STORE) &&
+      !MI->hasOneMemOperand())
+    report("Generic instruction accessing memory must have one mem operand",
+           MI);
+
   StringRef ErrorInfo;
   if (!TII->verifyInstruction(*MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
@@ -2047,23 +2057,14 @@ void MachineVerifier::verifyStackFrame() {
     // Update stack state by checking contents of MBB.
     for (const auto &I : *MBB) {
       if (I.getOpcode() == FrameSetupOpcode) {
-        // The first operand of a FrameOpcode should be i32.
-        int Size = I.getOperand(0).getImm();
-        assert(Size >= 0 &&
-          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
-
         if (BBState.ExitIsSetup)
           report("FrameSetup is after another FrameSetup", &I);
-        BBState.ExitValue -= Size;
+        BBState.ExitValue -= TII->getFrameSize(I);
         BBState.ExitIsSetup = true;
       }
 
       if (I.getOpcode() == FrameDestroyOpcode) {
-        // The first operand of a FrameOpcode should be i32.
-        int Size = I.getOperand(0).getImm();
-        assert(Size >= 0 &&
-          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
-
+        int Size = TII->getFrameSize(I);
         if (!BBState.ExitIsSetup)
           report("FrameDestroy is not after a FrameSetup", &I);
         int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp
index ad9166f1ed23..00e72971a01e 100644
--- a/lib/CodeGen/PatchableFunction.cpp
+++ b/lib/CodeGen/PatchableFunction.cpp
@@ -75,7 +75,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
                  .addImm(FirstActualI->getOpcode());
 
   for (auto &MO : FirstActualI->operands())
-    MIB.addOperand(MO);
+    MIB.add(MO);
 
   FirstActualI->eraseFromParent();
   MF.ensureAlignment(4);
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 6081916a6a82..61dccdde8f1d 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -253,7 +253,7 @@ void SchedulePostRATDList::exitRegion() {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dumpSchedule - dump the scheduled Sequence.
-void SchedulePostRATDList::dumpSchedule() const {
+LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
     if (SUnit *SU = Sequence[i])
       SU->dump(this);
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 5fca7fa5536b..1354009794cb 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -265,11 +265,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
   for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-      if (I->getOpcode() == FrameSetupOpcode ||
-          I->getOpcode() == FrameDestroyOpcode) {
-        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
-               " instructions should have a single immediate argument!");
-        unsigned Size = I->getOperand(0).getImm();
+      if (TII.isFrameInstr(*I)) {
+        unsigned Size = TII.getFrameSize(*I);
         if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
         AdjustsStack = true;
         FrameSDOps.push_back(I);
@@ -336,7 +333,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
     return;
 
   const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+  const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs();
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
@@ -1049,8 +1046,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
-  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   if (RS && FrameIndexEliminationScavenging)
     RS->enterBasicBlock(*BB);
@@ -1059,11 +1054,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
 
   for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
-    if (I->getOpcode() == FrameSetupOpcode ||
-        I->getOpcode() == FrameDestroyOpcode) {
-      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+    if (TII.isFrameInstr(*I)) {
+      InsideCallSequence = TII.isFrameSetup(*I);
       SPAdj += TII.getSPAdjust(*I);
-
       I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
       continue;
     }
@@ -1237,4 +1230,6 @@ doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) {
         ++I;
     }
   }
+
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
 }
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 804a4c3dad66..b29e62bf1aa3 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {}
 PseudoSourceValue::~PseudoSourceValue() {}
 
 void PseudoSourceValue::printCustom(raw_ostream &O) const {
-  O << PSVNames[Kind];
+  if (Kind < TargetCustom)
+    O << PSVNames[Kind];
+  else
+    O << "TargetCustom" << Kind;
 }
 
 bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index a558e371ad4c..a87fed3a687e 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -176,8 +176,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     Q.collectInterferingVRegs();
-    if (Q.seenUnspillableVReg())
-      return false;
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
       if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index c47cfb1b986f..06500289c971 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -29,8 +29,10 @@
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
@@ -125,6 +127,7 @@ class RAGreedy : public MachineFunctionPass,
   MachineBlockFrequencyInfo *MBFI;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
+  MachineOptimizationRemarkEmitter *ORE;
   EdgeBundles *Bundles;
   SpillPlacement *SpillPlacer;
   LiveDebugVariables *DebugVars;
@@ -419,6 +422,20 @@ private:
   void collectHintInfo(unsigned, HintsInfo &);
 
   bool isUnusedCalleeSavedReg(unsigned PhysReg) const;
+
+  /// Compute and report the number of spills and reloads for a loop.
+  void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
+                                    unsigned &FoldedReloads, unsigned &Spills,
+                                    unsigned &FoldedSpills);
+
+  /// Report the number of spills and reloads for each loop.
+  void reportNumberOfSplillsReloads() {
+    for (MachineLoop *L : *Loops) {
+      unsigned Reloads, FoldedReloads, Spills, FoldedSpills;
+      reportNumberOfSplillsReloads(L, Reloads, FoldedReloads, Spills,
+                                   FoldedSpills);
+    }
+  }
 };
 } // end anonymous namespace
 
@@ -439,6 +456,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
 INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_END(RAGreedy, "greedy",
                 "Greedy Register Allocator", false, false)
 
@@ -490,6 +508,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveRegMatrix>();
   AU.addRequired<EdgeBundles>();
   AU.addRequired<SpillPlacement>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -679,7 +698,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
     MCRegUnitIterator Units(PhysReg, TRI);
     for (; Units.isValid(); ++Units) {
       // Instantiate a "subquery", not to be confused with the Queries array.
-      LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]);
+      LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]);
       if (subQ.checkInterference())
         break;
     }
@@ -830,7 +849,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   SmallVector<LiveInterval*, 8> Intfs;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
+    // We usually have the interfering VRegs cached so collectInterferingVRegs()
+    // should be fast, we may need to recalculate if when different physregs
+    // overlap the same register unit so we had different SubRanges queried
+    // against it.
+    Q.collectInterferingVRegs();
     ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
     Intfs.append(IVR.begin(), IVR.end());
   }
@@ -2611,6 +2634,69 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   return 0;
 }
 
+void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
+                                            unsigned &FoldedReloads,
+                                            unsigned &Spills,
+                                            unsigned &FoldedSpills) {
+  Reloads = 0;
+  FoldedReloads = 0;
+  Spills = 0;
+  FoldedSpills = 0;
+
+  // Sum up the spill and reloads in subloops.
+  for (MachineLoop *SubLoop : *L) {
+    unsigned SubReloads;
+    unsigned SubFoldedReloads;
+    unsigned SubSpills;
+    unsigned SubFoldedSpills;
+
+    reportNumberOfSplillsReloads(SubLoop, SubReloads, SubFoldedReloads,
+                                 SubSpills, SubFoldedSpills);
+    Reloads += SubReloads;
+    FoldedReloads += SubFoldedReloads;
+    Spills += SubSpills;
+    FoldedSpills += SubFoldedSpills;
+  }
+
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  int FI;
+
+  for (MachineBasicBlock *MBB : L->getBlocks())
+    // Handle blocks that were not included in subloops.
+    if (Loops->getLoopFor(MBB) == L)
+      for (MachineInstr &MI : *MBB) {
+        const MachineMemOperand *MMO;
+
+        if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI))
+          ++Reloads;
+        else if (TII->hasLoadFromStackSlot(MI, MMO, FI) &&
+                 MFI.isSpillSlotObjectIndex(FI))
+          ++FoldedReloads;
+        else if (TII->isStoreToStackSlot(MI, FI) &&
+                 MFI.isSpillSlotObjectIndex(FI))
+          ++Spills;
+        else if (TII->hasStoreToStackSlot(MI, MMO, FI) &&
+                 MFI.isSpillSlotObjectIndex(FI))
+          ++FoldedSpills;
+      }
+
+  if (Reloads || FoldedReloads || Spills || FoldedSpills) {
+    using namespace ore;
+    MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReload",
+                                      L->getStartLoc(), L->getHeader());
+    if (Spills)
+      R << NV("NumSpills", Spills) << " spills ";
+    if (FoldedSpills)
+      R << NV("NumFoldedSpills", FoldedSpills) << " folded spills ";
+    if (Reloads)
+      R << NV("NumReloads", Reloads) << " reloads ";
+    if (FoldedReloads)
+      R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads ";
+    ORE->emit(R << "generated in loop");
+  }
+}
+
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                << "********** Function: " << mf.getName() << '\n');
@@ -2633,6 +2719,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Indexes = &getAnalysis<SlotIndexes>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   DomTree = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
   Loops = &getAnalysis<MachineLoopInfo>();
   Bundles = &getAnalysis<EdgeBundles>();
@@ -2658,6 +2745,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   allocatePhysRegs();
   tryHintsRecoloring();
   postOptimization();
+  reportNumberOfSplillsReloads();
 
   releaseMemory();
   return true;
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 101b30bf3b65..3b5964eef55e 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -1,4 +1,4 @@
-//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===//
+//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,34 +29,61 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/RegAllocPBQP.h"
 #include "RegisterCoalescer.h"
 #include "Spiller.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/Solution.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
 #include <limits>
+#include <map>
 #include <memory>
 #include <queue>
 #include <set>
 #include <sstream>
+#include <string>
+#include <system_error>
+#include <tuple>
 #include <vector>
+#include <utility>
 
 using namespace llvm;
 
@@ -86,7 +113,6 @@ namespace {
 /// Programming problems.
 class RegAllocPBQP : public MachineFunctionPass {
 public:
-
   static char ID;
 
   /// Construct a PBQP register allocator.
@@ -113,7 +139,6 @@ public:
   }
 
 private:
-
   typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
   typedef std::vector<const LiveInterval*> Node2LIMap;
   typedef std::vector<unsigned> AllowedSet;
@@ -187,7 +212,6 @@ public:
 /// @brief Add interference edges between overlapping vregs.
 class Interference : public PBQPRAConstraint {
 private:
-
   typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
   typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey;
   typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
@@ -276,7 +300,6 @@ private:
   }
 
 public:
-
   void apply(PBQPRAGraph &G) override {
     // The following is loosely based on the linear scan algorithm introduced in
     // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
@@ -363,7 +386,6 @@ public:
   }
 
 private:
-
   // Create an Interference edge and add it to the graph, unless it is
   // a null matrix, meaning the nodes' allowed registers do not have any
   // interference. This case occurs frequently between integer and floating
@@ -372,7 +394,6 @@ private:
   bool createInterferenceEdge(PBQPRAGraph &G,
                               PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId,
                               IMatrixCache &C) {
-
     const TargetRegisterInfo &TRI =
         *G.getMetadata().MF.getSubtarget().getRegisterInfo();
     const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
@@ -409,7 +430,6 @@ private:
   }
 };
 
-
 class Coalescing : public PBQPRAConstraint {
 public:
   void apply(PBQPRAGraph &G) override {
@@ -421,7 +441,6 @@ public:
     // gives the Ok.
     for (const auto &MBB : MF) {
       for (const auto &MI : MBB) {
-
         // Skip not-coalescable or already coalesced copies.
         if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
           continue;
@@ -479,7 +498,6 @@ public:
   }
 
 private:
-
   void addVirtRegCoalesce(
                     PBQPRAGraph::RawMatrix &CostMat,
                     const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
@@ -496,14 +514,15 @@ private:
       }
     }
   }
-
 };
 
-} // End anonymous namespace.
+} // end anonymous namespace
 
 // Out-of-line destructor/anchor for PBQPRAConstraint.
-PBQPRAConstraint::~PBQPRAConstraint() {}
+PBQPRAConstraint::~PBQPRAConstraint() = default;
+
 void PBQPRAConstraint::anchor() {}
+
 void PBQPRAConstraintList::anchor() {}
 
 void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
@@ -554,7 +573,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
 
 static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
                                    const MachineFunction &MF) {
-  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs();
   for (unsigned i = 0; CSR[i] != 0; ++i)
     if (TRI.regsOverlap(reg, CSR[i]))
       return true;
@@ -777,7 +796,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   // If there are non-empty intervals allocate them using pbqp.
   if (!VRegsToAlloc.empty()) {
-
     const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
       llvm::make_unique<PBQPRAConstraintList>();
@@ -840,7 +858,8 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
   });
 }
 
-void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
     const Vector &Costs = getNodeCosts(NId);
     assert(Costs.getLength() != 0 && "Empty vector in graph.");
@@ -861,7 +880,10 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   }
 }
 
-LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const {
+  dump(dbgs());
+}
+#endif
 
 void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
   OS << "graph {\n";
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index ece44c28e9ed..855aa37ff3c3 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "Clobbered Registers: ");
 
-  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
-    if (MRI->isPhysRegModified(PReg, true))
-      RegMask[PReg / 32] &= ~(1u << PReg % 32);
+  const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask();
+  auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
+    RegMask[Reg / 32] &= ~(1u << Reg % 32);
+  };
+  // Scan all the physical registers. When a register is defined in the current
+  // function set it and all the aliasing registers as defined in the regmask.
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+    // If a register is in the UsedPhysRegsMask set then mark it as defined.
+    // All it's aliases will also be in the set, so we can skip setting
+    // as defined all the aliases here.
+    if (UsedPhysRegsMask.test(PReg)) {
+      SetRegAsDefined(PReg);
+      continue;
+    }
+    // If a register is defined by an instruction mark it as defined together
+    // with all it's aliases.
+    if (!MRI->def_empty(PReg)) {
+      for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
+        SetRegAsDefined(*AI);
+    }
+  }
 
   if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {
     const uint32_t *CallPreservedMask =
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 178fa18ac5a6..82a3bd9a0bd1 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,12 +14,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -29,8 +39,7 @@ static cl::opt<unsigned>
 StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
          cl::desc("Limit all regclasses to N registers"));
 
-RegisterClassInfo::RegisterClassInfo()
-  : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {}
+RegisterClassInfo::RegisterClassInfo() = default;
 
 void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   bool Update = false;
@@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 
   // Does this MF have different CSRs?
   assert(TRI && "no register info set");
-  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
-  if (Update || CSR != CalleeSaved) {
-    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+
+  // Get the callee saved registers.
+  const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+  if (Update || CSR != CalleeSavedRegs) {
+    // Build a CSRAlias map. Every CSR alias saves the last
     // overlapping CSR.
-    CSRNum.clear();
-    CSRNum.resize(TRI->getNumRegs(), 0);
-    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    CalleeSavedAliases.resize(TRI->getNumRegs(), 0);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+        CalleeSavedAliases[*AI] = *I;
+
     Update = true;
   }
-  CalleeSaved = CSR;
+  CalleeSavedRegs = CSR;
 
   // Different reserved registers?
   const BitVector &RR = MF->getRegInfo().getReservedRegs();
@@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CSRNum[PhysReg])
+    if (CalleeSavedAliases[PhysReg])
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {
@@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     }
   }
   RCI.NumRegs = N + CSRAlias.size();
-  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+  assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
 
   // CSR aliases go after the volatile registers, preserve the target's order.
   for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
@@ -156,9 +167,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
 unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
   const TargetRegisterClass *RC = nullptr;
   unsigned NumRCUnits = 0;
-  for (TargetRegisterInfo::regclass_iterator
-         RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) {
-    const int *PSetID = TRI->getRegClassPressureSets(*RI);
+  for (const TargetRegisterClass *C : TRI->regclasses()) {
+    const int *PSetID = TRI->getRegClassPressureSets(C);
     for (; *PSetID != -1; ++PSetID) {
       if ((unsigned)*PSetID == Idx)
         break;
@@ -168,9 +178,9 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
 
     // Found a register class that counts against this pressure set.
     // For efficiency, only compute the set order for the largest set.
-    unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit;
+    unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit;
     if (!RC || NUnits > NumRCUnits) {
-      RC = *RI;
+      RC = C;
       NumRCUnits = NUnits;
     }
   }
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 4bb3c229afc5..bf44ee8453b6 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -189,6 +190,9 @@ namespace {
     /// This returns true if an interval was modified.
     bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
+    /// We found a copy which can be moved to its less frequent predecessor.
+    bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI);
+
     /// If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
     bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
@@ -811,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
       assert(ASubValNo != nullptr);
 
-      LaneBitmask AMask = SA.LaneMask;
-      for (LiveInterval::SubRange &SB : IntB.subranges()) {
-        LaneBitmask BMask = SB.LaneMask;
-        LaneBitmask Common = BMask & AMask;
-        if (Common.none())
-          continue;
-
-        DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
-                      << " into " << PrintLaneMask(Common) << '\n');
-        LaneBitmask BRest = BMask & ~AMask;
-        LiveInterval::SubRange *CommonRange;
-        if (BRest.any()) {
-          SB.LaneMask = BRest;
-          DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
-                       << '\n');
-          // Duplicate SubRange for newly merged common stuff.
-          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
-        } else {
-          // We van reuse the L SubRange.
-          SB.LaneMask = Common;
-          CommonRange = &SB;
-        }
-        LiveRange RangeCopy(SB, Allocator);
-
-        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
-        assert(BSubValNo->def == CopyIdx);
-        BSubValNo->def = ASubValNo->def;
-        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
-        AMask &= ~BMask;
-      }
-      if (AMask.any()) {
-        DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
-        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
-        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
-        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
-      }
+      IntB.refineSubRanges(Allocator, SA.LaneMask,
+          [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) {
+        VNInfo *BSubValNo = SR.empty()
+          ? SR.getNextValue(CopyIdx, Allocator)
+          : SR.getVNInfoAt(CopyIdx);
+        assert(BSubValNo != nullptr);
+        addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo);
+      });
     }
   }
 
@@ -861,6 +837,184 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   return true;
 }
 
+/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a
+/// predecessor of BB2, and if B is not redefined on the way from A = B
+/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the
+/// execution goes through the path from BB0 to BB2. We may move B = A
+/// to the predecessor without such reversed copy.
+/// So we will transform the program from:
+///   BB0:
+///      A = B;    BB1:
+///       ...         ...
+///     /     \      /
+///             BB2:
+///               ...
+///               B = A;
+///
+/// to:
+///
+///   BB0:         BB1:
+///      A = B;        ...
+///       ...          B = A;
+///     /     \       /
+///             BB2:
+///               ...
+///
+/// A special case is when BB0 and BB2 are the same BB which is the only
+/// BB in a loop:
+///   BB1:
+///        ...
+///   BB0/BB2:  ----
+///        B = A;   |
+///        ...      |
+///        A = B;   |
+///          |-------
+///          |
+/// We may hoist B = A from BB0/BB2 to BB1.
+///
+/// The major preconditions for correctness to remove such partial
+/// redundancy include:
+/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of
+///    the PHI is defined by the reversed copy A = B in BB0.
+/// 2. No B is referenced from the start of BB2 to B = A.
+/// 3. No B is defined from A = B to the end of BB0.
+/// 4. BB1 has only one successor.
+///
+/// 2 and 4 implicitly ensure B is not live at the end of BB1.
+/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a
+/// colder place, which not only prevent endless loop, but also make sure
+/// the movement of copy is beneficial.
+bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
+                                                MachineInstr &CopyMI) {
+  assert(!CP.isPhys());
+  if (!CopyMI.isFullCopy())
+    return false;
+
+  MachineBasicBlock &MBB = *CopyMI.getParent();
+  if (MBB.isEHPad())
+    return false;
+
+  if (MBB.pred_size() != 2)
+    return false;
+
+  LiveInterval &IntA =
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // A is defined by PHI at the entry of MBB.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
+  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx);
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (!AValNo->isPHIDef())
+    return false;
+
+  // No B is referenced before CopyMI in MBB.
+  if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx))
+    return false;
+
+  // MBB has two predecessors: one contains A = B so no copy will be inserted
+  // for it. The other one will have a copy moved from MBB.
+  bool FoundReverseCopy = false;
+  MachineBasicBlock *CopyLeftBB = nullptr;
+  for (MachineBasicBlock *Pred : MBB.predecessors()) {
+    VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred));
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def);
+    if (!DefMI || !DefMI->isFullCopy()) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    // Check DefMI is a reverse copy and it is in BB Pred.
+    if (DefMI->getOperand(0).getReg() != IntA.reg ||
+        DefMI->getOperand(1).getReg() != IntB.reg ||
+        DefMI->getParent() != Pred) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    // If there is any other def of B after DefMI and before the end of Pred,
+    // we need to keep the copy of B = A at the end of Pred if we remove
+    // B = A from MBB.
+    bool ValB_Changed = false;
+    for (auto VNI : IntB.valnos) {
+      if (VNI->isUnused())
+        continue;
+      if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) {
+        ValB_Changed = true;
+        break;
+      }
+    }
+    if (ValB_Changed) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    FoundReverseCopy = true;
+  }
+
+  // If no reverse copy is found in predecessors, nothing to do.
+  if (!FoundReverseCopy)
+    return false;
+
+  // If CopyLeftBB is nullptr, it means every predecessor of MBB contains
+  // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated.
+  // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and
+  // update IntA/IntB.
+  //
+  // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so
+  // MBB is hotter than CopyLeftBB.
+  if (CopyLeftBB && CopyLeftBB->succ_size() > 1)
+    return false;
+
+  // Now ok to move copy.
+  if (CopyLeftBB) {
+    DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#"
+                 << CopyLeftBB->getNumber() << '\t' << CopyMI);
+
+    // Insert new copy to CopyLeftBB.
+    auto InsPos = CopyLeftBB->getFirstTerminator();
+    MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
+                                      TII->get(TargetOpcode::COPY), IntB.reg)
+                                  .addReg(IntA.reg);
+    SlotIndex NewCopyIdx =
+        LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot();
+    IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+    for (LiveInterval::SubRange &SR : IntB.subranges())
+      SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+  } else {
+    DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#"
+                 << MBB.getNumber() << '\t' << CopyMI);
+  }
+
+  // Remove CopyMI.
+  // Note: This is fine to remove the copy before updating the live-ranges.
+  // While updating the live-ranges, we only look at slot indices and
+  // never go back to the instruction.
+  LIS->RemoveMachineInstrFromMaps(CopyMI);
+  CopyMI.eraseFromParent();
+
+  // Update the liveness.
+  SmallVector<SlotIndex, 8> EndPoints;
+  VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead();
+  LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(),
+                  &EndPoints);
+  BValNo->markUnused();
+  // Extend IntB to the EndPoints of its original live interval.
+  LIS->extendToIndices(IntB, EndPoints);
+
+  // Now, do the same for its subranges.
+  for (LiveInterval::SubRange &SR : IntB.subranges()) {
+    EndPoints.clear();
+    VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead();
+    assert(BValNo && "All sublanes should be live");
+    LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints);
+    BValNo->markUnused();
+    LIS->extendToIndices(SR, EndPoints);
+  }
+
+  // Finally, update the live-range of IntA.
+  shrinkToUses(&IntA);
+  return true;
+}
+
 /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
 /// defining a subregister.
 static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
@@ -1290,7 +1444,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
 
     // If SrcReg wasn't read, it may still be the case that DstReg is live-in
     // because SrcReg is a sub-register.
-    if (DstInt && !Reads && SubIdx)
+    if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
     // Replace SrcReg with DstReg in all UseMI operands.
@@ -1486,6 +1640,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       }
     }
 
+    // Try and see if we can partially eliminate the copy by moving the copy to
+    // its predecessor.
+    if (!CP.isPartial() && !CP.isPhys())
+      if (removePartialRedundancy(CP, *CopyMI))
+        return true;
+
     // Otherwise, we are unable to join the intervals.
     DEBUG(dbgs() << "\tInterference!\n");
     Again = true;  // May be possible to coalesce later.
@@ -1583,6 +1743,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
         return false;
       }
     }
+
+    // We must also check for overlaps with regmask clobbers.
+    BitVector RegMaskUsable;
+    if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) &&
+        !RegMaskUsable.test(DstReg)) {
+      DEBUG(dbgs() << "\t\tRegMask interference\n");
+      return false;
+    }
   }
 
   // Skip any value computations, we are not adding new values to the
@@ -1636,14 +1804,6 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
           DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
           return false;
         }
-
-        // We must also check for clobbers caused by regmasks.
-        for (const auto &MO : MI->operands()) {
-          if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
-            DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
-            return false;
-          }
-        }
       }
     }
 
@@ -2738,39 +2898,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
                                           LaneBitmask LaneMask,
                                           CoalescerPair &CP) {
   BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
-  for (LiveInterval::SubRange &R : LI.subranges()) {
-    LaneBitmask RMask = R.LaneMask;
-    // LaneMask of subregisters common to subrange R and ToMerge.
-    LaneBitmask Common = RMask & LaneMask;
-    // There is nothing to do without common subregs.
-    if (Common.none())
-      continue;
-
-    DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
-                 << PrintLaneMask(Common) << '\n');
-    // LaneMask of subregisters contained in the R range but not in ToMerge,
-    // they have to split into their own subrange.
-    LaneBitmask LRest = RMask & ~LaneMask;
-    LiveInterval::SubRange *CommonRange;
-    if (LRest.any()) {
-      R.LaneMask = LRest;
-      DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
-      // Duplicate SubRange for newly merged common stuff.
-      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+  LI.refineSubRanges(Allocator, LaneMask,
+      [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) {
+    if (SR.empty()) {
+      SR.assign(ToMerge, Allocator);
     } else {
-      // Reuse the existing range.
-      R.LaneMask = Common;
-      CommonRange = &R;
+      // joinSubRegRange() destroys the merged range, so we need a copy.
+      LiveRange RangeCopy(ToMerge, Allocator);
+      joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP);
     }
-    LiveRange RangeCopy(ToMerge, Allocator);
-    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
-    LaneMask &= ~RMask;
-  }
-
-  if (LaneMask.any()) {
-    DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
-    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
-  }
+  });
 }
 
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index fc84aebb14d7..c726edc88b41 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +12,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -52,6 +76,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
                               const TargetRegisterInfo *TRI) {
@@ -97,6 +122,7 @@ void RegPressureTracker::dump() const {
   P.dump(TRI);
 }
 
+LLVM_DUMP_METHOD
 void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   const char *sep = "";
   for (const PressureChange &Change : *this) {
@@ -108,6 +134,7 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   }
   dbgs() << '\n';
 }
+#endif
 
 void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
                                              LaneBitmask PreviousMask,
@@ -264,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const {
           MachineBasicBlock::const_iterator());
 }
 
-
 SlotIndex RegPressureTracker::getCurrSlot() const {
   MachineBasicBlock::const_iterator IdxPos =
     skipDebugInstructionsForward(CurrPos, MBB->end());
@@ -328,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
 
 static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
                                unsigned RegUnit) {
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end())
@@ -340,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                         RegisterMaskPair Pair) {
   unsigned RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end()) {
@@ -352,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
 
 static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                        unsigned RegUnit) {
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end()) {
@@ -366,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                            RegisterMaskPair Pair) {
   unsigned RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I != RegUnits.end()) {
@@ -423,6 +449,8 @@ namespace {
 ///
 /// FIXME: always ignore tied opers
 class RegisterOperandsCollector {
+  friend class llvm::RegisterOperands;
+
   RegisterOperands &RegOpers;
   const TargetRegisterInfo &TRI;
   const MachineRegisterInfo &MRI;
@@ -517,11 +545,9 @@ class RegisterOperandsCollector {
         addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll()));
     }
   }
-
-  friend class llvm::RegisterOperands;
 };
 
-} // namespace
+} // end anonymous namespace
 
 void RegisterOperands::collect(const MachineInstr &MI,
                                const TargetRegisterInfo &TRI,
@@ -674,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
   assert(Pair.LaneMask.any());
 
   unsigned RegUnit = Pair.RegUnit;
-  auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
+  auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
     return Other.RegUnit == RegUnit;
   });
   LaneBitmask PrevMask;
@@ -772,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
         if (!TrackLaneMasks) {
           addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
         } else {
-          auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
-            return Other.RegUnit == Reg;
-          });
+          auto I =
+              llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
+                return Other.RegUnit == Reg;
+              });
           bool IsRedef = I != LiveUses->end();
           if (IsRedef) {
             // ignore re-defs here...
@@ -1154,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
 
       if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
         int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
-        if (CritInc > 0 && CritInc <= INT16_MAX) {
+        if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
           Delta.CriticalMax = PressureChange(PSetID);
           Delta.CriticalMax.setUnitInc(CritInc);
         }
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index fdf741fd58f7..6392136fa290 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//===- RegisterScavenging.cpp - Machine register scavenging ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,28 +15,32 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <string>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
 
 void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
-  for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
-    LaneBitmask UnitMask = (*RUI).second;
-    if (UnitMask.none() || (LaneMask & UnitMask).any())
-      RegUnitsAvailable.reset((*RUI).first);
-  }
+  LiveUnits.addRegMasked(Reg, LaneMask);
 }
 
 void RegScavenger::init(MachineBasicBlock &MBB) {
@@ -44,6 +48,7 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
+  LiveUnits.init(*TRI);
 
   assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
          "Target changed?");
@@ -51,7 +56,6 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   // Self-initialize.
   if (!this->MBB) {
     NumRegUnits = TRI->getNumRegUnits();
-    RegUnitsAvailable.resize(NumRegUnits);
     KillRegUnits.resize(NumRegUnits);
     DefRegUnits.resize(NumRegUnits);
     TmpRegUnits.resize(NumRegUnits);
@@ -64,32 +68,17 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
     I->Restore = nullptr;
   }
 
-  // All register units start out unused.
-  RegUnitsAvailable.set();
-
-  // Pristine CSRs are not available.
-  BitVector PR = MF.getFrameInfo().getPristineRegs(MF);
-  for (int I = PR.find_first(); I>0; I = PR.find_next(I))
-    setRegUsed(I);
-
   Tracking = false;
 }
 
-void RegScavenger::setLiveInsUsed(const MachineBasicBlock &MBB) {
-  for (const auto &LI : MBB.liveins())
-    setRegUsed(LI.PhysReg, LI.LaneMask);
-}
-
 void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) {
   init(MBB);
-  setLiveInsUsed(MBB);
+  LiveUnits.addLiveIns(MBB);
 }
 
 void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
   init(MBB);
-  // Merge live-ins of successors to get live-outs.
-  for (const MachineBasicBlock *Succ : MBB.successors())
-    setLiveInsUsed(*Succ);
+  LiveUnits.addLiveOuts(MBB);
 
   // Move internal iterator at the last instruction of the block.
   if (MBB.begin() != MBB.end()) {
@@ -263,36 +252,7 @@ void RegScavenger::backward() {
   assert(Tracking && "Must be tracking to determine kills and defs");
 
   const MachineInstr &MI = *MBBI;
-  // Defined or clobbered registers are available now.
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isRegMask()) {
-      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd;
-           ++RU) {
-        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
-          if (MO.clobbersPhysReg(*RURI)) {
-            RegUnitsAvailable.set(RU);
-            break;
-          }
-        }
-      }
-    } else if (MO.isReg() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
-          isReserved(Reg))
-        continue;
-      addRegUnits(RegUnitsAvailable, Reg);
-    }
-  }
-  // Mark read registers as unavailable.
-  for (const MachineOperand &MO : MI.uses()) {
-    if (MO.isReg() && MO.readsReg()) {
-      unsigned Reg = MO.getReg();
-      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
-          isReserved(Reg))
-        continue;
-      removeRegUnits(RegUnitsAvailable, Reg);
-    }
-  }
+  LiveUnits.stepBackward(MI);
 
   if (MBBI == MBB->begin()) {
     MBBI = MachineBasicBlock::iterator(nullptr);
@@ -302,12 +262,9 @@ void RegScavenger::backward() {
 }
 
 bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
-  if (includeReserved && isReserved(Reg))
-    return true;
-  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
-    if (!RegUnitsAvailable.test(*RUI))
-      return true;
-  return false;
+  if (isReserved(Reg))
+    return includeReserved;
+  return !LiveUnits.available(Reg);
 }
 
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
@@ -441,7 +398,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   unsigned NeedSize = RC->getSize();
   unsigned NeedAlign = RC->getAlignment();
 
-  unsigned SI = Scavenged.size(), Diff = UINT_MAX;
+  unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
   int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
   for (unsigned I = 0; I < Scavenged.size(); ++I) {
     if (Scavenged[I].Reg != 0)
diff --git a/lib/CodeGen/ResetMachineFunctionPass.cpp b/lib/CodeGen/ResetMachineFunctionPass.cpp
index 451964199ba5..3e259927ac5c 100644
--- a/lib/CodeGen/ResetMachineFunctionPass.cpp
+++ b/lib/CodeGen/ResetMachineFunctionPass.cpp
@@ -30,17 +30,23 @@ namespace {
     /// Tells whether or not this pass should emit a fallback
     /// diagnostic when it resets a function.
     bool EmitFallbackDiag;
+    /// Whether we should abort immediately instead of resetting the function.
+    bool AbortOnFailedISel;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    ResetMachineFunction(bool EmitFallbackDiag = false)
-        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag) {}
+    ResetMachineFunction(bool EmitFallbackDiag = false,
+                         bool AbortOnFailedISel = false)
+        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag),
+          AbortOnFailedISel(AbortOnFailedISel) {}
 
     StringRef getPassName() const override { return "ResetMachineFunction"; }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       if (MF.getProperties().hasProperty(
               MachineFunctionProperties::Property::FailedISel)) {
+        if (AbortOnFailedISel)
+          report_fatal_error("Instruction selection failed");
         DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n');
         ++NumFunctionsReset;
         MF.reset();
@@ -62,6 +68,7 @@ INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE,
                 "reset machine function if ISel failed", false, false)
 
 MachineFunctionPass *
-llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false) {
-  return new ResetMachineFunction(EmitFallbackDiag);
+llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false,
+                                     bool AbortOnFailedISel = false) {
+  return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel);
 }
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 2b82df293c14..fa68411284e7 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -451,7 +451,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
   IRBuilder<> IRBFail(CheckTerm);
   // FIXME: respect -fsanitize-trap / -ftrap-function here?
   Constant *StackChkFail = F.getParent()->getOrInsertFunction(
-      "__stack_chk_fail", IRB.getVoidTy(), nullptr);
+      "__stack_chk_fail", IRB.getVoidTy());
   IRBFail.CreateCall(StackChkFail, {});
 }
 
diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp
index 7fbeaddb38e8..09289f947dc9 100644
--- a/lib/CodeGen/SafeStackColoring.cpp
+++ b/lib/CodeGen/SafeStackColoring.cpp
@@ -236,6 +236,7 @@ void StackColoring::calculateLiveIntervals() {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void StackColoring::dumpAllocas() {
   dbgs() << "Allocas:\n";
   for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
@@ -262,6 +263,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {
     dbgs() << "  " << AllocaNo << ": " << Range << "\n";
   }
 }
+#endif
 
 void StackColoring::run() {
   DEBUG(dumpAllocas());
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 427d95268c74..dc72ac073258 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -1,4 +1,4 @@
-//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the ScheduleDAG class, which is a base class used by
-// scheduling implementation classes.
+/// \file Implements the ScheduleDAG class, which is a base class used by
+/// scheduling implementation classes.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <climits>
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "pre-RA-sched"
@@ -33,58 +43,52 @@ static cl::opt<bool> StressSchedOpt(
   cl::desc("Stress test instruction scheduling"));
 #endif
 
-void SchedulingPriorityQueue::anchor() { }
+void SchedulingPriorityQueue::anchor() {}
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
     : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
       TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
-      MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
+      MRI(mf.getRegInfo()) {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;
 #endif
 }
 
-ScheduleDAG::~ScheduleDAG() {}
+ScheduleDAG::~ScheduleDAG() = default;
 
-/// Clear the DAG state (e.g. between scheduling regions).
 void ScheduleDAG::clearDAG() {
   SUnits.clear();
   EntrySU = SUnit();
   ExitSU = SUnit();
 }
 
-/// getInstrDesc helper to handle SDNodes.
 const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
   if (!Node || !Node->isMachineOpcode()) return nullptr;
   return &TII->get(Node->getMachineOpcode());
 }
 
-/// addPred - This adds the specified edge as a pred of the current node if
-/// not already.  It also adds the current node as a successor of the
-/// specified node.
 bool SUnit::addPred(const SDep &D, bool Required) {
   // If this node already has this dependence, don't add a redundant one.
-  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I) {
+  for (SDep &PredDep : Preds) {
     // Zero-latency weak edges may be added purely for heuristic ordering. Don't
     // add them if another kind of edge already exists.
-    if (!Required && I->getSUnit() == D.getSUnit())
+    if (!Required && PredDep.getSUnit() == D.getSUnit())
       return false;
-    if (I->overlaps(D)) {
-      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
-      if (I->getLatency() < D.getLatency()) {
-        SUnit *PredSU = I->getSUnit();
+    if (PredDep.overlaps(D)) {
+      // Extend the latency if needed. Equivalent to
+      // removePred(PredDep) + addPred(D).
+      if (PredDep.getLatency() < D.getLatency()) {
+        SUnit *PredSU = PredDep.getSUnit();
         // Find the corresponding successor in N.
-        SDep ForwardD = *I;
+        SDep ForwardD = PredDep;
         ForwardD.setSUnit(this);
-        for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(),
-               EE = PredSU->Succs.end(); II != EE; ++II) {
-          if (*II == ForwardD) {
-            II->setLatency(D.getLatency());
+        for (SDep &SuccDep : PredSU->Succs) {
+          if (SuccDep == ForwardD) {
+            SuccDep.setLatency(D.getLatency());
             break;
           }
         }
-        I->setLatency(D.getLatency());
+        PredDep.setLatency(D.getLatency());
       }
       return false;
     }
@@ -95,8 +99,10 @@ bool SUnit::addPred(const SDep &D, bool Required) {
   SUnit *N = D.getSUnit();
   // Update the bookkeeping.
   if (D.getKind() == SDep::Data) {
-    assert(NumPreds < UINT_MAX && "NumPreds will overflow!");
-    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!");
+    assert(NumPreds < std::numeric_limits<unsigned>::max() &&
+           "NumPreds will overflow!");
+    assert(N->NumSuccs < std::numeric_limits<unsigned>::max() &&
+           "NumSuccs will overflow!");
     ++NumPreds;
     ++N->NumSuccs;
   }
@@ -105,7 +111,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
       ++WeakPredsLeft;
     }
     else {
-      assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!");
+      assert(NumPredsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumPredsLeft will overflow!");
       ++NumPredsLeft;
     }
   }
@@ -114,7 +121,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
       ++N->WeakSuccsLeft;
     }
     else {
-      assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+      assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumSuccsLeft will overflow!");
       ++N->NumSuccsLeft;
     }
   }
@@ -127,51 +135,46 @@ bool SUnit::addPred(const SDep &D, bool Required) {
   return true;
 }
 
-/// removePred - This removes the specified edge as a pred of the current
-/// node if it exists.  It also removes the current node as a successor of
-/// the specified node.
 void SUnit::removePred(const SDep &D) {
   // Find the matching predecessor.
-  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I)
-    if (*I == D) {
-      // Find the corresponding successor in N.
-      SDep P = D;
-      P.setSUnit(this);
-      SUnit *N = D.getSUnit();
-      SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P);
-      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
-      N->Succs.erase(Succ);
-      Preds.erase(I);
-      // Update the bookkeeping.
-      if (P.getKind() == SDep::Data) {
-        assert(NumPreds > 0 && "NumPreds will underflow!");
-        assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
-        --NumPreds;
-        --N->NumSuccs;
-      }
-      if (!N->isScheduled) {
-        if (D.isWeak())
-          --WeakPredsLeft;
-        else {
-          assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
-          --NumPredsLeft;
-        }
-      }
-      if (!isScheduled) {
-        if (D.isWeak())
-          --N->WeakSuccsLeft;
-        else {
-          assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
-          --N->NumSuccsLeft;
-        }
-      }
-      if (P.getLatency() != 0) {
-        this->setDepthDirty();
-        N->setHeightDirty();
-      }
-      return;
+  SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D);
+  if (I == Preds.end())
+    return;
+  // Find the corresponding successor in N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P);
+  assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+  N->Succs.erase(Succ);
+  Preds.erase(I);
+  // Update the bookkeeping.
+  if (P.getKind() == SDep::Data) {
+    assert(NumPreds > 0 && "NumPreds will underflow!");
+    assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+    --NumPreds;
+    --N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak())
+      --WeakPredsLeft;
+    else {
+      assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+      --NumPredsLeft;
     }
+  }
+  if (!isScheduled) {
+    if (D.isWeak())
+      --N->WeakSuccsLeft;
+    else {
+      assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+      --N->NumSuccsLeft;
+    }
+  }
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
 }
 
 void SUnit::setDepthDirty() {
@@ -181,9 +184,8 @@ void SUnit::setDepthDirty() {
   do {
     SUnit *SU = WorkList.pop_back_val();
     SU->isDepthCurrent = false;
-    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
-         E = SU->Succs.end(); I != E; ++I) {
-      SUnit *SuccSU = I->getSUnit();
+    for (SDep &SuccDep : SU->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
       if (SuccSU->isDepthCurrent)
         WorkList.push_back(SuccSU);
     }
@@ -197,18 +199,14 @@ void SUnit::setHeightDirty() {
   do {
     SUnit *SU = WorkList.pop_back_val();
     SU->isHeightCurrent = false;
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
-         E = SU->Preds.end(); I != E; ++I) {
-      SUnit *PredSU = I->getSUnit();
+    for (SDep &PredDep : SU->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
       if (PredSU->isHeightCurrent)
         WorkList.push_back(PredSU);
     }
   } while (!WorkList.empty());
 }
 
-/// setDepthToAtLeast - Update this node's successors to reflect the
-/// fact that this node's depth just increased.
-///
 void SUnit::setDepthToAtLeast(unsigned NewDepth) {
   if (NewDepth <= getDepth())
     return;
@@ -217,9 +215,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) {
   isDepthCurrent = true;
 }
 
-/// setHeightToAtLeast - Update this node's predecessors to reflect the
-/// fact that this node's height just increased.
-///
 void SUnit::setHeightToAtLeast(unsigned NewHeight) {
   if (NewHeight <= getHeight())
     return;
@@ -228,8 +223,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) {
   isHeightCurrent = true;
 }
 
-/// ComputeDepth - Calculate the maximal path from the node to the exit.
-///
+/// Calculates the maximal path from the node to the exit.
 void SUnit::ComputeDepth() {
   SmallVector<SUnit*, 8> WorkList;
   WorkList.push_back(this);
@@ -238,12 +232,11 @@ void SUnit::ComputeDepth() {
 
     bool Done = true;
     unsigned MaxPredDepth = 0;
-    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
-         E = Cur->Preds.end(); I != E; ++I) {
-      SUnit *PredSU = I->getSUnit();
+    for (const SDep &PredDep : Cur->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
       if (PredSU->isDepthCurrent)
         MaxPredDepth = std::max(MaxPredDepth,
-                                PredSU->Depth + I->getLatency());
+                                PredSU->Depth + PredDep.getLatency());
       else {
         Done = false;
         WorkList.push_back(PredSU);
@@ -261,8 +254,7 @@ void SUnit::ComputeDepth() {
   } while (!WorkList.empty());
 }
 
-/// ComputeHeight - Calculate the maximal path from the node to the entry.
-///
+/// Calculates the maximal path from the node to the entry.
 void SUnit::ComputeHeight() {
   SmallVector<SUnit*, 8> WorkList;
   WorkList.push_back(this);
@@ -271,12 +263,11 @@ void SUnit::ComputeHeight() {
 
     bool Done = true;
     unsigned MaxSuccHeight = 0;
-    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
-         E = Cur->Succs.end(); I != E; ++I) {
-      SUnit *SuccSU = I->getSUnit();
+    for (const SDep &SuccDep : Cur->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
       if (SuccSU->isHeightCurrent)
         MaxSuccHeight = std::max(MaxSuccHeight,
-                                 SuccSU->Height + I->getLatency());
+                                 SuccSU->Height + SuccDep.getLatency());
       else {
         Done = false;
         WorkList.push_back(SuccSU);
@@ -310,6 +301,7 @@ void SUnit::biasCriticalPath() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {
   if (this == &DAG->ExitSU)
     OS << "ExitSU";
@@ -319,15 +311,13 @@ void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {
     OS << "SU(" << NodeNum << ")";
 }
 
-/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
-/// a group of nodes flagged together.
-void SUnit::dump(const ScheduleDAG *G) const {
+LLVM_DUMP_METHOD void SUnit::dump(const ScheduleDAG *G) const {
   print(dbgs(), G);
   dbgs() << ": ";
   G->dumpNode(this);
 }
 
-void SUnit::dumpAll(const ScheduleDAG *G) const {
+LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const {
   dump(G);
 
   dbgs() << "  # preds left       : " << NumPredsLeft << "\n";
@@ -343,41 +333,39 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
 
   if (Preds.size() != 0) {
     dbgs() << "  Predecessors:\n";
-    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I) {
+    for (const SDep &SuccDep : Preds) {
       dbgs() << "   ";
-      switch (I->getKind()) {
+      switch (SuccDep.getKind()) {
       case SDep::Data:   dbgs() << "data "; break;
       case SDep::Anti:   dbgs() << "anti "; break;
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      I->getSUnit()->print(dbgs(), G);
-      if (I->isArtificial())
+      SuccDep.getSUnit()->print(dbgs(), G);
+      if (SuccDep.isArtificial())
         dbgs() << " *";
-      dbgs() << ": Latency=" << I->getLatency();
-      if (I->isAssignedRegDep())
-        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << ": Latency=" << SuccDep.getLatency();
+      if (SuccDep.isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI);
       dbgs() << "\n";
     }
   }
   if (Succs.size() != 0) {
     dbgs() << "  Successors:\n";
-    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
-         I != E; ++I) {
+    for (const SDep &SuccDep : Succs) {
       dbgs() << "   ";
-      switch (I->getKind()) {
+      switch (SuccDep.getKind()) {
       case SDep::Data:   dbgs() << "data "; break;
       case SDep::Anti:   dbgs() << "anti "; break;
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      I->getSUnit()->print(dbgs(), G);
-      if (I->isArtificial())
+      SuccDep.getSUnit()->print(dbgs(), G);
+      if (SuccDep.isArtificial())
         dbgs() << " *";
-      dbgs() << ": Latency=" << I->getLatency();
-      if (I->isAssignedRegDep())
-        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << ": Latency=" << SuccDep.getLatency();
+      if (SuccDep.isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI);
       dbgs() << "\n";
     }
   }
@@ -385,47 +373,44 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
 #endif
 
 #ifndef NDEBUG
-/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
-/// their state is consistent. Return the number of scheduled nodes.
-///
 unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
   bool AnyNotSched = false;
   unsigned DeadNodes = 0;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    if (!SUnits[i].isScheduled) {
-      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+  for (const SUnit &SUnit : SUnits) {
+    if (!SUnit.isScheduled) {
+      if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) {
         ++DeadNodes;
         continue;
       }
       if (!AnyNotSched)
         dbgs() << "*** Scheduling failed! ***\n";
-      SUnits[i].dump(this);
+      SUnit.dump(this);
       dbgs() << "has not been scheduled!\n";
       AnyNotSched = true;
     }
-    if (SUnits[i].isScheduled &&
-        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) >
-          unsigned(INT_MAX)) {
+    if (SUnit.isScheduled &&
+        (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) >
+          unsigned(std::numeric_limits<int>::max())) {
       if (!AnyNotSched)
         dbgs() << "*** Scheduling failed! ***\n";
-      SUnits[i].dump(this);
+      SUnit.dump(this);
       dbgs() << "has an unexpected "
            << (isBottomUp ? "Height" : "Depth") << " value!\n";
       AnyNotSched = true;
     }
     if (isBottomUp) {
-      if (SUnits[i].NumSuccsLeft != 0) {
+      if (SUnit.NumSuccsLeft != 0) {
         if (!AnyNotSched)
           dbgs() << "*** Scheduling failed! ***\n";
-        SUnits[i].dump(this);
+        SUnit.dump(this);
         dbgs() << "has successors left!\n";
         AnyNotSched = true;
       }
     } else {
-      if (SUnits[i].NumPredsLeft != 0) {
+      if (SUnit.NumPredsLeft != 0) {
         if (!AnyNotSched)
           dbgs() << "*** Scheduling failed! ***\n";
-        SUnits[i].dump(this);
+        SUnit.dump(this);
         dbgs() << "has predecessors left!\n";
         AnyNotSched = true;
       }
@@ -436,36 +421,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
 }
 #endif
 
-/// InitDAGTopologicalSorting - create the initial topological
-/// ordering from the DAG to be scheduled.
-///
-/// The idea of the algorithm is taken from
-/// "Online algorithms for managing the topological order of
-/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
-/// This is the MNR algorithm, which was first introduced by
-/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
-/// "Maintaining a topological order under edge insertions".
-///
-/// Short description of the algorithm:
-///
-/// Topological ordering, ord, of a DAG maps each node to a topological
-/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
-///
-/// This means that if there is a path from the node X to the node Z,
-/// then ord(X) < ord(Z).
-///
-/// This property can be used to check for reachability of nodes:
-/// if Z is reachable from X, then an insertion of the edge Z->X would
-/// create a cycle.
-///
-/// The algorithm first computes a topological ordering for the DAG by
-/// initializing the Index2Node and Node2Index arrays and then tries to keep
-/// the ordering up-to-date after edge insertions by reordering the DAG.
-///
-/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
-/// the nodes reachable from Y, and then shifts them using Shift to lie
-/// immediately after X in Index2Node.
 void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  // The idea of the algorithm is taken from
+  // "Online algorithms for managing the topological order of
+  // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+  // This is the MNR algorithm, which was first introduced by
+  // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+  // "Maintaining a topological order under edge insertions".
+  //
+  // Short description of the algorithm:
+  //
+  // Topological ordering, ord, of a DAG maps each node to a topological
+  // index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+  //
+  // This means that if there is a path from the node X to the node Z,
+  // then ord(X) < ord(Z).
+  //
+  // This property can be used to check for reachability of nodes:
+  // if Z is reachable from X, then an insertion of the edge Z->X would
+  // create a cycle.
+  //
+  // The algorithm first computes a topological ordering for the DAG by
+  // initializing the Index2Node and Node2Index arrays and then tries to keep
+  // the ordering up-to-date after edge insertions by reordering the DAG.
+  //
+  // On insertion of the edge X->Y, the algorithm first marks by calling DFS
+  // the nodes reachable from Y, and then shifts them using Shift to lie
+  // immediately after X in Index2Node.
   unsigned DAGSize = SUnits.size();
   std::vector<SUnit*> WorkList;
   WorkList.reserve(DAGSize);
@@ -476,18 +458,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
   // Initialize the data structures.
   if (ExitSU)
     WorkList.push_back(ExitSU);
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    int NodeNum = SU->NodeNum;
-    unsigned Degree = SU->Succs.size();
+  for (SUnit &SU : SUnits) {
+    int NodeNum = SU.NodeNum;
+    unsigned Degree = SU.Succs.size();
     // Temporarily use the Node2Index array as scratch space for degree counts.
     Node2Index[NodeNum] = Degree;
 
     // Is it a node without dependencies?
     if (Degree == 0) {
-      assert(SU->Succs.empty() && "SUnit should have no successors");
+      assert(SU.Succs.empty() && "SUnit should have no successors");
       // Collect leaf nodes.
-      WorkList.push_back(SU);
+      WorkList.push_back(&SU);
     }
   }
 
@@ -497,9 +478,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
     WorkList.pop_back();
     if (SU->NodeNum < DAGSize)
       Allocate(SU->NodeNum, --Id);
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      SUnit *SU = I->getSUnit();
+    for (const SDep &PredDep : SU->Preds) {
+      SUnit *SU = PredDep.getSUnit();
       if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])
         // If all dependencies of the node are processed already,
         // then the node can be computed now.
@@ -511,19 +491,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
 
 #ifndef NDEBUG
   // Check correctness of the ordering
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
+  for (SUnit &SU : SUnits)  {
+    for (const SDep &PD : SU.Preds) {
+      assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] &&
       "Wrong topological sorting");
     }
   }
 #endif
 }
 
-/// AddPred - Updates the topological ordering to accommodate an edge
-/// to be added from SUnit X to SUnit Y.
 void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   int UpperBound, LowerBound;
   LowerBound = Node2Index[Y->NodeNum];
@@ -540,16 +516,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   }
 }
 
-/// RemovePred - Updates the topological ordering to accommodate an
-/// an edge to be removed from the specified node N from the predecessors
-/// of the current node M.
 void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
   // InitDAGTopologicalSorting();
 }
 
-/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
-/// all nodes affected by the edge insertion. These nodes will later get new
-/// topological indexes by means of the Shift method.
 void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
                                      bool &HasLoop) {
   std::vector<const SUnit*> WorkList;
@@ -560,8 +530,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     SU = WorkList.back();
     WorkList.pop_back();
     Visited.set(SU->NodeNum);
-    for (int I = SU->Succs.size()-1; I >= 0; --I) {
-      unsigned s = SU->Succs[I].getSUnit()->NodeNum;
+    for (const SDep &SuccDep
+         : make_range(SU->Succs.rbegin(), SU->Succs.rend())) {
+      unsigned s = SuccDep.getSUnit()->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
       if (s >= Node2Index.size())
         continue;
@@ -571,14 +542,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
       }
       // Visit successors if not already and in affected region.
       if (!Visited.test(s) && Node2Index[s] < UpperBound) {
-        WorkList.push_back(SU->Succs[I].getSUnit());
+        WorkList.push_back(SuccDep.getSUnit());
       }
     }
   } while (!WorkList.empty());
 }
 
-/// Shift - Renumber the nodes so that the topological ordering is
-/// preserved.
+std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU,
+                                                         const SUnit &TargetSU,
+                                                         bool &Success) {
+  std::vector<const SUnit*> WorkList;
+  int LowerBound = Node2Index[StartSU.NodeNum];
+  int UpperBound = Node2Index[TargetSU.NodeNum];
+  bool Found = false;
+  BitVector VisitedBack;
+  std::vector<int> Nodes;
+
+  if (LowerBound > UpperBound) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.reserve(SUnits.size());
+  Visited.reset();
+
+  // Starting from StartSU, visit all successors up
+  // to UpperBound.
+  WorkList.push_back(&StartSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      const SUnit *Succ = SU->Succs[I].getSUnit();
+      unsigned s = Succ->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (Succ->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        Found = true;
+        continue;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        Visited.set(s);
+        WorkList.push_back(Succ);
+      }
+    }
+  } while (!WorkList.empty());
+
+  if (!Found) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.clear();
+  VisitedBack.resize(SUnits.size());
+  Found = false;
+
+  // Starting from TargetSU, visit all predecessors up
+  // to LowerBound. SUs that are visited by the two
+  // passes are added to Nodes.
+  WorkList.push_back(&TargetSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (int I = SU->Preds.size()-1; I >= 0; --I) {
+      const SUnit *Pred = SU->Preds[I].getSUnit();
+      unsigned s = Pred->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. EntrySU).
+      if (Pred->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == LowerBound) {
+        Found = true;
+        continue;
+      }
+      if (!VisitedBack.test(s) && Visited.test(s)) {
+        VisitedBack.set(s);
+        WorkList.push_back(Pred);
+        Nodes.push_back(s);
+      }
+    }
+  } while (!WorkList.empty());
+
+  assert(Found && "Error in SUnit Graph!");
+  Success = true;
+  return Nodes;
+}
+
 void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
                                        int UpperBound) {
   std::vector<int> L;
@@ -598,28 +648,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
     }
   }
 
-  for (unsigned j = 0; j < L.size(); ++j) {
-    Allocate(L[j], i - shift);
+  for (unsigned LI : L) {
+    Allocate(LI, i - shift);
     i = i + 1;
   }
 }
 
-
-/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will
-/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU).
 bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
   // Is SU reachable from TargetSU via successor edges?
   if (IsReachable(SU, TargetSU))
     return true;
-  for (SUnit::pred_iterator
-         I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I)
-    if (I->isAssignedRegDep() &&
-        IsReachable(SU, I->getSUnit()))
+  for (const SDep &PredDep : TargetSU->Preds)
+    if (PredDep.isAssignedRegDep() &&
+        IsReachable(SU, PredDep.getSUnit()))
       return true;
   return false;
 }
 
-/// IsReachable - Checks if SU is reachable from TargetSU.
 bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
                                              const SUnit *TargetSU) {
   // If insertion of the edge SU->TargetSU would create a cycle
@@ -637,7 +682,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   return HasLoop;
 }
 
-/// Allocate - assign the topological index to the node n.
 void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
   Node2Index[n] = index;
   Index2Node[index] = n;
@@ -647,4 +691,4 @@ ScheduleDAGTopologicalSort::
 ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)
   : SUnits(sunits), ExitSU(exitsu) {}
 
-ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 611c5a71bd5a..18823b74c47f 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the ScheduleDAGInstrs class, which implements re-scheduling
-// of MachineInstrs.
+/// \file This implements the ScheduleDAGInstrs class, which implements
+/// re-scheduling of MachineInstrs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -101,8 +101,8 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
   SchedModel.init(ST.getSchedModel(), &ST, TII);
 }
 
-/// getUnderlyingObjectFromInt - This is the function that does the work of
-/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
+/// This is the function that does the work of looking through basic
+/// ptrtoint+arithmetic+inttoptr sequences.
 static const Value *getUnderlyingObjectFromInt(const Value *V) {
   do {
     if (const Operator *U = dyn_cast<Operator>(V)) {
@@ -129,8 +129,8 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) {
   } while (1);
 }
 
-/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects
-/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
+/// This is a wrapper around GetUnderlyingObjects and adds support for basic
+/// ptrtoint+arithmetic+inttoptr sequences.
 static void getUnderlyingObjects(const Value *V,
                                  SmallVectorImpl<Value *> &Objects,
                                  const DataLayout &DL) {
@@ -158,9 +158,8 @@ static void getUnderlyingObjects(const Value *V,
   } while (!Working.empty());
 }
 
-/// getUnderlyingObjectsForInstr - If this machine instr has memory reference
-/// information and it can be tracked to a normal reference to a known
-/// object, return the Value for that object.
+/// If this machine instr has memory reference information and it can be tracked
+/// to a normal reference to a known object, return the Value for that object.
 static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
                                          const MachineFrameInfo &MFI,
                                          UnderlyingObjectsVector &Objects,
@@ -216,10 +215,6 @@ void ScheduleDAGInstrs::finishBlock() {
   BB = nullptr;
 }
 
-/// Initialize the DAG and common scheduler state for the current scheduling
-/// region. This does not actually create the DAG, only clears it. The
-/// scheduling driver may call BuildSchedGraph multiple times per scheduling
-/// region.
 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
@@ -230,20 +225,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
   NumRegionInstrs = regioninstrs;
 }
 
-/// Close the current scheduling region. Don't clear any state in case the
-/// driver wants to refer to the previous scheduling region.
 void ScheduleDAGInstrs::exitRegion() {
   // Nothing to do.
 }
 
-/// addSchedBarrierDeps - Add dependencies from instructions in the current
-/// list of instructions being scheduled to scheduling barrier by adding
-/// the exit SU to the register defs and use list. This is because we want to
-/// make sure instructions which define registers that are either used by
-/// the terminator or are live-out are properly scheduled. This is
-/// especially important when the definition latency of the return value(s)
-/// are too high to be hidden by the branch or when the liveout registers
-/// used by instructions in the fallthrough block.
 void ScheduleDAGInstrs::addSchedBarrierDeps() {
   MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
   ExitSU.setInstr(ExitMI);
@@ -271,7 +256,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
   }
 }
 
-/// MO is an operand of SU's instruction that defines a physical register. Add
+/// MO is an operand of SU's instruction that defines a physical register. Adds
 /// data dependencies from SU to any uses of the physical register.
 void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
@@ -313,9 +298,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-/// addPhysRegDeps - Add register dependencies (data, anti, and output) from
-/// this SUnit to following instructions in the same scheduling region that
-/// depend the physical register referenced at OperIdx.
+/// \brief Adds register dependencies (data, anti, and output) from this SUnit
+/// to following instructions in the same scheduling region that depend the
+/// physical register referenced at OperIdx.
 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   MachineOperand &MO = MI->getOperand(OperIdx);
@@ -406,9 +391,9 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
   return TRI->getSubRegIndexLaneMask(SubReg);
 }
 
-/// addVRegDefDeps - Add register output and data dependencies from this SUnit
-/// to instructions that occur later in the same scheduling region if they read
-/// from or write to the virtual register defined at OperIdx.
+/// Adds register output and data dependencies from this SUnit to instructions
+/// that occur later in the same scheduling region if they read from or write to
+/// the virtual register defined at OperIdx.
 ///
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
@@ -515,10 +500,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
-/// addVRegUseDeps - Add a register data dependency if the instruction that
-/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a
-/// register antidependency from this SUnit to instructions that occur later in
-/// the same scheduling region if they write the virtual register.
+/// \brief Adds a register data dependency if the instruction that defines the
+/// virtual register used at OperIdx is mapped to an SUnit. Add a register
+/// antidependency from this SUnit to instructions that occur later in the same
+/// scheduling region if they write the virtual register.
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
@@ -545,87 +530,25 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-/// Return true if MI is an instruction we are unable to reason about
+/// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
          (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
 }
 
-/// This returns true if the two MIs need a chain edge between them.
-/// This is called on normal stores and loads.
-static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                             const DataLayout &DL, MachineInstr *MIa,
-                             MachineInstr *MIb) {
-  const MachineFunction *MF = MIa->getParent()->getParent();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-
-  assert ((MIa->mayStore() || MIb->mayStore()) &&
-          "Dependency checked between two loads");
-
-  // Let the target decide if memory accesses cannot possibly overlap.
-  if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA))
-    return false;
-
-  // To this point analysis is generic. From here on we do need AA.
-  if (!AA)
-    return true;
-
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
-    return true;
-
-  MachineMemOperand *MMOa = *MIa->memoperands_begin();
-  MachineMemOperand *MMOb = *MIb->memoperands_begin();
-
-  if (!MMOa->getValue() || !MMOb->getValue())
-    return true;
-
-  // The following interface to AA is fashioned after DAGCombiner::isAlias
-  // and operates with MachineMemOperand offset with some important
-  // assumptions:
-  //   - LLVM fundamentally assumes flat address spaces.
-  //   - MachineOperand offset can *only* result from legalization and
-  //     cannot affect queries other than the trivial case of overlap
-  //     checking.
-  //   - These offsets never wrap and never step outside
-  //     of allocated objects.
-  //   - There should never be any negative offsets here.
-  //
-  // FIXME: Modify API to hide this math from "user"
-  // FIXME: Even before we go to AA we can reason locally about some
-  // memory objects. It can save compile time, and possibly catch some
-  // corner cases not currently covered.
-
-  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
-  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
-
-  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
-  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
-  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
-
-  AliasResult AAResult =
-      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
-                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
-                MemoryLocation(MMOb->getValue(), Overlapb,
-                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
-
-  return (AAResult != NoAlias);
-}
-
-/// Check whether two objects need a chain edge and add it if needed.
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
                                             unsigned Latency) {
-  if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(),
-                       SUb->getInstr())) {
+  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
     SDep Dep(SUa, SDep::MayAliasMem);
     Dep.setLatency(Latency);
     SUb->addPred(Dep);
   }
 }
 
-/// Create an SUnit for each real instruction, numbered in top-down topological
-/// order. The instruction order A < B, implies that no edge exists from B to A.
+/// \brief Creates an SUnit for each real instruction, numbered in top-down
+/// topological order. The instruction order A < B, implies that no edge exists
+/// from B to A.
 ///
 /// Map each real instruction to its SUnit.
 ///
@@ -682,14 +605,13 @@ void ScheduleDAGInstrs::initSUnits() {
 }
 
 class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
-
   /// Current total number of SUs in map.
   unsigned NumNodes;
 
   /// 1 for loads, 0 for stores. (see comment in SUList)
   unsigned TrueMemOrderLatency;
-public:
 
+public:
   Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}
 
   /// To keep NumNodes up to date, insert() is used instead of
@@ -697,8 +619,8 @@ public:
   ValueType &operator[](const SUList &Key) {
     llvm_unreachable("Don't use. Use insert() instead."); };
 
-  /// Add SU to the SUList of V. If Map grows huge, reduce its size
-  /// by calling reduce().
+  /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
+  /// reduce().
   void inline insert(SUnit *SU, ValueType V) {
     MapVector::operator[](V).push_back(SU);
     NumNodes++;
@@ -723,7 +645,7 @@ public:
 
   unsigned inline size() const { return NumNodes; }
 
-  /// Count the number of SUs in this map after a reduction.
+  /// Counts the number of SUs in this map after a reduction.
   void reComputeSize(void) {
     NumNodes = 0;
     for (auto &I : *this)
@@ -797,9 +719,6 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
   map.reComputeSize();
 }
 
-/// If RegPressure is non-null, compute register pressure as a side effect. The
-/// DAG builder is an efficient place to do it because it already visits
-/// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs,
@@ -1088,10 +1007,6 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() {
   }
 }
 
-/// Reduce maps in FIFO order, by N SUs. This is better than turning
-/// every Nth memory SU into BarrierChain in buildSchedGraph(), since
-/// it avoids unnecessary edges between seen SUs above the new
-/// BarrierChain, and those below it.
 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
                                               Value2SUsMap &loads, unsigned N) {
   DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
@@ -1142,7 +1057,6 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
         loads.dump());
 }
 
-/// \brief Initialize register live-range state for updating kills.
 void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {
   // Start with no live registers.
   LiveRegs.reset();
@@ -1178,32 +1092,35 @@ static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
       if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
          return;
     } else
-        (--End)->clearRegisterKills(Reg, TRI);
+      (--End)->clearRegisterKills(Reg, TRI);
   }
 }
 
-bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {
+void ScheduleDAGInstrs::toggleKillFlag(MachineInstr &MI, MachineOperand &MO) {
+  if (MO.isDebug())
+    return;
+
   // Setting kill flag...
   if (!MO.isKill()) {
     MO.setIsKill(true);
-    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
-    return false;
+    toggleBundleKillFlag(&MI, MO.getReg(), true, TRI);
+    return;
   }
 
   // If MO itself is live, clear the kill flag...
   if (LiveRegs.test(MO.getReg())) {
     MO.setIsKill(false);
-    toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
-    return false;
+    toggleBundleKillFlag(&MI, MO.getReg(), false, TRI);
+    return;
   }
 
   // If any subreg of MO is live, then create an imp-def for that
   // subreg and keep MO marked as killed.
   MO.setIsKill(false);
-  toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
+  toggleBundleKillFlag(&MI, MO.getReg(), false, TRI);
   bool AllDead = true;
   const unsigned SuperReg = MO.getReg();
-  MachineInstrBuilder MIB(MF, MI);
+  MachineInstrBuilder MIB(MF, &MI);
   for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
     if (LiveRegs.test(*SubRegs)) {
       MIB.addReg(*SubRegs, RegState::ImplicitDefine);
@@ -1213,13 +1130,12 @@ bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {
 
   if(AllDead) {
     MO.setIsKill(true);
-    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
+    toggleBundleKillFlag(&MI, MO.getReg(), true, TRI);
   }
-  return false;
 }
 
-// FIXME: Reuse the LivePhysRegs utility for this.
 void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
+  // FIXME: Reuse the LivePhysRegs utility for this.
   DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
 
   LiveRegs.resize(TRI->getNumRegs());
@@ -1289,7 +1205,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
 
       if (MO.isKill() != kill) {
         DEBUG(dbgs() << "Fixing " << MO << " in ");
-        toggleKillFlag(&MI, MO);
+        toggleKillFlag(MI, MO);
         DEBUG(MI.dump());
         DEBUG({
           if (MI.getOpcode() == TargetOpcode::BUNDLE) {
@@ -1319,6 +1235,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
 }
 
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+  // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   SU->getInstr()->dump();
 #endif
@@ -1347,7 +1264,7 @@ std::string ScheduleDAGInstrs::getDAGName() const {
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-/// \brief Internal state used to compute SchedDFSResult.
+/// Internal state used to compute SchedDFSResult.
 class SchedDFSImpl {
   SchedDFSResult &R;
 
@@ -1358,8 +1275,8 @@ class SchedDFSImpl {
 
   struct RootData {
     unsigned NodeID;
-    unsigned ParentNodeID;  // Parent node (member of the parent subtree).
-    unsigned SubInstrCount; // Instr count in this tree only, not children.
+    unsigned ParentNodeID;  ///< Parent node (member of the parent subtree).
+    unsigned SubInstrCount; ///< Instr count in this tree only, not children.
 
     RootData(unsigned id): NodeID(id),
                            ParentNodeID(SchedDFSResult::InvalidSubtreeID),
@@ -1375,7 +1292,7 @@ public:
     RootSet.setUniverse(R.DFSNodeData.size());
   }
 
-  /// Return true if this node been visited by the DFS traversal.
+  /// Returns true if this node been visited by the DFS traversal.
   ///
   /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
   /// ID. Later, SubtreeID is updated but remains valid.
@@ -1384,7 +1301,7 @@ public:
       != SchedDFSResult::InvalidSubtreeID;
   }
 
-  /// Initialize this node's instruction count. We don't need to flag the node
+  /// Initializes this node's instruction count. We don't need to flag the node
   /// visited until visitPostorder because the DAG cannot have cycles.
   void visitPreorder(const SUnit *SU) {
     R.DFSNodeData[SU->NodeNum].InstrCount =
@@ -1433,8 +1350,8 @@ public:
     RootSet[SU->NodeNum] = RData;
   }
 
-  /// Called once for each tree edge after calling visitPostOrderNode on the
-  /// predecessor. Increment the parent node's instruction count and
+  /// \brief Called once for each tree edge after calling visitPostOrderNode on
+  /// the predecessor. Increment the parent node's instruction count and
   /// preemptively join this subtree to its parent's if it is small enough.
   void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
     R.DFSNodeData[Succ->NodeNum].InstrCount
@@ -1442,13 +1359,13 @@ public:
     joinPredSubtree(PredDep, Succ);
   }
 
-  /// Add a connection for cross edges.
+  /// Adds a connection for cross edges.
   void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
     ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
   }
 
-  /// Set each node's subtree ID to the representative ID and record connections
-  /// between trees.
+  /// Sets each node's subtree ID to the representative ID and record
+  /// connections between trees.
   void finalize() {
     SubtreeClasses.compress();
     R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
@@ -1484,8 +1401,8 @@ public:
   }
 
 protected:
-  /// Join the predecessor subtree with the successor that is its DFS
-  /// parent. Apply some heuristics before joining.
+  /// Joins the predecessor subtree with the successor that is its DFS parent.
+  /// Applies some heuristics before joining.
   bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
                        bool CheckLimit = true) {
     assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
@@ -1531,10 +1448,10 @@ protected:
     } while (FromTree != SchedDFSResult::InvalidSubtreeID);
   }
 };
-} // namespace llvm
+} // end namespace llvm
 
 namespace {
-/// \brief Manage the stack used by a reverse depth-first search over the DAG.
+/// Manage the stack used by a reverse depth-first search over the DAG.
 class SchedDAGReverseDFS {
   std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;
 public:
@@ -1569,7 +1486,7 @@ static bool hasDataSucc(const SUnit *SU) {
   return false;
 }
 
-/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
+/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
 /// search from this root.
 void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
   if (!IsBottomUp)
@@ -1626,8 +1543,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
   }
 }
 
-LLVM_DUMP_METHOD
-void ILPValue::print(raw_ostream &OS) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
   OS << InstrCount << " / " << Length << " = ";
   if (!Length)
     OS << "BADILP";
@@ -1635,8 +1552,7 @@ void ILPValue::print(raw_ostream &OS) const {
     OS << format("%g", ((double)InstrCount / Length));
 }
 
-LLVM_DUMP_METHOD
-void ILPValue::dump() const {
+LLVM_DUMP_METHOD void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
@@ -1648,4 +1564,5 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
   return OS;
 }
 
-} // namespace llvm
+} // end namespace llvm
+#endif
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 83bc1ba7beb9..b3d83d5313af 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -1,4 +1,4 @@
-//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===//
+//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,11 +15,13 @@
 
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
     const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
     const char *ParentDebugType)
     : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II),
-      DAG(SchedDAG), IssueWidth(0), IssueCount(0) {
-
+      DAG(SchedDAG) {
   // Determine the maximum depth of any itinerary. This determines the depth of
   // the scoreboard. We always make the scoreboard at least 1 cycle deep to
   // avoid dealing with the boundary condition.
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2c7bffe76503..4d468551ae24 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -53,10 +53,6 @@ STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
   static cl::opt<bool>
-    CombinerAA("combiner-alias-analysis", cl::Hidden,
-               cl::desc("Enable DAG combiner alias-analysis heuristics"));
-
-  static cl::opt<bool>
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Enable DAG combiner's use of IR alias analysis"));
 
@@ -133,6 +129,9 @@ namespace {
     /// Add to the worklist making sure its instance is at the back (next to be
     /// processed.)
     void AddToWorklist(SDNode *N) {
+      assert(N->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted Node added to Worklist");
+
       // Skip handle nodes as they can't usefully be combined and confuse the
       // zero-use deletion strategy.
       if (N->getOpcode() == ISD::HANDLENODE)
@@ -177,6 +176,7 @@ namespace {
     void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
 
   private:
+    unsigned MaximumLegalStoreInBits;
 
     /// Check the specified integer node value to see if it can be simplified or
     /// if things it uses can be simplified by bit propagation.
@@ -232,9 +232,12 @@ namespace {
     SDValue visitTokenFactor(SDNode *N);
     SDValue visitMERGE_VALUES(SDNode *N);
     SDValue visitADD(SDNode *N);
+    SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);
     SDValue visitSUB(SDNode *N);
     SDValue visitADDC(SDNode *N);
+    SDValue visitUADDO(SDNode *N);
     SDValue visitSUBC(SDNode *N);
+    SDValue visitUSUBO(SDNode *N);
     SDValue visitADDE(SDNode *N);
     SDValue visitSUBE(SDNode *N);
     SDValue visitMUL(SDNode *N);
@@ -259,6 +262,7 @@ namespace {
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
     SDValue visitRotate(SDNode *N);
+    SDValue visitABS(SDNode *N);
     SDValue visitBSWAP(SDNode *N);
     SDValue visitBITREVERSE(SDNode *N);
     SDValue visitCTLZ(SDNode *N);
@@ -274,6 +278,7 @@ namespace {
     SDValue visitSIGN_EXTEND(SDNode *N);
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitAssertZext(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
@@ -336,6 +341,7 @@ namespace {
     SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
 
     SDValue foldSelectOfConstants(SDNode *N);
+    SDValue foldBinOpIntoSelect(SDNode *BO);
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
     SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
     SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
@@ -344,6 +350,8 @@ namespace {
                              bool NotExtCompare = false);
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
+    SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                              const SDLoc &DL);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                           const SDLoc &DL, bool foldBooleans = true);
 
@@ -377,6 +385,7 @@ namespace {
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
@@ -384,9 +393,9 @@ namespace {
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
-    SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask,
-                                  SDValue VecIn1, SDValue VecIn2,
-                                  unsigned LeftIdx);
+    SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
+                                  ArrayRef<int> VectorMask, SDValue VecIn1,
+                                  SDValue VecIn2, unsigned LeftIdx);
 
     SDValue GetDemandedBits(SDValue V, const APInt &Mask);
 
@@ -416,15 +425,12 @@ namespace {
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
-      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
-      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+      MemOpLink(LSBaseSDNode *N, int64_t Offset)
+          : MemNode(N), OffsetFromBase(Offset) {}
       // Ptr to the mem node.
       LSBaseSDNode *MemNode;
       // Offset from the base ptr.
       int64_t OffsetFromBase;
-      // What is the sequence number of this mem node.
-      // Lowest mem operand in the DAG starts at zero.
-      unsigned SequenceNum;
     };
 
     /// This is a helper function for visitMUL to check the profitability
@@ -435,12 +441,6 @@ namespace {
                                      SDValue &AddNode,
                                      SDValue &ConstNode);
 
-    /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
-    /// constant build_vector of the stored constant values in Stores.
-    SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL,
-                                         ArrayRef<MemOpLink> Stores,
-                                         SmallVectorImpl<SDValue> &Chains,
-                                         EVT Ty) const;
 
     /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
     /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
@@ -451,34 +451,35 @@ namespace {
                           EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
                           bool &NarrowLoad);
 
+    /// Helper function for MergeConsecutiveStores which merges the
+    /// component store chains.
+    SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                unsigned NumStores);
+
     /// This is a helper function for MergeConsecutiveStores. When the source
     /// elements of the consecutive stores are all constants or all extracted
     /// vector elements, try to merge them into one larger store.
-    /// \return number of stores that were merged into a merged store (always
-    /// a prefix of \p StoreNode).
-    bool MergeStoresOfConstantsOrVecElts(
-        SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
-        bool IsConstantSrc, bool UseVector);
+    /// \return True if a merged store was created.
+    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumStores,
+                                         bool IsConstantSrc, bool UseVector);
 
     /// This is a helper function for MergeConsecutiveStores.
     /// Stores that may be merged are placed in StoreNodes.
-    /// Loads that may alias with those stores are placed in AliasLoadNodes.
-    void getStoreMergeAndAliasCandidates(
-        StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
-        SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
+    void getStoreMergeCandidates(StoreSDNode *St,
+                                 SmallVectorImpl<MemOpLink> &StoreNodes);
 
     /// Helper function for MergeConsecutiveStores. Checks if
     /// Candidate stores have indirect dependency through their
     /// operands. \return True if safe to merge
     bool checkMergeStoreCandidatesForDependencies(
-        SmallVectorImpl<MemOpLink> &StoreNodes);
+        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);
 
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return number of stores that were merged into a merged store (the
     /// affected nodes are stored as a prefix in \p StoreNodes).
-    bool MergeConsecutiveStores(StoreSDNode *N,
-                                SmallVectorImpl<MemOpLink> &StoreNodes);
+    bool MergeConsecutiveStores(StoreSDNode *N);
 
     /// \brief Try to transform a truncation where C is a constant:
     ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
@@ -493,6 +494,13 @@ namespace {
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
       ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+      MaximumLegalStoreInBits = 0;
+      for (MVT VT : MVT::all_valuetypes())
+        if (EVT(VT).isSimple() && VT != MVT::Other &&
+            TLI.isTypeLegal(EVT(VT)) &&
+            VT.getSizeInBits() >= MaximumLegalStoreInBits)
+          MaximumLegalStoreInBits = VT.getSizeInBits();
     }
 
     /// Runs the dag combiner on all nodes in the work list
@@ -607,10 +615,16 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
 
   switch (Op.getOpcode()) {
   default: return false;
-  case ISD::ConstantFP:
-    // Don't invert constant FP values after legalize.  The negated constant
-    // isn't necessarily legal.
-    return LegalOperations ? 0 : 1;
+  case ISD::ConstantFP: {
+    if (!LegalOperations)
+      return 1;
+
+    // Don't invert constant FP values after legalization unless the target says
+    // the negated constant is legal.
+    EVT VT = Op.getValueType();
+    return TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+      TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT);
+  }
   case ISD::FADD:
     // FIXME: determine better conditions for this xform.
     if (!Options->UnsafeFPMath) return 0;
@@ -629,7 +643,8 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
                               Depth + 1);
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
-    if (!Options->UnsafeFPMath && !Op.getNode()->getFlags()->hasNoSignedZeros())
+    if (!Options->NoSignedZerosFPMath &&
+        !Op.getNode()->getFlags()->hasNoSignedZeros())
       return 0;
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
@@ -1079,37 +1094,36 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
+    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
     SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
-    if (!NN0.getNode())
-      return SDValue();
 
     bool Replace1 = false;
     SDValue N1 = Op.getOperand(1);
-    SDValue NN1;
-    if (N0 == N1)
-      NN1 = NN0;
-    else {
-      NN1 = PromoteOperand(N1, PVT, Replace1);
-      if (!NN1.getNode())
-        return SDValue();
-    }
+    SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
+    SDLoc DL(Op);
 
-    AddToWorklist(NN0.getNode());
-    if (NN1.getNode())
-      AddToWorklist(NN1.getNode());
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
 
-    if (Replace0)
+    // New replace instances of N0 and N1
+    if (Replace0 && N0 && N0.getOpcode() != ISD::DELETED_NODE && NN0 &&
+        NN0.getOpcode() != ISD::DELETED_NODE) {
+      AddToWorklist(NN0.getNode());
       ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
-    if (Replace1)
+    }
+
+    if (Replace1 && N1 && N1.getOpcode() != ISD::DELETED_NODE && NN1 &&
+        NN1.getOpcode() != ISD::DELETED_NODE) {
+      AddToWorklist(NN1.getNode());
       ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
+    }
 
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                       DAG.getNode(Opc, DL, PVT, NN0, NN1));
+    // Deal with Op being deleted.
+    if (Op && Op.getOpcode() != ISD::DELETED_NODE)
+      return RV;
   }
   return SDValue();
 }
@@ -1137,26 +1151,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
+    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
     if (Opc == ISD::SRA)
-      N0 = SExtPromoteOperand(Op.getOperand(0), PVT);
+      N0 = SExtPromoteOperand(N0, PVT);
     else if (Opc == ISD::SRL)
-      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
+      N0 = ZExtPromoteOperand(N0, PVT);
     else
       N0 = PromoteOperand(N0, PVT, Replace);
+
     if (!N0.getNode())
       return SDValue();
 
+    SDLoc DL(Op);
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
+
     AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                       DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1)));
+    // Deal with Op being deleted.
+    if (Op && Op.getOpcode() != ISD::DELETED_NODE)
+      return RV;
   }
   return SDValue();
 }
@@ -1361,8 +1381,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
              N->getNumValues() == 1 && "Type mismatch");
-      SDValue OpV = RV;
-      DAG.ReplaceAllUsesWith(N, &OpV);
+      DAG.ReplaceAllUsesWith(N, &RV);
     }
 
     // Push the new node and any users onto the worklist
@@ -1389,7 +1408,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ADD:                return visitADD(N);
   case ISD::SUB:                return visitSUB(N);
   case ISD::ADDC:               return visitADDC(N);
+  case ISD::UADDO:              return visitUADDO(N);
   case ISD::SUBC:               return visitSUBC(N);
+  case ISD::USUBO:              return visitUSUBO(N);
   case ISD::ADDE:               return visitADDE(N);
   case ISD::SUBE:               return visitSUBE(N);
   case ISD::MUL:                return visitMUL(N);
@@ -1415,6 +1436,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SRL:                return visitSRL(N);
   case ISD::ROTR:
   case ISD::ROTL:               return visitRotate(N);
+  case ISD::ABS:                return visitABS(N);
   case ISD::BSWAP:              return visitBSWAP(N);
   case ISD::BITREVERSE:         return visitBITREVERSE(N);
   case ISD::CTLZ:               return visitCTLZ(N);
@@ -1430,6 +1452,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
   case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::AssertZext:         return visitAssertZext(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
   case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
@@ -1574,7 +1597,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   }
 
   SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
-  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
   SmallPtrSet<SDNode*, 16> SeenOps;
   bool Changed = false;             // If we should replace this token factor.
 
@@ -1618,6 +1641,86 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
     }
   }
 
+  // Remove Nodes that are chained to another node in the list. Do so
+  // by walking up chains breath-first stopping when we've seen
+  // another operand. In general we must climb to the EntryNode, but we can exit
+  // early if we find all remaining work is associated with just one operand as
+  // no further pruning is possible.
+
+  // List of nodes to search through and original Ops from which they originate.
+  SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
+  SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
+  SmallPtrSet<SDNode *, 16> SeenChains;
+  bool DidPruneOps = false;
+
+  unsigned NumLeftToConsider = 0;
+  for (const SDValue &Op : Ops) {
+    Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
+    OpWorkCount.push_back(1);
+  }
+
+  auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
+    // If this is an Op, we can remove the op from the list. Remark any
+    // search associated with it as from the current OpNumber.
+    if (SeenOps.count(Op) != 0) {
+      Changed = true;
+      DidPruneOps = true;
+      unsigned OrigOpNumber = 0;
+      while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
+        OrigOpNumber++;
+      assert((OrigOpNumber != Ops.size()) &&
+             "expected to find TokenFactor Operand");
+      // Re-mark worklist from OrigOpNumber to OpNumber
+      for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
+        if (Worklist[i].second == OrigOpNumber) {
+          Worklist[i].second = OpNumber;
+        }
+      }
+      OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
+      OpWorkCount[OrigOpNumber] = 0;
+      NumLeftToConsider--;
+    }
+    // Add if it's a new chain
+    if (SeenChains.insert(Op).second) {
+      OpWorkCount[OpNumber]++;
+      Worklist.push_back(std::make_pair(Op, OpNumber));
+    }
+  };
+
+  for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
+    // We need at least be consider at least 2 Ops to prune.
+    if (NumLeftToConsider <= 1)
+      break;
+    auto CurNode = Worklist[i].first;
+    auto CurOpNumber = Worklist[i].second;
+    assert((OpWorkCount[CurOpNumber] > 0) &&
+           "Node should not appear in worklist");
+    switch (CurNode->getOpcode()) {
+    case ISD::EntryToken:
+      // Hitting EntryToken is the only way for the search to terminate without
+      // hitting
+      // another operand's search. Prevent us from marking this operand
+      // considered.
+      NumLeftToConsider++;
+      break;
+    case ISD::TokenFactor:
+      for (const SDValue &Op : CurNode->op_values())
+        AddToWorklist(i, Op.getNode(), CurOpNumber);
+      break;
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
+      break;
+    default:
+      if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
+        AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
+      break;
+    }
+    OpWorkCount[CurOpNumber]--;
+    if (OpWorkCount[CurOpNumber] == 0)
+      NumLeftToConsider--;
+  }
+
   SDValue Result;
 
   // If we've changed things around then replace token factor.
@@ -1626,15 +1729,22 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       // The entry token is the only possible outcome.
       Result = DAG.getEntryNode();
     } else {
-      // New and improved token factor.
-      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+      if (DidPruneOps) {
+        SmallVector<SDValue, 8> PrunedOps;
+        //
+        for (const SDValue &Op : Ops) {
+          if (SeenChains.count(Op.getNode()) == 0)
+            PrunedOps.push_back(Op);
+        }
+        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
+      } else {
+        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+      }
     }
 
-    // Add users to worklist if AA is enabled, since it may introduce
-    // a lot of new chained token factors while removing memory deps.
-    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-      : DAG.getSubtarget().useAA();
-    return CombineTo(N, Result, UseAA /*add to worklist*/);
+    // Add users to worklist, since we may introduce a lot of new
+    // chained token factors while removing memory deps.
+    return CombineTo(N, Result, true /*add to worklist*/);
   }
 
   return Result;
@@ -1664,6 +1774,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
   return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
 }
 
+SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
+  auto BinOpcode = BO->getOpcode();
+  assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB ||
+          BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV ||
+          BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM ||
+          BinOpcode == ISD::UREM || BinOpcode == ISD::AND ||
+          BinOpcode == ISD::OR || BinOpcode == ISD::XOR ||
+          BinOpcode == ISD::SHL || BinOpcode == ISD::SRL ||
+          BinOpcode == ISD::SRA || BinOpcode == ISD::FADD ||
+          BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL ||
+          BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) &&
+         "Unexpected binary operator");
+
+  // Bail out if any constants are opaque because we can't constant fold those.
+  SDValue C1 = BO->getOperand(1);
+  if (!isConstantOrConstantVector(C1, true) &&
+      !isConstantFPBuildVectorOrConstantFP(C1))
+    return SDValue();
+
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  // TODO: Handle ISD::SELECT_CC.
+  SDValue Sel = BO->getOperand(0);
+  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
+    return SDValue();
+
+  SDValue CT = Sel.getOperand(1);
+  if (!isConstantOrConstantVector(CT, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CT))
+    return SDValue();
+
+  SDValue CF = Sel.getOperand(2);
+  if (!isConstantOrConstantVector(CF, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CF))
+    return SDValue();
+
+  // We have a select-of-constants followed by a binary operator with a
+  // constant. Eliminate the binop by pulling the constant math into the select.
+  // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1
+  EVT VT = Sel.getValueType();
+  SDLoc DL(Sel);
+  SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1);
+  assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) ||
+          isConstantFPBuildVectorOrConstantFP(NewCT)) &&
+         "Failed to constant fold a binop with constant operands");
+
+  SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1);
+  assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) ||
+          isConstantFPBuildVectorOrConstantFP(NewCF)) &&
+         "Failed to constant fold a binop with constant operands");
+
+  return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -1712,6 +1876,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       }
   }
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate add
   if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))
     return RADD;
@@ -1774,6 +1941,19 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1))
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
+  if (SDValue Combined = visitADDLike(N0, N1, N))
+    return Combined;
+
+  if (SDValue Combined = visitADDLike(N1, N0, N))
+    return Combined;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+  EVT VT = N0.getValueType();
+  SDLoc DL(LocReference);
+
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
@@ -1781,12 +1961,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
                        DAG.getNode(ISD::SHL, DL, VT,
                                    N1.getOperand(0).getOperand(1),
                                    N1.getOperand(1)));
-  if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0)))
-    return DAG.getNode(ISD::SUB, DL, VT, N1,
-                       DAG.getNode(ISD::SHL, DL, VT,
-                                   N0.getOperand(0).getOperand(1),
-                                   N0.getOperand(1)));
 
   if (N1.getOpcode() == ISD::AND) {
     SDValue AndOp0 = N1.getOperand(0);
@@ -1797,7 +1971,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     // and similar xforms where the inner op is either ~0 or 0.
     if (NumSignBits == DestBits &&
         isOneConstantOrOneSplatConstant(N1->getOperand(1)))
-      return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0);
+      return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
   }
 
   // add (sext i1), X -> sub X, (zext i1)
@@ -1825,39 +1999,61 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  SDLoc DL(N);
 
   // If the flag result is dead, turn this into an ADD.
   if (!N->hasAnyUseOfValue(1))
-    return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1),
-                     DAG.getNode(ISD::CARRY_FALSE,
-                                 SDLoc(N), MVT::Glue));
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // canonicalize constant to RHS.
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
+    return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
 
   // fold (addc x, 0) -> x + no carry out
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
-                                        SDLoc(N), MVT::Glue));
+                                        DL, MVT::Glue));
 
-  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
-  APInt LHSZero, LHSOne;
-  APInt RHSZero, RHSOne;
-  DAG.computeKnownBits(N0, LHSZero, LHSOne);
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
-  if (LHSZero.getBoolValue()) {
-    DAG.computeKnownBits(N1, RHSZero, RHSOne);
+  return SDValue();
+}
 
-    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
-    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
-    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
-      return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1),
-                       DAG.getNode(ISD::CARRY_FALSE,
-                                   SDLoc(N), MVT::Glue));
-  }
+SDValue DAGCombiner::visitUADDO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);
+
+  // fold (uaddo x, 0) -> x + no carry out
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getConstant(0, DL, CarryVT));
 
   return SDValue();
 }
@@ -1920,6 +2116,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
                                       N1.getNode());
   }
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
 
   // fold (sub x, c) -> (add x, -c)
@@ -2066,6 +2265,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitUSUBO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // fold (usubo x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  // fold (usubo x, 0) -> x + no borrow
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (isAllOnesConstant(N0))
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSUBE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2131,6 +2362,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // fold (mul x, 1) -> x
   if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (mul x, -1) -> 0-x
   if (N1IsConst && ConstValue1.isAllOnesValue()) {
     SDLoc DL(N);
@@ -2297,6 +2532,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   return combined;
 }
 
+static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (DAG.isUndef(N->getOpcode(), {N0, N1}))
+    return DAG.getUNDEF(VT);
+
+  // undef / X -> 0
+  // undef % X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2319,8 +2571,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     return N0;
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
-    return DAG.getNode(ISD::SUB, DL, VT,
-                       DAG.getConstant(0, DL, VT), N0);
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
+
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
 
   // If we know the sign bits of both operands are zero, strength reduce to a
   // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
@@ -2372,7 +2629,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   // If integer divide is expensive and we satisfy the requirements, emit an
   // alternate sequence.  Targets may check function attributes for size/speed
   // trade-offs.
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildSDIV(N))
       return Op;
@@ -2384,13 +2641,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
-  // undef / X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X / undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2414,6 +2664,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
                                                     N0C, N1C))
       return Folded;
 
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (udiv x, (1 << c)) -> x >>u c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1)) {
@@ -2444,7 +2700,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   }
 
   // fold (udiv x, c) -> alternate
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildUDIV(N))
       return Op;
@@ -2456,13 +2712,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
-  // undef / X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X / undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2482,32 +2731,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
       return Folded;
 
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (isSigned) {
     // If we know the sign bits of both operands are zero, strength reduce to a
     // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
   } else {
-    // fold (urem x, pow2) -> (and x, pow2-1)
+    SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
     if (DAG.isKnownToBeAPowerOfTwo(N1)) {
-      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits());
-      SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+      // fold (urem x, pow2) -> (and x, pow2-1)
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
-    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
     if (N1.getOpcode() == ISD::SHL &&
         DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
-      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits());
-      SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+      // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
   }
 
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
 
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
@@ -2536,13 +2788,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   if (SDValue DivRem = useDivRem(N))
     return DivRem.getValue(1);
 
-  // undef % X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X % undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2932,95 +3177,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   return SDValue();
 }
 
+/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
+SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                                       const SDLoc &DL) {
+  SDValue LL, LR, RL, RR, N0CC, N1CC;
+  if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
+      !isSetCCEquivalent(N1, RL, RR, N1CC))
+    return SDValue();
+
+  assert(N0.getValueType() == N1.getValueType() &&
+         "Unexpected operand types for bitwise logic op");
+  assert(LL.getValueType() == LR.getValueType() &&
+         RL.getValueType() == RR.getValueType() &&
+         "Unexpected operand types for setcc");
+
+  // If we're here post-legalization or the logic op type is not i1, the logic
+  // op type must match a setcc result type. Also, all folds require new
+  // operations on the left and right operands, so those types must match.
+  EVT VT = N0.getValueType();
+  EVT OpVT = LL.getValueType();
+  if (LegalOperations || VT != MVT::i1)
+    if (VT != getSetCCResultType(OpVT))
+      return SDValue();
+  if (OpVT != RL.getValueType())
+    return SDValue();
+
+  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
+  bool IsInteger = OpVT.isInteger();
+  if (LR == RR && CC0 == CC1 && IsInteger) {
+    bool IsZero = isNullConstantOrNullSplatConstant(LR);
+    bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);
+
+    // All bits clear?
+    bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
+    // All sign bits clear?
+    bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
+    // Any bits set?
+    bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
+    // Any sign bits set?
+    bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
+
+    // (and (seteq X,  0), (seteq Y,  0)) --> (seteq (or X, Y),  0)
+    // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
+    // (or  (setne X,  0), (setne Y,  0)) --> (setne (or X, Y),  0)
+    // (or  (setlt X,  0), (setlt Y,  0)) --> (setlt (or X, Y),  0)
+    if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
+      SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(Or.getNode());
+      return DAG.getSetCC(DL, VT, Or, LR, CC1);
+    }
+
+    // All bits set?
+    bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
+    // All sign bits set?
+    bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
+    // Any bits clear?
+    bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
+    // Any sign bits clear?
+    bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
+
+    // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
+    // (and (setlt X,  0), (setlt Y,  0)) --> (setlt (and X, Y),  0)
+    // (or  (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
+    // (or  (setgt X, -1), (setgt Y  -1)) --> (setgt (and X, Y), -1)
+    if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(And.getNode());
+      return DAG.getSetCC(DL, VT, And, LR, CC1);
+    }
+  }
+
+  // TODO: What is the 'or' equivalent of this fold?
+  // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
+  if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE &&
+      ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
+       (isAllOnesConstant(LR) && isNullConstant(RR)))) {
+    SDValue One = DAG.getConstant(1, DL, OpVT);
+    SDValue Two = DAG.getConstant(2, DL, OpVT);
+    SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
+    AddToWorklist(Add.getNode());
+    return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
+  }
+
+  // Try more general transforms if the predicates match and the only user of
+  // the compares is the 'and' or 'or'.
+  if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
+      N0.hasOneUse() && N1.hasOneUse()) {
+    // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
+    // or  (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
+    if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
+      SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
+      SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
+      SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
+      SDValue Zero = DAG.getConstant(0, DL, OpVT);
+      return DAG.getSetCC(DL, VT, Or, Zero, CC1);
+    }
+  }
+
+  // Canonicalize equivalent operands to LL == RL.
+  if (LL == RR && LR == RL) {
+    CC1 = ISD::getSetCCSwappedOperands(CC1);
+    std::swap(RL, RR);
+  }
+
+  // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  // (or  (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  if (LL == RL && LR == RR) {
+    ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger)
+                                : ISD::getSetCCOrOperation(CC0, CC1, IsInteger);
+    if (NewCC != ISD::SETCC_INVALID &&
+        (!LegalOperations ||
+         (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
+          TLI.isOperationLegal(ISD::SETCC, OpVT))))
+      return DAG.getSetCC(DL, VT, LL, LR, NewCC);
+  }
+
+  return SDValue();
+}
+
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an And operation to a single value. This makes them reusable in the context
 /// of visitSELECT(). Rules involving constants are not included as
 /// visitSELECT() already handles those cases.
-SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
-                                  SDNode *LocReference) {
+SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
 
   // fold (and x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
-    return DAG.getConstant(0, SDLoc(LocReference), VT);
-  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
-    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
-    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
-
-    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
-        LL.getValueType().isInteger()) {
-      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
-      if (isNullConstant(LR) && Op1 == ISD::SETEQ) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
-                                       LR.getValueType(), LL, RL);
-          AddToWorklist(ORNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-        }
-      }
-      if (isAllOnesConstant(LR)) {
-        // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
-        if (Op1 == ISD::SETEQ) {
-          EVT CCVT = getSetCCResultType(LR.getValueType());
-          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-            SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0),
-                                          LR.getValueType(), LL, RL);
-            AddToWorklist(ANDNode.getNode());
-            return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
-          }
-        }
-        // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1)
-        if (Op1 == ISD::SETGT) {
-          EVT CCVT = getSetCCResultType(LR.getValueType());
-          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-            SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
-                                         LR.getValueType(), LL, RL);
-            AddToWorklist(ORNode.getNode());
-            return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-          }
-        }
-      }
-    }
-    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
-    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
-        Op0 == Op1 && LL.getValueType().isInteger() &&
-      Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
-                            (isAllOnesConstant(LR) && isNullConstant(RR)))) {
-      EVT CCVT = getSetCCResultType(LL.getValueType());
-      if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-        SDLoc DL(N0);
-        SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(),
-                                      LL, DAG.getConstant(1, DL,
-                                                          LL.getValueType()));
-        AddToWorklist(ADDNode.getNode());
-        return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode,
-                            DAG.getConstant(2, DL, LL.getValueType()),
-                            ISD::SETUGE);
-      }
-    }
-    // canonicalize equivalent to ll == rl
-    if (LL == RR && LR == RL) {
-      Op1 = ISD::getSetCCSwappedOperands(Op1);
-      std::swap(RL, RR);
-    }
-    if (LL == RL && LR == RR) {
-      bool isInteger = LL.getValueType().isInteger();
-      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
-      if (Result != ISD::SETCC_INVALID &&
-          (!LegalOperations ||
-           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
-        EVT CCVT = getSetCCResultType(LL.getValueType());
-        if (N0.getValueType() == CCVT ||
-            (!LegalOperations && N0.getValueType() == MVT::i1))
-          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                              LL, LR, Result);
-      }
-    }
-  }
+    return DAG.getConstant(0, DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
+    return V;
 
   if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
       VT.getSizeInBits() <= 64) {
@@ -3037,13 +3326,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
           if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
             ADDC |= Mask;
             if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
-              SDLoc DL(N0);
+              SDLoc DL0(N0);
               SDValue NewAdd =
-                DAG.getNode(ISD::ADD, DL, VT,
+                DAG.getNode(ISD::ADD, DL0, VT,
                             N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
               CombineTo(N0.getNode(), NewAdd);
               // Return N so it doesn't get rechecked!
-              return SDValue(LocReference, 0);
+              return SDValue(N, 0);
             }
           }
         }
@@ -3068,7 +3357,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
         unsigned MaskBits = AndMask.countTrailingOnes();
         EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
 
-        if (APIntOps::isMask(AndMask) &&
+        if (AndMask.isMask() &&
             // Required bits must not span the two halves of the integer and
             // must fit in the half size type.
             (ShiftBits + MaskBits <= Size / 2) &&
@@ -3108,7 +3397,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                                    bool &NarrowLoad) {
   uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits();
 
-  if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue()))
+  if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits))
     return false;
 
   ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
@@ -3191,6 +3480,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate and
   if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
     return RAND;
@@ -3299,6 +3592,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
       // preserve semantics once we get rid of the AND.
       SDValue NewLoad(Load, 0);
+
+      // Fold the AND away. NewLoad may get replaced immediately.
+      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
+
       if (Load->getExtensionType() == ISD::EXTLOAD) {
         NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
                               Load->getValueType(0), SDLoc(Load),
@@ -3316,10 +3613,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         }
       }
 
-      // Fold the AND away, taking care not to fold to the old load node if we
-      // replaced it.
-      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
-
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
@@ -3723,65 +4016,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
 
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an Or operation to a single value \see visitANDLike().
-SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
+
   // fold (or x, undef) -> -1
-  if (!LegalOperations &&
-      (N0.isUndef() || N1.isUndef())) {
-    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
-    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()),
-                           SDLoc(LocReference), VT);
-  }
-  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
-    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
-    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
-
-    if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) {
-      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
-      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
-      if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR),
-                                       LR.getValueType(), LL, RL);
-          AddToWorklist(ORNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-        }
-      }
-      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
-      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
-      if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR),
-                                        LR.getValueType(), LL, RL);
-          AddToWorklist(ANDNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
-        }
-      }
-    }
-    // canonicalize equivalent to ll == rl
-    if (LL == RR && LR == RL) {
-      Op1 = ISD::getSetCCSwappedOperands(Op1);
-      std::swap(RL, RR);
-    }
-    if (LL == RL && LR == RR) {
-      bool isInteger = LL.getValueType().isInteger();
-      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
-      if (Result != ISD::SETCC_INVALID &&
-          (!LegalOperations ||
-           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
-        EVT CCVT = getSetCCResultType(LL.getValueType());
-        if (N0.getValueType() == CCVT ||
-            (!LegalOperations && N0.getValueType() == MVT::i1))
-          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                              LL, LR, Result);
-      }
-    }
-  }
+  if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
+    return DAG.getAllOnesConstant(DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
+    return V;
 
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
@@ -3802,7 +4046,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
             DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
           SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                                   N0.getOperand(0), N1.getOperand(0));
-          SDLoc DL(LocReference);
           return DAG.getNode(ISD::AND, DL, VT, X,
                              DAG.getConstant(LHSMask | RHSMask, DL, VT));
         }
@@ -3818,7 +4061,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
       (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
-    return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X);
+    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
   }
 
   return SDValue();
@@ -3847,14 +4090,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     // fold (or x, -1) -> -1, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
       // do not return N0, because undef node may exist in N0
-      return DAG.getConstant(
-          APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N),
-          N0.getValueType());
+      return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       // do not return N1, because undef node may exist in N1
-      return DAG.getConstant(
-          APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N),
-          N1.getValueType());
+      return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
     // Do this only if the resulting shuffle is legal.
@@ -3867,7 +4106,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
       bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
       // Ensure both shuffles have a zero input.
-      if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) {
+      if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
         assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
         assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
         const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
@@ -3939,6 +4178,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   // fold (or x, -1) -> -1
   if (isAllOnesConstant(N1))
     return N1;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (or x, c) -> c iff (x & ~c) == 0
   if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
     return N1;
@@ -3956,7 +4199,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
     return ROR;
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
-  // iff (c1 & c2) == 0.
+  // iff (c1 & c2) != 0.
   if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
@@ -3978,6 +4221,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
 
+  if (SDValue Load = MatchLoadCombine(N))
+    return Load;
+
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
       SimplifyDemandedBits(SDValue(N, 0)))
@@ -4190,8 +4436,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
-      APInt AllBits = APInt::getAllOnesValue(EltSizeInBits);
-      SDValue Mask = DAG.getConstant(AllBits, DL, VT);
+      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
 
       if (LHSMask.getNode()) {
         APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
@@ -4349,6 +4594,299 @@ struct BaseIndexOffset {
 };
 } // namespace
 
+namespace {
+/// Represents known origin of an individual byte in load combine pattern. The
+/// value of the byte is either constant zero or comes from memory.
+struct ByteProvider {
+  // For constant zero providers Load is set to nullptr. For memory providers
+  // Load represents the node which loads the byte from memory.
+  // ByteOffset is the offset of the byte in the value produced by the load.
+  LoadSDNode *Load;
+  unsigned ByteOffset;
+
+  ByteProvider() : Load(nullptr), ByteOffset(0) {}
+
+  static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
+    return ByteProvider(Load, ByteOffset);
+  }
+  static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
+
+  bool isConstantZero() const { return !Load; }
+  bool isMemory() const { return Load; }
+
+  bool operator==(const ByteProvider &Other) const {
+    return Other.Load == Load && Other.ByteOffset == ByteOffset;
+  }
+
+private:
+  ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
+      : Load(Load), ByteOffset(ByteOffset) {}
+};
+
+/// Recursively traverses the expression calculating the origin of the requested
+/// byte of the given value. Returns None if the provider can't be calculated.
+///
+/// For all the values except the root of the expression verifies that the value
+/// has exactly one use and if it's not true return None. This way if the origin
+/// of the byte is returned it's guaranteed that the values which contribute to
+/// the byte are not used outside of this expression.
+///
+/// Because the parts of the expression are not allowed to have more than one
+/// use this function iterates over trees, not DAGs. So it never visits the same
+/// node more than once.
+const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
+                                                   unsigned Depth,
+                                                   bool Root = false) {
+  // Typical i64 by i8 pattern requires recursion up to 8 calls depth
+  if (Depth == 10)
+    return None;
+
+  if (!Root && !Op.hasOneUse())
+    return None;
+
+  assert(Op.getValueType().isScalarInteger() && "can't handle other types");
+  unsigned BitWidth = Op.getValueSizeInBits();
+  if (BitWidth % 8 != 0)
+    return None;
+  unsigned ByteWidth = BitWidth / 8;
+  assert(Index < ByteWidth && "invalid index requested");
+  (void) ByteWidth;
+
+  switch (Op.getOpcode()) {
+  case ISD::OR: {
+    auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
+    if (!LHS)
+      return None;
+    auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
+    if (!RHS)
+      return None;
+
+    if (LHS->isConstantZero())
+      return RHS;
+    if (RHS->isConstantZero())
+      return LHS;
+    return None;
+  }
+  case ISD::SHL: {
+    auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!ShiftOp)
+      return None;
+
+    uint64_t BitShift = ShiftOp->getZExtValue();
+    if (BitShift % 8 != 0)
+      return None;
+    uint64_t ByteShift = BitShift / 8;
+
+    return Index < ByteShift
+               ? ByteProvider::getConstantZero()
+               : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
+                                       Depth + 1);
+  }
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND: {
+    SDValue NarrowOp = Op->getOperand(0);
+    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return None;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    if (Index >= NarrowByteWidth)
+      return Op.getOpcode() == ISD::ZERO_EXTEND
+                 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
+                 : None;
+    return calculateByteProvider(NarrowOp, Index, Depth + 1);
+  }
+  case ISD::BSWAP:
+    return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
+                                 Depth + 1);
+  case ISD::LOAD: {
+    auto L = cast<LoadSDNode>(Op.getNode());
+    if (L->isVolatile() || L->isIndexed())
+      return None;
+
+    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return None;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    if (Index >= NarrowByteWidth)
+      return L->getExtensionType() == ISD::ZEXTLOAD
+                 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
+                 : None;
+    return ByteProvider::getMemory(L, Index);
+  }
+  }
+
+  return None;
+}
+} // namespace
+
+/// Match a pattern where a wide type scalar value is loaded by several narrow
+/// loads and combined by shifts and ors. Fold it into a single load or a load
+/// and a BSWAP if the targets supports it.
+///
+/// Assuming little endian target:
+///  i8 *a = ...
+///  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+/// =>
+///  i32 val = *((i32)a)
+///
+///  i8 *a = ...
+///  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+/// =>
+///  i32 val = BSWAP(*((i32)a))
+///
+/// TODO: This rule matches complex patterns with OR node roots and doesn't
+/// interact well with the worklist mechanism. When a part of the pattern is
+/// updated (e.g. one of the loads) its direct users are put into the worklist,
+/// but the root node of the pattern which triggers the load combine is not
+/// necessarily a direct user of the changed node. For example, once the address
+/// of t28 load is reassociated load combine won't be triggered:
+///             t25: i32 = add t4, Constant:i32<2>
+///           t26: i64 = sign_extend t25
+///        t27: i64 = add t2, t26
+///       t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
+///     t29: i32 = zero_extend t28
+///   t32: i32 = shl t29, Constant:i8<8>
+/// t33: i32 = or t23, t32
+/// As a possible fix visitLoad can check if the load can be a part of a load
+/// combine pattern and add corresponding OR roots to the worklist.
+SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR &&
+         "Can only match load combining against OR nodes");
+
+  // Handles simple types only
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+  unsigned ByteWidth = VT.getSizeInBits() / 8;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Before legalize we can introduce too wide illegal loads which will be later
+  // split into legal sized loads. This enables us to combine i64 load by i8
+  // patterns to a couple of i32 loads on 32 bit targets.
+  if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
+    return SDValue();
+
+  std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = [](
+    unsigned BW, unsigned i) { return i; };
+  std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
+    unsigned BW, unsigned i) { return BW - i - 1; };
+
+  bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
+  auto MemoryByteOffset = [&] (ByteProvider P) {
+    assert(P.isMemory() && "Must be a memory byte provider");
+    unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
+    assert(LoadBitWidth % 8 == 0 &&
+           "can only analyze providers for individual bytes not bit");
+    unsigned LoadByteWidth = LoadBitWidth / 8;
+    return IsBigEndianTarget
+            ? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
+            : LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
+  };
+
+  Optional<BaseIndexOffset> Base;
+  SDValue Chain;
+
+  SmallSet<LoadSDNode *, 8> Loads;
+  Optional<ByteProvider> FirstByteProvider;
+  int64_t FirstOffset = INT64_MAX;
+
+  // Check if all the bytes of the OR we are looking at are loaded from the same
+  // base address. Collect bytes offsets from Base address in ByteOffsets.
+  SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
+  for (unsigned i = 0; i < ByteWidth; i++) {
+    auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
+    if (!P || !P->isMemory()) // All the bytes must be loaded from memory
+      return SDValue();
+
+    LoadSDNode *L = P->Load;
+    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
+           "Must be enforced by calculateByteProvider");
+    assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
+
+    // All loads must share the same chain
+    SDValue LChain = L->getChain();
+    if (!Chain)
+      Chain = LChain;
+    else if (Chain != LChain)
+      return SDValue();
+
+    // Loads must share the same base address
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
+    if (!Base)
+      Base = Ptr;
+    else if (!Base->equalBaseIndex(Ptr))
+      return SDValue();
+
+    // Calculate the offset of the current byte from the base address
+    int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset(*P);
+    ByteOffsets[i] = ByteOffsetFromBase;
+
+    // Remember the first byte load
+    if (ByteOffsetFromBase < FirstOffset) {
+      FirstByteProvider = P;
+      FirstOffset = ByteOffsetFromBase;
+    }
+
+    Loads.insert(L);
+  }
+  assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
+         "memory, so there must be at least one load which produces the value");
+  assert(Base && "Base address of the accessed memory location must be set");
+  assert(FirstOffset != INT64_MAX && "First byte offset must be set");
+
+  // Check if the bytes of the OR we are looking at match with either big or
+  // little endian value load
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned i = 0; i < ByteWidth; i++) {
+    int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
+    LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
+    BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
+    if (!BigEndian && !LittleEndian)
+      return SDValue();
+  }
+  assert((BigEndian != LittleEndian) && "should be either or");
+  assert(FirstByteProvider && "must be set");
+
+  // Ensure that the first byte is loaded from zero offset of the first load.
+  // So the combined value can be loaded from the first load address.
+  if (MemoryByteOffset(*FirstByteProvider) != 0)
+    return SDValue();
+  LoadSDNode *FirstLoad = FirstByteProvider->Load;
+
+  // The node we are looking at matches with the pattern, check if we can
+  // replace it with a single load and bswap if needed.
+
+  // If the load needs byte swap check if the target supports it
+  bool NeedsBswap = IsBigEndianTarget != BigEndian;
+
+  // Before legalize we can introduce illegal bswaps which will be later
+  // converted to an explicit bswap sequence. This way we end up with a single
+  // load and byte shuffling instead of several loads and byte shuffling.
+  if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Check that a load of the wide type is both allowed and fast on the target
+  bool Fast = false;
+  bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, FirstLoad->getAddressSpace(),
+                                        FirstLoad->getAlignment(), &Fast);
+  if (!Allowed || !Fast)
+    return SDValue();
+
+  SDValue NewLoad =
+      DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
+                  FirstLoad->getPointerInfo(), FirstLoad->getAlignment());
+
+  // Transfer chain users from old loads to the new load.
+  for (LoadSDNode *L : Loads)
+    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+
+  return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
+}
+
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4386,6 +4924,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate xor
   if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
     return RXOR;
@@ -4403,9 +4945,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       default:
         llvm_unreachable("Unhandled SetCC Equivalent!");
       case ISD::SETCC:
-        return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC);
+        return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
       case ISD::SELECT_CC:
-        return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2),
+        return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
                                N0.getOperand(3), NotCC);
       }
     }
@@ -4470,6 +5012,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
                                          N01C->getAPIntValue(), DL, VT));
     }
   }
+
+  // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) &&
+      TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
+      if (C->getAPIntValue() == (OpSizeInBits - 1))
+        return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0));
+  }
+
   // fold (xor x, x) -> 0
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
@@ -4673,6 +5226,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl undef, x) -> 0
   if (N0.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
                             APInt::getAllOnesValue(OpSizeInBits)))
@@ -4808,9 +5365,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
   if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
       isConstantOrConstantVector(N1, /* No Opaques */ true)) {
-    unsigned BitSize = VT.getScalarSizeInBits();
     SDLoc DL(N);
-    SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT);
+    SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
     SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
   }
@@ -4877,6 +5433,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
@@ -5024,6 +5584,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // fold (srl x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // if (srl x, c) is known to be zero, return 0
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(OpSizeInBits)))
@@ -5074,9 +5638,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
     SDLoc DL(N);
-    APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
     SDValue Mask =
-        DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1);
+        DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
     AddToWorklist(Mask.getNode());
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
   }
@@ -5202,6 +5765,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (abs c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
+  // fold (abs (abs x)) -> (abs x)
+  if (N0.getOpcode() == ISD::ABS)
+    return N0;
+  // fold (abs x) -> x iff not-negative
+  if (DAG.SignBitIsZero(N0))
+    return N0;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -5217,7 +5796,11 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
 
 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
 
+  // fold (bitreverse c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
   // fold (bitreverse (bitreverse x)) -> x
   if (N0.getOpcode() == ISD::BITREVERSE)
     return N0.getOperand(0);
@@ -5311,7 +5894,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   }
 }
 
-// TODO: We should handle other cases of selecting between {-1,0,1} here.
 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5320,6 +5902,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   EVT CondVT = Cond.getValueType();
   SDLoc DL(N);
 
+  if (!VT.isInteger())
+    return SDValue();
+
+  auto *C1 = dyn_cast<ConstantSDNode>(N1);
+  auto *C2 = dyn_cast<ConstantSDNode>(N2);
+  if (!C1 || !C2)
+    return SDValue();
+
+  // Only do this before legalization to avoid conflicting with target-specific
+  // transforms in the other direction (create a select from a zext/sext). There
+  // is also a target-independent combine here in DAGCombiner in the other
+  // direction for (select Cond, -1, 0) when the condition is not i1.
+  if (CondVT == MVT::i1 && !LegalOperations) {
+    if (C1->isNullValue() && C2->isOne()) {
+      // select Cond, 0, 1 --> zext (!Cond)
+      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+      if (VT != MVT::i1)
+        NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
+      return NotCond;
+    }
+    if (C1->isNullValue() && C2->isAllOnesValue()) {
+      // select Cond, 0, -1 --> sext (!Cond)
+      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+      if (VT != MVT::i1)
+        NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
+      return NotCond;
+    }
+    if (C1->isOne() && C2->isNullValue()) {
+      // select Cond, 1, 0 --> zext (Cond)
+      if (VT != MVT::i1)
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+      return Cond;
+    }
+    if (C1->isAllOnesValue() && C2->isNullValue()) {
+      // select Cond, -1, 0 --> sext (Cond)
+      if (VT != MVT::i1)
+        Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+      return Cond;
+    }
+
+    // For any constants that differ by 1, we can transform the select into an
+    // extend and add. Use a target hook because some targets may prefer to
+    // transform in the other direction.
+    if (TLI.convertSelectOfConstantsToMath()) {
+      if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
+        // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+      }
+      if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
+        // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+      }
+    }
+
+    return SDValue();
+  }
+
   // fold (select Cond, 0, 1) -> (xor Cond, 1)
   // We can't do this reliably if integer based booleans have different contents
   // to floating point based booleans. This is because we can't tell whether we
@@ -5329,15 +5972,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   // undiscoverable (or not reasonably discoverable). For example, it could be
   // in another basic block or it could require searching a complicated
   // expression.
-  if (VT.isInteger() &&
-      (CondVT == MVT::i1 || (CondVT.isInteger() &&
-                             TLI.getBooleanContents(false, true) ==
-                                 TargetLowering::ZeroOrOneBooleanContent &&
-                             TLI.getBooleanContents(false, false) ==
-                                 TargetLowering::ZeroOrOneBooleanContent)) &&
-      isNullConstant(N1) && isOneConstant(N2)) {
-    SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
-                                  DAG.getConstant(1, DL, CondVT));
+  if (CondVT.isInteger() &&
+      TLI.getBooleanContents(false, true) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      TLI.getBooleanContents(false, false) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      C1->isNullValue() && C2->isOne()) {
+    SDValue NotCond =
+        DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
     if (VT.bitsEq(CondVT))
       return NotCond;
     return DAG.getZExtOrTrunc(NotCond, DL, VT);
@@ -5847,7 +6489,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                            ISD::NON_EXTLOAD, MLD->isExpandingLoad());
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MLD->isExpandingLoad()); 
+                                     MLD->isExpandingLoad());
 
     MMO = DAG.getMachineFunction().
     getMachineMemOperand(MLD->getPointerInfo(),
@@ -5921,34 +6563,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (SimplifySelectOps(N, N1, N2))
     return SDValue(N, 0);  // Don't revisit N.
 
-  // If the VSELECT result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (N0.getOpcode() == ISD::SETCC) {
-    EVT VT = N->getValueType(0);
-
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
-    std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
-    std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
-    std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
-
-    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
-    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
-
-    // Add the new VSELECT nodes to the work list in case they need to be split
-    // again.
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-  }
-
   // Fold (vselect (build_vector all_ones), N1, N2) -> N1
   if (ISD::isBuildVectorAllOnes(N0.getNode()))
     return N1;
@@ -6258,6 +6872,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
 
+  // Simplify TF.
+  AddToWorklist(NewChain.getNode());
+
   CombineTo(N, NewValue);
 
   // Replace uses of the original load (before extension)
@@ -6273,6 +6890,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
@@ -6281,8 +6899,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext (sext x)) -> (sext x)
   // fold (sext (aext x)) -> (sext x)
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
-    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
-                       N0.getOperand(0));
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
 
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (sext (truncate (load x))) -> (sext (smaller load x))
@@ -6314,12 +6931,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
       // bits, just sext from i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op);
+        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
     } else {
       // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
       // bits, just truncate to i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     }
 
     // fold (sext (truncate x)) -> (sextinreg x).
@@ -6329,7 +6946,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
       else if (OpBits > DestBits)
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op,
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
                          DAG.getValueType(N0.getValueType()));
     }
   }
@@ -6349,16 +6966,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
-                      ISD::SIGN_EXTEND);
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -6376,8 +6991,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
                                        LN0->getMemOperand());
       CombineTo(N, ExtLoad);
@@ -6411,7 +7025,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                          LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
-        SDLoc DL(N);
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
@@ -6419,24 +7032,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                     N0.getOperand(0).getValueType(), ExtLoad);
         CombineTo(N, And);
         CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL,
-                        ISD::SIGN_EXTEND);
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     }
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
-    EVT N0VT = N0.getOperand(0).getValueType();
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+    EVT N00VT = N0.getOperand(0).getValueType();
+
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations &&
-        TLI.getBooleanContents(N0VT) ==
+        TLI.getBooleanContents(N00VT) ==
             TargetLowering::ZeroOrNegativeOneBooleanContent) {
       // On some architectures (such as SSE/NEON/etc) the SETCC result type is
       // of the same size as the compared operands. Only optimize sext(setcc())
       // if this is the case.
-      EVT SVT = getSetCCResultType(N0VT);
+      EVT SVT = getSetCCResultType(N00VT);
 
       // We know that the # elements of the results is the same as the
       // # elements of the compare (and the # elements of the compare result
@@ -6444,19 +7060,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == SVT.getSizeInBits())
-        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
-                             N0.getOperand(1),
-                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSetCC(DL, VT, N00, N01, CC);
 
       // If the desired elements are smaller or larger than the source
-      // elements we can use a matching integer vector type and then
-      // truncate/sign extend
-      EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
-      if (SVT == MatchingVectorType) {
-        SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType,
-                               N0.getOperand(0), N0.getOperand(1),
-                               cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
+      // elements, we can use a matching integer vector type and then
+      // truncate/sign extend.
+      EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+      if (SVT == MatchingVecType) {
+        SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
+        return DAG.getSExtOrTrunc(VsetCC, DL, VT);
       }
     }
 
@@ -6465,36 +7077,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     // getBooleanContents().
     unsigned SetCCWidth = N0.getScalarValueSizeInBits();
 
-    SDLoc DL(N);
     // To determine the "true" side of the select, we need to know the high bit
     // of the value returned by the setcc if it evaluates to true.
     // If the type of the setcc is i1, then the true case of the select is just
     // sext(i1 1), that is, -1.
     // If the type of the setcc is larger (say, i8) then the value of the high
-    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value
+    // bit depends on getBooleanContents(), so ask TLI for a real "true" value
     // of the appropriate width.
-    SDValue ExtTrueVal =
-        (SetCCWidth == 1)
-            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()),
-                              DL, VT)
-            : TLI.getConstTrueVal(DAG, VT, DL);
-
-    if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal,
-            DAG.getConstant(0, DL, VT),
-            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+    SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT)
+                                           : TLI.getConstTrueVal(DAG, VT, DL);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    if (SDValue SCC =
+            SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
       return SCC;
 
     if (!VT.isVector()) {
-      EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType());
-      if (!LegalOperations ||
-          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
-        SDLoc DL(N);
-        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC =
-            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC);
-        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal,
-                             DAG.getConstant(0, DL, VT));
+      EVT SetCCVT = getSetCCResultType(N00VT);
+      // Don't do this transform for i1 because there's a select transform
+      // that would reverse it.
+      // TODO: We should not do this transform at all without a target hook
+      // because a sext is likely cheaper than a select?
+      if (SetCCVT.getScalarSizeInBits() != 1 &&
+          (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
+        SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
+        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
       }
     }
   }
@@ -6502,7 +7108,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext x) -> (zext x) if the sign bit is known zero.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
       DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
 
   return SDValue();
 }
@@ -6677,13 +7283,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
+
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
 
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ZERO_EXTEND);
+      CombineTo(N, ExtLoad);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -6991,9 +7598,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAssertZext(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT EVT = cast<VTSDNode>(N1)->getVT();
+
+  // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt)
+  if (N0.getOpcode() == ISD::AssertZext &&
+      EVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
+    return N0;
+
+  return SDValue();
+}
+
 /// See if the specified operand can be simplified with the knowledge that only
 /// the bits specified by Mask are used.  If so, return the simpler operand,
 /// otherwise return a null SDValue.
+///
+/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
+/// simplify nodes with multiple uses more aggressively.)
 SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   switch (V.getOpcode()) {
   default: break;
@@ -7029,6 +7652,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
         return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                            SimplifyLHS, V.getOperand(1));
     }
+    break;
+  case ISD::AND: {
+    // X & -1 -> X (ignoring bits which aren't demanded).
+    ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
+    if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask)
+      return V.getOperand(0);
+    break;
+  }
   }
   return SDValue();
 }
@@ -7244,6 +7875,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
+  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x)
+  if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
+       N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
+       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+      N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
+      return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
+  }
+
   // fold (sext_in_reg (zext x)) -> (sext x)
   // iff we are extending the source sign bit.
   if (N0.getOpcode() == ISD::ZERO_EXTEND) {
@@ -7254,7 +7895,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
-  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
     return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
@@ -7496,6 +8137,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
                                                      VT.getSizeInBits())))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
   }
+
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
   if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
@@ -7517,6 +8159,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       }
     }
   }
+
   // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
   // where ... are all 'undef'.
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
@@ -7582,6 +8225,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
+  // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
+  // When the adde's carry is not used.
+  if (N0.getOpcode() == ISD::ADDE && N0.hasOneUse() &&
+      !N0.getNode()->hasAnyUseOfValue(1) &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::ADDE, VT))) {
+    SDLoc SL(N);
+    auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+    auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
+    return DAG.getNode(ISD::ADDE, SL, DAG.getVTList(VT, MVT::Glue),
+                       X, Y, N0.getOperand(2));
+  }
+
   return SDValue();
 }
 
@@ -7672,6 +8327,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
   // If the input is a BUILD_VECTOR with all constant elements, fold this now.
   // Only do this before legalize, since afterward the target may be depending
   // on the bitconvert.
@@ -8040,6 +8698,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
+static bool isContractable(SDNode *N) {
+  SDNodeFlags F = cast<BinaryWithFlagsSDNode>(N)->Flags;
+  return F.hasAllowContract() || F.hasUnsafeAlgebra();
+}
+
 /// Try to perform FMA combining on a given FADD node.
 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDValue N0 = N->getOperand(0);
@@ -8048,24 +8711,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
 
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !isContractable(N))
+    return SDValue();
+
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
-  ;
-  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
@@ -8073,35 +8739,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
+    if (N.getOpcode() != ISD::FMUL)
+      return false;
+    return AllowFusionGlobally || isContractable(N.getNode());
+  };
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && N0.getOpcode() == ISD::FMUL &&
-      N1.getOpcode() == ISD::FMUL) {
+  if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
     if (N0.getNode()->use_size() > N1.getNode()->use_size())
       std::swap(N0, N1);
   }
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-  if (N0.getOpcode() == ISD::FMUL &&
-      (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1), N1);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
-  if (N1.getOpcode() == ISD::FMUL &&
-      (Aggressive || N1->hasOneUse())) {
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N1.getOperand(0), N1.getOperand(1), N0);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (AllowFusion && LookThroughFPExt) {
+  if (LookThroughFPExt) {
     // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N00))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N00.getOperand(0)),
@@ -8113,7 +8783,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     // Note: Commutes FADD operands.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N10))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N10.getOperand(0)),
@@ -8154,7 +8824,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                      N0));
     }
 
-    if (AllowFusion && LookThroughFPExt) {
+    if (LookThroughFPExt) {
       // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y, (fma (fpext u), (fpext v), z))
       auto FoldFAddFMAFPExtFMul = [&] (
@@ -8169,7 +8839,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N02 = N0.getOperand(2);
         if (N02.getOpcode() == ISD::FP_EXTEND) {
           SDValue N020 = N02.getOperand(0);
-          if (N020.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N020))
             return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
                                         N020.getOperand(0), N020.getOperand(1),
                                         N1);
@@ -8195,7 +8865,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N00 = N0.getOperand(0);
         if (N00.getOpcode() == PreferredFusedOpcode) {
           SDValue N002 = N00.getOperand(2);
-          if (N002.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N002))
             return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
                                         N002.getOperand(0), N002.getOperand(1),
                                         N1);
@@ -8208,7 +8878,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N12 = N1.getOperand(2);
         if (N12.getOpcode() == ISD::FP_EXTEND) {
           SDValue N120 = N12.getOperand(0);
-          if (N120.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N120))
             return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
                                         N120.getOperand(0), N120.getOperand(1),
                                         N0);
@@ -8224,7 +8894,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N10 = N1.getOperand(0);
         if (N10.getOpcode() == PreferredFusedOpcode) {
           SDValue N102 = N10.getOperand(2);
-          if (N102.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N102))
             return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
                                         N102.getOperand(0), N102.getOperand(1),
                                         N0);
@@ -8244,23 +8914,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
-
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the subtraction is not contractable, do not combine.
+  if (!AllowFusionGlobally && !isContractable(N))
+    return SDValue();
+
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
-  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
@@ -8268,9 +8941,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
+    if (N.getOpcode() != ISD::FMUL)
+      return false;
+    return AllowFusionGlobally || isContractable(N.getNode());
+  };
+
   // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-  if (N0.getOpcode() == ISD::FMUL &&
-      (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1),
                        DAG.getNode(ISD::FNEG, SL, VT, N1));
@@ -8278,16 +8958,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
-  if (N1.getOpcode() == ISD::FMUL &&
-      (Aggressive || N1->hasOneUse()))
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse()))
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT,
                                    N1.getOperand(0)),
                        N1.getOperand(1), N0);
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
-  if (N0.getOpcode() == ISD::FNEG &&
-      N0.getOperand(0).getOpcode() == ISD::FMUL &&
+  if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
       (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
@@ -8297,12 +8975,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (AllowFusion && LookThroughFPExt) {
+  if (LookThroughFPExt) {
     // fold (fsub (fpext (fmul x, y)), z)
     //   -> (fma (fpext x), (fpext y), (fneg z))
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N00))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N00.getOperand(0)),
@@ -8316,7 +8994,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // Note: Commutes FSUB operands.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N10))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT,
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8336,7 +9014,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == ISD::FNEG) {
         SDValue N000 = N00.getOperand(0);
-        if (N000.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N000)) {
           return DAG.getNode(ISD::FNEG, SL, VT,
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8358,7 +9036,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == ISD::FP_EXTEND) {
         SDValue N000 = N00.getOperand(0);
-        if (N000.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N000)) {
           return DAG.getNode(ISD::FNEG, SL, VT,
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8378,10 +9056,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma x, y (fma u, v, (fneg z)))
     // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
     // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath &&
-        N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
+    if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode &&
+        isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
+        N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8395,9 +9072,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
     // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath &&
-        N1.getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+    if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode &&
+        isContractableFMUL(N1.getOperand(2))) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8410,14 +9086,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                      N21, N0));
     }
 
-    if (AllowFusion && LookThroughFPExt) {
+    if (LookThroughFPExt) {
       // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
       if (N0.getOpcode() == PreferredFusedOpcode) {
         SDValue N02 = N0.getOperand(2);
         if (N02.getOpcode() == ISD::FP_EXTEND) {
           SDValue N020 = N02.getOperand(0);
-          if (N020.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N020))
             return DAG.getNode(PreferredFusedOpcode, SL, VT,
                                N0.getOperand(0), N0.getOperand(1),
                                DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8440,7 +9116,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         SDValue N00 = N0.getOperand(0);
         if (N00.getOpcode() == PreferredFusedOpcode) {
           SDValue N002 = N00.getOperand(2);
-          if (N002.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N002))
             return DAG.getNode(PreferredFusedOpcode, SL, VT,
                                DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                            N00.getOperand(0)),
@@ -8461,7 +9137,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       if (N1.getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
         SDValue N120 = N1.getOperand(2).getOperand(0);
-        if (N120.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N120)) {
           SDValue N1200 = N120.getOperand(0);
           SDValue N1201 = N120.getOperand(1);
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8488,7 +9164,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         SDValue N100 = N1.getOperand(0).getOperand(0);
         SDValue N101 = N1.getOperand(0).getOperand(1);
         SDValue N102 = N1.getOperand(0).getOperand(2);
-        if (N102.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N102)) {
           SDValue N1020 = N102.getOperand(0);
           SDValue N1021 = N102.getOperand(1);
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8624,6 +9300,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
@@ -8637,7 +9316,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                        GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
   // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) {
+  if (Options.NoSignedZerosFPMath || N->getFlags()->hasNoSignedZeros()) {
     // fold (fadd A, 0) -> A
     if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
       if (N1C->isZero())
@@ -8771,13 +9450,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, DL, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) {
+  if (Options.NoSignedZerosFPMath  || N->getFlags()->hasNoSignedZeros()) {
     // (fsub 0, B) -> -B
     if (N0CFP && N0CFP->isZero()) {
       if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
@@ -8850,6 +9532,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return N0;
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (Options.UnsafeFPMath) {
     // fold (fmul A, 0) -> 0
     if (N1CFP && N1CFP->isZero())
@@ -9104,6 +9789,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (Options.UnsafeFPMath) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
@@ -9207,6 +9895,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
     return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1,
                        &cast<BinaryWithFlagsSDNode>(N)->Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   return SDValue();
 }
 
@@ -10361,7 +11052,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
-
+        AddUsersToWorklist(Chain.getNode());
         if (N->use_empty())
           deleteAndRecombine(N);
 
@@ -10414,7 +11105,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
       if (PrevST->getBasePtr() == Ptr &&
           PrevST->getValue().getValueType() == N->getValueType(0))
-      return CombineTo(N, Chain.getOperand(1), Chain);
+        return CombineTo(N, PrevST->getOperand(1), Chain);
     }
   }
 
@@ -10432,14 +11123,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     }
   }
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-#ifndef NDEBUG
-  if (CombinerAAOnlyFunc.getNumOccurrences() &&
-      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
-    UseAA = false;
-#endif
-  if (UseAA && LD->isUnindexed()) {
+  if (LD->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -11021,6 +11705,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
   SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
                               ArgChains);
   DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  AddToWorklist(Chain.getNode());
   return true;
 }
 
@@ -11414,18 +12099,24 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
   return false;
 }
 
-SDValue DAGCombiner::getMergedConstantVectorStore(
-    SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores,
-    SmallVectorImpl<SDValue> &Chains, EVT Ty) const {
-  SmallVector<SDValue, 8> BuildVector;
+SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         unsigned NumStores) {
+  SmallVector<SDValue, 8> Chains;
+  SmallPtrSet<const SDNode *, 8> Visited;
+  SDLoc StoreDL(StoreNodes[0].MemNode);
+
+  for (unsigned i = 0; i < NumStores; ++i) {
+    Visited.insert(StoreNodes[i].MemNode);
+  }
 
-  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
-    StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode);
-    Chains.push_back(St->getChain());
-    BuildVector.push_back(St->getValue());
+  // don't include nodes that are children
+  for (unsigned i = 0; i < NumStores; ++i) {
+    if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
+      Chains.push_back(StoreNodes[i].MemNode->getChain());
   }
 
-  return DAG.getBuildVector(Ty, SL, BuildVector);
+  assert(Chains.size() > 0 && "Chain should have generated a chain");
+  return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
@@ -11436,22 +12127,8 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     return false;
 
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned LatestNodeUsed = 0;
-
-  for (unsigned i=0; i < NumStores; ++i) {
-    // Find a chain for the new wide-store operand. Notice that some
-    // of the store nodes that we found may not be selected for inclusion
-    // in the wide store. The chain we use needs to be the chain of the
-    // latest store node which is *used* and replaced by the wide store.
-    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
-      LatestNodeUsed = i;
-  }
-
-  SmallVector<SDValue, 8> Chains;
 
   // The latest Node in the DAG.
-  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
   SDLoc DL(StoreNodes[0].MemNode);
 
   SDValue StoredVal;
@@ -11467,7 +12144,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
 
     if (IsConstantSrc) {
-      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty);
+      SmallVector<SDValue, 8> BuildVector;
+      for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
+        SDValue Val = St->getValue();
+        if (MemVT.getScalarType().isInteger())
+          if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue()))
+            Val = DAG.getConstant(
+                (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(),
+                SDLoc(CFP), MemVT);
+        BuildVector.push_back(Val);
+      }
+      StoredVal = DAG.getBuildVector(Ty, DL, BuildVector);
     } else {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumStores; ++i) {
@@ -11477,7 +12165,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
         if (Val.getValueType() != MemVT)
           return false;
         Ops.push_back(Val);
-        Chains.push_back(St->getChain());
       }
 
       // Build the extracted vector elements back into a vector.
@@ -11497,7 +12184,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     for (unsigned i = 0; i < NumStores; ++i) {
       unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
-      Chains.push_back(St->getChain());
 
       SDValue Val = St->getValue();
       StoreInt <<= ElementSizeBytes * 8;
@@ -11515,54 +12201,27 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
   }
 
-  assert(!Chains.empty());
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
   SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
                                   FirstInChain->getBasePtr(),
                                   FirstInChain->getPointerInfo(),
                                   FirstInChain->getAlignment());
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-  if (UseAA) {
-    // Replace all merged stores with the new store.
-    for (unsigned i = 0; i < NumStores; ++i)
-      CombineTo(StoreNodes[i].MemNode, NewStore);
-  } else {
-    // Replace the last store with the new store.
-    CombineTo(LatestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumStores; ++i) {
-      if (StoreNodes[i].MemNode == LatestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      // ReplaceAllUsesWith will replace all uses that existed when it was
-      // called, but graph optimizations may cause new ones to appear. For
-      // example, the case in pr14333 looks like
-      //
-      //  St's chain -> St -> another store -> X
-      //
-      // And the only difference from St to the other store is the chain.
-      // When we change it's chain to be St's chain they become identical,
-      // get CSEed and the net result is that X is now a use of St.
-      // Since we know that St is redundant, just iterate.
-      while (!St->use_empty())
-        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
-  }
+  // Replace all merged stores with the new store.
+  for (unsigned i = 0; i < NumStores; ++i)
+    CombineTo(StoreNodes[i].MemNode, NewStore);
 
-  StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end());
+  AddToWorklist(NewChain.getNode());
   return true;
 }
 
-void DAGCombiner::getStoreMergeAndAliasCandidates(
-    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
-    SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
+void DAGCombiner::getStoreMergeCandidates(
+    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  EVT MemVT = St->getMemoryVT();
 
   // We must have a base and an offset.
   if (!BasePtr.Base.getNode())
@@ -11572,104 +12231,71 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
   if (BasePtr.Base.isUndef())
     return;
 
-  // Walk up the chain and look for nodes with offsets from the same
-  // base pointer. Stop when reaching an instruction with a different kind
-  // or instruction which has a different base pointer.
-  EVT MemVT = St->getMemoryVT();
-  unsigned Seq = 0;
-  StoreSDNode *Index = St;
-
-
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-
-  if (UseAA) {
-    // Look at other users of the same chain. Stores on the same chain do not
-    // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized
-    // to be on the same chain, so don't bother looking at adjacent chains.
-
-    SDValue Chain = St->getChain();
-    for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) {
-      if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
-        if (I.getOperandNo() != 0)
-          continue;
-
-        if (OtherST->isVolatile() || OtherST->isIndexed())
-          continue;
-
-        if (OtherST->getMemoryVT() != MemVT)
-          continue;
-
-        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
-
-        if (Ptr.equalBaseIndex(BasePtr))
-          StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
-      }
-    }
-
-    return;
-  }
-
-  while (Index) {
-    // If the chain has more than one use, then we can't reorder the mem ops.
-    if (Index != St && !SDValue(Index, 0)->hasOneUse())
-      break;
-
-    // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
-
-    // Check that the base pointer is the same as the original one.
-    if (!Ptr.equalBaseIndex(BasePtr))
-      break;
-
-    // The memory operands must not be volatile.
-    if (Index->isVolatile() || Index->isIndexed())
-      break;
-
-    // No truncation.
-    if (Index->isTruncatingStore())
-      break;
-
-    // The stored memory type must be the same.
-    if (Index->getMemoryVT() != MemVT)
-      break;
-
-    // We do not allow under-aligned stores in order to prevent
-    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
-    // be irrelevant here; what MATTERS is that we not move memory
-    // operations that potentially overlap past each-other.
-    if (Index->getAlignment() < MemVT.getStoreSize())
-      break;
-
-    // We found a potential memory operand to merge.
-    StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
-
-    // Find the next memory operand in the chain. If the next operand in the
-    // chain is a store then move up and continue the scan with the next
-    // memory operand. If the next operand is a load save it and use alias
-    // information to check if it interferes with anything.
-    SDNode *NextInChain = Index->getChain().getNode();
-    while (1) {
-      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
-        // We found a store node. Use it for the next iteration.
-        Index = STn;
-        break;
-      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
-        if (Ldn->isVolatile()) {
-          Index = nullptr;
-          break;
+  bool IsLoadSrc = isa<LoadSDNode>(St->getValue());
+  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
+                       isa<ConstantFPSDNode>(St->getValue());
+  bool IsExtractVecSrc =
+      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
+  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool {
+    if (Other->isVolatile() || Other->isIndexed())
+      return false;
+    // We can merge constant floats to equivalent integers
+    if (Other->getMemoryVT() != MemVT)
+      if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) &&
+            isa<ConstantFPSDNode>(Other->getValue())))
+        return false;
+    if (IsLoadSrc)
+      if (!isa<LoadSDNode>(Other->getValue()))
+        return false;
+    if (IsConstantSrc)
+      if (!(isa<ConstantSDNode>(Other->getValue()) ||
+            isa<ConstantFPSDNode>(Other->getValue())))
+        return false;
+    if (IsExtractVecSrc)
+      if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+            Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR))
+        return false;
+    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
+    return (Ptr.equalBaseIndex(BasePtr));
+  };
+  // We looking for a root node which is an ancestor to all mergable
+  // stores. We search up through a load, to our root and then down
+  // through all children. For instance we will find Store{1,2,3} if
+  // St is Store1, Store2. or Store3 where the root is not a load
+  // which always true for nonvolatile ops. TODO: Expand
+  // the search to find all valid candidates through multiple layers of loads.
+  //
+  // Root
+  // |-------|-------|
+  // Load    Load    Store3
+  // |       |
+  // Store1   Store2
+  //
+  // FIXME: We should be able to climb and
+  // descend TokenFactors to find candidates as well.
+
+  SDNode *RootNode = (St->getChain()).getNode();
+
+  if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
+    RootNode = Ldn->getChain().getNode();
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
+      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
+        for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
+          if (I2.getOperandNo() == 0)
+            if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
+              BaseIndexOffset Ptr;
+              if (CandidateMatch(OtherST, Ptr))
+                StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
+            }
+  } else
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
+      if (I.getOperandNo() == 0)
+        if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
+          BaseIndexOffset Ptr;
+          if (CandidateMatch(OtherST, Ptr))
+            StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
         }
-
-        // Save the load node for later. Continue the scan.
-        AliasLoadNodes.push_back(Ldn);
-        NextInChain = Ldn->getChain().getNode();
-        continue;
-      } else {
-        Index = nullptr;
-        break;
-      }
-    }
-  }
 }
 
 // We need to check that merging these stores does not cause a loop
@@ -11678,31 +12304,34 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
 // through the chain). Check in parallel by searching up from
 // non-chain operands of candidates.
 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
-    SmallVectorImpl<MemOpLink> &StoreNodes) {
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
   SmallPtrSet<const SDNode *, 16> Visited;
   SmallVector<const SDNode *, 8> Worklist;
   // search ops of store candidates
-  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+  for (unsigned i = 0; i < NumStores; ++i) {
     SDNode *n = StoreNodes[i].MemNode;
     // Potential loops may happen only through non-chain operands
     for (unsigned j = 1; j < n->getNumOperands(); ++j)
       Worklist.push_back(n->getOperand(j).getNode());
   }
   // search through DAG. We can stop early if we find a storenode
-  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+  for (unsigned i = 0; i < NumStores; ++i) {
     if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist))
       return false;
   }
   return true;
 }
 
-bool DAGCombiner::MergeConsecutiveStores(
-    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) {
+bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+
+  if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
+    return false;
+
   bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
       Attribute::NoImplicitFloat);
 
@@ -11731,145 +12360,137 @@ bool DAGCombiner::MergeConsecutiveStores(
   if (MemVT.isVector() && IsLoadSrc)
     return false;
 
-  // Only look at ends of store sequences.
-  SDValue Chain = SDValue(St, 0);
-  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
-    return false;
-
-  // Save the LoadSDNodes that we find in the chain.
-  // We need to make sure that these nodes do not interfere with
-  // any of the store nodes.
-  SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
-
-  getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
+  SmallVector<MemOpLink, 8> StoreNodes;
+  // Find potential store merge candidates by searching through chain sub-DAG
+  getStoreMergeCandidates(St, StoreNodes);
 
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
     return false;
 
-  // only do dependence check in AA case
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-  if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes))
-    return false;
-
   // Sort the memory operands according to their distance from the
-  // base pointer.  As a secondary criteria: make sure stores coming
-  // later in the code come first in the list. This is important for
-  // the non-UseAA case, because we're merging stores into the FINAL
-  // store along a chain which potentially contains aliasing stores.
-  // Thus, if there are multiple stores to the same address, the last
-  // one can be considered for merging but not the others.
+  // base pointer.
   std::sort(StoreNodes.begin(), StoreNodes.end(),
             [](MemOpLink LHS, MemOpLink RHS) {
-    return LHS.OffsetFromBase < RHS.OffsetFromBase ||
-           (LHS.OffsetFromBase == RHS.OffsetFromBase &&
-            LHS.SequenceNum < RHS.SequenceNum);
-  });
+              return LHS.OffsetFromBase < RHS.OffsetFromBase;
+            });
 
   // Scan the memory operations on the chain and find the first non-consecutive
   // store memory address.
-  unsigned LastConsecutiveStore = 0;
+  unsigned NumConsecutiveStores = 0;
   int64_t StartAddress = StoreNodes[0].OffsetFromBase;
-  for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) {
 
-    // Check that the addresses are consecutive starting from the second
-    // element in the list of stores.
-    if (i > 0) {
-      int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
-      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-        break;
-    }
-
-    // Check if this store interferes with any of the loads that we found.
-    // If we find a load that alias with this store. Stop the sequence.
-    if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) {
-          return isAlias(Ldn, StoreNodes[i].MemNode);
-        }))
+  // Check that the addresses are consecutive starting from the second
+  // element in the list of stores.
+  for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
+    int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
       break;
-
-    // Mark this node as useful.
-    LastConsecutiveStore = i;
+    NumConsecutiveStores = i + 1;
   }
 
+  if (NumConsecutiveStores < 2)
+    return false;
+
+  // Check that we can merge these candidates without causing a cycle
+  if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumConsecutiveStores))
+    return false;
+
+
   // The node with the lowest store address.
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-  unsigned FirstStoreAlign = FirstInChain->getAlignment();
   LLVMContext &Context = *DAG.getContext();
   const DataLayout &DL = DAG.getDataLayout();
 
   // Store the constants into memory as one consecutive store.
   if (IsConstantSrc) {
-    unsigned LastLegalType = 0;
-    unsigned LastLegalVectorType = 0;
-    bool NonZero = false;
-    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
-      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
-
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
-        NonZero |= !C->isNullValue();
-      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) {
-        NonZero |= !C->getConstantFPValue()->isNullValue();
-      } else {
-        // Non-constant.
-        break;
-      }
+    bool RV = false;
+    while (NumConsecutiveStores > 1) {
+      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+      unsigned FirstStoreAlign = FirstInChain->getAlignment();
+      unsigned LastLegalType = 0;
+      unsigned LastLegalVectorType = 0;
+      bool NonZero = false;
+      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+        StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue StoredVal = ST->getValue();
+
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
+          NonZero |= !C->isNullValue();
+        } else if (ConstantFPSDNode *C =
+                       dyn_cast<ConstantFPSDNode>(StoredVal)) {
+          NonZero |= !C->getConstantFPValue()->isNullValue();
+        } else {
+          // Non-constant.
+          break;
+        }
 
-      // Find a legal type for the constant store.
-      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-      bool IsFast;
-      if (TLI.isTypeLegal(StoreTy) &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                                 FirstStoreAlign, &IsFast) && IsFast) {
-        LastLegalType = i+1;
-      // Or check whether a truncstore is legal.
-      } else if (TLI.getTypeAction(Context, StoreTy) ==
-                 TargetLowering::TypePromoteInteger) {
-        EVT LegalizedStoredValueTy =
-          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
-        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                   FirstStoreAS, FirstStoreAlign, &IsFast) &&
+        // Find a legal type for the constant store.
+        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+        EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+        bool IsFast = false;
+        if (TLI.isTypeLegal(StoreTy) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                   FirstStoreAlign, &IsFast) &&
             IsFast) {
           LastLegalType = i + 1;
+          // Or check whether a truncstore is legal.
+        } else if (TLI.getTypeAction(Context, StoreTy) ==
+                   TargetLowering::TypePromoteInteger) {
+          EVT LegalizedStoredValueTy =
+              TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+              TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                     FirstStoreAS, FirstStoreAlign, &IsFast) &&
+              IsFast) {
+            LastLegalType = i + 1;
+          }
         }
-      }
 
-      // We only use vectors if the constant is known to be zero or the target
-      // allows it and the function is not marked with the noimplicitfloat
-      // attribute.
-      if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1,
-                                                        FirstStoreAS)) &&
-          !NoVectors) {
-        // Find a legal type for the vector store.
-        EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
-        if (TLI.isTypeLegal(Ty) &&
-            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
-                                   FirstStoreAlign, &IsFast) && IsFast)
-          LastLegalVectorType = i + 1;
+        // We only use vectors if the constant is known to be zero or the target
+        // allows it and the function is not marked with the noimplicitfloat
+        // attribute.
+        if ((!NonZero ||
+             TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
+            !NoVectors) {
+          // Find a legal type for the vector store.
+          EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1);
+          if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) &&
+              TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                     FirstStoreAlign, &IsFast) &&
+              IsFast)
+            LastLegalVectorType = i + 1;
+        }
       }
-    }
 
-    // Check if we found a legal integer type to store.
-    if (LastLegalType == 0 && LastLegalVectorType == 0)
-      return false;
+      // Check if we found a legal integer type that creates a meaningful merge.
+      if (LastLegalType < 2 && LastLegalVectorType < 2)
+        break;
 
-    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
-    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
+      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
 
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
-                                           true, UseVector);
+      bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                                    true, UseVector);
+      if (!Merged)
+        break;
+      // Remove merged stores for next iteration.
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      RV = true;
+      NumConsecutiveStores -= NumElem;
+    }
+    return RV;
   }
 
   // When extracting multiple vector elements, try to store them
   // in one vector store rather than a sequence of scalar stores.
   if (IsExtractVecSrc) {
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
     unsigned NumStoresToMerge = 0;
     bool IsVec = MemVT.isVector();
-    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
       unsigned StoreValOpcode = St->getValue().getOpcode();
       // This restriction could be loosened.
@@ -11909,7 +12530,7 @@ bool DAGCombiner::MergeConsecutiveStores(
   // Find acceptable loads. Loads need to have the same chain (token factor),
   // must not be zext, volatile, indexed, and they must be consecutive.
   BaseIndexOffset LdBasePtr;
-  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+  for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
     StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
     LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
     if (!Ld) break;
@@ -11942,7 +12563,7 @@ bool DAGCombiner::MergeConsecutiveStores(
     }
 
     // We found a potential memory operand to merge.
-    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0));
+    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));
   }
 
   if (LoadNodes.size() < 2)
@@ -11954,7 +12575,9 @@ bool DAGCombiner::MergeConsecutiveStores(
   if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
       St->getAlignment() >= RequiredAlignment)
     return false;
-
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+  unsigned FirstStoreAlign = FirstInChain->getAlignment();
   LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
   unsigned FirstLoadAS = FirstLoad->getAddressSpace();
   unsigned FirstLoadAlign = FirstLoad->getAlignment();
@@ -12023,31 +12646,12 @@ bool DAGCombiner::MergeConsecutiveStores(
 
   // We add +1 here because the LastXXX variables refer to location while
   // the NumElem refers to array/index size.
-  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
+  unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
   NumElem = std::min(LastLegalType, NumElem);
 
   if (NumElem < 2)
     return false;
 
-  // Collect the chains from all merged stores.
-  SmallVector<SDValue, 8> MergeStoreChains;
-  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
-
-  // The latest Node in the DAG.
-  unsigned LatestNodeUsed = 0;
-  for (unsigned i=1; i<NumElem; ++i) {
-    // Find a chain for the new wide-store operand. Notice that some
-    // of the store nodes that we found may not be selected for inclusion
-    // in the wide store. The chain we use needs to be the chain of the
-    // latest store node which is *used* and replaced by the wide store.
-    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
-      LatestNodeUsed = i;
-
-    MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
-  }
-
-  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
-
   // Find if it is better to use vectors or integers to load and store
   // to memory.
   EVT JointMemOpVT;
@@ -12067,8 +12671,9 @@ bool DAGCombiner::MergeConsecutiveStores(
                                 FirstLoad->getBasePtr(),
                                 FirstLoad->getPointerInfo(), FirstLoadAlign);
 
-  SDValue NewStoreChain =
-    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
+  SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+
+  AddToWorklist(NewStoreChain.getNode());
 
   SDValue NewStore =
       DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
@@ -12081,25 +12686,9 @@ bool DAGCombiner::MergeConsecutiveStores(
                                   SDValue(NewLoad.getNode(), 1));
   }
 
-  if (UseAA) {
-    // Replace the all stores with the new store.
-    for (unsigned i = 0; i < NumElem; ++i)
-      CombineTo(StoreNodes[i].MemNode, NewStore);
-  } else {
-    // Replace the last store with the new store.
-    CombineTo(LatestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumElem; ++i) {
-      // Remove all Store nodes.
-      if (StoreNodes[i].MemNode == LatestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
-  }
-
-  StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end());
+  // Replace the all stores with the new store.
+  for (unsigned i = 0; i < NumElem; ++i)
+    CombineTo(StoreNodes[i].MemNode, NewStore);
   return true;
 }
 
@@ -12256,19 +12845,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (SDValue NewST = TransformFPLoadStorePair(N))
     return NewST;
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-#ifndef NDEBUG
-  if (CombinerAAOnlyFunc.getNumOccurrences() &&
-      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
-    UseAA = false;
-#endif
-  if (UseAA && ST->isUnindexed()) {
-    // FIXME: We should do this even without AA enabled. AA will just allow
-    // FindBetterChain to work in more situations. The problem with this is that
-    // any combine that expects memory operations to be on consecutive chains
-    // first needs to be updated to look for users of the same chain.
-
+  if (ST->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes, on this store and any
     // adjacent stores.
     if (findBetterNeighborChains(ST)) {
@@ -12302,8 +12879,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     if (SimplifyDemandedBits(
             Value,
             APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
-                                 ST->getMemoryVT().getScalarSizeInBits())))
+                                 ST->getMemoryVT().getScalarSizeInBits()))) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   // If this is a load followed by a store to the same location, then the store
@@ -12347,15 +12931,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
       // or until we merge the last store on the chain.
-      SmallVector<MemOpLink, 8> StoreNodes;
-      bool Changed = MergeConsecutiveStores(ST, StoreNodes);
+      bool Changed = MergeConsecutiveStores(ST);
       if (!Changed) break;
-
-      if (any_of(StoreNodes,
-                 [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) {
-        // ST has been merged and no longer exists.
+      // Return N as merge only uses CombineTo and no worklist clean
+      // up is necessary.
+      if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
         return SDValue(N, 0);
-      }
     }
   }
 
@@ -12364,7 +12945,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // Make sure to do this only after attempting to merge stores in order to
   //  avoid changing the types of some subset of stores due to visit order,
   //  preventing their merging.
-  if (isa<ConstantFPSDNode>(Value)) {
+  if (isa<ConstantFPSDNode>(ST->getValue())) {
     if (SDValue NewSt = replaceStoreOfFPConstant(ST))
       return NewSt;
   }
@@ -12493,10 +13074,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
 
   EVT VT = InVec.getValueType();
 
-  // If we can't generate a legal BUILD_VECTOR, exit
-  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
-    return SDValue();
-
   // Check that we know which element is being inserted
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
@@ -12523,6 +13100,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  // If we can't generate a legal BUILD_VECTOR, exit
+  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return SDValue();
+
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
@@ -12544,11 +13125,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     // All the operands of BUILD_VECTOR must have the same type;
     // we enforce that here.
     EVT OpVT = Ops[0].getValueType();
-    if (InVal.getValueType() != OpVT)
-      InVal = OpVT.bitsGT(InVal.getValueType()) ?
-                DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
-                DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
-    Ops[Elt] = InVal;
+    Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
   }
 
   // Return the new vector
@@ -12568,6 +13145,11 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
     return SDValue();
 
+  ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
+    ISD::NON_EXTLOAD : ISD::EXTLOAD;
+  if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
+    return SDValue();
+
   Align = NewAlign;
 
   SDValue NewPtr = OriginalLoad->getBasePtr();
@@ -12639,6 +13221,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   EVT VT = InVec.getValueType();
   EVT NVT = N->getValueType(0);
 
+  if (InVec.isUndef())
+    return DAG.getUNDEF(NVT);
+
   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
@@ -13022,7 +13607,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
   return DAG.getNode(Opcode, DL, VT, BV);
 }
 
-SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N,
+SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
                                            unsigned LeftIdx) {
@@ -13300,6 +13885,35 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (ISD::allOperandsUndef(N))
     return DAG.getUNDEF(VT);
 
+  // Check if we can express BUILD VECTOR via subvector extract.
+  if (!LegalTypes && (N->getNumOperands() > 1)) {
+    SDValue Op0 = N->getOperand(0);
+    auto checkElem = [&](SDValue Op) -> uint64_t {
+      if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
+          (Op0.getOperand(0) == Op.getOperand(0)))
+        if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+          return CNode->getZExtValue();
+      return -1;
+    };
+
+    int Offset = checkElem(Op0);
+    for (unsigned i = 0; i < N->getNumOperands(); ++i) {
+      if (Offset + i != checkElem(N->getOperand(i))) {
+        Offset = -1;
+        break;
+      }
+    }
+
+    if ((Offset == 0) &&
+        (Op0.getOperand(0).getValueType() == N->getValueType(0)))
+      return Op0.getOperand(0);
+    if ((Offset != -1) &&
+        ((Offset % N->getValueType(0).getVectorNumElements()) ==
+         0)) // IDX must be multiple of output size.
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
+                         Op0.getOperand(0), Op0.getOperand(1));
+  }
+
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
@@ -13491,8 +14105,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
         return SDValue();
 
-      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
-                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
+      if (VNTNumElms < 2)
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
       if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
         return SDValue();
 
@@ -13611,15 +14228,19 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
 
-  if (V->getOpcode() == ISD::CONCAT_VECTORS) {
-    // Combine:
-    //    (extract_subvec (concat V1, V2, ...), i)
-    // Into:
-    //    Vi if possible
-    // Only operand 0 is checked as 'concat' assumes all inputs of the same
-    // type.
-    if (V->getOperand(0).getValueType() != NVT)
-      return SDValue();
+  // Extract from UNDEF is UNDEF.
+  if (V.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  // Combine:
+  //    (extract_subvec (concat V1, V2, ...), i)
+  // Into:
+  //    Vi if possible
+  // Only operand 0 is checked as 'concat' assumes all inputs of the same
+  // type.
+  if (V->getOpcode() == ISD::CONCAT_VECTORS &&
+      isa<ConstantSDNode>(N->getOperand(1)) &&
+      V->getOperand(0).getValueType() == NVT) {
     unsigned Idx = N->getConstantOperandVal(1);
     unsigned NumElems = NVT.getVectorNumElements();
     assert((Idx % NumElems) == 0 &&
@@ -13633,19 +14254,16 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
 
   if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
     // Handle only simple case where vector being inserted and vector
-    // being extracted are of same type, and are half size of larger vectors.
-    EVT BigVT = V->getOperand(0).getValueType();
+    // being extracted are of same size.
     EVT SmallVT = V->getOperand(1).getValueType();
-    if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits())
+    if (!NVT.bitsEq(SmallVT))
       return SDValue();
 
-    // Only handle cases where both indexes are constants with the same type.
+    // Only handle cases where both indexes are constants.
     ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
     ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
 
-    if (InsIdx && ExtIdx &&
-        InsIdx->getValueType(0).getSizeInBits() <= 64 &&
-        ExtIdx->getValueType(0).getSizeInBits() <= 64) {
+    if (InsIdx && ExtIdx) {
       // Combine:
       //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
       // Into:
@@ -13892,6 +14510,113 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
   return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
 }
 
+// Match shuffles that can be converted to any_vector_extend_in_reg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
+// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
+SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
+                                     SelectionDAG &DAG,
+                                     const TargetLowering &TLI,
+                                     bool LegalOperations) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  ArrayRef<int> Mask = SVN->getMask();
+  SDValue N0 = SVN->getOperand(0);
+
+  // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
+  auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
+  // power-of-2 extensions as they are the most likely.
+  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
+    if (!isAnyExtend(Scale))
+      continue;
+
+    EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
+    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
+      return DAG.getBitcast(VT,
+                            DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
+  }
+
+  return SDValue();
+}
+
+// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
+// each source element of a large type into the lowest elements of a smaller
+// destination type. This is often generated during legalization.
+// If the source node itself was a '*_extend_vector_inreg' node then we should
+// then be able to remove it.
+SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  SDValue N0 = SVN->getOperand(0);
+  while (N0.getOpcode() == ISD::BITCAST)
+    N0 = N0.getOperand(0);
+
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
+      Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
+      Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  ArrayRef<int> Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
+
+  // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
+  // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
+  // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
+  auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // At the moment we just handle the case where we've truncated back to the
+  // same size as before the extension.
+  // TODO: handle more extension/truncation cases as cases arise.
+  if (EltSizeInBits != ExtSrcSizeInBits)
+    return SDValue();
+
+  // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for
+  // power-of-2 truncations as they are the most likely.
+  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2)
+    if (isTruncate(Scale))
+      return DAG.getBitcast(VT, N00);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -13996,6 +14721,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
     return S;
 
+  // Match shuffles that can be converted to any_vector_extend_in_reg.
+  if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
+    return V;
+
+  // Combine "truncate_vector_in_reg" style shuffles.
+  if (SDValue V = combineTruncationShuffle(SVN, DAG))
+    return V;
+
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
       (N1.isUndef() ||
@@ -14253,6 +14986,16 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
 
+  // If inserting an UNDEF, just return the original vector.
+  if (N1.isUndef())
+    return N0;
+
+  // If this is an insert of an extracted vector into an undef vector, we can
+  // just use the input to the extract.
+  if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
+    return N1.getOperand(0);
+
   // Combine INSERT_SUBVECTORs where we are inserting to the same index.
   // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
   // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
@@ -14262,26 +15005,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
                        N1, N2);
 
-  if (N0.getValueType() != N1.getValueType())
+  if (!isa<ConstantSDNode>(N2))
     return SDValue();
 
+  unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+
+  // Canonicalize insert_subvector dag nodes.
+  // Example:
+  // (insert_subvector (insert_subvector A, Idx0), Idx1)
+  // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
+      N1.getValueType() == N0.getOperand(1).getValueType() &&
+      isa<ConstantSDNode>(N0.getOperand(2))) {
+    unsigned OtherIdx = cast<ConstantSDNode>(N0.getOperand(2))->getZExtValue();
+    if (InsIdx < OtherIdx) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
+                                  N0.getOperand(0), N1, N2);
+      AddToWorklist(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
+                         VT, NewOp, N0.getOperand(1), N0.getOperand(2));
+    }
+  }
+
   // If the input vector is a concatenation, and the insert replaces
-  // one of the halves, we can optimize into a single concat_vectors.
-  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&
-      N2.getOpcode() == ISD::Constant) {
-    APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue();
+  // one of the pieces, we can optimize into a single concat_vectors.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
+      N0.getOperand(0).getValueType() == N1.getValueType()) {
+    unsigned Factor = N1.getValueType().getVectorNumElements();
 
-    // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->
-    // (concat_vectors Z, Y)
-    if (InsIdx == 0)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,
-                         N0.getOperand(1));
+    SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
+    Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;
 
-    // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->
-    // (concat_vectors X, Z)
-    if (InsIdx == VT.getVectorNumElements() / 2)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0),
-                         N1);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
   }
 
   return SDValue();
@@ -15257,7 +16013,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
   if (Base.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
       Base = Base.getOperand(0);
-      Offset += C->getZExtValue();
+      Offset += C->getSExtValue();
     }
   }
 
@@ -15454,6 +16210,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       ++Depth;
       break;
 
+    case ISD::CopyFromReg:
+      // Forward past CopyFromReg.
+      Chains.push_back(Chain.getOperand(0));
+      ++Depth;
+      break;
+
     default:
       // For all other instructions we will just have to take what we can get.
       Aliases.push_back(Chain);
@@ -15482,6 +16244,18 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
+// This function tries to collect a bunch of potentially interesting
+// nodes to improve the chains of, all at once. This might seem
+// redundant, as this function gets called when visiting every store
+// node, so why not let the work be done on each store as it's visited?
+//
+// I believe this is mainly important because MergeConsecutiveStores
+// is unable to deal with merging stores of different sizes, so unless
+// we improve the chains of all the potential candidates up-front
+// before running MergeConsecutiveStores, it might only see some of
+// the nodes that will eventually be candidates, and then not be able
+// to go from a partially-merged state to the desired final
+// fully-merged state.
 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
@@ -15517,10 +16291,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
     if (!Ptr.equalBaseIndex(BasePtr))
       break;
 
-    // Find the next memory operand in the chain. If the next operand in the
-    // chain is a store then move up and continue the scan with the next
-    // memory operand. If the next operand is a load save it and use alias
-    // information to check if it interferes with anything.
+    // Walk up the chain to find the next store node, ignoring any
+    // intermediate loads. Any other kind of node will halt the loop.
     SDNode *NextInChain = Index->getChain().getNode();
     while (true) {
       if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
@@ -15539,9 +16311,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
         Index = nullptr;
         break;
       }
-    }
+    } // end while
   }
 
+  // At this point, ChainedStores lists all of the Store nodes
+  // reachable by iterating up through chain nodes matching the above
+  // conditions.  For each such store identified, try to find an
+  // earlier chain to attach the store to which won't violate the
+  // required ordering.
   bool MadeChangeToSt = false;
   SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index e2f33bb433ba..0584ab9f60d1 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1,4 +1,4 @@
-//===-- FastISel.cpp - Implementation of the FastISel class ---------------===//
+//===- FastISel.cpp - Implementation of the FastISel class ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -39,35 +39,76 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
@@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
                                     "target-specific selector");
 STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 
-void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
-                                           unsigned AttrIdx) {
-  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
-  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
-  Alignment = CS->getParamAlignment(AttrIdx);
-}
-
 /// Set the current block to which generated machine instructions will be
 /// appended, and clear the local CSE map.
 void FastISel::startNewBlock() {
@@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
       // Try to emit the constant by using an integer constant with a cast.
       const APFloat &Flt = CF->getValueAPF();
       EVT IntVT = TLI.getPointerTy(DL);
-
-      uint64_t x[2];
       uint32_t IntBitWidth = IntVT.getSizeInBits();
+      APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false);
       bool isExact;
-      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                                 APFloat::rmTowardZero, &isExact);
+      (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact);
       if (isExact) {
-        APInt IntVal(IntBitWidth, x);
-
         unsigned IntegerReg =
-            getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+            getRegForValue(ConstantInt::get(V->getContext(), SIntVal));
         if (IntegerReg != 0)
           Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
                            /*Kill=*/false);
@@ -646,7 +668,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(TargetOpcode::STACKMAP));
   for (auto const &MO : Ops)
-    MIB.addOperand(MO);
+    MIB.add(MO);
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
@@ -672,10 +694,8 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
   Args.reserve(NumArgs);
 
   // Populate the argument list.
-  // Attributes for args start at offset 1, after the return attribute.
   ImmutableCallSite CS(CI);
-  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
-       ArgI != ArgE; ++ArgI) {
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) {
     Value *V = CI->getOperand(ArgI);
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
@@ -683,7 +703,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, AttrI);
+    Entry.setAttributes(&CS, ArgIdx);
     Args.push_back(Entry);
   }
 
@@ -826,7 +846,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
                                     TII.get(TargetOpcode::PATCHPOINT));
 
   for (auto &MO : Ops)
-    MIB.addOperand(MO);
+    MIB.add(MO);
 
   MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI);
 
@@ -841,9 +861,9 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
   return true;
 }
 
-/// Returns an AttributeSet representing the attributes applied to the return
+/// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
-static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
+static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
@@ -852,8 +872,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
-  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
-                           Attrs);
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
 }
 
 bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
@@ -885,9 +905,10 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, ArgI + 1);
+    Entry.setAttributes(&CS, ArgI);
     Args.push_back(Entry);
   }
+  TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args);
 
   CallLoweringInfo CLI;
   CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs);
@@ -1021,7 +1042,7 @@ bool FastISel::lowerCall(const CallInst *CI) {
     Entry.Ty = V->getType();
 
     // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Entry.setAttributes(&CS, i - CS.arg_begin());
     Args.push_back(Entry);
   }
 
@@ -1149,7 +1170,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       } else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::DBG_VALUE))
-            .addOperand(*Op)
+            .add(*Op)
             .addImm(0)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
@@ -1362,7 +1383,7 @@ bool FastISel::selectInstruction(const Instruction *I) {
 
   if (const auto *Call = dyn_cast<CallInst>(I)) {
     const Function *F = Call->getCalledFunction();
-    LibFunc::Func Func;
+    LibFunc Func;
 
     // As a special case, don't handle calls to builtin library functions that
     // may be translated directly to target instructions.
@@ -1665,7 +1686,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
       TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
       SkipTargetIndependentISel(SkipTargetIndependentISel) {}
 
-FastISel::~FastISel() {}
+FastISel::~FastISel() = default;
 
 bool FastISel::fastLowerArguments() { return false; }
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4a9042cfb3f4..e85d1951e3ae 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -235,7 +235,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
 
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
-      unsigned NumResults = CountResults(Node);
       VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
       assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
       MIB.addReg(VRBase, RegState::Define);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b0028252836a..fc7cd020fe2e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1192,8 +1192,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
 
       // If the index is dependent on the store we will introduce a cycle when
       // creating the load (the load uses the index, and by replacing the chain
-      // we will make the index dependent on the load).
-      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist))
+      // we will make the index dependent on the load). Also, the store might be
+      // dependent on the extractelement and introduce a cycle when creating 
+      // the load.
+      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) ||
+          ST->hasPredecessor(Op.getNode()))
         continue;
 
       StackPtr = ST->getBasePtr();
@@ -1909,8 +1912,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1935,9 +1938,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setTailCall(isTailCall)
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -1960,8 +1967,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   for (unsigned i = 0; i != NumOps; ++i) {
     Entry.Node = Ops[i];
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1970,9 +1977,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -1994,8 +2004,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2004,9 +2014,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2081,8 +2094,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
 
@@ -2090,8 +2103,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = FIPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = isSigned;
-  Entry.isZExt = !isSigned;
+  Entry.IsSExt = isSigned;
+  Entry.IsZExt = !isSigned;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2099,9 +2112,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2185,24 +2201,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
   Entry.Ty = RetTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Pass the return address of sin.
   SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = SinPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Also pass the return address of the cos.
   SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = CosPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2210,9 +2226,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args));
+  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+      std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2529,12 +2545,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
     APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
     APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
     for (unsigned J = 0; J != Sz; J += 8) {
-      MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J));
-      MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J));
-      MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J));
-      MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J));
-      MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J));
-      MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J));
+      MaskHi4 = MaskHi4 | (0xF0ull << J);
+      MaskLo4 = MaskLo4 | (0x0Full << J);
+      MaskHi2 = MaskHi2 | (0xCCull << J);
+      MaskLo2 = MaskLo2 | (0x33ull << J);
+      MaskHi1 = MaskHi1 | (0xAAull << J);
+      MaskLo1 = MaskLo1 | (0x55ull << J);
     }
 
     // BSWAP if the type is wider than a single byte.
@@ -3091,7 +3107,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                             TLI.getVectorIdxTy(DAG.getDataLayout()))));
     }
 
-    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+    Tmp1 = DAG.getBuildVector(VT, dl, Ops);
     // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
     Results.push_back(Tmp1);
@@ -3790,8 +3806,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
                                     VT.getScalarType(), Ex, Sh));
     }
-    SDValue Result =
-      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars);
+
+    SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
     ReplaceNode(SDValue(Node, 0), Result);
     break;
   }
@@ -3830,10 +3846,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__sync_synchronize",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+        .setLibCallee(
+            CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+            DAG.getExternalSymbol("__sync_synchronize",
+                                  TLI.getPointerTy(DAG.getDataLayout())),
+            std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -3870,10 +3887,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("abort",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "abort", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args));
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -4424,8 +4441,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
       NewOps.push_back(Elt);
     }
 
-    SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps);
-
+    SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);
     Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
     break;
   }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 72b56d84d945..6f2b1b94ce46 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -459,7 +459,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
     Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
     if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
-      SoftenFloatResult(Op.getNode(), 0);
+      AddToWorklist(Op.getNode());
   }
 
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
@@ -472,8 +472,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   }
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
-  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
-    Op = GetSoftenedFloat(Op);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
@@ -1054,15 +1052,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
                                                  SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  assert(NVT.getSizeInBits() == integerPartWidth &&
+  assert(NVT.getSizeInBits() == 64 &&
          "Do not know how to expand this float constant!");
   APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
   SDLoc dl(N);
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
-                                 APInt(integerPartWidth, C.getRawData()[1])),
+                                 APInt(64, C.getRawData()[1])),
                          dl, NVT);
   Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
-                                 APInt(integerPartWidth, C.getRawData()[0])),
+                                 APInt(64, C.getRawData()[0])),
                          dl, NVT);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index dc436ce04514..85068e890756 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -690,7 +690,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   case TargetLowering::TypePromoteInteger:
     Res = GetPromotedInteger(InOp);
     break;
-  case TargetLowering::TypeSplitVector:
+  case TargetLowering::TypeSplitVector: {
     EVT InVT = InOp.getValueType();
     assert(InVT.isVector() && "Cannot split scalar types");
     unsigned NumElts = InVT.getVectorNumElements();
@@ -709,6 +709,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
   }
+  case TargetLowering::TypeWidenVector: {
+    SDValue WideInOp = GetWidenedVector(InOp);
+
+    // Truncate widened InOp.
+    unsigned NumElem = WideInOp.getValueType().getVectorNumElements();
+    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getValueType(0).getScalarType(), NumElem);
+    SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp);
+
+    // Zero extend so that the elements are of same type as those of NVT
+    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(),
+                                 NumElem);
+    SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc);
+
+    // Extract the low NVT subvector.
+    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx);
+  }
+  }
 
   // Truncate to NVT instead of VT
   return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
@@ -1089,6 +1109,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   SDValue Cond = N->getOperand(0);
   EVT OpTy = N->getOperand(1).getValueType();
 
+  if (N->getOpcode() == ISD::VSELECT)
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      return Res;
+
   // Promote all the way up to the canonical SetCC type.
   EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
   Cond = PromoteTargetBoolean(Cond, OpVT);
@@ -2586,24 +2610,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = true;
-    Entry.isZExt = false;
+    Entry.IsSExt = true;
+    Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   // Also pass the address of the overflow check.
   Entry.Node = Temp;
   Entry.Ty = PtrTy->getPointerTo();
-  Entry.isSExt = true;
-  Entry.isZExt = false;
+  Entry.IsSExt = true;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
-    .setSExtResult();
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
+      .setSExtResult();
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cf19d75676cd..0a2b680e1c66 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -199,8 +199,7 @@ bool DAGTypeLegalizer::run() {
   // non-leaves.
   for (SDNode &Node : DAG.allnodes()) {
     if (Node.getNumOperands() == 0) {
-      Node.setNodeId(ReadyToProcess);
-      Worklist.push_back(&Node);
+      AddToWorklist(&Node);
     } else {
       Node.setNodeId(Unanalyzed);
     }
@@ -331,6 +330,12 @@ ScanOperands:
     // to the worklist etc.
     if (NeedsReanalyzing) {
       assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+
+      // Remove any result values from SoftenedFloats as N will be revisited
+      // again.
+      for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i)
+        SoftenedFloats.erase(SDValue(N, i));
+
       N->setNodeId(NewNode);
       // Recompute the NodeId and correct processed operands, adding the node to
       // the worklist if ready.
@@ -749,6 +754,8 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
     // new uses of From due to CSE. If this happens, replace the new uses of
     // From with To.
   } while (!From.use_empty());
+
+  SoftenedFloats.erase(From);
 }
 
 void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
@@ -1077,8 +1084,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1087,9 +1094,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ec55662d75c0..80c939700518 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -191,6 +191,11 @@ private:
   void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
                     SDValue &Lo, SDValue &Hi);
 
+  void AddToWorklist(SDNode *N) {
+    N->setNodeId(ReadyToProcess);
+    Worklist.push_back(N);
+  }
+
   //===--------------------------------------------------------------------===//
   // Integer Promotion Support: LegalizeIntegerTypes.cpp
   //===--------------------------------------------------------------------===//
@@ -597,6 +602,7 @@ private:
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
+  SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
@@ -672,6 +678,7 @@ private:
   SDValue SplitVecOp_BITCAST(SDNode *N);
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
@@ -713,6 +720,7 @@ private:
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVSELECTAndMask(SDNode *N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_SETCC(SDNode* N);
   SDValue WidenVecRes_UNDEF(SDNode *N);
@@ -782,6 +790,13 @@ private:
   /// By default, the vector will be widened with undefined values.
   SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
 
+  /// Return a mask of vector type MaskVT to replace InMask. Also adjust
+  /// MaskVT to ToMaskVT if needed with vector extension or truncation.
+  SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
+
+  /// Get the target mask VT, and widen if needed.
+  EVT getSETCCWidenedResultTy(SDValue SetCC);
+
   //===--------------------------------------------------------------------===//
   // Generic Splitting: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 3682c32460c6..c02b8960b36c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -512,8 +512,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
   GetSplitOp(Op, Lo, Hi);
 }
 
-void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
-                                       SDValue &Hi) {
+static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N,
+                                               SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
   SDLoc dl(N);
   GetSplitOp(N->getOperand(1), LL, LH);
@@ -522,9 +538,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
+    // It seems to improve code to generate two narrow SETCCs as opposed to
+    // splitting a wide result vector.
+    else if (Cond.getOpcode() == ISD::SETCC)
+      std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);
     // Check if there are already splitted versions of the vector available and
     // use those instead of splitting the mask operand again.
-    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+    else if (getTypeAction(Cond.getValueType()) ==
+             TargetLowering::TypeSplitVector)
       GetSplitVector(Cond, CL, CH);
     else
       std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index d4fa20f35274..5f167f8de1cf 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -105,6 +105,7 @@ class VectorLegalizer {
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
+  SDValue ExpandFSUB(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
@@ -621,8 +622,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
     }
 
     NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-    Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
-                        Op.getNode()->getValueType(0), Vals);
+    Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals);
   } else {
     SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG);
 
@@ -692,6 +692,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandUINT_TO_FLOAT(Op);
   case ISD::FNEG:
     return ExpandFNEG(Op);
+  case ISD::FSUB:
+    return ExpandFSUB(Op);
   case ISD::SETCC:
     return UnrollVSETCC(Op);
   case ISD::BITREVERSE:
@@ -720,8 +722,6 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
   assert(VT.isVector() && !Mask.getValueType().isVector()
          && Op1.getValueType() == Op2.getValueType() && "Invalid type");
 
-  unsigned NumElem = VT.getVectorNumElements();
-
   // If we can't even use the basic vector operations of
   // AND,OR,XOR, we will have to scalarize the op.
   // Notice that the operation may be 'promoted' which means that it is
@@ -745,8 +745,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
           DAG.getConstant(0, DL, BitTy));
 
   // Broadcast the mask so that the entire vector is all-one or all zero.
-  SmallVector<SDValue, 8> Ops(NumElem, Mask);
-  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops);
+  Mask = DAG.getSplatBuildVector(MaskTy, DL, Mask);
 
   // Bitcast the operands to be the same type as the mask.
   // This is needed when we select between FP types because
@@ -1025,6 +1024,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
+  // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal,
+  // we can defer this to operation legalization where it will be lowered as
+  // a+(-b).
+  EVT VT = Op.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FADD, VT))
+    return Op; // Defer to LegalizeDAG
+
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
   EVT VT = Op.getValueType();
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
@@ -1102,7 +1113,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
                                            (EltVT.getSizeInBits()), dl, EltVT),
                            DAG.getConstant(0, dl, EltVT));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6906f67ebacb..78fddb5ce8f5 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    R = ScalarizeVecRes_VecInregOp(N);
+    break;
   case ISD::ANY_EXTEND:
   case ISD::BITREVERSE:
   case ISD::BSWAP:
@@ -97,6 +102,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
   case ISD::ZERO_EXTEND:
+  case ISD::FCANONICALIZE:
     R = ScalarizeVecRes_UnaryOp(N);
     break;
 
@@ -257,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
                      LHS, DAG.getValueType(ExtVT));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+
+  EVT OpVT = Op.getValueType();
+  EVT OpEltVT = OpVT.getVectorElementType();
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    Op = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
+  }
+
+  llvm_unreachable("Illegal extend_vector_inreg opcode");
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   // If the operand is wider than the vector element type then it is implicitly
   // truncated.  Make that explicit here.
@@ -486,7 +520,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
     Ops[i] = GetScalarizedVector(N->getOperand(i));
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops);
+  return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
 }
 
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
@@ -637,6 +671,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
+  case ISD::FCANONICALIZE:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
@@ -781,10 +816,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
-  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps);
+  Lo = DAG.getBuildVector(LoVT, dl, LoOps);
 
   SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
-  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps);
+  Hi = DAG.getBuildVector(HiVT, dl, HiOps);
 }
 
 void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
@@ -928,7 +963,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   SDValue InLo, InHi;
-  GetSplitVector(N0, InLo, InHi);
+
+  if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N0, InLo, InHi);
+  else
+    std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);
+
   EVT InLoVT = InLo.getValueType();
   unsigned InNumElements = InLoVT.getVectorNumElements();
 
@@ -1372,7 +1412,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
-      Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps);
+      Output = DAG.getBuildVector(NewVT, dl, SVOps);
     } else if (InputUsed[0] == -1U) {
       // No input vectors were used!  The result is undefined.
       Output = DAG.getUNDEF(NewVT);
@@ -1466,8 +1506,15 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
     case ISD::FTRUNC:
+    case ISD::FCANONICALIZE:
       Res = SplitVecOp_UnaryOp(N);
       break;
+
+    case ISD::ANY_EXTEND_VECTOR_INREG:
+    case ISD::SIGN_EXTEND_VECTOR_INREG:
+    case ISD::ZERO_EXTEND_VECTOR_INREG:
+      Res = SplitVecOp_ExtVecInRegOp(N);
+      break;
     }
   }
 
@@ -1615,7 +1662,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                              VecVT.getVectorNumElements());
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps);
+    Vec = DAG.getBuildVector(VecVT, dl, ElementOps);
   }
 
   // Store the vector to the stack.
@@ -1629,6 +1676,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
                         MachinePointerInfo(), EltVT);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
+  SDValue Lo, Hi;
+
+  // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
+  // splitting the result has the same effect as splitting the input operand.
+  SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                                              unsigned OpNo) {
   EVT LoVT, HiVT;
@@ -1881,7 +1938,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts);
+  return DAG.getBuildVector(N->getValueType(0), DL, Elts);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
@@ -2323,6 +2380,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
         return DAG.getNode(Opcode, DL, WidenVT, InOp);
       return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
     }
+    if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
+      // If both input and result vector types are of same width, extend
+      // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
+      // accepts fewer elements in the result than in the input.
+      if (Opcode == ISD::SIGN_EXTEND)
+        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
+      if (Opcode == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+    }
   }
 
   if (TLI.isTypeLegal(InWidenVT)) {
@@ -2375,7 +2441,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
@@ -2430,7 +2496,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
   while (Ops.size() != WidenNumElts)
     Ops.push_back(DAG.getUNDEF(WidenSVT));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
@@ -2593,7 +2659,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
   assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
   NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps);
+  return DAG.getBuildVector(WidenVT, dl, NewOps);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
@@ -2663,7 +2729,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
@@ -2704,7 +2770,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
@@ -2814,6 +2880,212 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
                      WidenVT, N->getOperand(0));
 }
 
+// Return true if this is a node that could have two SETCCs as operands.
+static inline bool isLogicalMaskOp(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return true;
+  }
+  return false;
+}
+
+// This is used just for the assert in convertMask(). Check that this either
+// a SETCC or a previously handled SETCC by convertMask().
+#ifndef NDEBUG
+static inline bool isSETCCorConvertedSETCC(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
+    for (unsigned i = 1; i < N->getNumOperands(); ++i)
+      if (!N->getOperand(i)->isUndef())
+        return false;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() == ISD::TRUNCATE)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::SIGN_EXTEND)
+    N = N.getOperand(0);
+
+  return (N.getOpcode() == ISD::SETCC);
+}
+#endif
+
+// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
+// to ToMaskVT if needed with vector extension or truncation.
+SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
+                                      EVT ToMaskVT) {
+  LLVMContext &Ctx = *DAG.getContext();
+
+  // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
+  unsigned InMaskOpc = InMask->getOpcode();
+  assert((InMaskOpc == ISD::SETCC ||
+          (isLogicalMaskOp(InMaskOpc) &&
+           isSETCCorConvertedSETCC(InMask->getOperand(0)) &&
+           isSETCCorConvertedSETCC(InMask->getOperand(1)))) &&
+         "Unexpected mask argument.");
+
+  // Make a new Mask node, with a legal result VT.
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i < InMask->getNumOperands(); ++i)
+    Ops.push_back(InMask->getOperand(i));
+  SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops);
+
+  // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
+  // extend or truncate is needed.
+  unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
+  unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
+  if (MaskScalarBits < ToMaskScalBits) {
+    EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                 MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
+  } else if (MaskScalarBits > ToMaskScalBits) {
+    EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                   MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
+  }
+
+  assert(Mask->getValueType(0).getScalarSizeInBits() ==
+             ToMaskVT.getScalarSizeInBits() &&
+         "Mask should have the right element size by now.");
+
+  // Adjust Mask to the right number of elements.
+  unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
+  if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
+    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
+    Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
+                       ZeroIdx);
+  } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
+    unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
+    EVT SubVT = Mask->getValueType(0);
+    SmallVector<SDValue, 16> SubConcatOps(NumSubVecs);
+    SubConcatOps[0] = Mask;
+    for (unsigned i = 1; i < NumSubVecs; ++i)
+      SubConcatOps[i] = DAG.getUNDEF(SubVT);
+    Mask =
+        DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps);
+  }
+
+  assert((Mask->getValueType(0) == ToMaskVT) &&
+         "A mask of ToMaskVT should have been produced by now.");
+
+  return Mask;
+}
+
+// Get the target mask VT, and widen if needed.
+EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
+  assert(SetCC->getOpcode() == ISD::SETCC);
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
+  return MaskVT;
+}
+
+// This method tries to handle VSELECT and its mask by legalizing operands
+// (which may require widening) and if needed adjusting the mask vector type
+// to match that of the VSELECT. Without it, many cases end up with
+// scalarization of the SETCC, with many unnecessary instructions.
+SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Cond = N->getOperand(0);
+
+  if (N->getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
+    return SDValue();
+
+  // If this is a splitted VSELECT that was previously already handled, do
+  // nothing.
+  if (Cond->getValueType(0).getScalarSizeInBits() != 1)
+    return SDValue();
+
+  EVT VSelVT = N->getValueType(0);
+  // Only handle vector types which are a power of 2.
+  if (!isPowerOf2_64(VSelVT.getSizeInBits()))
+    return SDValue();
+
+  // Don't touch if this will be scalarized.
+  EVT FinalVT = VSelVT;
+  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+    FinalVT = EVT::getVectorVT(Ctx, FinalVT.getVectorElementType(),
+                               FinalVT.getVectorNumElements() / 2);
+  if (FinalVT.getVectorNumElements() == 1)
+    return SDValue();
+
+  // If there is support for an i1 vector mask, don't touch.
+  if (Cond.getOpcode() == ISD::SETCC) {
+    EVT SetCCOpVT = Cond->getOperand(0).getValueType();
+    while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
+      SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
+    EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
+    if (SetCCResVT.getScalarSizeInBits() == 1)
+      return SDValue();
+  }
+
+  // Get the VT and operands for VSELECT, and widen if needed.
+  SDValue VSelOp1 = N->getOperand(1);
+  SDValue VSelOp2 = N->getOperand(2);
+  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
+    VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
+    VSelOp1 = GetWidenedVector(VSelOp1);
+    VSelOp2 = GetWidenedVector(VSelOp2);
+  }
+
+  // The mask of the VSELECT should have integer elements.
+  EVT ToMaskVT = VSelVT;
+  if (!ToMaskVT.getScalarType().isInteger())
+    ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
+
+  SDValue Mask;
+  if (Cond->getOpcode() == ISD::SETCC) {
+    EVT MaskVT = getSETCCWidenedResultTy(Cond);
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else if (isLogicalMaskOp(Cond->getOpcode()) &&
+             Cond->getOperand(0).getOpcode() == ISD::SETCC &&
+             Cond->getOperand(1).getOpcode() == ISD::SETCC) {
+    // Cond is (AND/OR/XOR (SETCC, SETCC))
+    SDValue SETCC0 = Cond->getOperand(0);
+    SDValue SETCC1 = Cond->getOperand(1);
+    EVT VT0 = getSETCCWidenedResultTy(SETCC0);
+    EVT VT1 = getSETCCWidenedResultTy(SETCC1);
+    unsigned ScalarBits0 = VT0.getScalarSizeInBits();
+    unsigned ScalarBits1 = VT1.getScalarSizeInBits();
+    unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
+    EVT MaskVT;
+    // If the two SETCCs have different VTs, either extend/truncate one of
+    // them to the other "towards" ToMaskVT, or truncate one and extend the
+    // other to ToMaskVT.
+    if (ScalarBits0 != ScalarBits1) {
+      EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
+      EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
+      if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
+        MaskVT = WideVT;
+      else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
+        MaskVT = NarrowVT;
+      else
+        MaskVT = ToMaskVT;
+    } else
+      // If the two SETCCs have the same VT, don't change it.
+      MaskVT = VT0;
+
+    // Make new SETCCs and logical nodes.
+    SETCC0 = convertMask(SETCC0, VT0, MaskVT);
+    SETCC1 = convertMask(SETCC1, VT1, MaskVT);
+    Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
+
+    // Convert the logical op for VSELECT if needed.
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else
+    return SDValue();
+
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
@@ -2821,6 +3093,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
   if (CondVT.isVector()) {
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      return Res;
+
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
                                         CondEltVT, WidenNumElts);
@@ -3093,7 +3368,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
             ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
             DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
@@ -3144,7 +3419,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
           ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
           DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
@@ -3565,10 +3840,9 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   for (; i != WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
-
 void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // The strategy assumes that we can efficiently store power-of-two widths.
@@ -3737,5 +4011,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
     DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = FillVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  return DAG.getBuildVector(NVT, dl, Ops);
 }
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index ded8e68fcbce..a1d70ab6f036 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -57,10 +57,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
   RegPressure.resize(NumRC);
   std::fill(RegLimit.begin(), RegLimit.end(), 0);
   std::fill(RegPressure.begin(), RegPressure.end(), 0);
-  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-                                             E = TRI->regclass_end();
-       I != E; ++I)
-    RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
+  for (const TargetRegisterClass *RC : TRI->regclasses())
+    RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF);
 
   ParallelLiveRanges = 0;
   HorizontalVerticalBalance = 0;
@@ -364,16 +362,11 @@ int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
     return RegBalance;
 
   if (RawPressure) {
-    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-             E = TRI->regclass_end(); I != E; ++I) {
-      const TargetRegisterClass *RC = *I;
+    for (const TargetRegisterClass *RC : TRI->regclasses())
       RegBalance += rawRegPressureDelta(SU, RC->getID());
-    }
   }
   else {
-    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-         E = TRI->regclass_end(); I != E; ++I) {
-      const TargetRegisterClass *RC = *I;
+    for (const TargetRegisterClass *RC : TRI->regclasses()) {
       if ((RegPressure[RC->getID()] +
            rawRegPressureDelta(SU, RC->getID()) > 0) &&
           (RegPressure[RC->getID()] +
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 3549ccd9e345..e923e30e5037 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -422,11 +422,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner,
     }
     // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
     if (N->isMachineOpcode()) {
-      if (N->getMachineOpcode() ==
-          (unsigned)TII->getCallFrameDestroyOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         ++NestLevel;
-      } else if (N->getMachineOpcode() ==
-                 (unsigned)TII->getCallFrameSetupOpcode()) {
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         if (NestLevel == 0)
           return false;
         --NestLevel;
@@ -480,12 +478,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
     }
     // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
     if (N->isMachineOpcode()) {
-      if (N->getMachineOpcode() ==
-          (unsigned)TII->getCallFrameDestroyOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         ++NestLevel;
         MaxNest = std::max(MaxNest, NestLevel);
-      } else if (N->getMachineOpcode() ==
-                 (unsigned)TII->getCallFrameSetupOpcode()) {
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         assert(NestLevel != 0);
         --NestLevel;
         if (NestLevel == 0)
@@ -550,7 +546,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
   if (!LiveRegDefs[CallResource])
     for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
       if (Node->isMachineOpcode() &&
-          Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+          Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         unsigned NestLevel = 0;
         unsigned MaxNest = 0;
         SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
@@ -755,7 +751,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
     for (const SDNode *SUNode = SU->getNode(); SUNode;
          SUNode = SUNode->getGluedNode()) {
       if (SUNode->isMachineOpcode() &&
-          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+          SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
         LiveRegDefs[CallResource] = nullptr;
@@ -826,7 +822,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (SUNode->isMachineOpcode() &&
-        SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+        SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
       ++NumLiveRegs;
       LiveRegDefs[CallResource] = SU;
       LiveRegGens[CallResource] = CallSeqEndForStart[SU];
@@ -839,7 +835,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
     for (const SDNode *SUNode = SU->getNode(); SUNode;
          SUNode = SUNode->getGluedNode()) {
       if (SUNode->isMachineOpcode() &&
-          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+          SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
         LiveRegDefs[CallResource] = nullptr;
@@ -1305,7 +1301,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
     // If we're in the middle of scheduling a call, don't begin scheduling
     // another call. Also, don't allow any physical registers to be live across
     // the call.
-    if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+    if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) ||
+        (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) {
       // Check the special calling-sequence resource.
       unsigned CallResource = TRI->getNumRegs();
       if (LiveRegDefs[CallResource]) {
@@ -1659,9 +1656,8 @@ public:
       RegPressure.resize(NumRC);
       std::fill(RegLimit.begin(), RegLimit.end(), 0);
       std::fill(RegPressure.begin(), RegPressure.end(), 0);
-      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-             E = TRI->regclass_end(); I != E; ++I)
-        RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF);
+      for (const TargetRegisterClass *RC : TRI->regclasses())
+        RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF);
     }
   }
 
@@ -1788,7 +1784,7 @@ public:
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump(ScheduleDAG *DAG) const override {
+  LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override {
     // Emulate pop() without clobbering NodeQueueIds.
     std::vector<SUnit*> DumpQueue = Queue;
     SF DumpPicker = Picker;
@@ -1924,19 +1920,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
 //                     Register Pressure Tracking
 //===----------------------------------------------------------------------===//
 
-void RegReductionPQBase::dumpRegPressure() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-         E = TRI->regclass_end(); I != E; ++I) {
-    const TargetRegisterClass *RC = *I;
+LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const {
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
     unsigned Id = RC->getID();
     unsigned RP = RegPressure[Id];
     if (!RP) continue;
     DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
           << RegLimit[Id] << '\n');
   }
-#endif
 }
+#endif
 
 bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
   if (!TLI)
@@ -2092,7 +2086,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
       RegPressure[RCId] -= Cost;
     }
   }
-  dumpRegPressure();
+  DEBUG(dumpRegPressure());
 }
 
 void RegReductionPQBase::unscheduledNode(SUnit *SU) {
@@ -2172,7 +2166,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
     }
   }
 
-  dumpRegPressure();
+  DEBUG(dumpRegPressure());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 3be622f8c179..3c8526ebb702 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -650,6 +650,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+  // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   if (!SU->getNode()) {
     dbgs() << "PHYS REG COPY\n";
@@ -704,8 +705,8 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   if (!N->getHasDebugValue())
     return;
 
-  // Opportunistically insert immediate dbg_value uses, i.e. those with source
-  // order number right after the N.
+  // Opportunistically insert immediate dbg_value uses, i.e. those with the same
+  // source order number as N.
   MachineBasicBlock *BB = Emitter.getBlock();
   MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
   ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N);
@@ -713,7 +714,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
     if (DVs[i]->isInvalidated())
       continue;
     unsigned DVOrder = DVs[i]->getOrder();
-    if (!Order || DVOrder == ++Order) {
+    if (!Order || DVOrder == Order) {
       MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap);
       if (DbgMI) {
         Orders.push_back(std::make_pair(DVOrder, DbgMI));
@@ -835,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       GluedNodes.push_back(N);
     while (!GluedNodes.empty()) {
       SDNode *N = GluedNodes.back();
-      Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned,
-                       VRBaseMap);
+      Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);
       // Remember the source order of the inserted instruction.
       if (HasDbg)
         ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e225ba8703b7..003ea5030bfc 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -289,28 +289,28 @@ static int isSignedOp(ISD::CondCode Opcode) {
 }
 
 ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
-                                       bool isInteger) {
-  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+                                       bool IsInteger) {
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed integer setcc with an unsigned integer setcc.
     return ISD::SETCC_INVALID;
 
   unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
 
-  // If the N and U bits get set then the resultant comparison DOES suddenly
-  // care about orderedness, and is true when ordered.
+  // If the N and U bits get set, then the resultant comparison DOES suddenly
+  // care about orderedness, and it is true when ordered.
   if (Op > ISD::SETTRUE2)
     Op &= ~16;     // Clear the U bit if the N bit is set.
 
   // Canonicalize illegal integer setcc's.
-  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+  if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
     Op = ISD::SETNE;
 
   return ISD::CondCode(Op);
 }
 
 ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
-                                        bool isInteger) {
-  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+                                        bool IsInteger) {
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed setcc with an unsigned setcc.
     return ISD::SETCC_INVALID;
 
@@ -318,7 +318,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
   ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
 
   // Canonicalize illegal integer setcc's.
-  if (isInteger) {
+  if (IsInteger) {
     switch (Result) {
     default: break;
     case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
@@ -871,11 +871,13 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf) {
-  MF = &mf;
+void SelectionDAG::init(MachineFunction &NewMF,
+                        OptimizationRemarkEmitter &NewORE) {
+  MF = &NewMF;
+  ORE = &NewORE;
   TLI = getSubtarget().getTargetLowering();
   TSI = getSubtarget().getSelectionDAGInfo();
-  Context = &mf.getFunction()->getContext();
+  Context = &MF->getFunction()->getContext();
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -1994,8 +1996,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
 /// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows
 /// us to only collect the known bits that are shared by the requested vector
 /// elements.
-/// TODO: We only support DemandedElts on a few opcodes so far, the remainder
-/// should be added when they become necessary.
 void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                     APInt &KnownOne, const APInt &DemandedElts,
                                     unsigned Depth) const {
@@ -2251,10 +2251,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                KnownZero2.countLeadingOnes(),
                                BitWidth) - BitWidth;
 
-    TrailZ = std::min(TrailZ, BitWidth);
-    LeadZ = std::min(LeadZ, BitWidth);
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-                APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero.clearAllBits();
+    KnownZero.setLowBits(std::min(TrailZ, BitWidth));
+    KnownZero.setHighBits(std::min(LeadZ, BitWidth));
     break;
   }
   case ISD::UDIV: {
@@ -2272,7 +2271,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero.setHighBits(LeadZ);
     break;
   }
   case ISD::SELECT:
@@ -2297,10 +2296,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
     break;
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
@@ -2312,14 +2307,14 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+      KnownZero.setBitsFrom(1);
     break;
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+      KnownZero.setBitsFrom(1);
     break;
   case ISD::SHL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
@@ -2328,7 +2323,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       KnownZero = KnownZero << *ShAmt;
       KnownOne = KnownOne << *ShAmt;
       // Low bits are known zero.
-      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue());
+      KnownZero.setLowBits(ShAmt->getZExtValue());
     }
     break;
   case ISD::SRL:
@@ -2338,8 +2333,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       KnownZero = KnownZero.lshr(*ShAmt);
       KnownOne  = KnownOne.lshr(*ShAmt);
       // High bits are known zero.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());
-      KnownZero |= HighBits;
+      KnownZero.setHighBits(ShAmt->getZExtValue());
     }
     break;
   case ISD::SRA:
@@ -2350,13 +2344,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       KnownOne  = KnownOne.lshr(*ShAmt);
       // If we know the value of the sign bit, then we know it is copied across
       // the high bits by the shift amount.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());
       APInt SignBit = APInt::getSignBit(BitWidth);
       SignBit = SignBit.lshr(*ShAmt);  // Adjust to where it is now in the mask.
       if (KnownZero.intersects(SignBit)) {
-        KnownZero |= HighBits;  // New bits are known zero.
+        KnownZero.setHighBits(ShAmt->getZExtValue());// New bits are known zero.
       } else if (KnownOne.intersects(SignBit)) {
-        KnownOne  |= HighBits;  // New bits are known one.
+        KnownOne.setHighBits(ShAmt->getZExtValue()); // New bits are known one.
       }
     }
     break;
@@ -2401,9 +2394,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP: {
-    unsigned LowBits = Log2_32(BitWidth)+1;
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
-    KnownOne.clearAllBits();
+    KnownZero.setBitsFrom(Log2_32(BitWidth)+1);
     break;
   }
   case ISD::LOAD: {
@@ -2412,26 +2403,39 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      KnownZero.setBitsFrom(MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
       if (LD->getExtensionType() == ISD::NON_EXTLOAD)
         computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne);
     }
     break;
   }
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarSizeInBits();
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne,
+                     DemandedElts.zext(InVT.getVectorNumElements()),
+                     Depth + 1);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero.setBitsFrom(InBits);
+    break;
+  }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
-    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
     computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
                      Depth + 1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
-    KnownZero |= NewBits;
+    KnownZero.setBitsFrom(InBits);
     break;
   }
+  // TODO ISD::SIGN_EXTEND_VECTOR_INREG
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
@@ -2478,10 +2482,21 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    KnownZero.setBitsFrom(1);
     break;
-
-  case ISD::SUB: {
+  case ISD::USUBO:
+  case ISD::SSUBO:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        KnownZero.setBitsFrom(1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::SUB:
+  case ISD::SUBC: {
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
       // We know that the top bits of C-X are clear if X contains less bits
       // than C (i.e. no wrap-around can happen).  For example, 20-X is
@@ -2499,13 +2514,40 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
         if ((KnownZero2 & MaskV) == MaskV) {
           unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
           // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
+          KnownZero.setHighBits(NLZ2);
         }
       }
     }
-    LLVM_FALLTHROUGH;
+
+    // If low bits are know to be zero in both operands, then we know they are
+    // going to be 0 in the result. Both addition and complement operations
+    // preserve the low zero bits.
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+    unsigned KnownZeroLow = KnownZero2.countTrailingOnes();
+    if (KnownZeroLow == 0)
+      break;
+
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+    KnownZeroLow = std::min(KnownZeroLow,
+                            KnownZero2.countTrailingOnes());
+    KnownZero.setBits(0, KnownZeroLow);
+    break;
   }
+  case ISD::UADDO:
+  case ISD::SADDO:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        KnownZero.setBitsFrom(1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
   case ISD::ADD:
+  case ISD::ADDC:
   case ISD::ADDE: {
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
@@ -2526,19 +2568,19 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownZeroLow = std::min(KnownZeroLow,
                             KnownZero2.countTrailingOnes());
 
-    if (Opcode == ISD::ADD) {
-      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow);
-      if (KnownZeroHigh > 1)
-        KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1);
+    if (Opcode == ISD::ADDE) {
+      // With ADDE, a carry bit may be added in, so we can only use this
+      // information if we know (at least) that the low two bits are clear.
+      // We then return to the caller that the low bit is unknown but that
+      // other bits are known zero.
+      if (KnownZeroLow >= 2)
+        KnownZero.setBits(1, KnownZeroLow);
       break;
     }
 
-    // With ADDE, a carry bit may be added in, so we can only use this
-    // information if we know (at least) that the low two bits are clear.  We
-    // then return to the caller that the low bit is unknown but that other bits
-    // are known zero.
-    if (KnownZeroLow >= 2) // ADDE
-      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow);
+    KnownZero.setLowBits(KnownZeroLow);
+    if (KnownZeroHigh > 1)
+      KnownZero.setHighBits(KnownZeroHigh - 1);
     break;
   }
   case ISD::SREM:
@@ -2591,7 +2633,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    KnownZero.clearAllBits();
+    KnownZero.setHighBits(Leaders);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
@@ -2673,6 +2716,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     }
     break;
   }
+  case ISD::BITREVERSE: {
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+    KnownZero = KnownZero2.reverseBits();
+    KnownOne = KnownOne2.reverseBits();
+    break;
+  }
   case ISD::BSWAP: {
     computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
                      Depth + 1);
@@ -2680,12 +2730,62 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownOne = KnownOne2.byteSwap();
     break;
   }
-  case ISD::SMIN:
-  case ISD::SMAX:
-  case ISD::UMIN:
+  case ISD::ABS: {
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+
+    // If the source's MSB is zero then we know the rest of the bits already.
+    if (KnownZero2[BitWidth - 1]) {
+      KnownZero = KnownZero2;
+      KnownOne = KnownOne2;
+      break;
+    }
+
+    // We only know that the absolute values's MSB will be zero iff there is
+    // a set bit that isn't the sign bit (otherwise it could be INT_MIN).
+    KnownOne2.clearBit(BitWidth - 1);
+    if (KnownOne2.getBoolValue()) {
+      KnownZero = APInt::getSignBit(BitWidth);
+      break;
+    }
+    break;
+  }
+  case ISD::UMIN: {
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+
+    // UMIN - we know that the result will have the maximum of the
+    // known zero leading bits of the inputs.
+    unsigned LeadZero = KnownZero.countLeadingOnes();
+    LeadZero = std::max(LeadZero, KnownZero2.countLeadingOnes());
+
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    KnownZero.setHighBits(LeadZero);
+    break;
+  }
   case ISD::UMAX: {
     computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
                      Depth + 1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+
+    // UMAX - we know that the result will have the maximum of the
+    // known one leading bits of the inputs.
+    unsigned LeadOne = KnownOne.countLeadingOnes();
+    LeadOne = std::max(LeadOne, KnownOne2.countLeadingOnes());
+
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    KnownOne.setHighBits(LeadOne);
+    break;
+  }
+  case ISD::SMIN:
+  case ISD::SMAX: {
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+                     Depth + 1);
     // If we don't know any bits, early out.
     if (!KnownOne && !KnownZero)
       break;
@@ -2699,7 +2799,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::TargetFrameIndex:
     if (unsigned Align = InferPtrAlignment(Op)) {
       // The low bits are known zero if the pointer is aligned.
-      KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align));
+      KnownZero.setLowBits(Log2_32(Align));
       break;
     }
     break;
@@ -2712,13 +2812,48 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, DemandedElts,
+                                       *this, Depth);
     break;
   }
 
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
 
+SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
+                                                             SDValue N1) const {
+  // X + 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  APInt N1Zero, N1One;
+  computeKnownBits(N1, N1Zero, N1One);
+  if (N1Zero.getBoolValue()) {
+    APInt N0Zero, N0One;
+    computeKnownBits(N0, N0Zero, N0One);
+
+    bool overflow;
+    (~N0Zero).uadd_ov(~N1Zero, overflow);
+    if (!overflow)
+      return OFK_Never;
+  }
+
+  // mulhi + 1 never overflow
+  if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
+      (~N1Zero & 0x01) == ~N1Zero)
+    return OFK_Never;
+
+  if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
+    APInt N0Zero, N0One;
+    computeKnownBits(N0, N0Zero, N0One);
+
+    if ((~N0Zero & 0x01) == ~N0Zero)
+      return OFK_Never;
+  }
+
+  return OFK_Sometime;
+}
+
 bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   EVT OpVT = Val.getValueType();
   unsigned BitWidth = OpVT.getScalarSizeInBits();
@@ -2745,7 +2880,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
 
   // Are all operands of a build vector constant powers of two?
   if (Val.getOpcode() == ISD::BUILD_VECTOR)
-    if (llvm::all_of(Val->ops(), [this, BitWidth](SDValue E) {
+    if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
           if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
             return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
           return false;
@@ -2764,6 +2899,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ComputeNumSignBits(Op, DemandedElts, Depth);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                                          unsigned Depth) const {
+  EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarSizeInBits();
   unsigned Tmp, Tmp2;
@@ -2772,6 +2916,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   if (Depth == 6)
     return 1;  // Limit search depth.
 
+  if (!DemandedElts)
+    return 1;  // No demanded elts, better to assume we don't know anything.
+
   switch (Op.getOpcode()) {
   default: break;
   case ISD::AssertSext:
@@ -2786,7 +2933,28 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return Val.getNumSignBits();
   }
 
+  case ISD::BUILD_VECTOR:
+    Tmp = VTBits;
+    for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      SDValue SrcOp = Op.getOperand(i);
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
+
+      // BUILD_VECTOR can implicitly truncate sources, we must handle this.
+      if (SrcOp.getValueSizeInBits() != VTBits) {
+        assert(SrcOp.getValueSizeInBits() > VTBits &&
+               "Expected BUILD_VECTOR implicit truncation");
+        unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
+        Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
+      }
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    return Tmp;
+
   case ISD::SIGN_EXTEND:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
@@ -2799,7 +2967,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return std::max(Tmp, Tmp2);
 
   case ISD::SRA:
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
     // SRA X, C   -> adds C sign bits.
     if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) {
       APInt ShiftVal = C->getAPIntValue();
@@ -2887,6 +3055,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     }
     break;
   case ISD::ADD:
+  case ISD::ADDC:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
@@ -2961,19 +3130,63 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     // result. Otherwise it gives either negative or > bitwidth result
     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
   }
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    unsigned NumElts = InVec.getValueType().getVectorNumElements();
+
+    ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+      // If we know the element index, split the demand between the
+      // source vector and the inserted element.
+      unsigned EltIdx = CEltNo->getZExtValue();
+
+      // If we demand the inserted element then get its sign bits.
+      Tmp = UINT_MAX;
+      if (DemandedElts[EltIdx])
+        Tmp = ComputeNumSignBits(InVal, Depth + 1);
+
+      // If we demand the source vector then get its sign bits, and determine
+      // the minimum.
+      APInt VectorElts = DemandedElts;
+      VectorElts.clearBit(EltIdx);
+      if (!!VectorElts) {
+        Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
+        Tmp = std::min(Tmp, Tmp2);
+      }
+    } else {
+      // Unknown element index, so ignore DemandedElts and demand them all.
+      Tmp = ComputeNumSignBits(InVec, Depth + 1);
+      Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
-    // At the moment we keep this simple and skip tracking the specific
-    // element. This way we get the lowest common denominator for all elements
-    // of the vector.
-    // TODO: get information for given vector element
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
     const unsigned BitWidth = Op.getValueSizeInBits();
     const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
     // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
     // anything about sign bits. But if the sizes match we can derive knowledge
     // about sign bits from the vector operand.
-    if (BitWidth == EltBitWidth)
-      return ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    break;
+    if (BitWidth != EltBitWidth)
+      break;
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
   }
   case ISD::EXTRACT_SUBVECTOR:
     return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
@@ -3008,14 +3221,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_VOID) {
-    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth);
-    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+    unsigned NumBits =
+        TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
+    if (NumBits > 1)
+      FirstAnswer = std::max(FirstAnswer, NumBits);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   APInt KnownZero, KnownOne;
-  computeKnownBits(Op, KnownZero, KnownOne, Depth);
+  computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth);
 
   APInt Mask;
   if (KnownZero.isNegative()) {        // sign bit is 0
@@ -3054,6 +3269,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
   if (getTarget().Options.NoNaNsFPMath)
     return true;
 
+  if (const BinaryWithFlagsSDNode *BF = dyn_cast<BinaryWithFlagsSDNode>(Op))
+    return BF->Flags.hasNoNaNs();
+
   // If the value is a constant, we can obviously see if it is a NaN or not.
   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
     return !C->getValueAPF().isNaN();
@@ -3206,6 +3424,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
         return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
       break;
+    case ISD::ABS:
+      return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::BITREVERSE:
+      return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
@@ -3220,6 +3444,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::CTTZ_ZERO_UNDEF:
       return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
+    case ISD::FP16_TO_FP: {
+      bool Ignored;
+      APFloat FPV(APFloat::IEEEhalf(),
+                  (Val.getBitWidth() == 16) ? Val : Val.trunc(16));
+
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)FPV.convert(EVTToAPFloatSemantics(VT),
+                        APFloat::rmNearestTiesToEven, &Ignored);
+      return getConstantFP(FPV, DL, VT);
+    }
     }
   }
 
@@ -3261,17 +3496,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT: {
-      integerPart x[2];
       bool ignored;
-      static_assert(integerPartWidth >= 64, "APFloat parts too small!");
+      APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
       // FIXME need to be more flexible about rounding mode.
-      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
-                            Opcode==ISD::FP_TO_SINT,
-                            APFloat::rmTowardZero, &ignored);
-      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+      APFloat::opStatus s =
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
+      if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
         break;
-      APInt api(VT.getSizeInBits(), x);
-      return getConstant(api, DL, VT);
+      return getConstant(IntVal, DL, VT);
     }
     case ISD::BITCAST:
       if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
@@ -3281,6 +3513,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
       break;
+    case ISD::FP_TO_FP16: {
+      bool Ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(APFloat::IEEEhalf(),
+                      APFloat::rmNearestTiesToEven, &Ignored);
+      return getConstant(V.bitcastToAPInt(), DL, VT);
+    }
     }
   }
 
@@ -3303,6 +3543,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       case ISD::TRUNCATE:
       case ISD::UINT_TO_FP:
       case ISD::SINT_TO_FP:
+      case ISD::ABS:
+      case ISD::BITREVERSE:
       case ISD::BSWAP:
       case ISD::CTLZ:
       case ISD::CTLZ_ZERO_UNDEF:
@@ -3420,6 +3662,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
+  case ISD::ABS:
+    assert(VT.isInteger() && VT == Operand.getValueType() &&
+           "Invalid ABS!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
   case ISD::BSWAP:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid BSWAP!");
@@ -3569,6 +3817,30 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
                           GA->getOffset() + uint64_t(Offset));
 }
 
+bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM: {
+    // If a divisor is zero/undef or any element of a divisor vector is
+    // zero/undef, the whole op is undef.
+    assert(Ops.size() == 2 && "Div/rem should have 2 operands");
+    SDValue Divisor = Ops[1];
+    if (Divisor.isUndef() || isNullConstant(Divisor))
+      return true;
+
+    return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
+           any_of(Divisor->op_values(),
+                  [](SDValue V) { return V.isUndef() || isNullConstant(V); });
+    // TODO: Handle signed overflow.
+  }
+  // TODO: Handle oversized shifts.
+  default:
+    return false;
+  }
+}
+
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
                                              EVT VT, SDNode *Cst1,
                                              SDNode *Cst2) {
@@ -3578,6 +3850,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
+    return getUNDEF(VT);
+
   // Handle the case of two scalars.
   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
@@ -3645,6 +3920,9 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, Ops))
+    return getUNDEF(VT);
+
   // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
   if (!VT.isVector())
     return SDValue();
@@ -3676,7 +3954,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
   EVT LegalSVT = VT.getScalarType();
-  if (LegalSVT.isInteger()) {
+  if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(VT.getScalarType()))
       return SDValue();
@@ -3910,35 +4188,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
 
-    auto SignExtendInReg = [&](APInt Val) {
+    auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
       unsigned FromBits = EVT.getScalarSizeInBits();
       Val <<= Val.getBitWidth() - FromBits;
       Val = Val.ashr(Val.getBitWidth() - FromBits);
-      return getConstant(Val, DL, VT.getScalarType());
+      return getConstant(Val, DL, ConstantVT);
     };
 
     if (N1C) {
       const APInt &Val = N1C->getAPIntValue();
-      return SignExtendInReg(Val);
+      return SignExtendInReg(Val, VT);
     }
     if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
       SmallVector<SDValue, 8> Ops;
+      llvm::EVT OpVT = N1.getOperand(0).getValueType();
       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N1.getOperand(i);
         if (Op.isUndef()) {
-          Ops.push_back(getUNDEF(VT.getScalarType()));
+          Ops.push_back(getUNDEF(OpVT));
           continue;
         }
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-          APInt Val = C->getAPIntValue();
-          Val = Val.zextOrTrunc(VT.getScalarSizeInBits());
-          Ops.push_back(SignExtendInReg(Val));
-          continue;
-        }
-        break;
+        ConstantSDNode *C = cast<ConstantSDNode>(Op);
+        APInt Val = C->getAPIntValue();
+        Ops.push_back(SignExtendInReg(Val, OpVT));
       }
-      if (Ops.size() == VT.getVectorNumElements())
-        return getBuildVector(VT, DL, Ops);
+      return getBuildVector(VT, DL, Ops);
     }
     break;
   }
@@ -4040,6 +4314,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       if (VT.getSimpleVT() == N1.getSimpleValueType())
         return N1;
 
+      // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
+      if (N1.isUndef())
+        return getUNDEF(VT);
+
+      // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
+      // the concat have the same type as the extract.
+      if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
+          N1.getNumOperands() > 0 &&
+          VT == N1.getOperand(0).getValueType()) {
+        unsigned Factor = VT.getVectorNumElements();
+        return N1.getOperand(N2C->getZExtValue() / Factor);
+      }
+
       // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
       // during shuffle legalization.
       if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
@@ -4943,11 +5230,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5004,11 +5291,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5066,11 +5353,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -7049,6 +7336,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {
   return Seen;
 }
 
+/// Return true if the only users of N are contained in Nodes.
+bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (llvm::any_of(Nodes,
+                     [&User](const SDNode *Node) { return User == Node; }))
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
 /// isOperand - Return true if this node is an operand of N.
 ///
 bool SDValue::isOperandOf(const SDNode *N) const {
@@ -7070,21 +7372,39 @@ bool SDNode::isOperandOf(const SDNode *N) const {
 /// side-effecting instructions on any chain path.  In practice, this looks
 /// through token factors and non-volatile loads.  In order to remain efficient,
 /// this only looks a couple of nodes in, it does not do an exhaustive search.
+///
+/// Note that we only need to examine chains when we're searching for
+/// side-effects; SelectionDAG requires that all side-effects are represented
+/// by chains, even if another operand would force a specific ordering. This
+/// constraint is necessary to allow transformations like splitting loads.
 bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
-                                               unsigned Depth) const {
+                                             unsigned Depth) const {
   if (*this == Dest) return true;
 
   // Don't search too deeply, we just want to be able to see through
   // TokenFactor's etc.
   if (Depth == 0) return false;
 
-  // If this is a token factor, all inputs to the TF happen in parallel.  If any
-  // of the operands of the TF does not reach dest, then we cannot do the xform.
+  // If this is a token factor, all inputs to the TF happen in parallel.
   if (getOpcode() == ISD::TokenFactor) {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-      if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
-        return false;
-    return true;
+    // First, try a shallow search.
+    if (is_contained((*this)->ops(), Dest)) {
+      // We found the chain we want as an operand of this TokenFactor.
+      // Essentially, we reach the chain without side-effects if we could
+      // serialize the TokenFactor into a simple chain of operations with
+      // Dest as the last operation. This is automatically true if the
+      // chain has one use: there are no other ordering constraints.
+      // If the chain has more than one use, we give up: some other
+      // use of Dest might force a side-effect between Dest and the current
+      // node.
+      if (Dest.hasOneUse())
+        return true;
+    }
+    // Next, try a deep search: check whether every operand of the TokenFactor
+    // reaches Dest.
+    return all_of((*this)->ops(), [=](SDValue Op) {
+      return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
+    });
   }
 
   // Loads don't have side effects, look through them.
@@ -7102,11 +7422,6 @@ bool SDNode::hasPredecessor(const SDNode *N) const {
   return hasPredecessorHelper(N, Visited, Worklist);
 }
 
-uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
-  assert(Num < NumOperands && "Invalid child # of SDNode!");
-  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
-}
-
 const SDNodeFlags *SDNode::getFlags() const {
   if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
     return &FlagsNode->Flags;
@@ -7377,13 +7692,13 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
     unsigned BitPos = j * EltBitSize;
 
     if (OpVal.isUndef())
-      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
+      SplatUndef.setBits(BitPos, BitPos + EltBitSize);
     else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
-      SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize).
-                    zextOrTrunc(sz) << BitPos;
+      SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltBitSize),
+                            BitPos);
     else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
-      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
-     else
+      SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
+    else
       return false;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 996c95bd5f07..8708f58f1e63 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -84,10 +84,6 @@ LimitFPPrecision("limit-float-precision",
                  cl::location(LimitFloatPrecision),
                  cl::init(0));
 
-static cl::opt<bool>
-EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
-                cl::desc("Enable fast-math-flags for DAG nodes"));
-
 /// Minimum jump table density for normal functions.
 static cl::opt<unsigned>
 JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
@@ -634,10 +630,6 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
   }
 }
 
-/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-/// this value and returns the result as a ValueVT value.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
                                       const SDLoc &dl, SDValue &Chain,
@@ -739,10 +731,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
   return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
 }
 
-/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-/// specified value into the registers specified by this object.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
                                  const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                                  const Value *V,
@@ -796,9 +784,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 }
 
-/// AddInlineAsmOperands - Add this value to the specified inlineasm node
-/// operand list.  This adds the code marker and includes the number of
-/// values added into it.
 void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
                                         unsigned MatchingIdx, const SDLoc &dl,
                                         SelectionDAG &DAG,
@@ -850,12 +835,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   LPadToCallSiteMap.clear();
 }
 
-/// clear - Clear out the current SelectionDAG and the associated
-/// state and prepare this SelectionDAGBuilder object to be used
-/// for a new block. This doesn't clear out information about
-/// additional blocks that are needed to complete switch lowering
-/// or PHI node updating; that information is cleared out as it is
-/// consumed.
 void SelectionDAGBuilder::clear() {
   NodeMap.clear();
   UnusedArgNodeMap.clear();
@@ -867,21 +846,10 @@ void SelectionDAGBuilder::clear() {
   StatepointLowering.clear();
 }
 
-/// clearDanglingDebugInfo - Clear the dangling debug information
-/// map. This function is separated from the clear so that debug
-/// information that is dangling in a basic block can be properly
-/// resolved in a different basic block. This allows the
-/// SelectionDAG to resolve dangling debug information attached
-/// to PHI nodes.
 void SelectionDAGBuilder::clearDanglingDebugInfo() {
   DanglingDebugInfoMap.clear();
 }
 
-/// getRoot - Return the current virtual root of the Selection DAG,
-/// flushing any PendingLoad items. This must be done before emitting
-/// a store or any other node that may need to be ordered after any
-/// prior load instructions.
-///
 SDValue SelectionDAGBuilder::getRoot() {
   if (PendingLoads.empty())
     return DAG.getRoot();
@@ -901,10 +869,6 @@ SDValue SelectionDAGBuilder::getRoot() {
   return Root;
 }
 
-/// getControlRoot - Similar to getRoot, but instead of flushing all the
-/// PendingLoad items, flush all the PendingExports items. It is necessary
-/// to do this before emitting a terminator instruction.
-///
 SDValue SelectionDAGBuilder::getControlRoot() {
   SDValue Root = DAG.getRoot();
 
@@ -937,7 +901,9 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
     HandlePHINodesInSuccessorBlocks(I.getParent());
   }
 
-  ++SDNodeOrder;
+  // Increase the SDNodeOrder if dealing with a non-debug instruction.
+  if (!isa<DbgInfoIntrinsic>(I))
+    ++SDNodeOrder;
 
   CurInst = &I;
 
@@ -1403,16 +1369,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       const Function *F = I.getParent()->getParent();
 
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::SExt))
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                                Attribute::ZExt))
         ExtendKind = ISD::ZERO_EXTEND;
 
       LLVMContext &Context = F->getContext();
-      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                                      Attribute::InReg);
+      bool RetInReg = F->getAttributes().hasAttribute(
+          AttributeList::ReturnIndex, Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
@@ -1582,7 +1548,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
                                                   MachineBasicBlock *CurBB,
                                                   MachineBasicBlock *SwitchBB,
                                                   BranchProbability TProb,
-                                                  BranchProbability FProb) {
+                                                  BranchProbability FProb,
+                                                  bool InvertCond) {
   const BasicBlock *BB = CurBB->getBasicBlock();
 
   // If the leaf of the tree is a comparison, merge the condition into
@@ -1596,10 +1563,14 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
          isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
       ISD::CondCode Condition;
       if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
-        Condition = getICmpCondCode(IC->getPredicate());
+        ICmpInst::Predicate Pred =
+            InvertCond ? IC->getInversePredicate() : IC->getPredicate();
+        Condition = getICmpCondCode(Pred);
       } else {
         const FCmpInst *FC = cast<FCmpInst>(Cond);
-        Condition = getFCmpCondCode(FC->getPredicate());
+        FCmpInst::Predicate Pred =
+            InvertCond ? FC->getInversePredicate() : FC->getPredicate();
+        Condition = getFCmpCondCode(Pred);
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
       }
@@ -1612,7 +1583,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
   }
 
   // Create a CaseBlock record representing this branch.
-  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
+  ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
+  CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
                nullptr, TBB, FBB, CurBB, TProb, FProb);
   SwitchCases.push_back(CB);
 }
@@ -1625,16 +1597,44 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                MachineBasicBlock *SwitchBB,
                                                Instruction::BinaryOps Opc,
                                                BranchProbability TProb,
-                                               BranchProbability FProb) {
-  // If this node is not part of the or/and tree, emit it as a branch.
+                                               BranchProbability FProb,
+                                               bool InvertCond) {
+  // Skip over not part of the tree and remember to invert op and operands at
+  // next level.
+  if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
+    const Value *CondOp = BinaryOperator::getNotArgument(Cond);
+    if (InBlock(CondOp, CurBB->getBasicBlock())) {
+      FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                           !InvertCond);
+      return;
+    }
+  }
+
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  // Compute the effective opcode for Cond, taking into account whether it needs
+  // to be inverted, e.g.
+  //   and (not (or A, B)), C
+  // gets lowered as
+  //   and (and (not A, not B), C)
+  unsigned BOpc = 0;
+  if (BOp) {
+    BOpc = BOp->getOpcode();
+    if (InvertCond) {
+      if (BOpc == Instruction::And)
+        BOpc = Instruction::Or;
+      else if (BOpc == Instruction::Or)
+        BOpc = Instruction::And;
+    }
+  }
+
+  // If this node is not part of the or/and tree, emit it as a branch.
   if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
-      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() ||
+      BOpc != Opc || !BOp->hasOneUse() ||
       BOp->getParent() != CurBB->getBasicBlock() ||
       !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
       !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
     EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
-                                 TProb, FProb);
+                                 TProb, FProb, InvertCond);
     return;
   }
 
@@ -1669,14 +1669,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     auto NewFalseProb = TProb / 2 + FProb;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
-                         NewTrueProb, NewFalseProb);
+                         NewTrueProb, NewFalseProb, InvertCond);
 
     // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
     SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         Probs[0], Probs[1]);
+                         Probs[0], Probs[1], InvertCond);
   } else {
     assert(Opc == Instruction::And && "Unknown merge op!");
     // Codegen X & Y as:
@@ -1702,14 +1702,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     auto NewFalseProb = FProb / 2;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
-                         NewTrueProb, NewFalseProb);
+                         NewTrueProb, NewFalseProb, InvertCond);
 
     // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
     SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         Probs[0], Probs[1]);
+                         Probs[0], Probs[1], InvertCond);
   }
 }
 
@@ -1793,7 +1793,8 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
                            getEdgeProbability(BrMBB, Succ0MBB),
-                           getEdgeProbability(BrMBB, Succ1MBB));
+                           getEdgeProbability(BrMBB, Succ1MBB),
+                           /*InvertCond=*/false);
       // If the compares in later blocks need to use values not currently
       // exported from this block, export them now.  This block should always
       // be the first entry.
@@ -2027,7 +2028,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
     Entry.Node = StackSlot;
     Entry.Ty = FnTy->getParamType(0);
     if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
-      Entry.isInReg = true;
+      Entry.IsInReg = true;
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2581,13 +2582,13 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
   Flags.setVectorReduction(vec_redux);
-  if (EnableFMFInDAG) {
-    Flags.setAllowReciprocal(FMF.allowReciprocal());
-    Flags.setNoInfs(FMF.noInfs());
-    Flags.setNoNaNs(FMF.noNaNs());
-    Flags.setNoSignedZeros(FMF.noSignedZeros());
-    Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
-  }
+  Flags.setAllowReciprocal(FMF.allowReciprocal());
+  Flags.setAllowContract(FMF.allowContract());
+  Flags.setNoInfs(FMF.noInfs());
+  Flags.setNoNaNs(FMF.noNaNs());
+  Flags.setNoSignedZeros(FMF.noSignedZeros());
+  Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
+
   SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
                                      Op1, Op2, &Flags);
   setValue(&I, BinNodeValue);
@@ -2914,7 +2915,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) {
                              DestVT, N)); // convert types.
   // Check if the original LLVM IR Operand was a ConstantInt, because getValue()
   // might fold any kind of constant expression to an integer constant and that
-  // is not what we are looking for. Only regcognize a bitcast of a genuine
+  // is not what we are looking for. Only recognize a bitcast of a genuine
   // constant integer as an opaque constant.
   else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
     setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
@@ -3067,14 +3068,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
 
   if (SrcNumElts > MaskNumElts) {
     // Analyze the access pattern of the vector to see if we can extract
-    // two subvectors and do the shuffle. The analysis is done by calculating
-    // the range of elements the mask access on both vectors.
-    int MinRange[2] = { static_cast<int>(SrcNumElts),
-                        static_cast<int>(SrcNumElts)};
-    int MaxRange[2] = {-1, -1};
-
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
-      int Idx = Mask[i];
+    // two subvectors and do the shuffle.
+    int StartIdx[2] = { -1, -1 };  // StartIdx to extract from
+    bool CanExtract = true;
+    for (int Idx : Mask) {
       unsigned Input = 0;
       if (Idx < 0)
         continue;
@@ -3083,41 +3080,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         Input = 1;
         Idx -= SrcNumElts;
       }
-      if (Idx > MaxRange[Input])
-        MaxRange[Input] = Idx;
-      if (Idx < MinRange[Input])
-        MinRange[Input] = Idx;
-    }
-
-    // Check if the access is smaller than the vector size and can we find
-    // a reasonable extract index.
-    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not
-                                   // Extract.
-    int StartIdx[2];  // StartIdx to extract from
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) {
-        RangeUse[Input] = 0; // Unused
-        StartIdx[Input] = 0;
-        continue;
-      }
 
-      // Find a good start index that is a multiple of the mask length. Then
-      // see if the rest of the elements are in range.
-      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
-      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
-          StartIdx[Input] + MaskNumElts <= SrcNumElts)
-        RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+      // If all the indices come from the same MaskNumElts sized portion of
+      // the sources we can use extract. Also make sure the extract wouldn't
+      // extract past the end of the source.
+      int NewStartIdx = alignDown(Idx, MaskNumElts);
+      if (NewStartIdx + MaskNumElts > SrcNumElts ||
+          (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
+        CanExtract = false;
+      // Make sure we always update StartIdx as we use it to track if all
+      // elements are undef.
+      StartIdx[Input] = NewStartIdx;
     }
 
-    if (RangeUse[0] == 0 && RangeUse[1] == 0) {
+    if (StartIdx[0] < 0 && StartIdx[1] < 0) {
       setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
       return;
     }
-    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) {
+    if (CanExtract) {
       // Extract appropriate subvector and generate a vector shuffle
       for (unsigned Input = 0; Input < 2; ++Input) {
         SDValue &Src = Input == 0 ? Src1 : Src2;
-        if (RangeUse[Input] == 0)
+        if (StartIdx[Input] < 0)
           Src = DAG.getUNDEF(VT);
         else {
           Src = DAG.getNode(
@@ -3128,16 +3112,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       }
 
       // Calculate new mask.
-      SmallVector<int, 8> MappedOps;
-      for (unsigned i = 0; i != MaskNumElts; ++i) {
-        int Idx = Mask[i];
-        if (Idx >= 0) {
-          if (Idx < (int)SrcNumElts)
-            Idx -= StartIdx[0];
-          else
-            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
-        }
-        MappedOps.push_back(Idx);
+      SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
+      for (int &Idx : MappedOps) {
+        if (Idx >= (int)SrcNumElts)
+          Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        else if (Idx >= 0)
+          Idx -= StartIdx[0];
       }
 
       setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
@@ -3151,8 +3131,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   EVT EltVT = VT.getVectorElementType();
   EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
   SmallVector<SDValue,8> Ops;
-  for (unsigned i = 0; i != MaskNumElts; ++i) {
-    int Idx = Mask[i];
+  for (int Idx : Mask) {
     SDValue Res;
 
     if (Idx < 0) {
@@ -3281,7 +3260,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         // N = N + Offset
         uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
 
-        // In an inbouds GEP with an offset that is nonnegative even when
+        // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
@@ -4752,7 +4731,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   else
     FuncInfo.ArgDbgValues.push_back(
         BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE))
-            .addOperand(*Op)
+            .add(*Op)
             .addImm(Offset)
             .addMetadata(Variable)
             .addMetadata(Expr));
@@ -4764,7 +4743,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
 SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
                                              DILocalVariable *Variable,
                                              DIExpression *Expr, int64_t Offset,
-                                             DebugLoc dl,
+                                             const DebugLoc &dl,
                                              unsigned DbgSDNodeOrder) {
   SDDbgValue *SDV;
   auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode());
@@ -4794,9 +4773,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
 #  define setjmp_undefined_for_msvc
 #endif
 
-/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
-/// we want to emit this as a call to a named external function, return the name
-/// otherwise lower it and return null.
+/// Lower the call to the specified intrinsic function. If we want to emit this
+/// as a call to a named external function, return the name. Otherwise, lower it
+/// and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -4929,14 +4908,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       report_fatal_error("Unsupported element size");
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl)
-        .setChain(getRoot())
-        .setCallee(TLI.getLibcallCallingConv(LibraryCall),
-                   Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(
-                       TLI.getLibcallName(LibraryCall),
-                       TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
+        TLI.getLibcallCallingConv(LibraryCall),
+        Type::getVoidTy(*DAG.getContext()),
+        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     DAG.setRoot(CallResult.second);
@@ -5301,6 +5278,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return nullptr;
+  case Intrinsic::experimental_constrained_fadd:
+  case Intrinsic::experimental_constrained_fsub:
+  case Intrinsic::experimental_constrained_fmul:
+  case Intrinsic::experimental_constrained_fdiv:
+  case Intrinsic::experimental_constrained_frem:
+    visitConstrainedFPIntrinsic(I, Intrinsic);
+    return nullptr;
   case Intrinsic::fmuladd: {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
@@ -5537,7 +5521,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::trap: {
     StringRef TrapFuncName =
         I.getAttributes()
-            .getAttribute(AttributeSet::FunctionIndex, "trap-func-name")
+            .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
             .getValueAsString();
     if (TrapFuncName.empty()) {
       ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
@@ -5548,7 +5532,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee(
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
         CallingConv::C, I.getType(),
         DAG.getExternalSymbol(TrapFuncName.data(),
                               TLI.getPointerTy(DAG.getDataLayout())),
@@ -5749,6 +5733,46 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
 }
 
+void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I,
+                                                      unsigned Intrinsic) {
+  SDLoc sdl = getCurSDLoc();
+  unsigned Opcode;
+  switch (Intrinsic) {
+  default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+  case Intrinsic::experimental_constrained_fadd: 
+    Opcode = ISD::STRICT_FADD;
+    break;
+  case Intrinsic::experimental_constrained_fsub:
+    Opcode = ISD::STRICT_FSUB;
+    break;
+  case Intrinsic::experimental_constrained_fmul:
+    Opcode = ISD::STRICT_FMUL;
+    break;
+  case Intrinsic::experimental_constrained_fdiv:
+    Opcode = ISD::STRICT_FDIV;
+    break;
+  case Intrinsic::experimental_constrained_frem:
+    Opcode = ISD::STRICT_FREM;
+    break;
+  }
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Chain = getRoot();
+  SDValue Ops[3] = { Chain, getValue(I.getArgOperand(0)),
+                     getValue(I.getArgOperand(1)) };
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
+  ValueVTs.push_back(MVT::Other); // Out chain
+
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+  SDValue Result = DAG.getNode(Opcode, sdl, VTs, Ops);
+
+  assert(Result.getNode()->getNumValues() == 2);
+  SDValue OutChain = Result.getValue(1);
+  DAG.setRoot(OutChain);
+  SDValue FPResult = Result.getValue(0);
+  setValue(&I, FPResult);
+}
+
 std::pair<SDValue, SDValue>
 SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
                                     const BasicBlock *EHPadBB) {
@@ -5827,7 +5851,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   Type *RetTy = CS.getType();
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   Args.reserve(CS.arg_size());
 
   const Value *SwiftErrorVal = nullptr;
@@ -5843,6 +5866,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
+    TargetLowering::ArgListEntry Entry;
     const Value *V = *i;
 
     // Skip empty types
@@ -5852,11 +5876,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
-    // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Entry.setAttributes(&CS, i - CS.arg_begin());
 
     // Use swifterror virtual register as input to the call.
-    if (Entry.isSwiftError && TLI.supportSwiftError()) {
+    if (Entry.IsSwiftError && TLI.supportSwiftError()) {
       SwiftErrorVal = V;
       // We find the virtual register for the actual swifterror argument.
       // Instead of using the Value, we use the virtual register instead.
@@ -5869,7 +5892,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
     // If we have an explicit sret argument that is an Instruction, (i.e., it
     // might point to function-local memory), we can't meaningfully tail-call.
-    if (Entry.isSRet && isa<Instruction>(V))
+    if (Entry.IsSRet && isa<Instruction>(V))
       isTailCall = false;
   }
 
@@ -5912,8 +5935,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   }
 }
 
-/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
-/// value is equal or not-equal to zero.
+/// Return true if it only matters that the value is equal or not-equal to zero.
 static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
   for (const User *U : V->users()) {
     if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -5928,13 +5950,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
 }
 
 static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
-                             Type *LoadTy,
                              SelectionDAGBuilder &Builder) {
 
   // Check to see if this load can be trivially constant folded, e.g. if the
   // input is from a string literal.
   if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
     // Cast pointer to the type we really want to load.
+    Type *LoadTy =
+        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+    if (LoadVT.isVector())
+      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
 
@@ -5967,8 +5993,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
-/// processIntegerCallValue - Record the value for an instruction that
-/// produces an integer result, converting the type where necessary.
+/// Record the value for an instruction that produces an integer result,
+/// converting the type where necessary.
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
@@ -5981,20 +6007,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
   setValue(&I, Value);
 }
 
-/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
-/// If so, return true and lower it, otherwise return false and it will be
-/// lowered like a normal call.
+/// See if we can lower a memcmp call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
-  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
-      !I.getArgOperand(2)->getType()->isIntegerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
-
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
@@ -6005,11 +6024,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   }
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
-  std::pair<SDValue, SDValue> Res =
-    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
-                                getValue(LHS), getValue(RHS), getValue(Size),
-                                MachinePointerInfo(LHS),
-                                MachinePointerInfo(RHS));
+  std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
+      DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
+      getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, true);
     PendingLoads.push_back(Res.second);
@@ -6018,88 +6035,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
-    bool ActuallyDoIt = true;
-    MVT LoadVT;
-    Type *LoadTy;
-    switch (CSize->getZExtValue()) {
-    default:
-      LoadVT = MVT::Other;
-      LoadTy = nullptr;
-      ActuallyDoIt = false;
-      break;
-    case 2:
-      LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(CSize->getContext());
-      break;
-    case 4:
-      LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(CSize->getContext());
-      break;
-    case 8:
-      LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(CSize->getContext());
-      break;
-        /*
-    case 16:
-      LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(CSize->getContext());
-      LoadTy = VectorType::get(LoadTy, 4);
-      break;
-         */
-    }
-
-    // This turns into unaligned loads.  We only do this if the target natively
-    // supports the MVT we'll be loading or if it is small enough (<= 4) that
-    // we'll only produce a small number of byte loads.
+  if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I))
+    return false;
 
-    // Require that we can find a legal MVT, and only do this if the target
-    // supports unaligned loads of that type.  Expanding into byte loads would
-    // bloat the code.
+  // If the target has a fast compare for the given size, it will return a
+  // preferred load type for that size. Require that the load VT is legal and
+  // that the target supports unaligned loads of that type. Otherwise, return
+  // INVALID.
+  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
-      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       // TODO: Check alignment of src and dest ptrs.
-      if (!TLI.isTypeLegal(LoadVT) ||
-          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
-          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
-        ActuallyDoIt = false;
+      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+      if (!TLI.isTypeLegal(LVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     }
 
-    if (ActuallyDoIt) {
-      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
-      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+    return LVT;
+  };
 
-      SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
-                                 ISD::SETNE);
-      processIntegerCallValue(I, Res, false);
-      return true;
-    }
+  // This turns into unaligned loads. We only do this if the target natively
+  // supports the MVT we'll be loading or if it is small enough (<= 4) that
+  // we'll only produce a small number of byte loads.
+  MVT LoadVT;
+  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+  switch (NumBitsToCompare) {
+  default:
+    return false;
+  case 16:
+    LoadVT = MVT::i16;
+    break;
+  case 32:
+    LoadVT = MVT::i32;
+    break;
+  case 64:
+  case 128:
+  case 256:
+    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
+    break;
   }
 
+  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
 
-  return false;
+  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+  // Bitcast to a wide integer type if the loads are vectors.
+  if (LoadVT.isVector()) {
+    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+    LoadL = DAG.getBitcast(CmpVT, LoadL);
+    LoadR = DAG.getBitcast(CmpVT, LoadR);
+  }
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+  processIntegerCallValue(I, Cmp, false);
+  return true;
 }
 
-/// visitMemChrCall -- See if we can lower a memchr call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a memchr call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   const Value *Src = I.getArgOperand(0);
   const Value *Char = I.getArgOperand(1);
   const Value *Length = I.getArgOperand(2);
-  if (!Src->getType()->isPointerTy() ||
-      !Char->getType()->isIntegerTy() ||
-      !Length->getType()->isIntegerTy() ||
-      !I.getType()->isPointerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6115,15 +6123,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
   return false;
 }
 
-///
-/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to
-/// to adjust the dst pointer by the size of the copied memory.
+/// See if we can lower a mempcpy call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
-
-  // Verify argument count: void *mempcpy(void *, const void *, size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   SDValue Dst = getValue(I.getArgOperand(0));
   SDValue Src = getValue(I.getArgOperand(1));
   SDValue Size = getValue(I.getArgOperand(2));
@@ -6158,19 +6163,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   return true;
 }
 
-/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
-/// optimized form.  If so, return true and lower it, otherwise return false
-/// and it will be lowered like a normal call.
+/// See if we can lower a strcpy call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
-  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isPointerTy() ||
-      !I.getType()->isPointerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6187,19 +6186,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
   return false;
 }
 
-/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
-/// If so, return true and lower it, otherwise return false and it will be
-/// lowered like a normal call.
+/// See if we can lower a strcmp call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  int strcmp(void*,void*)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isPointerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6216,17 +6209,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
   return false;
 }
 
-/// visitStrLenCall -- See if we can lower a strlen call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a strlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  size_t strlen(char *)
-  if (I.getNumArgOperands() != 1)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0);
-  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6241,19 +6230,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
   return false;
 }
 
-/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a strnlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isIntegerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6269,16 +6252,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
   return false;
 }
 
-/// visitUnaryFloatCall - If a call instruction is a unary floating-point
-/// operation (as expected), translate it to an SDNode with the specified opcode
-/// and return true.
+/// See if we can lower a unary floating-point operation into an SDNode with
+/// the specified Opcode.  If so, return true and lower it, otherwise return
+/// false and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
                                               unsigned Opcode) {
-  // Sanity check that it really is a unary floating-point call.
-  if (I.getNumArgOperands() != 1 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      !I.onlyReadsMemory())
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp = getValue(I.getArgOperand(0));
@@ -6286,17 +6268,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
   return true;
 }
 
-/// visitBinaryFloatCall - If a call instruction is a binary floating-point
-/// operation (as expected), translate it to an SDNode with the specified opcode
-/// and return true.
+/// See if we can lower a binary floating-point operation into an SDNode with
+/// the specified Opcode. If so, return true and lower it. Otherwise return
+/// false, and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
                                                unsigned Opcode) {
-  // Sanity check that it really is a binary floating-point call.
-  if (I.getNumArgOperands() != 2 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      I.getType() != I.getArgOperand(1)->getType() ||
-      !I.onlyReadsMemory())
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp0 = getValue(I.getArgOperand(0));
@@ -6336,20 +6316,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     // Check for well-known libc/libm calls.  If the function is internal, it
     // can't be a library call.  Don't do the check if marked as nobuiltin for
     // some reason.
-    LibFunc::Func Func;
+    LibFunc Func;
     if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
-        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->getLibFunc(*F, Func) &&
         LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
       default: break;
-      case LibFunc::copysign:
-      case LibFunc::copysignf:
-      case LibFunc::copysignl:
-        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.getType() == I.getArgOperand(1)->getType() &&
-            I.onlyReadsMemory()) {
+      case LibFunc_copysign:
+      case LibFunc_copysignf:
+      case LibFunc_copysignl:
+        // We already checked this call's prototype; verify it doesn't modify
+        // errno.
+        if (I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
@@ -6357,122 +6335,122 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
           return;
         }
         break;
-      case LibFunc::fabs:
-      case LibFunc::fabsf:
-      case LibFunc::fabsl:
+      case LibFunc_fabs:
+      case LibFunc_fabsf:
+      case LibFunc_fabsl:
         if (visitUnaryFloatCall(I, ISD::FABS))
           return;
         break;
-      case LibFunc::fmin:
-      case LibFunc::fminf:
-      case LibFunc::fminl:
+      case LibFunc_fmin:
+      case LibFunc_fminf:
+      case LibFunc_fminl:
         if (visitBinaryFloatCall(I, ISD::FMINNUM))
           return;
         break;
-      case LibFunc::fmax:
-      case LibFunc::fmaxf:
-      case LibFunc::fmaxl:
+      case LibFunc_fmax:
+      case LibFunc_fmaxf:
+      case LibFunc_fmaxl:
         if (visitBinaryFloatCall(I, ISD::FMAXNUM))
           return;
         break;
-      case LibFunc::sin:
-      case LibFunc::sinf:
-      case LibFunc::sinl:
+      case LibFunc_sin:
+      case LibFunc_sinf:
+      case LibFunc_sinl:
         if (visitUnaryFloatCall(I, ISD::FSIN))
           return;
         break;
-      case LibFunc::cos:
-      case LibFunc::cosf:
-      case LibFunc::cosl:
+      case LibFunc_cos:
+      case LibFunc_cosf:
+      case LibFunc_cosl:
         if (visitUnaryFloatCall(I, ISD::FCOS))
           return;
         break;
-      case LibFunc::sqrt:
-      case LibFunc::sqrtf:
-      case LibFunc::sqrtl:
-      case LibFunc::sqrt_finite:
-      case LibFunc::sqrtf_finite:
-      case LibFunc::sqrtl_finite:
+      case LibFunc_sqrt:
+      case LibFunc_sqrtf:
+      case LibFunc_sqrtl:
+      case LibFunc_sqrt_finite:
+      case LibFunc_sqrtf_finite:
+      case LibFunc_sqrtl_finite:
         if (visitUnaryFloatCall(I, ISD::FSQRT))
           return;
         break;
-      case LibFunc::floor:
-      case LibFunc::floorf:
-      case LibFunc::floorl:
+      case LibFunc_floor:
+      case LibFunc_floorf:
+      case LibFunc_floorl:
         if (visitUnaryFloatCall(I, ISD::FFLOOR))
           return;
         break;
-      case LibFunc::nearbyint:
-      case LibFunc::nearbyintf:
-      case LibFunc::nearbyintl:
+      case LibFunc_nearbyint:
+      case LibFunc_nearbyintf:
+      case LibFunc_nearbyintl:
         if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
           return;
         break;
-      case LibFunc::ceil:
-      case LibFunc::ceilf:
-      case LibFunc::ceill:
+      case LibFunc_ceil:
+      case LibFunc_ceilf:
+      case LibFunc_ceill:
         if (visitUnaryFloatCall(I, ISD::FCEIL))
           return;
         break;
-      case LibFunc::rint:
-      case LibFunc::rintf:
-      case LibFunc::rintl:
+      case LibFunc_rint:
+      case LibFunc_rintf:
+      case LibFunc_rintl:
         if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
         break;
-      case LibFunc::round:
-      case LibFunc::roundf:
-      case LibFunc::roundl:
+      case LibFunc_round:
+      case LibFunc_roundf:
+      case LibFunc_roundl:
         if (visitUnaryFloatCall(I, ISD::FROUND))
           return;
         break;
-      case LibFunc::trunc:
-      case LibFunc::truncf:
-      case LibFunc::truncl:
+      case LibFunc_trunc:
+      case LibFunc_truncf:
+      case LibFunc_truncl:
         if (visitUnaryFloatCall(I, ISD::FTRUNC))
           return;
         break;
-      case LibFunc::log2:
-      case LibFunc::log2f:
-      case LibFunc::log2l:
+      case LibFunc_log2:
+      case LibFunc_log2f:
+      case LibFunc_log2l:
         if (visitUnaryFloatCall(I, ISD::FLOG2))
           return;
         break;
-      case LibFunc::exp2:
-      case LibFunc::exp2f:
-      case LibFunc::exp2l:
+      case LibFunc_exp2:
+      case LibFunc_exp2f:
+      case LibFunc_exp2l:
         if (visitUnaryFloatCall(I, ISD::FEXP2))
           return;
         break;
-      case LibFunc::memcmp:
+      case LibFunc_memcmp:
         if (visitMemCmpCall(I))
           return;
         break;
-      case LibFunc::mempcpy:
+      case LibFunc_mempcpy:
         if (visitMemPCpyCall(I))
           return;
         break;
-      case LibFunc::memchr:
+      case LibFunc_memchr:
         if (visitMemChrCall(I))
           return;
         break;
-      case LibFunc::strcpy:
+      case LibFunc_strcpy:
         if (visitStrCpyCall(I, false))
           return;
         break;
-      case LibFunc::stpcpy:
+      case LibFunc_stpcpy:
         if (visitStrCpyCall(I, true))
           return;
         break;
-      case LibFunc::strcmp:
+      case LibFunc_strcmp:
         if (visitStrCmpCall(I))
           return;
         break;
-      case LibFunc::strlen:
+      case LibFunc_strlen:
         if (visitStrLenCall(I))
           return;
         break;
-      case LibFunc::strnlen:
+      case LibFunc_strnlen:
         if (visitStrNLenCall(I))
           return;
         break;
@@ -7361,7 +7339,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
 
   // Populate the argument list.
   // Attributes for args start at offset 1, after the return attribute.
-  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
        ArgI != ArgE; ++ArgI) {
     const Value *V = CS->getOperand(ArgI);
 
@@ -7370,7 +7348,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
     TargetLowering::ArgListEntry Entry;
     Entry.Node = getValue(V);
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, AttrI);
+    Entry.setAttributes(&CS, ArgIdx);
     Args.push_back(Entry);
   }
 
@@ -7631,9 +7609,9 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   FuncInfo.MF->getFrameInfo().setHasPatchPoint();
 }
 
-/// Returns an AttributeSet representing the attributes applied to the return
+/// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
-static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
@@ -7642,8 +7620,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
-  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
-                           Attrs);
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
 }
 
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
@@ -7683,15 +7661,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     ArgListEntry Entry;
     Entry.Node = DemoteStackSlot;
     Entry.Ty = StackSlotPtrType;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isInReg = false;
-    Entry.isSRet = true;
-    Entry.isNest = false;
-    Entry.isByVal = false;
-    Entry.isReturned = false;
-    Entry.isSwiftSelf = false;
-    Entry.isSwiftError = false;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsInReg = false;
+    Entry.IsSRet = true;
+    Entry.IsNest = false;
+    Entry.IsByVal = false;
+    Entry.IsReturned = false;
+    Entry.IsSwiftSelf = false;
+    Entry.IsSwiftError = false;
     Entry.Alignment = Align;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
@@ -7724,7 +7702,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   ArgListTy &Args = CLI.getArgs();
   if (supportSwiftError()) {
     for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-      if (Args[i].isSwiftError) {
+      if (Args[i].IsSwiftError) {
         ISD::InputArg MyFlags;
         MyFlags.VT = getPointerTy(DL);
         MyFlags.ArgVT = EVT(getPointerTy(DL));
@@ -7741,7 +7719,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
     Type *FinalType = Args[i].Ty;
-    if (Args[i].isByVal)
+    if (Args[i].IsByVal)
       FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         FinalType, CLI.CallConv, CLI.IsVarArg);
@@ -7754,11 +7732,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       ISD::ArgFlagsTy Flags;
       unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
 
-      if (Args[i].isZExt)
+      if (Args[i].IsZExt)
         Flags.setZExt();
-      if (Args[i].isSExt)
+      if (Args[i].IsSExt)
         Flags.setSExt();
-      if (Args[i].isInReg) {
+      if (Args[i].IsInReg) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (CLI.CallConv == CallingConv::X86_VectorCall &&
@@ -7771,15 +7749,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // Set InReg Flag
         Flags.setInReg();
       }
-      if (Args[i].isSRet)
+      if (Args[i].IsSRet)
         Flags.setSRet();
-      if (Args[i].isSwiftSelf)
+      if (Args[i].IsSwiftSelf)
         Flags.setSwiftSelf();
-      if (Args[i].isSwiftError)
+      if (Args[i].IsSwiftError)
         Flags.setSwiftError();
-      if (Args[i].isByVal)
+      if (Args[i].IsByVal)
         Flags.setByVal();
-      if (Args[i].isInAlloca) {
+      if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
         // inalloca.  This way we can know how many bytes we should've allocated
@@ -7788,7 +7766,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (Args[i].isByVal || Args[i].isInAlloca) {
+      if (Args[i].IsByVal || Args[i].IsInAlloca) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
@@ -7801,7 +7779,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           FrameAlign = getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
-      if (Args[i].isNest)
+      if (Args[i].IsNest)
         Flags.setNest();
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
@@ -7812,13 +7790,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-      if (Args[i].isSExt)
+      if (Args[i].IsSExt)
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (Args[i].isZExt)
+      else if (Args[i].IsZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
       // Conservatively only handle 'returned' on non-vectors for now
-      if (Args[i].isReturned && !Op.getValueType().isVector()) {
+      if (Args[i].IsReturned && !Op.getValueType().isVector()) {
         assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
                "unexpected use of 'returned'");
         // Before passing 'returned' to the target lowering code, ensure that
@@ -7832,9 +7810,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // parameter extension method is not compatible with the return
         // extension method
         if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
-            (ExtendKind != ISD::ANY_EXTEND &&
-             CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt))
-        Flags.setReturned();
+            (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
+             CLI.RetZExt == Args[i].IsZExt))
+          Flags.setReturned();
       }
 
       getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
@@ -8010,6 +7988,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
   return true;
 }
 
+typedef DenseMap<const Argument *,
+                 std::pair<const AllocaInst *, const StoreInst *>>
+    ArgCopyElisionMapTy;
+
+/// Scan the entry block of the function in FuncInfo for arguments that look
+/// like copies into a local alloca. Record any copied arguments in
+/// ArgCopyElisionCandidates.
+static void
+findArgumentCopyElisionCandidates(const DataLayout &DL,
+                                  FunctionLoweringInfo *FuncInfo,
+                                  ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
+  // Record the state of every static alloca used in the entry block. Argument
+  // allocas are all used in the entry block, so we need approximately as many
+  // entries as we have arguments.
+  enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
+  SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
+  unsigned NumArgs = FuncInfo->Fn->arg_size();
+  StaticAllocas.reserve(NumArgs * 2);
+
+  auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
+    if (!V)
+      return nullptr;
+    V = V->stripPointerCasts();
+    const auto *AI = dyn_cast<AllocaInst>(V);
+    if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
+      return nullptr;
+    auto Iter = StaticAllocas.insert({AI, Unknown});
+    return &Iter.first->second;
+  };
+
+  // Look for stores of arguments to static allocas. Look through bitcasts and
+  // GEPs to handle type coercions, as long as the alloca is fully initialized
+  // by the store. Any non-store use of an alloca escapes it and any subsequent
+  // unanalyzed store might write it.
+  // FIXME: Handle structs initialized with multiple stores.
+  for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
+    // Look for stores, and handle non-store uses conservatively.
+    const auto *SI = dyn_cast<StoreInst>(&I);
+    if (!SI) {
+      // We will look through cast uses, so ignore them completely.
+      if (I.isCast())
+        continue;
+      // Ignore debug info intrinsics, they don't escape or store to allocas.
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      // This is an unknown instruction. Assume it escapes or writes to all
+      // static alloca operands.
+      for (const Use &U : I.operands()) {
+        if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
+          *Info = StaticAllocaInfo::Clobbered;
+      }
+      continue;
+    }
+
+    // If the stored value is a static alloca, mark it as escaped.
+    if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
+      *Info = StaticAllocaInfo::Clobbered;
+
+    // Check if the destination is a static alloca.
+    const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
+    StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
+    if (!Info)
+      continue;
+    const AllocaInst *AI = cast<AllocaInst>(Dst);
+
+    // Skip allocas that have been initialized or clobbered.
+    if (*Info != StaticAllocaInfo::Unknown)
+      continue;
+
+    // Check if the stored value is an argument, and that this store fully
+    // initializes the alloca. Don't elide copies from the same argument twice.
+    const Value *Val = SI->getValueOperand()->stripPointerCasts();
+    const auto *Arg = dyn_cast<Argument>(Val);
+    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
+        Arg->getType()->isEmptyTy() ||
+        DL.getTypeStoreSize(Arg->getType()) !=
+            DL.getTypeAllocSize(AI->getAllocatedType()) ||
+        ArgCopyElisionCandidates.count(Arg)) {
+      *Info = StaticAllocaInfo::Clobbered;
+      continue;
+    }
+
+    DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');
+
+    // Mark this alloca and store for argument copy elision.
+    *Info = StaticAllocaInfo::Elidable;
+    ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
+
+    // Stop scanning if we've seen all arguments. This will happen early in -O0
+    // builds, which is useful, because -O0 builds have large entry blocks and
+    // many allocas.
+    if (ArgCopyElisionCandidates.size() == NumArgs)
+      break;
+  }
+}
+
+/// Try to elide argument copies from memory into a local alloca. Succeeds if
+/// ArgVal is a load from a suitable fixed stack object.
+static void tryToElideArgumentCopy(
+    FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
+    DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
+    SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
+    ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
+    SDValue ArgVal, bool &ArgHasUses) {
+  // Check if this is a load from a fixed stack object.
+  auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
+  if (!LNode)
+    return;
+  auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
+  if (!FINode)
+    return;
+
+  // Check that the fixed stack object is the right size and alignment.
+  // Look at the alignment that the user wrote on the alloca instead of looking
+  // at the stack object.
+  auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
+  assert(ArgCopyIter != ArgCopyElisionCandidates.end());
+  const AllocaInst *AI = ArgCopyIter->second.first;
+  int FixedIndex = FINode->getIndex();
+  int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
+  int OldIndex = AllocaIndex;
+  MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
+  if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
+    DEBUG(dbgs() << "  argument copy elision failed due to bad fixed stack "
+                    "object size\n");
+    return;
+  }
+  unsigned RequiredAlignment = AI->getAlignment();
+  if (!RequiredAlignment) {
+    RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
+        AI->getAllocatedType());
+  }
+  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
+    DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
+                    "greater than stack argument alignment ("
+                 << RequiredAlignment << " vs "
+                 << MFI.getObjectAlignment(FixedIndex) << ")\n");
+    return;
+  }
+
+  // Perform the elision. Delete the old stack object and replace its only use
+  // in the variable info map. Mark the stack object as mutable.
+  DEBUG({
+    dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
+           << "  Replacing frame index " << OldIndex << " with " << FixedIndex
+           << '\n';
+  });
+  MFI.RemoveStackObject(OldIndex);
+  MFI.setIsImmutableObjectIndex(FixedIndex, false);
+  AllocaIndex = FixedIndex;
+  ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
+  Chains.push_back(ArgVal.getValue(1));
+
+  // Avoid emitting code for the store implementing the copy.
+  const StoreInst *SI = ArgCopyIter->second.second;
+  ElidedArgCopyInstrs.insert(SI);
+
+  // Check for uses of the argument again so that we can avoid exporting ArgVal
+  // if it is't used by anything other than the store.
+  for (const Value *U : Arg.users()) {
+    if (U != SI) {
+      ArgHasUses = true;
+      break;
+    }
+  }
+}
+
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
@@ -8032,15 +8177,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     Ins.push_back(RetArg);
   }
 
+  // Look for stores of arguments to static allocas. Mark such arguments with a
+  // flag to ask the target to give us the memory location of that argument if
+  // available.
+  ArgCopyElisionMapTy ArgCopyElisionCandidates;
+  findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
+
   // Set up the incoming argument description vector.
-  unsigned Idx = 1;
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
-       I != E; ++I, ++Idx) {
+  unsigned Idx = 0;
+  for (const Argument &Arg : F.args()) {
+    ++Idx;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
-    bool isArgValueUsed = !I->use_empty();
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    bool isArgValueUsed = !Arg.use_empty();
     unsigned PartBase = 0;
-    Type *FinalType = I->getType();
+    Type *FinalType = Arg.getType();
     if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
       FinalType = cast<PointerType>(FinalType)->getElementType();
     bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
@@ -8060,7 +8211,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (F.getCallingConv() == CallingConv::X86_VectorCall &&
-            isa<StructType>(I->getType())) {
+            isa<StructType>(Arg.getType())) {
           // The first value of a structure is marked
           if (0 == Value)
             Flags.setHvaStart();
@@ -8092,7 +8243,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           Flags.setByVal();
       }
       if (Flags.isByVal() || Flags.isInAlloca()) {
-        PointerType *Ty = cast<PointerType>(I->getType());
+        PointerType *Ty = cast<PointerType>(Arg.getType());
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
@@ -8109,6 +8260,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
+      if (ArgCopyElisionCandidates.count(&Arg))
+        Flags.setCopyElisionCandidate();
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
@@ -8155,7 +8308,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
   // Set up the argument values.
   unsigned i = 0;
-  Idx = 1;
+  Idx = 0;
   if (!FuncInfo->CanLowerReturn) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
@@ -8181,25 +8334,39 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ++i;
   }
 
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
-      ++I, ++Idx) {
+  SmallVector<SDValue, 4> Chains;
+  DenseMap<int, int> ArgCopyElisionFrameIndexMap;
+  for (const Argument &Arg : F.args()) {
+    ++Idx;
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
+    if (NumValues == 0)
+      continue;
+
+    bool ArgHasUses = !Arg.use_empty();
+
+    // Elide the copying store if the target loaded this argument from a
+    // suitable fixed stack object.
+    if (Ins[i].Flags.isCopyElisionCandidate()) {
+      tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
+                             ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
+                             InVals[i], ArgHasUses);
+    }
 
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
     bool isSwiftErrorArg =
         TLI->supportSwiftError() &&
         F.getAttributes().hasAttribute(Idx, Attribute::SwiftError);
-    if (I->use_empty() && NumValues && !isSwiftErrorArg) {
-      SDB->setUnusedArgValue(&*I, InVals[i]);
+    if (!ArgHasUses && !isSwiftErrorArg) {
+      SDB->setUnusedArgValue(&Arg, InVals[i]);
 
       // Also remember any frame index for use in FastISel.
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
-        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     for (unsigned Val = 0; Val != NumValues; ++Val) {
@@ -8210,16 +8377,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // Even an apparant 'unused' swifterror argument needs to be returned. So
       // we do generate a copy for it that can be used on return from the
       // function.
-      if (!I->use_empty() || isSwiftErrorArg) {
+      if (ArgHasUses || isSwiftErrorArg) {
         Optional<ISD::NodeType> AssertOp;
         if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
           AssertOp = ISD::AssertSext;
         else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
           AssertOp = ISD::AssertZext;
 
-        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
-                                             NumParts, PartVT, VT,
-                                             nullptr, AssertOp));
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
+                                             PartVT, VT, nullptr, AssertOp));
       }
 
       i += NumParts;
@@ -8232,18 +8398,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // Note down frame index.
     if (FrameIndexSDNode *FI =
         dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
-      FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+      FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
                                      SDB->getCurSDLoc());
 
-    SDB->setValue(&*I, Res);
+    SDB->setValue(&Arg, Res);
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
       if (LoadSDNode *LNode =
           dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
         if (FrameIndexSDNode *FI =
             dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
-        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     // Update the SwiftErrorVRegDefMap.
@@ -8263,18 +8429,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // uses with vregs.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        FuncInfo->ValueMap[&*I] = Reg;
+        FuncInfo->ValueMap[&Arg] = Reg;
         continue;
       }
     }
-    if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) {
-      FuncInfo->InitializeRegForValue(&*I);
-      SDB->CopyToExportRegsIfNeeded(&*I);
+    if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(&Arg);
+      SDB->CopyToExportRegsIfNeeded(&Arg);
     }
   }
 
+  if (!Chains.empty()) {
+    Chains.push_back(NewRoot);
+    NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  DAG.setRoot(NewRoot);
+
   assert(i == InVals.size() && "Argument register count mismatch!");
 
+  // If any argument copy elisions occurred and we have debug info, update the
+  // stale frame indices used in the dbg.declare variable info table.
+  MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
+  if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
+    for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
+      auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
+      if (I != ArgCopyElisionFrameIndexMap.end())
+        VI.Slot = I->second;
+    }
+  }
+
   // Finally, if the target has anything special to do, allow it to do so.
   EmitFunctionEntryCode();
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index abde8a89befc..c6acc09b6602 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -616,33 +616,27 @@ public:
   void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
             const TargetLibraryInfo *li);
 
-  /// clear - Clear out the current SelectionDAG and the associated
-  /// state and prepare this SelectionDAGBuilder object to be used
-  /// for a new block. This doesn't clear out information about
-  /// additional blocks that are needed to complete switch lowering
-  /// or PHI node updating; that information is cleared out as it is
-  /// consumed.
+  /// Clear out the current SelectionDAG and the associated state and prepare
+  /// this SelectionDAGBuilder object to be used for a new block. This doesn't
+  /// clear out information about additional blocks that are needed to complete
+  /// switch lowering or PHI node updating; that information is cleared out as
+  /// it is consumed.
   void clear();
 
-  /// clearDanglingDebugInfo - Clear the dangling debug information
-  /// map. This function is separated from the clear so that debug
-  /// information that is dangling in a basic block can be properly
-  /// resolved in a different basic block. This allows the
-  /// SelectionDAG to resolve dangling debug information attached
-  /// to PHI nodes.
+  /// Clear the dangling debug information map. This function is separated from
+  /// the clear so that debug information that is dangling in a basic block can
+  /// be properly resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached to PHI nodes.
   void clearDanglingDebugInfo();
 
-  /// getRoot - Return the current virtual root of the Selection DAG,
-  /// flushing any PendingLoad items. This must be done before emitting
-  /// a store or any other node that may need to be ordered after any
-  /// prior load instructions.
-  ///
+  /// Return the current virtual root of the Selection DAG, flushing any
+  /// PendingLoad items. This must be done before emitting a store or any other
+  /// node that may need to be ordered after any prior load instructions.
   SDValue getRoot();
 
-  /// getControlRoot - Similar to getRoot, but instead of flushing all the
-  /// PendingLoad items, flush all the PendingExports items. It is necessary
-  /// to do this before emitting a terminator instruction.
-  ///
+  /// Similar to getRoot, but instead of flushing all the PendingLoad items,
+  /// flush all the PendingExports items. It is necessary to do this before
+  /// emitting a terminator instruction.
   SDValue getControlRoot();
 
   SDLoc getCurSDLoc() const {
@@ -688,12 +682,13 @@ public:
                             MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
                             MachineBasicBlock *SwitchBB,
                             Instruction::BinaryOps Opc, BranchProbability TW,
-                            BranchProbability FW);
+                            BranchProbability FW, bool InvertCond);
   void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     MachineBasicBlock *CurBB,
                                     MachineBasicBlock *SwitchBB,
-                                    BranchProbability TW, BranchProbability FW);
+                                    BranchProbability TW, BranchProbability FW,
+                                    bool InvertCond);
   bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
   bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
   void CopyToExportRegsIfNeeded(const Value *V);
@@ -900,6 +895,7 @@ private:
   void visitInlineAsm(ImmutableCallSite CS);
   const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
+  void visitConstrainedFPIntrinsic(const CallInst &I, unsigned Intrinsic);
 
   void visitVAStart(const CallInst &I);
   void visitVAArg(const VAArgInst &I);
@@ -944,8 +940,8 @@ private:
 
   /// Return the appropriate SDDbgValue based on N.
   SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable,
-                          DIExpression *Expr, int64_t Offset, DebugLoc dl,
-                          unsigned DbgSDNodeOrder);
+                          DIExpression *Expr, int64_t Offset,
+                          const DebugLoc &dl, unsigned DbgSDNodeOrder);
 };
 
 /// RegsForValue - This struct represents the registers (physical or virtual)
@@ -958,26 +954,23 @@ private:
 /// type.
 ///
 struct RegsForValue {
-  /// ValueVTs - The value types of the values, which may not be legal, and
+  /// The value types of the values, which may not be legal, and
   /// may need be promoted or synthesized from one or more registers.
-  ///
   SmallVector<EVT, 4> ValueVTs;
 
-  /// RegVTs - The value types of the registers. This is the same size as
-  /// ValueVTs and it records, for each value, what the type of the assigned
-  /// register or registers are. (Individual values are never synthesized
-  /// from more than one type of register.)
+  /// The value types of the registers. This is the same size as ValueVTs and it
+  /// records, for each value, what the type of the assigned register or
+  /// registers are. (Individual values are never synthesized from more than one
+  /// type of register.)
   ///
   /// With virtual registers, the contents of RegVTs is redundant with TLI's
   /// getRegisterType member function, however when with physical registers
   /// it is necessary to have a separate record of the types.
-  ///
   SmallVector<MVT, 4> RegVTs;
 
-  /// Regs - This list holds the registers assigned to the values.
+  /// This list holds the registers assigned to the values.
   /// Each legal or promoted value requires one register, and each
   /// expanded value requires multiple registers.
-  ///
   SmallVector<unsigned, 4> Regs;
 
   RegsForValue();
@@ -987,33 +980,33 @@ struct RegsForValue {
   RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
                const DataLayout &DL, unsigned Reg, Type *Ty);
 
-  /// append - Add the specified values to this one.
+  /// Add the specified values to this one.
   void append(const RegsForValue &RHS) {
     ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
     RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
     Regs.append(RHS.Regs.begin(), RHS.Regs.end());
   }
 
-  /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-  /// this value and returns the result as a ValueVTs value.  This uses
-  /// Chain/Flag as the input and updates them for the output Chain/Flag.
-  /// If the Flag pointer is NULL, no flag is used.
+  /// Emit a series of CopyFromReg nodes that copies from this value and returns
+  /// the result as a ValueVTs value. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no
+  /// flag is used.
   SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
                           const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                           const Value *V = nullptr) const;
 
-  /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified
-  /// value into the registers specified by this object.  This uses Chain/Flag
-  /// as the input and updates them for the output Chain/Flag.  If the Flag
-  /// pointer is nullptr, no flag is used.  If V is not nullptr, then it is used
-  /// in printing better diagnostic messages on error.
+  /// Emit a series of CopyToReg nodes that copies the specified value into the
+  /// registers specified by this object. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no
+  /// flag is used. If V is not nullptr, then it is used in printing better
+  /// diagnostic messages on error.
   void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,
                      SDValue &Chain, SDValue *Flag, const Value *V = nullptr,
                      ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
 
-  /// AddInlineAsmOperands - Add this value to the specified inlineasm node
-  /// operand list.  This adds the code marker, matching input operand index
-  /// (if applicable), and includes the number of values added into it.
+  /// Add this value to the specified inlineasm node operand list. This adds the
+  /// code marker, matching input operand index (if applicable), and includes
+  /// the number of values added into it.
   void AddInlineAsmOperands(unsigned Kind, bool HasMatching,
                             unsigned MatchingIdx, const SDLoc &dl,
                             SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 0faaad8a21b7..488c60a28ffb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -300,6 +300,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
 
   // Bit manipulation
+  case ISD::ABS:                        return "abs";
   case ISD::BITREVERSE:                 return "bitreverse";
   case ISD::BSWAP:                      return "bswap";
   case ISD::CTPOP:                      return "ctpop";
@@ -366,11 +367,13 @@ static Printable PrintNodeId(const SDNode &Node) {
   });
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); }
-void SDNode::dump(const SelectionDAG *G) const {
+LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const {
   print(dbgs(), G);
   dbgs() << '\n';
 }
+#endif
 
 void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
   for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
@@ -416,7 +419,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
     else {
       OS << "<APFloat(";
-      CSDN->getValueAPF().bitcastToAPInt().dump();
+      CSDN->getValueAPF().bitcastToAPInt().print(OS, false);
       OS << ")>";
     }
   } else if (const GlobalAddressSDNode *GADN =
@@ -566,6 +569,7 @@ static bool shouldPrintInline(const SDNode &Node) {
   return Node.getNumOperands() == 0;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
   for (const SDValue &Op : N->op_values()) {
     if (shouldPrintInline(*Op.getNode()))
@@ -592,6 +596,7 @@ LLVM_DUMP_METHOD void SelectionDAG::dump() const {
   if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
   dbgs() << "\n\n";
 }
+#endif
 
 void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
   OS << PrintNodeId(*this) << ": ";
@@ -618,6 +623,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet;
 static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
                        const SelectionDAG *G, VisitedSDNodeSet &once) {
@@ -646,15 +652,16 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
     DumpNodesr(OS, Op.getNode(), indent+2, G, once);
 }
 
-void SDNode::dumpr() const {
+LLVM_DUMP_METHOD void SDNode::dumpr() const {
   VisitedSDNodeSet once;
   DumpNodesr(dbgs(), this, 0, nullptr, once);
 }
 
-void SDNode::dumpr(const SelectionDAG *G) const {
+LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const {
   VisitedSDNodeSet once;
   DumpNodesr(dbgs(), this, 0, G, once);
 }
+#endif
 
 static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
                                   const SelectionDAG *G, unsigned depth,
@@ -688,14 +695,17 @@ void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
   printrWithDepth(OS, G, 10);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
   printrWithDepth(dbgs(), G, depth);
 }
 
-void SDNode::dumprFull(const SelectionDAG *G) const {
+LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const {
   // Don't print impossibly deep things.
   dumprWithDepth(G, 10);
 }
+#endif
 
 void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
   printr(OS, G);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 64e6c221229b..e21204dbb966 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,40 +11,65 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackProtector.h"
-#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -59,6 +84,13 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -73,104 +105,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");
 STATISTIC(NumFastIselFailLowerArguments,
           "Number of entry blocks where fast isel failed to lower arguments");
 
-#ifndef NDEBUG
-static cl::opt<bool>
-EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden,
-          cl::desc("Enable extra verbose messages in the \"fast\" "
-                   "instruction selector"));
-
-  // Terminators
-STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret");
-STATISTIC(NumFastIselFailBr,"Fast isel fails on Br");
-STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch");
-STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr");
-STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke");
-STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume");
-STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable");
-
-  // Standard binary operators...
-STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add");
-STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd");
-STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub");
-STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub");
-STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul");
-STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul");
-STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv");
-STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv");
-STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv");
-STATISTIC(NumFastIselFailURem,"Fast isel fails on URem");
-STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem");
-STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem");
-
-  // Logical operators...
-STATISTIC(NumFastIselFailAnd,"Fast isel fails on And");
-STATISTIC(NumFastIselFailOr,"Fast isel fails on Or");
-STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor");
-
-  // Memory instructions...
-STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca");
-STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load");
-STATISTIC(NumFastIselFailStore,"Fast isel fails on Store");
-STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg");
-STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM");
-STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence");
-STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr");
-
-  // Convert instructions...
-STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc");
-STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt");
-STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt");
-STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc");
-STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt");
-STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI");
-STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI");
-STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP");
-STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP");
-STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr");
-STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt");
-STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast");
-
-  // Other instructions...
-STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp");
-STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp");
-STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI");
-STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select");
-STATISTIC(NumFastIselFailCall,"Fast isel fails on Call");
-STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl");
-STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr");
-STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr");
-STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg");
-STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement");
-STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement");
-STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
-STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
-STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
-STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
-
-// Intrinsic instructions...
-STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call");
-STATISTIC(NumFastIselFailSAddWithOverflow,
-          "Fast isel fails on sadd.with.overflow");
-STATISTIC(NumFastIselFailUAddWithOverflow,
-          "Fast isel fails on uadd.with.overflow");
-STATISTIC(NumFastIselFailSSubWithOverflow,
-          "Fast isel fails on ssub.with.overflow");
-STATISTIC(NumFastIselFailUSubWithOverflow,
-          "Fast isel fails on usub.with.overflow");
-STATISTIC(NumFastIselFailSMulWithOverflow,
-          "Fast isel fails on smul.with.overflow");
-STATISTIC(NumFastIselFailUMulWithOverflow,
-          "Fast isel fails on umul.with.overflow");
-STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress");
-STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call");
-STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call");
-STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call");
-#endif
-
-static cl::opt<bool>
-EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
-          cl::desc("Enable verbose messages in the \"fast\" "
-                   "instruction selector"));
 static cl::opt<int> EnableFastISelAbort(
     "fast-isel-abort", cl::Hidden,
     cl::desc("Enable abort calls when \"fast\" instruction selection "
@@ -179,6 +113,11 @@ static cl::opt<int> EnableFastISelAbort(
              "abort for argument lowering, and 3 will never fallback "
              "to SelectionDAG."));
 
+static cl::opt<bool> EnableFastISelFallbackReport(
+    "fast-isel-report-on-fallback", cl::Hidden,
+    cl::desc("Emit a diagnostic when \"fast\" instruction selection "
+             "falls back to SelectionDAG."));
+
 static cl::opt<bool>
 UseMBPI("use-mbpi",
         cl::desc("use Machine Branch Probability Info"),
@@ -238,7 +177,7 @@ MachinePassRegistry RegisterScheduler::Registry;
 ///
 //===---------------------------------------------------------------------===//
 static cl::opt<RegisterScheduler::FunctionPassCtor, false,
-               RegisterPassParser<RegisterScheduler> >
+               RegisterPassParser<RegisterScheduler>>
 ISHeuristic("pre-RA-sched",
             cl::init(&createDefaultScheduler), cl::Hidden,
             cl::desc("Instruction schedulers available (before register"
@@ -249,6 +188,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
                         createDefaultScheduler);
 
 namespace llvm {
+
   //===--------------------------------------------------------------------===//
   /// \brief This class is used by SelectionDAGISel to temporarily override
   /// the optimization level on a per-function basis.
@@ -318,6 +258,7 @@ namespace llvm {
            "Unknown sched type!");
     return createILPListDAGScheduler(IS, OptLevel);
   }
+
 } // end namespace llvm
 
 // EmitInstrWithCustomInserter - This method should be implemented by targets
@@ -431,8 +372,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
           MachineFunctionProperties::Property::Selected))
     return false;
   // Do some sanity-checking on the command-line options.
-  assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) &&
-         "-fast-isel-verbose requires -fast-isel");
   assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
          "-fast-isel-abort > 0 requires -fast-isel");
 
@@ -457,12 +396,13 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+  ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn));
 
-  CurDAG->init(*MF);
+  CurDAG->init(*MF, *ORE);
   FuncInfo->set(Fn, *MF, CurDAG);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -502,6 +442,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     TLI->initializeSplitCSR(EntryMBB);
 
   SelectAllBasicBlocks(Fn);
+  if (FastISelFailed && EnableFastISelFallbackReport) {
+    DiagnosticInfoISelFallback DiagFallback(Fn);
+    Fn.getContext().diagnose(DiagFallback);
+  }
 
   // If the first basic block in the function has live ins that need to be
   // copied into vregs, emit the copies into the top of the block before
@@ -628,7 +572,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     unsigned To = I->second;
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
-    for (;;) {
+    while (true) {
       DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E) break;
       To = J->second;
@@ -666,13 +610,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
+static void reportFastISelFailure(MachineFunction &MF,
+                                  OptimizationRemarkEmitter &ORE,
+                                  OptimizationRemarkMissed &R,
+                                  bool ShouldAbort) {
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || ShouldAbort)
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (ShouldAbort)
+    report_fatal_error(R.getMsg());
+
+  ORE.emit(R);
+}
+
 void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                         BasicBlock::const_iterator End,
                                         bool &HadTailCall) {
   // Lower the instructions. If a call is emitted as a tail call, cease emitting
   // nodes for this block.
-  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
-    SDB->visit(*I);
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
+    if (!ElidedArgCopyInstrs.count(&*I))
+      SDB->visit(*I);
+  }
 
   // Make sure the root of the DAG is up-to-date.
   CurDAG->setRoot(SDB->getControlRoot());
@@ -731,6 +692,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   int BlockNumber = -1;
   (void)BlockNumber;
   bool MatchFilterBB = false; (void)MatchFilterBB;
+
+  // Pre-type legalization allow creation of any node types.
+  CurDAG->NewNodesMustHaveLegalTypes = false;
+
 #ifndef NDEBUG
   MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
                    FilterDAGBasicBlockName ==
@@ -777,6 +742,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
+  // Only allow creation of legal node types.
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
@@ -802,12 +768,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   }
 
   if (Changed) {
+    DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
     {
       NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
                          GroupDescription, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
 
+    DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
@@ -907,10 +879,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 }
 
 namespace {
+
 /// ISelUpdater - helper class to handle updates of the instruction selection
 /// graph.
 class ISelUpdater : public SelectionDAG::DAGUpdateListener {
   SelectionDAG::allnodes_iterator &ISelPosition;
+
 public:
   ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
     : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
@@ -923,8 +897,53 @@ public:
       ++ISelPosition;
   }
 };
+
 } // end anonymous namespace
 
+static bool isStrictFPOp(SDNode *Node, unsigned &NewOpc) {
+  unsigned OrigOpc = Node->getOpcode();
+  switch (OrigOpc) {
+    case ISD::STRICT_FADD: NewOpc = ISD::FADD; return true;
+    case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; return true;
+    case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; return true;
+    case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; return true;
+    case ISD::STRICT_FREM: NewOpc = ISD::FREM; return true;
+    default: return false;
+  }
+}
+
+SDNode* SelectionDAGISel::MutateStrictFPToFP(SDNode *Node, unsigned NewOpc) {
+  assert(((Node->getOpcode() == ISD::STRICT_FADD && NewOpc == ISD::FADD) ||
+          (Node->getOpcode() == ISD::STRICT_FSUB && NewOpc == ISD::FSUB) ||
+          (Node->getOpcode() == ISD::STRICT_FMUL && NewOpc == ISD::FMUL) ||
+          (Node->getOpcode() == ISD::STRICT_FDIV && NewOpc == ISD::FDIV) ||
+          (Node->getOpcode() == ISD::STRICT_FREM && NewOpc == ISD::FREM)) &&
+          "Unexpected StrictFP opcode!");
+
+  // We're taking this node out of the chain, so we need to re-link things.
+  SDValue InputChain = Node->getOperand(0);
+  SDValue OutputChain = SDValue(Node, 1);
+  CurDAG->ReplaceAllUsesOfValueWith(OutputChain, InputChain);
+
+  SDVTList VTs = CurDAG->getVTList(Node->getOperand(1).getValueType());
+  SDValue Ops[2] = { Node->getOperand(1), Node->getOperand(2) };
+  SDNode *Res = CurDAG->MorphNodeTo(Node, NewOpc, VTs, Ops);
+  
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  } else {
+    CurDAG->ReplaceAllUsesWith(Node, Res);
+    CurDAG->RemoveDeadNode(Node);
+  }
+
+  return Res; 
+}
+
 void SelectionDAGISel::DoInstructionSelection() {
   DEBUG(dbgs() << "===== Instruction selection begins: BB#"
         << FuncInfo->MBB->getNumber()
@@ -960,7 +979,23 @@ void SelectionDAGISel::DoInstructionSelection() {
       if (Node->use_empty())
         continue;
 
+      // When we are using non-default rounding modes or FP exception behavior
+      // FP operations are represented by StrictFP pseudo-operations.  They
+      // need to be simplified here so that the target-specific instruction
+      // selectors know how to handle them.
+      //
+      // If the current node is a strict FP pseudo-op, the isStrictFPOp()
+      // function will provide the corresponding normal FP opcode to which the
+      // node should be mutated.
+      unsigned NormalFPOpc = ISD::UNDEF;
+      bool IsStrictFPOp = isStrictFPOp(Node, NormalFPOpc);
+      if (IsStrictFPOp)
+        Node = MutateStrictFPToFP(Node, NormalFPOpc);
+
       Select(Node);
+
+      // FIXME: Add code here to attach an implicit def and use of
+      // target-specific FP environment registers.
     }
 
     CurDAG->setRoot(Dummy.getValue());
@@ -1046,116 +1081,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
          !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
-#ifndef NDEBUG
-// Collect per Instruction statistics for fast-isel misses.  Only those
-// instructions that cause the bail are accounted for.  It does not account for
-// instructions higher in the block.  Thus, summing the per instructions stats
-// will not add up to what is reported by NumFastIselFailures.
-static void collectFailStats(const Instruction *I) {
-  switch (I->getOpcode()) {
-  default: assert (0 && "<Invalid operator> ");
-
-  // Terminators
-  case Instruction::Ret:         NumFastIselFailRet++; return;
-  case Instruction::Br:          NumFastIselFailBr++; return;
-  case Instruction::Switch:      NumFastIselFailSwitch++; return;
-  case Instruction::IndirectBr:  NumFastIselFailIndirectBr++; return;
-  case Instruction::Invoke:      NumFastIselFailInvoke++; return;
-  case Instruction::Resume:      NumFastIselFailResume++; return;
-  case Instruction::Unreachable: NumFastIselFailUnreachable++; return;
-
-  // Standard binary operators...
-  case Instruction::Add:  NumFastIselFailAdd++; return;
-  case Instruction::FAdd: NumFastIselFailFAdd++; return;
-  case Instruction::Sub:  NumFastIselFailSub++; return;
-  case Instruction::FSub: NumFastIselFailFSub++; return;
-  case Instruction::Mul:  NumFastIselFailMul++; return;
-  case Instruction::FMul: NumFastIselFailFMul++; return;
-  case Instruction::UDiv: NumFastIselFailUDiv++; return;
-  case Instruction::SDiv: NumFastIselFailSDiv++; return;
-  case Instruction::FDiv: NumFastIselFailFDiv++; return;
-  case Instruction::URem: NumFastIselFailURem++; return;
-  case Instruction::SRem: NumFastIselFailSRem++; return;
-  case Instruction::FRem: NumFastIselFailFRem++; return;
-
-  // Logical operators...
-  case Instruction::And: NumFastIselFailAnd++; return;
-  case Instruction::Or:  NumFastIselFailOr++; return;
-  case Instruction::Xor: NumFastIselFailXor++; return;
-
-  // Memory instructions...
-  case Instruction::Alloca:        NumFastIselFailAlloca++; return;
-  case Instruction::Load:          NumFastIselFailLoad++; return;
-  case Instruction::Store:         NumFastIselFailStore++; return;
-  case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return;
-  case Instruction::AtomicRMW:     NumFastIselFailAtomicRMW++; return;
-  case Instruction::Fence:         NumFastIselFailFence++; return;
-  case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return;
-
-  // Convert instructions...
-  case Instruction::Trunc:    NumFastIselFailTrunc++; return;
-  case Instruction::ZExt:     NumFastIselFailZExt++; return;
-  case Instruction::SExt:     NumFastIselFailSExt++; return;
-  case Instruction::FPTrunc:  NumFastIselFailFPTrunc++; return;
-  case Instruction::FPExt:    NumFastIselFailFPExt++; return;
-  case Instruction::FPToUI:   NumFastIselFailFPToUI++; return;
-  case Instruction::FPToSI:   NumFastIselFailFPToSI++; return;
-  case Instruction::UIToFP:   NumFastIselFailUIToFP++; return;
-  case Instruction::SIToFP:   NumFastIselFailSIToFP++; return;
-  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return;
-  case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return;
-  case Instruction::BitCast:  NumFastIselFailBitCast++; return;
-
-  // Other instructions...
-  case Instruction::ICmp:           NumFastIselFailICmp++; return;
-  case Instruction::FCmp:           NumFastIselFailFCmp++; return;
-  case Instruction::PHI:            NumFastIselFailPHI++; return;
-  case Instruction::Select:         NumFastIselFailSelect++; return;
-  case Instruction::Call: {
-    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
-      switch (Intrinsic->getIntrinsicID()) {
-      default:
-        NumFastIselFailIntrinsicCall++; return;
-      case Intrinsic::sadd_with_overflow:
-        NumFastIselFailSAddWithOverflow++; return;
-      case Intrinsic::uadd_with_overflow:
-        NumFastIselFailUAddWithOverflow++; return;
-      case Intrinsic::ssub_with_overflow:
-        NumFastIselFailSSubWithOverflow++; return;
-      case Intrinsic::usub_with_overflow:
-        NumFastIselFailUSubWithOverflow++; return;
-      case Intrinsic::smul_with_overflow:
-        NumFastIselFailSMulWithOverflow++; return;
-      case Intrinsic::umul_with_overflow:
-        NumFastIselFailUMulWithOverflow++; return;
-      case Intrinsic::frameaddress:
-        NumFastIselFailFrameaddress++; return;
-      case Intrinsic::sqrt:
-          NumFastIselFailSqrt++; return;
-      case Intrinsic::experimental_stackmap:
-        NumFastIselFailStackMap++; return;
-      case Intrinsic::experimental_patchpoint_void: // fall-through
-      case Intrinsic::experimental_patchpoint_i64:
-        NumFastIselFailPatchPoint++; return;
-      }
-    }
-    NumFastIselFailCall++;
-    return;
-  }
-  case Instruction::Shl:            NumFastIselFailShl++; return;
-  case Instruction::LShr:           NumFastIselFailLShr++; return;
-  case Instruction::AShr:           NumFastIselFailAShr++; return;
-  case Instruction::VAArg:          NumFastIselFailVAArg++; return;
-  case Instruction::ExtractElement: NumFastIselFailExtractElement++; return;
-  case Instruction::InsertElement:  NumFastIselFailInsertElement++; return;
-  case Instruction::ShuffleVector:  NumFastIselFailShuffleVector++; return;
-  case Instruction::ExtractValue:   NumFastIselFailExtractValue++; return;
-  case Instruction::InsertValue:    NumFastIselFailInsertValue++; return;
-  case Instruction::LandingPad:     NumFastIselFailLandingPad++; return;
-  }
-}
-#endif // NDEBUG
-
 /// Set up SwiftErrorVals by going through the function. If the function has
 /// swifterror argument, it will be the first entry.
 static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
@@ -1190,9 +1115,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
 }
 
 static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
+                                                FastISel *FastIS,
                                                 const TargetLowering *TLI,
                                                 const TargetInstrInfo *TII,
-                                                const BasicBlock *LLVMBB,
                                                 SelectionDAGBuilder *SDB) {
   if (!TLI->supportSwiftError())
     return;
@@ -1202,22 +1127,27 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
   if (FuncInfo->SwiftErrorVals.empty())
     return;
 
-  if (pred_begin(LLVMBB) == pred_end(LLVMBB)) {
-    auto &DL = FuncInfo->MF->getDataLayout();
-    auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
-    for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
-      // We will always generate a copy from the argument. It is always used at
-      // least by the 'return' of the swifterror.
-      if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
-        continue;
-      unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
-      // Assign Undef to Vreg. We construct MI directly to make sure it works
-      // with FastISel.
-      BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
-              SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
-              VReg);
-      FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
-    }
+  assert(FuncInfo->MBB == &*FuncInfo->MF->begin() &&
+         "expected to insert into entry block");
+  auto &DL = FuncInfo->MF->getDataLayout();
+  auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
+    // We will always generate a copy from the argument. It is always used at
+    // least by the 'return' of the swifterror.
+    if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
+      continue;
+    unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+    // Assign Undef to Vreg. We construct MI directly to make sure it works
+    // with FastISel.
+    BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
+            SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
+            VReg);
+
+    // Keep FastIS informed about the value we just inserted.
+    if (FastIS)
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+
+    FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
   }
 }
 
@@ -1340,6 +1270,7 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
 }
 
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  FastISelFailed = false;
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel)
@@ -1347,12 +1278,53 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
   setupSwiftErrorVals(Fn, TLI, FuncInfo);
 
-  // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
-  for (ReversePostOrderTraversal<const Function*>::rpo_iterator
-       I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    const BasicBlock *LLVMBB = *I;
 
+  // Lower arguments up front. An RPO iteration always visits the entry block
+  // first.
+  assert(*RPOT.begin() == &Fn.getEntryBlock());
+  ++NumEntryBlocks;
+
+  // Set up FuncInfo for ISel. Entry blocks never have PHIs.
+  FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
+  FuncInfo->InsertPt = FuncInfo->MBB->begin();
+
+  if (!FastIS) {
+    LowerArguments(Fn);
+  } else {
+    // See if fast isel can lower the arguments.
+    FastIS->startNewBlock();
+    if (!FastIS->lowerArguments()) {
+      FastISelFailed = true;
+      // Fast isel failed to lower these arguments
+      ++NumFastIselFailLowerArguments;
+
+      OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                 Fn.getSubprogram(),
+                                 &Fn.getEntryBlock());
+      R << "FastISel didn't lower all arguments: "
+        << ore::NV("Prototype", Fn.getType());
+      reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1);
+
+      // Use SelectionDAG argument lowering
+      LowerArguments(Fn);
+      CurDAG->setRoot(SDB->getControlRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // If we inserted any instructions at the beginning, make a note of
+    // where they are, so we can be sure to emit subsequent instructions
+    // after them.
+    if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+    else
+      FastIS->setLastLocalValue(nullptr);
+  }
+  createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
+
+  // Iterate over all basic blocks in the function.
+  for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
       for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
@@ -1384,8 +1356,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
     if (!FuncInfo->MBB)
       continue; // Some blocks like catchpads have no code or MBB.
-    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
-    createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB);
+
+    // Insert new instructions after any phi or argument setup code.
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = 0;
@@ -1396,35 +1369,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
-      FastIS->startNewBlock();
-
-      // Emit code for any incoming arguments. This must happen before
-      // beginning FastISel on the entry block.
-      if (LLVMBB == &Fn.getEntryBlock()) {
-        ++NumEntryBlocks;
-
-        // Lower any arguments needed in this block if this is the entry block.
-        if (!FastIS->lowerArguments()) {
-          // Fast isel failed to lower these arguments
-          ++NumFastIselFailLowerArguments;
-          if (EnableFastISelAbort > 1)
-            report_fatal_error("FastISel didn't lower all arguments");
-
-          // Use SelectionDAG argument lowering
-          LowerArguments(Fn);
-          CurDAG->setRoot(SDB->getControlRoot());
-          SDB->clear();
-          CodeGenAndEmitDAG();
-        }
-
-        // If we inserted any instructions at the beginning, make a note of
-        // where they are, so we can be sure to emit subsequent instructions
-        // after them.
-        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
-          FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
-        else
-          FastIS->setLastLocalValue(nullptr);
-      }
+      if (LLVMBB != &Fn.getEntryBlock())
+        FastIS->startNewBlock();
 
       unsigned NumFastIselRemaining = std::distance(Begin, End);
       // Do FastISel on as many instructions as possible.
@@ -1432,7 +1378,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         const Instruction *Inst = &*std::prev(BI);
 
         // If we no longer require this instruction, skip it.
-        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo) ||
+            ElidedArgCopyInstrs.count(Inst)) {
           --NumFastIselRemaining;
           continue;
         }
@@ -1443,6 +1390,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
         // Try to select the instruction with FastISel.
         if (FastIS->selectInstruction(Inst)) {
+          FastISelFailed = true;
           --NumFastIselRemaining;
           ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
@@ -1465,22 +1413,22 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
-#ifndef NDEBUG
-        if (EnableFastISelVerbose2)
-          collectFailStats(Inst);
-#endif
-
         // Then handle certain instructions as single-LLVM-Instruction blocks.
         if (isa<CallInst>(Inst)) {
+          OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                     Inst->getDebugLoc(), LLVMBB);
+
+          R << "FastISel missed call";
+
+          if (R.isEnabled() || EnableFastISelAbort) {
+            std::string InstStrStorage;
+            raw_string_ostream InstStr(InstStrStorage);
+            InstStr << *Inst;
 
-          if (EnableFastISelVerbose || EnableFastISelAbort) {
-            dbgs() << "FastISel missed call: ";
-            Inst->dump();
+            R << ": " << InstStr.str();
           }
-          if (EnableFastISelAbort > 2)
-            // FastISel selector couldn't handle something and bailed.
-            // For the purpose of debugging, just abort.
-            report_fatal_error("FastISel didn't select the entire block");
+
+          reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2);
 
           if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
               !Inst->use_empty()) {
@@ -1509,35 +1457,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
+        OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                   Inst->getDebugLoc(), LLVMBB);
+
         bool ShouldAbort = EnableFastISelAbort;
-        if (EnableFastISelVerbose || EnableFastISelAbort) {
-          if (isa<TerminatorInst>(Inst)) {
-            // Use a different message for terminator misses.
-            dbgs() << "FastISel missed terminator: ";
-            // Don't abort unless for terminator unless the level is really high
-            ShouldAbort = (EnableFastISelAbort > 2);
-          } else {
-            dbgs() << "FastISel miss: ";
-          }
-          Inst->dump();
+        if (isa<TerminatorInst>(Inst)) {
+          // Use a different message for terminator misses.
+          R << "FastISel missed terminator";
+          // Don't abort for terminator unless the level is really high
+          ShouldAbort = (EnableFastISelAbort > 2);
+        } else {
+          R << "FastISel missed";
+        }
+
+        if (R.isEnabled() || EnableFastISelAbort) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << *Inst;
+          R << ": " << InstStr.str();
         }
-        if (ShouldAbort)
-          // FastISel selector couldn't handle something and bailed.
-          // For the purpose of debugging, just abort.
-          report_fatal_error("FastISel didn't select the entire block");
+
+        reportFastISelFailure(*MF, *ORE, R, ShouldAbort);
 
         NumFastIselFailures += NumFastIselRemaining;
         break;
       }
 
       FastIS->recomputeInsertPt();
-    } else {
-      // Lower any arguments needed in this block if this is the entry block.
-      if (LLVMBB == &Fn.getEntryBlock()) {
-        ++NumEntryBlocks;
-        LowerArguments(Fn);
-      }
     }
+
     if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
           TLI->getSSPStackGuardCheck(*Fn.getParent());
@@ -1556,10 +1504,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       // block.
       bool HadTailCall;
       SelectBasicBlock(Begin, BI, HadTailCall);
+
+      // But if FastISel was run, we already selected some of the block.
+      // If we emitted a tail-call, we need to delete any previously emitted
+      // instruction that follows it.
+      if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end())
+        FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());
     }
 
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
+    ElidedArgCopyInstrs.clear();
   }
 
   propagateSwiftErrorVRegs(FuncInfo);
@@ -2177,7 +2132,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
     IgnoreChains = false;
   }
 
-
   SmallPtrSet<SDNode*, 16> Visited;
   return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
 }
@@ -2554,7 +2508,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
-          const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+          const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   // Accept if it is exactly the same as a previously recorded node.
   unsigned RecNo = MatcherTable[MatcherIndex++];
   assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
@@ -2564,9 +2518,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// CheckChildSame - Implements OP_CheckChildXSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-             SDValue N,
-             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
-             unsigned ChildNo) {
+              SDValue N,
+              const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes,
+              unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
@@ -2688,7 +2642,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
                                        unsigned Index, SDValue N,
                                        bool &Result,
                                        const SelectionDAGISel &SDISel,
-                 SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+                  SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   switch (Table[Index++]) {
   default:
     Result = false;
@@ -2756,6 +2710,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
 }
 
 namespace {
+
 struct MatchScope {
   /// FailIndex - If this match fails, this is the index to continue with.
   unsigned FailIndex;
@@ -2785,6 +2740,7 @@ class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
   SDNode **NodeToMatch;
   SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;
   SmallVectorImpl<MatchScope> &MatchScopes;
+
 public:
   MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,
                     SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN,
@@ -2816,6 +2772,7 @@ public:
           J.setNode(E);
   }
 };
+
 } // end anonymous namespace
 
 void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
@@ -2921,7 +2878,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
     // is the first time we're selecting an instruction.
     unsigned Idx = 1;
-    while (1) {
+    while (true) {
       // Get the size of this case.
       unsigned CaseSize = MatcherTable[Idx++];
       if (CaseSize & 128)
@@ -2942,7 +2899,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       MatcherIndex = OpcodeOffset[N.getOpcode()];
   }
 
-  while (1) {
+  while (true) {
     assert(MatcherIndex < TableSize && "Invalid index");
 #ifndef NDEBUG
     unsigned CurrentOpcodeIndex = MatcherIndex;
@@ -2957,7 +2914,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // immediately fail, don't even bother pushing a scope for them.
       unsigned FailIndex;
 
-      while (1) {
+      while (true) {
         unsigned NumToSkip = MatcherTable[MatcherIndex++];
         if (NumToSkip & 128)
           NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
@@ -3118,7 +3075,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       unsigned CurNodeOpcode = N.getOpcode();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
-      while (1) {
+      while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
@@ -3149,7 +3106,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
-      while (1) {
+      while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
@@ -3215,7 +3172,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // a single use.
       bool HasMultipleUses = false;
       for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
-        if (!NodeStack[i].hasOneUse()) {
+        if (!NodeStack[i].getNode()->hasOneUse()) {
           HasMultipleUses = true;
           break;
         }
@@ -3381,6 +3338,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
       continue;
     }
+    case OPC_Coverage: {
+      // This is emitted right before MorphNode/EmitNode.
+      // So it should be safe to assume that this node has been selected
+      unsigned index = MatcherTable[MatcherIndex++];
+      index |= (MatcherTable[MatcherIndex++] << 8);
+      dbgs() << "COVERED: " << getPatternForIndex(index) << "\n";
+      dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n";
+      continue;
+    }
 
     case OPC_EmitNode:     case OPC_MorphNodeTo:
     case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
@@ -3473,7 +3439,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
                                                              nullptr));
         }
-
       } else {
         assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
                "NodeToMatch was removed partway through selection");
@@ -3610,7 +3575,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // find a case to check.
     DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
     ++NumDAGIselRetries;
-    while (1) {
+    while (true) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
         return;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 690f0d2c8082..2756e276c6a9 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -55,14 +55,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
 
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore noalias because it doesn't affect the call sequence.
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex)
-      .removeAttribute(Attribute::NoAlias).hasAttributes())
+  AttributeList CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
+          .removeAttribute(Attribute::NoAlias)
+          .hasAttributes())
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
     return false;
 
   // Check if the only use is a function return node.
@@ -96,19 +97,20 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
 
 /// \brief Set CallLoweringInfo attribute flags based on a call instruction
 /// and called function attributes.
-void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
-                                                 unsigned AttrIdx) {
-  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
-  isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
-  Alignment  = CS->getParamAlignment(AttrIdx);
+void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                     unsigned ArgIdx) {
+  IsSExt = CS->paramHasAttr(ArgIdx, Attribute::SExt);
+  IsZExt = CS->paramHasAttr(ArgIdx, Attribute::ZExt);
+  IsInReg = CS->paramHasAttr(ArgIdx, Attribute::InReg);
+  IsSRet = CS->paramHasAttr(ArgIdx, Attribute::StructRet);
+  IsNest = CS->paramHasAttr(ArgIdx, Attribute::Nest);
+  IsByVal = CS->paramHasAttr(ArgIdx, Attribute::ByVal);
+  IsInAlloca = CS->paramHasAttr(ArgIdx, Attribute::InAlloca);
+  IsReturned = CS->paramHasAttr(ArgIdx, Attribute::Returned);
+  IsSwiftSelf = CS->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
+  IsSwiftError = CS->paramHasAttr(ArgIdx, Attribute::SwiftError);
+  // FIXME: getParamAlignment is off by one from argument index.
+  Alignment  = CS->getParamAlignment(ArgIdx + 1);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
@@ -125,8 +127,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   for (SDValue Op : Ops) {
     Entry.Node = Op;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
-    Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
     Args.push_back(Entry);
   }
 
@@ -138,10 +140,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
   bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
-    .setSExtResult(signExtend).setZExtResult(!signExtend);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setNoReturn(doesNotReturn)
+      .setDiscardResult(!isReturnValueUsed)
+      .setSExtResult(signExtend)
+      .setZExtResult(!signExtend);
   return LowerCallTo(CLI);
 }
 
@@ -334,34 +339,35 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 //  Optimization Methods
 //===----------------------------------------------------------------------===//
 
-/// Check to see if the specified operand of the specified instruction is a
-/// constant integer. If so, check to see if there are any bits set in the
-/// constant that are not demanded. If so, shrink the constant and return true.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
-                                                        const APInt &Demanded) {
-  SDLoc dl(Op);
+/// If the specified instruction has a constant integer operand and there are
+/// bits set in that constant that are not demanded, then clear those bits and
+/// return true.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(
+    SDValue Op, const APInt &Demanded) {
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
 
   // FIXME: ISD::SELECT, ISD::SELECT_CC
-  switch (Op.getOpcode()) {
-  default: break;
+  switch (Opcode) {
+  default:
+    break;
   case ISD::XOR:
   case ISD::AND:
   case ISD::OR: {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!C) return false;
+    auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Op1C)
+      return false;
 
-    if (Op.getOpcode() == ISD::XOR &&
-        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+    // If this is a 'not' op, don't touch it because that's a canonical form.
+    const APInt &C = Op1C->getAPIntValue();
+    if (Opcode == ISD::XOR && (C | ~Demanded).isAllOnesValue())
       return false;
 
-    // if we can expand it to have all bits set, do it
-    if (C->getAPIntValue().intersects(~Demanded)) {
+    if (C.intersects(~Demanded)) {
       EVT VT = Op.getValueType();
-      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
-                                DAG.getConstant(Demanded &
-                                                C->getAPIntValue(),
-                                                dl, VT));
-      return CombineTo(Op, New);
+      SDValue NewC = DAG.getConstant(Demanded & C, DL, VT);
+      SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
+      return CombineTo(Op, NewOp);
     }
 
     break;
@@ -470,6 +476,21 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
   return true;
 }
 
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask,
+                                          DAGCombinerInfo &DCI) const {
+
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+  APInt KnownZero, KnownOne;
+
+  bool Simplified = SimplifyDemandedBits(Op, DemandedMask, KnownZero, KnownOne,
+                                         TLO);
+  if (Simplified)
+    DCI.CommitTargetLoweringOpt(TLO);
+  return Simplified;
+}
+
 /// Look at Op. At this point, we know that only the DemandedMask bits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -711,8 +732,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       }
     }
 
-    KnownZero = KnownZeroOut;
-    KnownOne  = KnownOneOut;
+    KnownZero = std::move(KnownZeroOut);
+    KnownOne  = std::move(KnownOneOut);
     break;
   case ISD::SELECT:
     if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero,
@@ -750,6 +771,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
     break;
+  case ISD::SETCC: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    // If (1) we only need the sign-bit, (2) the setcc operands are the same
+    // width as the setcc result, and (3) the result of a setcc conforms to 0 or
+    // -1, we may be able to bypass the setcc.
+    if (NewMask.isSignBit() && Op0.getScalarValueSizeInBits() == BitWidth &&
+        getBooleanContents(Op.getValueType()) ==
+            BooleanContent::ZeroOrNegativeOneBooleanContent) {
+      // If we're testing X < 0, then this compare isn't needed - just use X!
+      // FIXME: We're limiting to integer types here, but this should also work
+      // if we don't care about FP signed-zero. The use of SETLT with FP means
+      // that we don't care about NaNs.
+      if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
+          (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode())))
+        return TLO.CombineTo(Op, Op0);
+
+      // TODO: Should we check for other forms of sign-bit comparisons?
+      // Examples: X <= -1, X >= 0
+    }
+    if (getBooleanContents(Op0.getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero.setBitsFrom(1);
+    break;
+  }
   case ISD::SHL:
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       unsigned ShAmt = SA->getZExtValue();
@@ -834,7 +882,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       KnownZero <<= SA->getZExtValue();
       KnownOne  <<= SA->getZExtValue();
       // low bits known zero.
-      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+      KnownZero.setLowBits(SA->getZExtValue());
     }
     break;
   case ISD::SRL:
@@ -853,7 +901,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
-        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+        InDemandedMask.setLowBits(ShAmt);
 
       // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the top bits (which are shifted out)
@@ -884,8 +932,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       KnownZero = KnownZero.lshr(ShAmt);
       KnownOne  = KnownOne.lshr(ShAmt);
 
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
-      KnownZero |= HighBits;  // High bits known zero.
+      KnownZero.setHighBits(ShAmt);  // High bits known zero.
     }
     break;
   case ISD::SRA:
@@ -911,7 +958,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
-        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+        InDemandedMask.setLowBits(ShAmt);
 
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
@@ -1075,7 +1122,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
     APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
-    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1);
     APInt NewBits   = ~InMask & NewMask;
 
     // If none of the top bits are demanded, convert this into an any_extend.
@@ -1191,7 +1238,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return true;
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 
-    KnownZero |= ~InMask & NewMask;
+    KnownZero |= ~InMask;
     break;
   }
   case ISD::BITCAST:
@@ -1281,6 +1328,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                    APInt &KnownZero,
                                                    APInt &KnownOne,
+                                                   const APInt &DemandedElts,
                                                    const SelectionDAG &DAG,
                                                    unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
@@ -1295,6 +1343,7 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
 /// This method can be implemented by targets that want to expose additional
 /// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const APInt &,
                                                          const SelectionDAG &,
                                                          unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
@@ -2050,6 +2099,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     if (Cond == ISD::SETO || Cond == ISD::SETUO)
       return DAG.getSetCC(dl, VT, N0, N0, Cond);
 
+    // setcc (fneg x), C -> setcc swap(pred) x, -C
+    if (N0.getOpcode() == ISD::FNEG) {
+      ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond);
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(SwapCond, N0.getSimpleValueType())) {
+        SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1);
+        return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond);
+      }
+    }
+
     // If the condition is not legal, see if we can find an equivalent one
     // which is legal.
     if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
@@ -2470,10 +2529,7 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
-  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
-       E = RI->regclass_end(); RCI != E; ++RCI) {
-    const TargetRegisterClass *RC = *RCI;
-
+  for (const TargetRegisterClass *RC : RI->regclasses()) {
     // If none of the value types for this register class are valid, we
     // can't use it.  For example, 64-bit reg classes on 32-bit targets.
     if (!isLegalRC(RC))
@@ -2933,7 +2989,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
 SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                       SelectionDAG &DAG,
                                       std::vector<SDNode *> *Created) const {
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
@@ -3808,7 +3864,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
-  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
+  CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 209bbe54ea23..ab578df4069d 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -64,6 +64,7 @@ public:
 
 private:
   bool setupEntryBlockAndCallSites(Function &F);
+  bool undoSwiftErrorSelect(Function &F);
   void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
   Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
   void lowerIncomingArguments(Function &F);
@@ -174,8 +175,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   // because the value needs to be added to the global context list.
   auto &DL = F.getParent()->getDataLayout();
   unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
-                           &EntryBB->front());
+  FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(),
+                           nullptr, Align, "fn_context", &EntryBB->front());
 
   // Fill in the function context structure.
   for (LandingPadInst *LPI : LPads) {
@@ -458,14 +459,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   return true;
 }
 
+bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) {
+  // We have inserted dummy copies 'select true, arg, undef' in the entry block
+  // for arguments to simplify this pass.
+  // swifterror arguments cannot be used in this way. Undo the select for the
+  // swifterror argument.
+  for (auto &AI : F.args()) {
+    if (AI.isSwiftError()) {
+      assert(AI.hasOneUse() && "Must have converted the argument to a select");
+      auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser());
+      assert(Select && "There must be single select user");
+      auto *OrigSwiftError = cast<Argument>(Select->getTrueValue());
+      Select->replaceAllUsesWith(OrigSwiftError);
+      Select->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
 bool SjLjEHPrepare::runOnFunction(Function &F) {
   Module &M = *F.getParent();
   RegisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), nullptr);
+      PointerType::getUnqual(FunctionContextTy));
   UnregisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), nullptr);
+      PointerType::getUnqual(FunctionContextTy));
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -476,5 +496,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
 
   bool Res = setupEntryBlockAndCallSites(F);
+  if (Res)
+    Res |= undoSwiftErrorSelect(F);
   return Res;
 }
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index dba103e9bfb1..bc2a1d09056b 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   return false;
 }
 
+void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) {
+  assert(!MI.isBundledWithPred() &&
+         "Use removeSingleMachineInstrFromMaps() instread");
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+  // FIXME: Eventually we want to actually delete these indexes.
+  MIEntry.setInstr(nullptr);
+}
+
+void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) {
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+
+  // When removing the first instruction of a bundle update mapping to next
+  // instruction.
+  if (MI.isBundledWithSucc()) {
+    // Only the first instruction of a bundle should have an index assigned.
+    assert(!MI.isBundledWithPred() && "Should have first bundle isntruction");
+
+    MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator());
+    MachineInstr &NextMI = *Next;
+    MIEntry.setInstr(&NextMI);
+    mi2iMap.insert(std::make_pair(&NextMI, MIIndex));
+    return;
+  } else {
+    // FIXME: Eventually we want to actually delete these indexes.
+    MIEntry.setInstr(nullptr);
+  }
+}
+
 void SlotIndexes::renumberIndexes() {
   // Renumber updates the index of every element of the index list.
   DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 1c6a84e53944..3a50aaa69985 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -487,12 +488,126 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
+SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  bool FirstCopy = !Def.isValid();
+  MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
+      .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
+              | getInternalReadRegState(!FirstCopy), SubIdx)
+      .addReg(FromReg, 0, SubIdx);
+
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  if (FirstCopy) {
+    SlotIndexes &Indexes = *LIS.getSlotIndexes();
+    Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  } else {
+    CopyMI->bundleWithPred();
+  }
+  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx);
+  DestLI.refineSubRanges(Allocator, LaneMask,
+                         [Def, &Allocator](LiveInterval::SubRange& SR) {
+    SR.createDeadDef(Def, Allocator);
+  });
+  return Def;
+}
+
+SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg,
+    LaneBitmask LaneMask, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
+    // The full vreg is copied.
+    MachineInstr *CopyMI =
+        BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
+    SlotIndexes &Indexes = *LIS.getSlotIndexes();
+    return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  }
+
+  // Only a subset of lanes needs to be copied. The following is a simple
+  // heuristic to construct a sequence of COPYs. We could add a target
+  // specific callback if this turns out to be suboptimal.
+  LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx));
+
+  // First pass: Try to find a perfectly matching subregister index. If none
+  // exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  unsigned BestCover = 0;
+  const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
+  assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
+  for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) {
+    // Is this index even compatible with the given class?
+    if (TRI.getSubClassWithSubReg(RC, Idx) != RC)
+      continue;
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
+    // Early exit if we found a perfect match.
+    if (SubRegMask == LaneMask) {
+      BestIdx = Idx;
+      break;
+    }
+
+    // The index must not cover any lanes outside \p LaneMask.
+    if ((SubRegMask & ~LaneMask).any())
+      continue;
+
+    unsigned PopCount = countPopulation(SubRegMask.getAsInteger());
+    PossibleIndexes.push_back(Idx);
+    if (PopCount > BestCover) {
+      BestCover = PopCount;
+      BestIdx = Idx;
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0)
+    report_fatal_error("Impossible to implement partial COPY");
+
+  SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore,
+                                        BestIdx, DestLI, Late, SlotIndex());
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  LaneBitmask LanesLeft =
+      LaneMask & ~(TRI.getSubRegIndexLaneMask(BestCover));
+  while (LanesLeft.any()) {
+    unsigned BestIdx = 0;
+    int BestCover = INT_MIN;
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == LanesLeft) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Try to cover as much of the remaining lanes as possible but
+      // as few of the already covered lanes as possible.
+      int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger())
+                - countPopulation((SubRegMask & ~LanesLeft).getAsInteger());
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0)
+      report_fatal_error("Impossible to implement partial COPY");
+
+    buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
+                          DestLI, Late, Def);
+    LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return Def;
+}
+
 VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    VNInfo *ParentVNI,
                                    SlotIndex UseIdx,
                                    MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
-  MachineInstr *CopyMI = nullptr;
   SlotIndex Def;
   LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
@@ -505,24 +620,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   LiveInterval &OrigLI = LIS.getInterval(Original);
   VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
 
+  unsigned Reg = LI->reg;
   bool DidRemat = false;
   if (OrigVNI) {
     LiveRangeEdit::Remat RM(ParentVNI);
     RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
-      Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
+      Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
       ++NumRemats;
       DidRemat = true;
     }
   }
   if (!DidRemat) {
-    // Can't remat, just insert a copy from parent.
-    CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
-               .addReg(Edit->getReg());
-    Def = LIS.getSlotIndexes()
-              ->insertMachineInstrInMaps(*CopyMI, Late)
-              .getRegSlot();
+    LaneBitmask LaneMask;
+    if (LI->hasSubRanges()) {
+      LaneMask = LaneBitmask::getNone();
+      for (LiveInterval::SubRange &S : LI->subranges())
+        LaneMask |= S.LaneMask;
+    } else {
+      LaneMask = LaneBitmask::getAll();
+    }
+
     ++NumCopies;
+    Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
   }
 
   // Define the value in Reg.
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index a75738aaf446..9d409e924a3d 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -405,6 +405,17 @@ private:
   /// deleteRematVictims - Delete defs that are dead after rematerializing.
   void deleteRematVictims();
 
+  /// Add a copy instruction copying \p FromReg to \p ToReg before
+  /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it
+  /// necessary to construct a sequence of copies to cover it exactly.
+  SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask,
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+      bool Late, unsigned RegIdx);
+
+  SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy);
+
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 89c4b574f17f..f51d959a089a 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -23,7 +23,6 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -385,14 +384,13 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-#ifndef NDEBUG
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
                                             const BitVector &BV) const {
-  DEBUG(dbgs() << tag << " : { ");
+  dbgs() << tag << " : { ";
   for (unsigned I = 0, E = BV.size(); I != E; ++I)
-    DEBUG(dbgs() << BV.test(I) << " ");
-  DEBUG(dbgs() << "}\n");
+    dbgs() << BV.test(I) << " ";
+  dbgs() << "}\n";
 }
 
 LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
@@ -408,20 +406,19 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
 
 LLVM_DUMP_METHOD void StackColoring::dump() const {
   for (MachineBasicBlock *MBB : depth_first(MF)) {
-    DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
-                 << MBB->getName() << "]\n");
-    DEBUG(dumpBB(MBB));
+    dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
+           << MBB->getName() << "]\n";
+    dumpBB(MBB);
   }
 }
 
 LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
   for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
-    DEBUG(dbgs() << "Interval[" << I << "]:\n");
-    DEBUG(Intervals[I]->dump());
+    dbgs() << "Interval[" << I << "]:\n";
+    Intervals[I]->dump();
   }
 }
-
-#endif // not NDEBUG
+#endif
 
 static inline int getStartOrEndSlot(const MachineInstr &MI)
 {
@@ -570,9 +567,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot)
 
   // Step 2: compute begin/end sets for each block
 
-  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
-  // deterministic numbering, and because we'll need a post-order iteration
-  // later for solving the liveness dataflow problem.
+  // NOTE: We use a depth-first iteration to ensure that we obtain a
+  // deterministic numbering.
   for (MachineBasicBlock *MBB : depth_first(MF)) {
 
     // Assign a serial number to this basic block.
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index 9b7dd400fc92..1a8ec5bff322 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -1,4 +1,4 @@
-//===---------------------------- StackMaps.cpp ---------------------------===//
+//===- StackMaps.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,23 +7,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -276,7 +287,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   }
 
   LiveOuts.erase(
-      remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }),
+      llvm::remove_if(LiveOuts,
+                      [](const LiveOutReg &LO) { return LO.Reg == 0; }),
       LiveOuts.end());
 
   return LiveOuts;
@@ -286,7 +298,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
                                     MachineInstr::const_mop_iterator MOI,
                                     MachineInstr::const_mop_iterator MOE,
                                     bool recordResult) {
-
   MCContext &OutContext = AP.OutStreamer->getContext();
   MCSymbol *MILabel = OutContext.createTempSymbol();
   AP.OutStreamer->EmitLabel(MILabel);
@@ -378,6 +389,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
   }
 #endif
 }
+
 void StackMaps::recordStatepoint(const MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
 
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index c2c010a29d44..a8aafe78748d 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -1,4 +1,4 @@
-//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//===- StackProtector.cpp - Stack Protector Insertion ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,30 +14,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <cstdlib>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "stack-protector"
@@ -51,7 +59,7 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
 
 char StackProtector::ID = 0;
 INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors",
-                false, true)
+                   false, true)
 
 FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
   return new StackProtector(TM);
@@ -222,7 +230,16 @@ bool StackProtector::RequiresStackProtector() {
   if (F->hasFnAttribute(Attribute::SafeStack))
     return false;
 
+  // We are constructing the OptimizationRemarkEmitter on the fly rather than
+  // using the analysis pass to avoid building DominatorTree and LoopInfo which
+  // are not available this late in the IR pipeline.
+  OptimizationRemarkEmitter ORE(F);
+
   if (F->hasFnAttribute(Attribute::StackProtectReq)) {
+    ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F)
+             << "Stack protection applied to function "
+             << ore::NV("Function", F)
+             << " due to a function attribute or command-line switch");
     NeedsProtector = true;
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
   } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
@@ -236,20 +253,29 @@ bool StackProtector::RequiresStackProtector() {
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         if (AI->isArrayAllocation()) {
+          OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray",
+                                    &I);
+          Remark
+              << "Stack protection applied to function "
+              << ore::NV("Function", F)
+              << " due to a call to alloca or use of a variable length array";
           if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
               Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              ORE.emit(Remark);
               NeedsProtector = true;
             } else if (Strong) {
               // Require protectors for all alloca calls in strong mode.
               Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              ORE.emit(Remark);
               NeedsProtector = true;
             }
           } else {
             // A call to alloca with a variable size requires protectors.
             Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            ORE.emit(Remark);
             NeedsProtector = true;
           }
           continue;
@@ -259,6 +285,11 @@ bool StackProtector::RequiresStackProtector() {
         if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
           Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
                                                    : SSPLK_SmallArray));
+          ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I)
+                   << "Stack protection applied to function "
+                   << ore::NV("Function", F)
+                   << " due to a stack allocated buffer or struct containing a "
+                      "buffer");
           NeedsProtector = true;
           continue;
         }
@@ -266,6 +297,11 @@ bool StackProtector::RequiresStackProtector() {
         if (Strong && HasAddressTaken(AI)) {
           ++NumAddrTaken;
           Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          ORE.emit(
+              OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I)
+              << "Stack protection applied to function "
+              << ore::NV("Function", F)
+              << " due to the address of a local variable being taken");
           NeedsProtector = true;
         }
       }
@@ -448,13 +484,13 @@ BasicBlock *StackProtector::CreateFailBB() {
     Constant *StackChkFail =
         M->getOrInsertFunction("__stack_smash_handler",
                                Type::getVoidTy(Context),
-                               Type::getInt8PtrTy(Context), nullptr);
+                               Type::getInt8PtrTy(Context));
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
     Constant *StackChkFail =
-        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
-                               nullptr);
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
+
     B.CreateCall(StackChkFail, {});
   }
   B.CreateUnreachable();
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index 7709236bbaa8..d2414200e9d5 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -725,6 +725,7 @@ bool TailDuplicator::duplicateSimpleBB(
     if (PredTBB == NextBB && PredFBB == nullptr)
       PredTBB = nullptr;
 
+    auto DL = PredBB->findBranchDebugLoc();
     TII->removeBranch(*PredBB);
 
     if (!PredBB->isSuccessor(NewTarget))
@@ -735,7 +736,7 @@ bool TailDuplicator::duplicateSimpleBB(
     }
 
     if (PredTBB)
-      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL);
 
     TDBBs.push_back(PredBB);
   }
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index f082add8c7dd..e5def6752e07 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -73,7 +73,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
     return;
 
   // Get the callee saved register list...
-  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
 
   // Early exit if there are no callee saved registers.
   if (!CSRegs || CSRegs[0] == 0)
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 01f91b96b58a..711144a34743 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -470,7 +470,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
 
   // No need to fold return, the meta data, and function arguments
   for (unsigned i = 0; i < StartIdx; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
 
   for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {
     MachineOperand &MO = MI.getOperand(i);
@@ -490,7 +490,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
       MIB.addImm(SpillOffset);
     }
     else
-      MIB.addOperand(MO);
+      MIB.add(MO);
   }
   return NewMI;
 }
@@ -941,12 +941,10 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const {
   unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
 
-  if (MI.getOpcode() != FrameSetupOpcode &&
-      MI.getOpcode() != FrameDestroyOpcode)
+  if (!isFrameInstr(MI))
     return 0;
 
-  int SPAdj = MI.getOperand(0).getImm();
-  SPAdj = TFI->alignSPAdjust(SPAdj);
+  int SPAdj = TFI->alignSPAdjust(getFrameSize(MI));
 
   if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) ||
       (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode))
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 003311b157fc..27630a3055cb 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -838,7 +838,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
-  MaskAndBranchFoldingIsLegal = false;
   EnableExtLdPromotion = false;
   HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
@@ -851,7 +850,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinFunctionAlignment = 0;
   PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
-  GatherAllAliasesMaxDepth = 6;
+  GatherAllAliasesMaxDepth = 18;
   MinStackArgumentAlignment = 1;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.
@@ -901,6 +900,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMAX, VT, Expand);
     setOperationAction(ISD::UMIN, VT, Expand);
     setOperationAction(ISD::UMAX, VT, Expand);
+    setOperationAction(ISD::ABS, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -1227,7 +1227,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
 
     // Copy operands before the frame-index.
     for (unsigned i = 0; i < OperIdx; ++i)
-      MIB.addOperand(MI->getOperand(i));
+      MIB.add(MI->getOperand(i));
     // Add frame index operands recognized by stackmaps.cpp
     if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
       // indirect-mem-ref tag, size, #FI, offset.
@@ -1237,18 +1237,18 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
       assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
       MIB.addImm(StackMaps::IndirectMemRefOp);
       MIB.addImm(MFI.getObjectSize(FI));
-      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.add(MI->getOperand(OperIdx));
       MIB.addImm(0);
     } else {
       // direct-mem-ref tag, #FI, offset.
       // Used by patchpoint, and direct alloca arguments to statepoints
       MIB.addImm(StackMaps::DirectMemRefOp);
-      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.add(MI->getOperand(OperIdx));
       MIB.addImm(0);
     }
     // Copy the operands after the frame index.
     for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
-      MIB.addOperand(MI->getOperand(i));
+      MIB.add(MI->getOperand(i));
 
     // Inherit previous memory operands.
     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
@@ -1589,7 +1589,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 /// type of the given function.  This does not require a DAG or a return value,
 /// and is suitable for use before any DAGs for the function are constructed.
 /// TODO: Move this out of TargetLowering.cpp.
-void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
+void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
                          const TargetLowering &TLI, const DataLayout &DL) {
   SmallVector<EVT, 4> ValueVTs;
@@ -1601,9 +1601,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1621,13 +1621,13 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
       Flags.setSExt();
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
@@ -1818,7 +1818,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
   Value *Fn = M->getOrInsertFunction("__safestack_pointer_address",
-                                     StackPtrTy->getPointerTo(0), nullptr);
+                                     StackPtrTy->getPointerTo(0));
   return IRB.CreateCall(Fn);
 }
 
@@ -1918,11 +1918,7 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {
 /// override the target defaults.
 static StringRef getRecipEstimateForFunc(MachineFunction &MF) {
   const Function *F = MF.getFunction();
-  StringRef RecipAttrName = "reciprocal-estimates";
-  if (!F->hasFnAttribute(RecipAttrName))
-    return StringRef();
-
-  return F->getFnAttribute(RecipAttrName).getValueAsString();
+  return F->getFnAttribute("reciprocal-estimates").getValueAsString();
 }
 
 /// Construct a string for the given reciprocal operation of the given type.
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index eb2a28f574a5..34892680aceb 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===//
+//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,36 +12,52 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <string>
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -53,10 +69,10 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
     const GlobalValue *GV, const TargetMachine &TM,
     MachineModuleInfo *MMI) const {
   unsigned Encoding = getPersonalityEncoding();
-  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect)
+  if ((Encoding & 0x80) == DW_EH_PE_indirect)
     return getContext().getOrCreateSymbol(StringRef("DW.ref.") +
                                           TM.getSymbol(GV)->getName());
-  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr)
+  if ((Encoding & 0x70) == DW_EH_PE_absptr)
     return TM.getSymbol(GV);
   report_fatal_error("We do not support this DWARF encoding yet!");
 }
@@ -86,8 +102,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
 const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
-
-  if (Encoding & dwarf::DW_EH_PE_indirect) {
+  if (Encoding & DW_EH_PE_indirect) {
     MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
 
     MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM);
@@ -102,7 +117,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
 
     return TargetLoweringObjectFile::
       getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
-                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
@@ -117,8 +132,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   // section(".eh_frame") gcc will produce:
   //
   //   .section   .eh_frame,"a",@progbits
-  
-  if (Name == getInstrProfCoverageSectionName(false))
+
+  if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF,
+                                      /*AddSegmentInfo=*/false))
     return SectionKind::getMetadata();
 
   if (Name.empty() || Name[0] != '.') return K;
@@ -149,7 +165,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   return K;
 }
 
-
 static unsigned getELFSectionType(StringRef Name, SectionKind K) {
   // Use SHT_NOTE for section whose name starts with ".note" to allow
   // emitting ELF notes from C variable declaration.
@@ -211,6 +226,20 @@ static const Comdat *getELFComdat(const GlobalValue *GV) {
   return C;
 }
 
+static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
+                                              const TargetMachine &TM) {
+  MDNode *MD = GO->getMetadata(LLVMContext::MD_associated);
+  if (!MD)
+    return nullptr;
+
+  auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+  if (!VM)
+    report_fatal_error("MD_associated operand is not ValueAsMetadata");
+
+  GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue());
+  return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -224,9 +253,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     Group = C->getName();
     Flags |= ELF::SHF_GROUP;
   }
-  return getContext().getELFSection(SectionName,
-                                    getELFSectionType(SectionName, Kind), Flags,
-                                    /*EntrySize=*/0, Group);
+
+  // A section can have at most one associated section. Put each global with
+  // MD_associated in a unique section.
+  unsigned UniqueID = MCContext::GenericSectionID;
+  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
+  if (AssociatedSymbol) {
+    UniqueID = NextUniqueID++;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+
+  MCSectionELF *Section = getContext().getELFSection(
+      SectionName, getELFSectionType(SectionName, Kind), Flags,
+      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
+  // Make sure that we did not get some other section with incompatible sh_link.
+  // This should not be possible due to UniqueID code above.
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  return Section;
 }
 
 /// Return the section prefix name used by options FunctionsSections and
@@ -248,11 +291,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
-static MCSectionELF *
-selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
-                          SectionKind Kind, Mangler &Mang,
-                          const TargetMachine &TM, bool EmitUniqueSection,
-                          unsigned Flags, unsigned *NextUniqueID) {
+static MCSectionELF *selectELFSectionForGlobal(
+    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
+    unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) {
   unsigned EntrySize = 0;
   if (Kind.isMergeableCString()) {
     if (Kind.isMergeable2ByteCString()) {
@@ -319,7 +361,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
   if (Kind.isExecuteOnly())
     UniqueID = 0;
   return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags,
-                           EntrySize, Group, UniqueID);
+                           EntrySize, Group, UniqueID, AssociatedSymbol);
 }
 
 MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
@@ -337,8 +379,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
   }
   EmitUniqueSection |= GO->hasComdat();
 
-  return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
-                                   EmitUniqueSection, Flags, &NextUniqueID);
+  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
+  if (AssociatedSymbol) {
+    EmitUniqueSection = true;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+
+  MCSectionELF *Section = selectELFSectionForGlobal(
+      getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags,
+      &NextUniqueID, AssociatedSymbol);
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  return Section;
 }
 
 MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
@@ -351,8 +402,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
     return ReadOnlySection;
 
   return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(),
-                                   getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC,
-                                   &NextUniqueID);
+                                   getMangler(), TM, EmitUniqueSection,
+                                   ELF::SHF_ALLOC, &NextUniqueID,
+                                   /* AssociatedSymbol */ nullptr);
 }
 
 bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
@@ -723,7 +775,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
 
     return TargetLoweringObjectFile::
       getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
-                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
@@ -1122,33 +1174,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
 
 void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
     raw_ostream &OS, const GlobalValue *GV) const {
-  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
-    return;
+  emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler());
+}
 
-  const Triple &TT = getTargetTriple();
+//===----------------------------------------------------------------------===//
+//                                  Wasm
+//===----------------------------------------------------------------------===//
 
-  if (TT.isKnownWindowsMSVCEnvironment())
-    OS << " /EXPORT:";
-  else
-    OS << " -export:";
-
-  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
-    std::string Flag;
-    raw_string_ostream FlagOS(Flag);
-    getMangler().getNameWithPrefix(FlagOS, GV, false);
-    FlagOS.flush();
-    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
-      OS << Flag.substr(1);
-    else
-      OS << Flag;
-  } else {
-    getMangler().getNameWithPrefix(OS, GV, false);
+static const Comdat *getWasmComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" +
+                       C->getName() + "' cannot be lowered.");
+
+  return C;
+}
+
+MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  llvm_unreachable("getExplicitSectionGlobal not yet implemented");
+  return nullptr;
+}
+
+static MCSectionWasm *
+selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
+                           SectionKind Kind, Mangler &Mang,
+                           const TargetMachine &TM, bool EmitUniqueSection,
+                           unsigned Flags, unsigned *NextUniqueID) {
+  StringRef Group = "";
+  if (getWasmComdat(GO))
+    llvm_unreachable("comdat not yet supported for wasm");
+
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name = getSectionPrefixForGlobal(Kind);
+
+  if (const auto *F = dyn_cast<Function>(GO)) {
+    const auto &OptionalPrefix = F->getSectionPrefix();
+    if (OptionalPrefix)
+      Name += *OptionalPrefix;
   }
 
-  if (!GV->getValueType()->isFunctionTy()) {
-    if (TT.isKnownWindowsMSVCEnvironment())
-      OS << ",DATA";
-    else
-      OS << ",data";
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GO, Mang, true);
+  }
+  unsigned UniqueID = MCContext::GenericSectionID;
+  if (EmitUniqueSection && !UniqueSectionNames) {
+    UniqueID = *NextUniqueID;
+    (*NextUniqueID)++;
   }
+  return Ctx.getWasmSection(Name, /*Type=*/0, Flags,
+                            Group, UniqueID);
+}
+
+MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+  if (Kind.isCommon())
+    report_fatal_error("mergable sections not supported yet on wasm");
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniqueSection = false;
+  if (Kind.isText())
+    EmitUniqueSection = TM.getFunctionSections();
+  else
+    EmitUniqueSection = TM.getDataSections();
+  EmitUniqueSection |= GO->hasComdat();
+
+  return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
+                                    EmitUniqueSection, /*Flags=*/0,
+                                    &NextUniqueID);
+}
+
+bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
+}
+
+const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic sanity checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
+}
+
+void
+TargetLoweringObjectFileWasm::InitializeWasm() {
+  // TODO: Initialize StaticCtorSection and StaticDtorSection.
 }
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index b6da8e0aa60d..c20d5ab814f8 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   return false;
 }
 
-/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
-/// is specified on the command line.  When this flag is off(default), the
-/// code generator is not allowed to generate mad (multiply add) if the
-/// result is "less precise" than doing those operations individually.
-bool TargetOptions::LessPreciseFPMAD() const {
-  return UnsafeFPMath || LessPreciseFPMADOption;
-}
-
 /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
 /// that the rounding mode of the FPU can change from its default.
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index e7ea2b4563f9..150195f5f85b 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -92,6 +92,9 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(false),
     cl::ZeroOrMore);
+static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner",
+    cl::Hidden,
+    cl::desc("Enable machine outliner"));
 
 static cl::opt<std::string>
 PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
@@ -261,7 +264,8 @@ TargetPassConfig::~TargetPassConfig() {
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
     : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false),
       AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
-      DisableVerify(false), EnableTailMerge(true) {
+      DisableVerify(false), EnableTailMerge(true),
+      RequireCodeGenSCCOrder(false) {
 
   Impl = new PassConfigImpl();
 
@@ -279,6 +283,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
 
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
     TM->Options.PrintMachineCode = true;
+
+  if (TM->Options.EnableIPRA)
+    setRequiresCodeGenSCCOrder();
 }
 
 CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
@@ -531,7 +538,7 @@ void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
   // Force codegen to run according to the callgraph.
-  if (TM->Options.EnableIPRA)
+  if (requiresCodeGenSCCOrder())
     addPass(new DummyCGSCCPass);
 
   // Add both the safe stack and the stack protection passes: each of them will
@@ -668,9 +675,15 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&StackMapLivenessID, false);
   addPass(&LiveDebugValuesID, false);
 
+  // Insert before XRay Instrumentation.
+  addPass(&FEntryInserterID, false);
+
   addPass(&XRayInstrumentationID, false);
   addPass(&PatchableFunctionID, false);
 
+  if (EnableMachineOutliner)
+    PM->add(createMachineOutlinerPass());
+
   AddingMachinePasses = false;
 }
 
@@ -704,6 +717,10 @@ void TargetPassConfig::addMachineSSAOptimization() {
 
   addPass(&MachineLICMID, false);
   addPass(&MachineCSEID, false);
+
+  // Coalesce basic blocks with the same branch condition
+  addPass(&BranchCoalescingID);
+
   addPass(&MachineSinkingID);
 
   addPass(&PeepholeOptimizerID);
@@ -730,7 +747,7 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
-LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag);
+static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 static RegisterRegAlloc
 defaultRegAlloc("default",
@@ -903,6 +920,11 @@ void TargetPassConfig::addBlockPlacement() {
 //===---------------------------------------------------------------------===//
 /// GlobalISel Configuration
 //===---------------------------------------------------------------------===//
+
+bool TargetPassConfig::isGlobalISelEnabled() const {
+  return false;
+}
+
 bool TargetPassConfig::isGlobalISelAbortEnabled() const {
   return EnableGlobalISelAbort == 1;
 }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index cd50c5b6571d..66cdad278e8d 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -155,8 +155,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
   // Pick the most sub register class of the right type that contains
   // this physreg.
   const TargetRegisterClass* BestRC = nullptr;
-  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
-    const TargetRegisterClass* RC = *I;
+  for (const TargetRegisterClass* RC : regclasses()) {
     if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
         (!BestRC || BestRC->hasSubClass(RC)))
       BestRC = RC;
@@ -185,10 +184,9 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
     if (SubClass)
       getAllocatableSetForRC(MF, SubClass, Allocatable);
   } else {
-    for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
-         E = regclass_end(); I != E; ++I)
-      if ((*I)->isAllocatable())
-        getAllocatableSetForRC(MF, *I, Allocatable);
+    for (const TargetRegisterClass *C : regclasses())
+      if (C->isAllocatable())
+        getAllocatableSetForRC(MF, C, Allocatable);
   }
 
   // Mask out the reserved registers
@@ -415,9 +413,9 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void
-TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
-                            const TargetRegisterInfo *TRI) {
+LLVM_DUMP_METHOD
+void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
+                                 const TargetRegisterInfo *TRI) {
   dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n";
 }
 #endif
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 83e52d335354..0df34ce43112 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +12,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const {
 
 static unsigned gcd(unsigned Dividend, unsigned Divisor) {
   // Dividend and Divisor will be naturally swapped as needed.
-  while(Divisor) {
+  while (Divisor) {
     unsigned Rem = Dividend % Divisor;
     Dividend = Divisor;
     Divisor = Rem;
   };
   return Dividend;
 }
+
 static unsigned lcm(unsigned A, unsigned B) {
   unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
   assert((LCM >= A && LCM >= B) && "LCM overflow");
@@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm,
   }
 }
 
+/// Returns true only if instruction is specified as single issue.
+bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->BeginGroup;
+  }
+  return false;
+}
+
+bool TargetSchedModel::mustEndGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->EndGroup;
+  }
+  return false;
+}
+
 unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
                                           const MCSchedClassDesc *SC) const {
   if (hasInstrItineraries()) {
@@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) {
 /// evaluation of predicates that depend on instruction operands or flags.
 const MCSchedClassDesc *TargetSchedModel::
 resolveSchedClass(const MachineInstr *MI) const {
-
   // Get the definition's scheduling class descriptor from this machine model.
   unsigned SchedClass = MI->getDesc().getSchedClass();
   const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
@@ -244,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
   if (SCDesc->isValid() && !SCDesc->isVariant())
     return computeInstrLatency(*SCDesc);
 
-  llvm_unreachable("No MI sched latency");
+  if (SCDesc->isValid()) {
+    assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()");
+    return computeInstrLatency(*SCDesc);
+  }
+  return 0;
 }
 
 unsigned
@@ -298,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   }
   return 0;
 }
+
+static Optional<double>
+getRTroughputFromItineraries(unsigned schedClass,
+                             const InstrItineraryData *IID){
+  double Unknown = std::numeric_limits<double>::infinity();
+  double Throughput = Unknown;
+
+  for (const InstrStage *IS = IID->beginStage(schedClass),
+                        *E = IID->endStage(schedClass);
+       IS != E; ++IS) {
+    unsigned Cycles = IS->getCycles();
+    if (!Cycles)
+      continue;
+    Throughput =
+        std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles);
+  }
+  // We need reciprocal throughput that's why we return such value.
+  return 1 / Throughput;
+}
+
+static Optional<double>
+getRTroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc,
+                                 const TargetSubtargetInfo *STI,
+                                 const MCSchedModel &SchedModel) {
+  double Unknown = std::numeric_limits<double>::infinity();
+  double Throughput = Unknown;
+
+  for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc),
+                                 *WEnd = STI->getWriteProcResEnd(SCDesc);
+       WPR != WEnd; ++WPR) {
+    unsigned Cycles = WPR->Cycles;
+    if (!Cycles)
+      return Optional<double>();
+
+    unsigned NumUnits =
+        SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits;
+    Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles);
+  }
+  // We need reciprocal throughput that's why we return such value.
+  return 1 / Throughput;
+}
+
+Optional<double>
+TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const {
+  if (hasInstrItineraries())
+    return getRTroughputFromItineraries(MI->getDesc().getSchedClass(),
+                                        getInstrItineraries());
+  if (hasInstrSchedModel())
+    return getRTroughputFromInstrSchedModel(resolveSchedClass(MI), STI,
+                                            SchedModel);
+  return Optional<double>();
+}
+
+Optional<double>
+TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const {
+  unsigned SchedClass = TII->get(Opcode).getSchedClass();
+  if (hasInstrItineraries())
+    return getRTroughputFromItineraries(SchedClass, getInstrItineraries());
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+    if (SCDesc->isValid() && !SCDesc->isVariant())
+      return getRTroughputFromInstrSchedModel(SCDesc, STI, SchedModel);
+  }
+  return Optional<double>();
+}
diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp
index c74707d95b9e..0a444e0fff07 100644
--- a/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -11,6 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
@@ -52,3 +55,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const {
 bool TargetSubtargetInfo::useAA() const {
   return false;
 }
+
+static std::string createSchedInfoStr(unsigned Latency,
+                                     Optional<double> RThroughput) {
+  static const char *SchedPrefix = " sched: [";
+  std::string Comment;
+  raw_string_ostream CS(Comment);
+  if (Latency > 0 && RThroughput.hasValue())
+    CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue())
+       << "]";
+  else if (Latency > 0)
+    CS << SchedPrefix << Latency << ":?]";
+  else if (RThroughput.hasValue())
+    CS << SchedPrefix << "?:" << RThroughput.getValue() << "]";
+  CS.flush();
+  return Comment;
+}
+
+/// Returns string representation of scheduler comment
+std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const {
+  if (MI.isPseudo() || MI.isTerminator())
+    return std::string();
+  // We don't cache TSchedModel because it depends on TargetInstrInfo
+  // that could be changed during the compilation
+  TargetSchedModel TSchedModel;
+  TSchedModel.init(getSchedModel(), this, getInstrInfo());
+  unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+  Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI);
+  return createSchedInfoStr(Latency, RThroughput);
+}
+
+/// Returns string representation of scheduler comment
+std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const {
+  // We don't cache TSchedModel because it depends on TargetInstrInfo
+  // that could be changed during the compilation
+  TargetSchedModel TSchedModel;
+  TSchedModel.init(getSchedModel(), this, getInstrInfo());
+  if (!TSchedModel.hasInstrSchedModel())
+    return std::string();
+  unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode());
+  Optional<double> RThroughput =
+      TSchedModel.computeInstrRThroughput(MCI.getOpcode());
+  return createSchedInfoStr(Latency, RThroughput);
+}
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 0f1b2ed994b7..75359fe3c0ea 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -905,7 +905,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     ++End;
   }
 
-  // Check if the reschedule will not break depedencies.
+  // Check if the reschedule will not break dependencies.
   unsigned NumVisited = 0;
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
@@ -1785,7 +1785,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
                                    TII->get(TargetOpcode::COPY))
                                .addReg(DstReg, RegState::Define, SubIdx)
-                               .addOperand(UseMO);
+                               .add(UseMO);
 
     // The first def needs an <undef> flag because there is no live register
     // before it.
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 0d506d646659..c8946010e9d1 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -167,6 +167,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   bool readsUndefSubreg(const MachineOperand &MO) const;
   void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
+  void expandCopyBundle(MachineInstr &MI) const;
 
 public:
   static char ID;
@@ -367,11 +368,41 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
   }
 
   if (Indexes)
-    Indexes->removeMachineInstrFromMaps(MI);
-  MI.eraseFromParent();
+    Indexes->removeSingleMachineInstrFromMaps(MI);
+  MI.eraseFromBundle();
   DEBUG(dbgs() << "  deleted.\n");
 }
 
+/// The liverange splitting logic sometimes produces bundles of copies when
+/// subregisters are involved. Expand these into a sequence of copy instructions
+/// after processing the last in the bundle. Does not update LiveIntervals
+/// which we shouldn't need for this instruction anymore.
+void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
+  if (!MI.isCopy())
+    return;
+
+  if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
+    // Only do this when the complete bundle is made out of COPYs.
+    MachineBasicBlock &MBB = *MI.getParent();
+    for (MachineBasicBlock::reverse_instr_iterator I =
+         std::next(MI.getReverseIterator()), E = MBB.instr_rend();
+         I != E && I->isBundledWithSucc(); ++I) {
+      if (!I->isCopy())
+        return;
+    }
+
+    for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator();
+         I->isBundledWithPred(); ) {
+      MachineInstr &MI = *I;
+      ++I;
+
+      MI.unbundleFromPred();
+      if (Indexes)
+        Indexes->insertMachineInstrInMaps(MI);
+    }
+  }
+}
+
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
   SmallVector<unsigned, 8> SuperDeads;
@@ -431,12 +462,14 @@ void VirtRegRewriter::rewrite() {
             }
           }
 
-          // The <def,undef> flag only makes sense for sub-register defs, and
-          // we are substituting a full physreg.  An <imp-use,kill> operand
-          // from the SuperKills list will represent the partial read of the
-          // super-register.
-          if (MO.isDef())
+          // The <def,undef> and <def,internal> flags only make sense for
+          // sub-register defs, and we are substituting a full physreg.  An
+          // <imp-use,kill> operand from the SuperKills list will represent the
+          // partial read of the super-register.
+          if (MO.isDef()) {
             MO.setIsUndef(false);
+            MO.setIsInternalRead(false);
+          }
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, SubReg);
@@ -461,6 +494,8 @@ void VirtRegRewriter::rewrite() {
 
       DEBUG(dbgs() << "> " << *MI);
 
+      expandCopyBundle(*MI);
+
       // We can remove identity copies right now.
       handleIdentityCopy(*MI);
     }
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index 568720c66e55..ae07e8b2fa03 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -86,6 +86,7 @@ private:
   // All fields are reset by runOnFunction.
   EHPersonality Personality = EHPersonality::Unknown;
 
+  const DataLayout *DL = nullptr;
   DenseMap<BasicBlock *, ColorVector> BlockColors;
   MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;
 };
@@ -111,6 +112,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) {
   if (!isFuncletEHPersonality(Personality))
     return false;
 
+  DL = &Fn.getParent()->getDataLayout();
   return prepareExplicitEH(Fn);
 }
 
@@ -1070,7 +1072,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {
   if (!isa<TerminatorInst>(EHPad)) {
     // If the EHPad isn't a terminator, then we can insert a load in this block
     // that will dominate all uses.
-    SpillSlot = new AllocaInst(PN->getType(), nullptr,
+    SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(PN->getName(), ".wineh.spillslot"),
                                &F.getEntryBlock().front());
     Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"),
@@ -1157,7 +1159,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
                                       Function &F) {
   // Lazilly create the spill slot.
   if (!SpillSlot)
-    SpillSlot = new AllocaInst(V->getType(), nullptr,
+    SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(V->getName(), ".wineh.spillslot"),
                                &F.getEntryBlock().front());
 
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 63bd762eeb2b..7d2848bdc13b 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -81,7 +81,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF,
         auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc))
                        .addImm(T.getOpcode());
         for (auto &MO : T.operands())
-          MIB.addOperand(MO);
+          MIB.add(MO);
         Terminators.push_back(&T);
       }
     }
@@ -157,6 +157,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
   case Triple::ArchType::arm:
   case Triple::ArchType::thumb:
   case Triple::ArchType::aarch64:
+  case Triple::ArchType::ppc64le:
+  case Triple::ArchType::mips:
+  case Triple::ArchType::mipsel:
+  case Triple::ArchType::mips64:
+  case Triple::ArchType::mips64el:
     // For the architectures which don't have a single return instruction
     prependRetWithPatchableExit(MF, TII);
     break;
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index f9bff86b41c8..6e9214d72adc 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -5,16 +5,17 @@ add_llvm_library(LLVMDebugInfoCodeView
   CVTypeDumper.cpp
   CVTypeVisitor.cpp
   EnumTables.cpp
+  Formatters.cpp
   Line.cpp
   ModuleSubstream.cpp
   ModuleSubstreamVisitor.cpp
   RecordSerialization.cpp
   SymbolRecordMapping.cpp
   SymbolDumper.cpp
+  SymbolSerializer.cpp
   TypeDatabase.cpp
   TypeDatabaseVisitor.cpp
   TypeDumpVisitor.cpp
-  TypeRecord.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
   TypeStreamMerger.cpp
diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
index 75cfd0dd184e..4c78caf03477 100644
--- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
@@ -11,20 +11,11 @@
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-template <typename T>
-static Error takeObject(ArrayRef<uint8_t> &Data, const T *&Res) {
-  if (Data.size() < sizeof(*Res))
-    return llvm::make_error<CodeViewError>(cv_error_code::insufficient_buffer);
-  Res = reinterpret_cast<const T *>(Data.data());
-  Data = Data.drop_front(sizeof(*Res));
-  return Error::success();
-}
-
 CVSymbolVisitor::CVSymbolVisitor(SymbolVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
index fcd239cce0dd..bcc8218d9446 100644
--- a/lib/DebugInfo/CodeView/CVTypeDumper.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
@@ -14,7 +14,7 @@
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -28,6 +28,8 @@ Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
   Pipeline.addCallbackToPipeline(Dumper);
 
   CVTypeVisitor Visitor(Pipeline);
+  if (Handler)
+    Visitor.addTypeServerHandler(*Handler);
 
   CVType RecordCopy = Record;
   if (auto EC = Visitor.visitTypeRecord(RecordCopy))
@@ -45,6 +47,8 @@ Error CVTypeDumper::dump(const CVTypeArray &Types,
   Pipeline.addCallbackToPipeline(Dumper);
 
   CVTypeVisitor Visitor(Pipeline);
+  if (Handler)
+    Visitor.addTypeServerHandler(*Handler);
 
   if (auto EC = Visitor.visitTypeStream(Types))
     return EC;
@@ -52,9 +56,9 @@ Error CVTypeDumper::dump(const CVTypeArray &Types,
 }
 
 Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
-  msf::ByteStream Stream(Data);
+  BinaryByteStream Stream(Data, llvm::support::little);
   CVTypeArray Types;
-  msf::StreamReader Reader(Stream);
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readArray(Types, Reader.getLength()))
     return EC;
 
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 5171e24f3aac..0069ee3cc904 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -10,9 +10,14 @@
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -21,7 +26,8 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
 template <typename T>
-static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
+static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record,
+                              TypeVisitorCallbacks &Callbacks) {
   TypeRecordKind RK = static_cast<TypeRecordKind>(Record.Type);
   T KnownRecord(RK);
   if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord))
@@ -39,7 +45,58 @@ static Error visitKnownMember(CVMemberRecord &Record,
   return Error::success();
 }
 
+static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
+  class StealTypeServerVisitor : public TypeVisitorCallbacks {
+  public:
+    explicit StealTypeServerVisitor(TypeServer2Record &TR) : TR(TR) {}
+
+    Error visitKnownRecord(CVType &CVR, TypeServer2Record &Record) override {
+      TR = Record;
+      return Error::success();
+    }
+
+  private:
+    TypeServer2Record &TR;
+  };
+
+  TypeServer2Record R(TypeRecordKind::TypeServer2);
+  TypeDeserializer Deserializer;
+  StealTypeServerVisitor Thief(R);
+  TypeVisitorCallbackPipeline Pipeline;
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(Thief);
+  CVTypeVisitor Visitor(Pipeline);
+  if (auto EC = Visitor.visitTypeRecord(Record))
+    return std::move(EC);
+
+  return R;
+}
+
+void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
+  Handlers.push_back(&Handler);
+}
+
 Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+  if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
+    auto TS = deserializeTypeServerRecord(Record);
+    if (!TS)
+      return TS.takeError();
+
+    for (auto Handler : Handlers) {
+      auto ExpectedResult = Handler->handle(*TS, Callbacks);
+      // If there was an error, return the error.
+      if (!ExpectedResult)
+        return ExpectedResult.takeError();
+
+      // If the handler processed the record, return success.
+      if (*ExpectedResult)
+        return Error::success();
+
+      // Otherwise keep searching for a handler, eventually falling out and
+      // using the default record handler.
+    }
+  }
+
   if (auto EC = Callbacks.visitTypeBegin(Record))
     return EC;
 
@@ -50,7 +107,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
     break;
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
   case EnumName: {                                                             \
-    if (auto EC = visitKnownRecord<Name##Record>(Record, Callbacks))           \
+    if (auto EC = visitKnownRecord<Name##Record>(*this, Record, Callbacks))    \
       return EC;                                                               \
     break;                                                                     \
   }
@@ -109,7 +166,15 @@ Error CVTypeVisitor::visitTypeStream(const CVTypeArray &Types) {
   return Error::success();
 }
 
-Error CVTypeVisitor::visitFieldListMemberStream(msf::StreamReader Reader) {
+Error CVTypeVisitor::visitTypeStream(CVTypeRange Types) {
+  for (auto I : Types) {
+    if (auto EC = visitTypeRecord(I))
+      return EC;
+  }
+  return Error::success();
+}
+
+Error CVTypeVisitor::visitFieldListMemberStream(BinaryStreamReader Reader) {
   FieldListDeserializer Deserializer(Reader);
   TypeVisitorCallbackPipeline Pipeline;
   Pipeline.addCallbackToPipeline(Deserializer);
@@ -130,7 +195,7 @@ Error CVTypeVisitor::visitFieldListMemberStream(msf::StreamReader Reader) {
 }
 
 Error CVTypeVisitor::visitFieldListMemberStream(ArrayRef<uint8_t> Data) {
-  msf::ByteStream S(Data);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Data, llvm::support::little);
+  BinaryStreamReader SR(S);
   return visitFieldListMemberStream(SR);
 }
diff --git a/lib/DebugInfo/CodeView/CodeViewError.cpp b/lib/DebugInfo/CodeView/CodeViewError.cpp
index 55c10c076eef..8de266b836b4 100644
--- a/lib/DebugInfo/CodeView/CodeViewError.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewError.cpp
@@ -31,6 +31,8 @@ public:
              "bytes.";
     case cv_error_code::corrupt_record:
       return "The CodeView record is corrupted.";
+    case cv_error_code::no_records:
+      return "There are no records";
     case cv_error_code::operation_unsupported:
       return "The requested operation is not supported.";
     case cv_error_code::unknown_member_record:
diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 9bd85cf9dc68..282e3103adc9 100644
--- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -10,8 +10,8 @@
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -145,10 +145,10 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value) {
   if (isWriting()) {
     // Truncate if we attempt to write too much.
     StringRef S = Value.take_front(maxFieldLength() - 1);
-    if (auto EC = Writer->writeZeroString(S))
+    if (auto EC = Writer->writeCString(S))
       return EC;
   } else {
-    if (auto EC = Reader->readZeroString(Value))
+    if (auto EC = Reader->readCString(Value))
       return EC;
   }
   return Error::success();
@@ -176,7 +176,7 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value) {
       if (auto EC = mapStringZ(V))
         return EC;
     }
-    if (auto EC = Writer->writeInteger(uint8_t(0)))
+    if (auto EC = Writer->writeInteger<uint8_t>(0))
       return EC;
   } else {
     StringRef S;
@@ -194,22 +194,22 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value) {
 Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
   assert(Value < 0 && "Encoded integer is not signed!");
   if (Value >= std::numeric_limits<int8_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_CHAR)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_CHAR))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int8_t>(Value)))
+    if (auto EC = Writer->writeInteger<int8_t>(Value))
       return EC;
   } else if (Value >= std::numeric_limits<int16_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_SHORT)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_SHORT))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int16_t>(Value)))
+    if (auto EC = Writer->writeInteger<int16_t>(Value))
       return EC;
   } else if (Value >= std::numeric_limits<int32_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_LONG)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_LONG))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int32_t>(Value)))
+    if (auto EC = Writer->writeInteger<int32_t>(Value))
       return EC;
   } else {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_QUADWORD)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_QUADWORD))
       return EC;
     if (auto EC = Writer->writeInteger(Value))
       return EC;
@@ -219,20 +219,20 @@ Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
 
 Error CodeViewRecordIO::writeEncodedUnsignedInteger(const uint64_t &Value) {
   if (Value < LF_NUMERIC) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint16_t>(Value))
       return EC;
   } else if (Value <= std::numeric_limits<uint16_t>::max()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_USHORT)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_USHORT))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint16_t>(Value))
       return EC;
   } else if (Value <= std::numeric_limits<uint32_t>::max()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_ULONG)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_ULONG))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<uint32_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint32_t>(Value))
       return EC;
   } else {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_UQUADWORD)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_UQUADWORD))
       return EC;
     if (auto EC = Writer->writeInteger(Value))
       return EC;
diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp
new file mode 100644
index 000000000000..ef00bd8570fa
--- /dev/null
+++ b/lib/DebugInfo/CodeView/Formatters.cpp
@@ -0,0 +1,37 @@
+//===- Formatters.cpp -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::codeview::detail;
+
+GuidAdapter::GuidAdapter(StringRef Guid)
+    : FormatAdapter(makeArrayRef(Guid.bytes_begin(), Guid.bytes_end())) {}
+
+GuidAdapter::GuidAdapter(ArrayRef<uint8_t> Guid)
+    : FormatAdapter(std::move(Guid)) {}
+
+void GuidAdapter::format(llvm::raw_ostream &Stream, StringRef Style) {
+  static const char *Lookup = "0123456789ABCDEF";
+
+  assert(Item.size() == 16 && "Expected 16-byte GUID");
+  Stream << "{";
+  for (int i = 0; i < 16;) {
+    uint8_t Byte = Item[i];
+    uint8_t HighNibble = (Byte >> 4) & 0xF;
+    uint8_t LowNibble = Byte & 0xF;
+    Stream << Lookup[HighNibble] << Lookup[LowNibble];
+    ++i;
+    if (i >= 4 && i <= 10 && i % 2 == 0)
+      Stream << "-";
+  }
+  Stream << "}";
+}
diff --git a/lib/DebugInfo/CodeView/ModuleSubstream.cpp b/lib/DebugInfo/CodeView/ModuleSubstream.cpp
index 768ebaa1c980..69a7c59116cf 100644
--- a/lib/DebugInfo/CodeView/ModuleSubstream.cpp
+++ b/lib/DebugInfo/CodeView/ModuleSubstream.cpp
@@ -9,22 +9,20 @@
 
 #include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
 
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
 ModuleSubstream::ModuleSubstream() : Kind(ModuleSubstreamKind::None) {}
 
-ModuleSubstream::ModuleSubstream(ModuleSubstreamKind Kind,
-                                 ReadableStreamRef Data)
+ModuleSubstream::ModuleSubstream(ModuleSubstreamKind Kind, BinaryStreamRef Data)
     : Kind(Kind), Data(Data) {}
 
-Error ModuleSubstream::initialize(ReadableStreamRef Stream,
+Error ModuleSubstream::initialize(BinaryStreamRef Stream,
                                   ModuleSubstream &Info) {
   const ModuleSubsectionHeader *Header;
-  StreamReader Reader(Stream);
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(Header))
     return EC;
 
@@ -42,4 +40,4 @@ uint32_t ModuleSubstream::getRecordLength() const {
 
 ModuleSubstreamKind ModuleSubstream::getSubstreamKind() const { return Kind; }
 
-ReadableStreamRef ModuleSubstream::getRecordData() const { return Data; }
+BinaryStreamRef ModuleSubstream::getRecordData() const { return Data; }
diff --git a/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp b/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
index 524793277980..e490a78cadbc 100644
--- a/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
+++ b/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
@@ -8,54 +8,52 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
-Error IModuleSubstreamVisitor::visitSymbols(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitSymbols(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::Symbols, Data);
 }
-Error IModuleSubstreamVisitor::visitLines(ReadableStreamRef Data,
+Error IModuleSubstreamVisitor::visitLines(BinaryStreamRef Data,
                                           const LineSubstreamHeader *Header,
                                           const LineInfoArray &Lines) {
   return visitUnknown(ModuleSubstreamKind::Lines, Data);
 }
-Error IModuleSubstreamVisitor::visitStringTable(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitStringTable(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::StringTable, Data);
 }
 Error IModuleSubstreamVisitor::visitFileChecksums(
-    ReadableStreamRef Data, const FileChecksumArray &Checksums) {
+    BinaryStreamRef Data, const FileChecksumArray &Checksums) {
   return visitUnknown(ModuleSubstreamKind::FileChecksums, Data);
 }
-Error IModuleSubstreamVisitor::visitFrameData(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitFrameData(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::FrameData, Data);
 }
-Error IModuleSubstreamVisitor::visitInlineeLines(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitInlineeLines(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::InlineeLines, Data);
 }
-Error IModuleSubstreamVisitor::visitCrossScopeImports(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitCrossScopeImports(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::CrossScopeExports, Data);
 }
-Error IModuleSubstreamVisitor::visitCrossScopeExports(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitCrossScopeExports(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::CrossScopeImports, Data);
 }
-Error IModuleSubstreamVisitor::visitILLines(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitILLines(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::ILLines, Data);
 }
-Error IModuleSubstreamVisitor::visitFuncMDTokenMap(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitFuncMDTokenMap(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::FuncMDTokenMap, Data);
 }
-Error IModuleSubstreamVisitor::visitTypeMDTokenMap(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitTypeMDTokenMap(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::TypeMDTokenMap, Data);
 }
-Error IModuleSubstreamVisitor::visitMergedAssemblyInput(
-    ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitMergedAssemblyInput(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::MergedAssemblyInput, Data);
 }
-Error IModuleSubstreamVisitor::visitCoffSymbolRVA(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitCoffSymbolRVA(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::CoffSymbolRVA, Data);
 }
 
@@ -65,7 +63,7 @@ Error llvm::codeview::visitModuleSubstream(const ModuleSubstream &R,
   case ModuleSubstreamKind::Symbols:
     return V.visitSymbols(R.getRecordData());
   case ModuleSubstreamKind::Lines: {
-    StreamReader Reader(R.getRecordData());
+    BinaryStreamReader Reader(R.getRecordData());
     const LineSubstreamHeader *Header;
     if (auto EC = Reader.readObject(Header))
       return EC;
@@ -78,7 +76,7 @@ Error llvm::codeview::visitModuleSubstream(const ModuleSubstream &R,
   case ModuleSubstreamKind::StringTable:
     return V.visitStringTable(R.getRecordData());
   case ModuleSubstreamKind::FileChecksums: {
-    StreamReader Reader(R.getRecordData());
+    BinaryStreamReader Reader(R.getRecordData());
     FileChecksumArray Checksums;
     if (auto EC = Reader.readArray(Checksums, Reader.bytesRemaining()))
       return EC;
diff --git a/lib/DebugInfo/CodeView/RecordSerialization.cpp b/lib/DebugInfo/CodeView/RecordSerialization.cpp
index 6f29caa9bbfc..6446670f60d8 100644
--- a/lib/DebugInfo/CodeView/RecordSerialization.cpp
+++ b/lib/DebugInfo/CodeView/RecordSerialization.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -33,7 +33,7 @@ StringRef llvm::codeview::getBytesAsCString(ArrayRef<uint8_t> LeafData) {
   return getBytesAsCharacters(LeafData).split('\0').first;
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, APSInt &Num) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, APSInt &Num) {
   // Used to avoid overload ambiguity on APInt construtor.
   bool FalseVal = false;
   uint16_t Short;
@@ -103,15 +103,15 @@ Error llvm::codeview::consume(msf::StreamReader &Reader, APSInt &Num) {
 
 Error llvm::codeview::consume(StringRef &Data, APSInt &Num) {
   ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
-  msf::ByteStream S(Bytes);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Bytes, llvm::support::little);
+  BinaryStreamReader SR(S);
   auto EC = consume(SR, Num);
   Data = Data.take_back(SR.bytesRemaining());
   return EC;
 }
 
 /// Decode a numeric leaf value that is known to be a uint64_t.
-Error llvm::codeview::consume_numeric(msf::StreamReader &Reader,
+Error llvm::codeview::consume_numeric(BinaryStreamReader &Reader,
                                       uint64_t &Num) {
   APSInt N;
   if (auto EC = consume(Reader, N))
@@ -123,27 +123,27 @@ Error llvm::codeview::consume_numeric(msf::StreamReader &Reader,
   return Error::success();
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, uint32_t &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, uint32_t &Item) {
   return Reader.readInteger(Item);
 }
 
 Error llvm::codeview::consume(StringRef &Data, uint32_t &Item) {
   ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
-  msf::ByteStream S(Bytes);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Bytes, llvm::support::little);
+  BinaryStreamReader SR(S);
   auto EC = consume(SR, Item);
   Data = Data.take_back(SR.bytesRemaining());
   return EC;
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, int32_t &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, int32_t &Item) {
   return Reader.readInteger(Item);
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, StringRef &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, StringRef &Item) {
   if (Reader.empty())
     return make_error<CodeViewError>(cv_error_code::corrupt_record,
                                      "Null terminated string buffer is empty!");
 
-  return Reader.readZeroString(Item);
+  return Reader.readCString(Item);
 }
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index fd54fba13c76..134471e81cac 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -468,8 +468,8 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   for (auto &Annotation : InlineSite.annotations()) {
     switch (Annotation.OpCode) {
     case BinaryAnnotationsOpCode::Invalid:
-      return llvm::make_error<CodeViewError>(
-          "Invalid binary annotation opcode!");
+      W.printString("(Annotation Padding)");
+      break;
     case BinaryAnnotationsOpCode::CodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeLength:
diff --git a/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
new file mode 100644
index 000000000000..251cc431f52b
--- /dev/null
+++ b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -0,0 +1,52 @@
+//===- SymbolSerializer.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator)
+  : Storage(Allocator), RecordBuffer(MaxRecordLength), Stream(RecordBuffer, llvm::support::little),
+  Writer(Stream), Mapping(Writer) { }
+
+Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
+  assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!");
+
+  Writer.setOffset(0);
+
+  if (auto EC = writeRecordPrefix(Record.kind()))
+    return EC;
+
+  CurrentSymbol = Record.kind();
+  if (auto EC = Mapping.visitSymbolBegin(Record))
+    return EC;
+
+  return Error::success();
+}
+
+Error SymbolSerializer::visitSymbolEnd(CVSymbol &Record) {
+  assert(CurrentSymbol.hasValue() && "Not in a symbol mapping!");
+
+  if (auto EC = Mapping.visitSymbolEnd(Record))
+    return EC;
+
+  uint32_t RecordEnd = Writer.getOffset();
+  uint16_t Length = RecordEnd - 2;
+  Writer.setOffset(0);
+  if (auto EC = Writer.writeInteger(Length))
+    return EC;
+
+  uint8_t *StableStorage = Storage.Allocate<uint8_t>(RecordEnd);
+  ::memcpy(StableStorage, &RecordBuffer[0], RecordEnd);
+  Record.RecordData = ArrayRef<uint8_t>(StableStorage, RecordEnd);
+  CurrentSymbol.reset();
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index c7f72551dc8b..f9ded6ce2a86 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -71,7 +71,7 @@ TypeIndex TypeDatabase::getNextTypeIndex() const {
 }
 
 /// Records the name of a type, and reserves its type index.
-void TypeDatabase::recordType(StringRef Name, CVType Data) {
+void TypeDatabase::recordType(StringRef Name, const CVType &Data) {
   CVUDTNames.push_back(Name);
   TypeRecords.push_back(Data);
 }
@@ -106,6 +106,10 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
   return "<unknown UDT>";
 }
 
+const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
+  return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+}
+
 bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
   uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
   return I < CVUDTNames.size();
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
index d9d563902182..c234afd2288b 100644
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -83,6 +83,22 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   return Error::success();
 }
 
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            StringListRecord &Strings) {
+  auto Indices = Strings.getIndices();
+  uint32_t Size = Indices.size();
+  SmallString<256> TypeName("\"");
+  for (uint32_t I = 0; I < Size; ++I) {
+    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    TypeName.append(ArgTypeName);
+    if (I + 1 != Size)
+      TypeName.append("\" \"");
+  }
+  TypeName.push_back('\"');
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   Name = Class.getName();
   return Error::success();
@@ -283,6 +299,10 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
   return Error::success();
 }
 
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, LabelRecord &R) {
+  return Error::success();
+}
+
 Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
                                             VFPtrRecord &VFP) {
   return Error::success();
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 033585ba8cc9..870d95221e7d 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -1,5 +1,4 @@
-//===-- TypeDumpVisitor.cpp - CodeView type info dumper -----------*- C++
-//-*-===//
+//===-- TypeDumpVisitor.cpp - CodeView type info dumper ----------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,13 +12,15 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
@@ -145,6 +146,10 @@ static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
     ENUM_ENTRY(FunctionOptions, ConstructorWithVirtualBases),
 };
 
+static const EnumEntry<uint16_t> LabelTypeEnum[] = {
+    ENUM_ENTRY(LabelType, Near), ENUM_ENTRY(LabelType, Far),
+};
+
 #undef ENUM_ENTRY
 
 static StringRef getLeafTypeName(TypeLeafKind LT) {
@@ -163,9 +168,14 @@ void TypeDumpVisitor::printTypeIndex(StringRef FieldName, TypeIndex TI) const {
   CVTypeDumper::printTypeIndex(*W, FieldName, TI, TypeDB);
 }
 
+void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
+  CVTypeDumper::printTypeIndex(*W, FieldName, TI, getSourceDB());
+}
+
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " (" << HexNumber(TypeDB.getNextTypeIndex().getIndex())
+  W->getOStream() << " ("
+                  << HexNumber(getSourceDB().getNextTypeIndex().getIndex())
                   << ")";
   W->getOStream() << " {\n";
   W->indent();
@@ -211,7 +221,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringIdRecord &String) {
-  printTypeIndex("Id", String.getId());
+  printItemIndex("Id", String.getId());
   W->printString("StringData", String.getString());
   return Error::success();
 }
@@ -227,6 +237,17 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   return Error::success();
 }
 
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringListRecord &Strs) {
+  auto Indices = Strs.getIndices();
+  uint32_t Size = Indices.size();
+  W->printNumber("NumStrings", Size);
+  ListScope Arguments(*W, "Strings");
+  for (uint32_t I = 0; I < Size; ++I) {
+    printTypeIndex("String", Indices[I]);
+  }
+  return Error::success();
+}
+
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   uint16_t Props = static_cast<uint16_t>(Class.getOptions());
   W->printNumber("MemberCount", Class.getMemberCount());
@@ -329,14 +350,14 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
-  printTypeIndex("ParentScope", Func.getParentScope());
+  printItemIndex("ParentScope", Func.getParentScope());
   printTypeIndex("FunctionType", Func.getFunctionType());
   W->printString("Name", Func.getName());
   return Error::success();
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
-  W->printBinary("Signature", TS.getGuid());
+  W->printString("Guid", formatv("{0}", fmt_guid(TS.getGuid())).str());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
   return Error::success();
@@ -390,7 +411,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         UdtSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
-  printTypeIndex("SourceFile", Line.getSourceFile());
+  printItemIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   return Error::success();
 }
@@ -398,7 +419,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         UdtModSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
-  printTypeIndex("SourceFile", Line.getSourceFile());
+  printItemIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   W->printNumber("Module", Line.getModule());
   return Error::success();
@@ -409,7 +430,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &Args) {
 
   ListScope Arguments(*W, "Arguments");
   for (auto Arg : Args.getArgs()) {
-    printTypeIndex("ArgType", Arg);
+    printItemIndex("ArgType", Arg);
   }
   return Error::success();
 }
@@ -530,3 +551,8 @@ Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
   printTypeIndex("ContinuationIndex", Cont.getContinuationIndex());
   return Error::success();
 }
+
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, LabelRecord &LR) {
+  W->printEnum("Mode", uint16_t(LR.Mode), makeArrayRef(LabelTypeEnum));
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeRecord.cpp b/lib/DebugInfo/CodeView/TypeRecord.cpp
deleted file mode 100644
index b951c068ca86..000000000000
--- a/lib/DebugInfo/CodeView/TypeRecord.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//===-- TypeRecord.cpp ------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-//===----------------------------------------------------------------------===//
-// Type index remapping
-//===----------------------------------------------------------------------===//
-
-static bool remapIndex(ArrayRef<TypeIndex> IndexMap, TypeIndex &Idx) {
-  // Simple types are unchanged.
-  if (Idx.isSimple())
-    return true;
-  unsigned MapPos = Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  if (MapPos < IndexMap.size()) {
-    Idx = IndexMap[MapPos];
-    return true;
-  }
-
-  // This type index is invalid. Remap this to "not translated by cvpack",
-  // and return failure.
-  Idx = TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct);
-  return false;
-}
-
-bool ModifierRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ModifiedType);
-}
-
-bool ProcedureRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReturnType);
-  Success &= remapIndex(IndexMap, ArgumentList);
-  return Success;
-}
-
-bool MemberFunctionRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReturnType);
-  Success &= remapIndex(IndexMap, ClassType);
-  Success &= remapIndex(IndexMap, ThisType);
-  Success &= remapIndex(IndexMap, ArgumentList);
-  return Success;
-}
-
-bool MemberFuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ClassType);
-  Success &= remapIndex(IndexMap, FunctionType);
-  return Success;
-}
-
-bool ArgListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (TypeIndex &Str : StringIndices)
-    Success &= remapIndex(IndexMap, Str);
-  return Success;
-}
-
-bool MemberPointerInfo::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ContainingType);
-}
-
-bool PointerRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReferentType);
-  if (isPointerToMember())
-    Success &= MemberInfo->remapTypeIndices(IndexMap);
-  return Success;
-}
-
-bool NestedTypeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool ArrayRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ElementType);
-  Success &= remapIndex(IndexMap, IndexType);
-  return Success;
-}
-
-bool TagRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, FieldList);
-}
-
-bool ClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= TagRecord::remapTypeIndices(IndexMap);
-  Success &= remapIndex(IndexMap, DerivationList);
-  Success &= remapIndex(IndexMap, VTableShape);
-  return Success;
-}
-
-bool EnumRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= TagRecord::remapTypeIndices(IndexMap);
-  Success &= remapIndex(IndexMap, UnderlyingType);
-  return Success;
-}
-
-bool BitFieldRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool VFTableShapeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool TypeServer2Record::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool StringIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Id);
-}
-
-bool FuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ParentScope);
-  Success &= remapIndex(IndexMap, FunctionType);
-  return Success;
-}
-
-bool UdtSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, UDT);
-  Success &= remapIndex(IndexMap, SourceFile);
-  return Success;
-}
-
-bool UdtModSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, UDT);
-  Success &= remapIndex(IndexMap, SourceFile);
-  return Success;
-}
-
-bool BuildInfoRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (TypeIndex &Arg : ArgIndices)
-    Success &= remapIndex(IndexMap, Arg);
-  return Success;
-}
-
-bool VFTableRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, CompleteClass);
-  Success &= remapIndex(IndexMap, OverriddenVFTable);
-  return Success;
-}
-
-bool OneMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, Type);
-  return Success;
-}
-
-bool MethodOverloadListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (OneMethodRecord &Meth : Methods)
-    if ((Success = Meth.remapTypeIndices(IndexMap)))
-      return Success;
-  return Success;
-}
-
-bool OverloadedMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, MethodList);
-}
-
-bool DataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool StaticDataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool EnumeratorRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool VFPtrRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool BaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool VirtualBaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, BaseType);
-  Success &= remapIndex(IndexMap, VBPtrType);
-  return Success;
-}
-
-bool ListContinuationRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ContinuationIndex);
-}
diff --git a/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index f46e08d55429..114f6fd2897e 100644
--- a/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -67,12 +67,9 @@ static Error mapNameAndUniqueName(CodeViewRecordIO &IO, StringRef &Name,
       error(IO.mapStringZ(N));
       error(IO.mapStringZ(U));
     } else {
-      size_t BytesNeeded = Name.size() + 1;
-      StringRef N = Name;
-      if (BytesNeeded > BytesLeft) {
-        size_t BytesToDrop = std::min(N.size(), BytesToDrop);
-        N = N.drop_back(BytesToDrop);
-      }
+      // Cap the length of the string at however many bytes we have available,
+      // plus one for the required null terminator.
+      auto N = StringRef(Name).take_front(BytesLeft - 1);
       error(IO.mapStringZ(N));
     }
   } else {
@@ -174,6 +171,15 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ArgListRecord &Record) {
   error(IO.mapVectorN<uint32_t>(
+      Record.ArgIndices,
+      [](CodeViewRecordIO &IO, TypeIndex &N) { return IO.mapInteger(N); }));
+
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
+                                          StringListRecord &Record) {
+  error(IO.mapVectorN<uint32_t>(
       Record.StringIndices,
       [](CodeViewRecordIO &IO, TypeIndex &N) { return IO.mapInteger(N); }));
 
@@ -368,6 +374,14 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           TypeServer2Record &Record) {
+  error(IO.mapGuid(Record.Guid));
+  error(IO.mapInteger(Record.Age));
+  error(IO.mapStringZ(Record.Name));
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR, LabelRecord &Record) {
+  error(IO.mapEnum(Record.Mode));
   return Error::success();
 }
 
diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp
index f24fcff86274..fd4d1853fa54 100644
--- a/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -9,7 +9,7 @@
 
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
 
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 #include <string.h>
 
@@ -85,7 +85,8 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
 
 TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
     : RecordStorage(Storage), LastTypeIndex(),
-      RecordBuffer(MaxRecordLength * 2), Stream(RecordBuffer), Writer(Stream),
+      RecordBuffer(MaxRecordLength * 2),
+      Stream(RecordBuffer, llvm::support::little), Writer(Stream),
       Mapping(Writer) {
   // RecordBuffer needs to be able to hold enough data so that if we are 1
   // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
@@ -203,15 +204,15 @@ Error TypeSerializer::visitMemberEnd(CVMemberRecord &Record) {
 
     uint8_t *SegmentBytes = RecordStorage.Allocate<uint8_t>(LengthWithSize);
     auto SavedSegment = MutableArrayRef<uint8_t>(SegmentBytes, LengthWithSize);
-    msf::MutableByteStream CS(SavedSegment);
-    msf::StreamWriter CW(CS);
+    MutableBinaryByteStream CS(SavedSegment, llvm::support::little);
+    BinaryStreamWriter CW(CS);
     if (auto EC = CW.writeBytes(CopyData))
       return EC;
     if (auto EC = CW.writeEnum(TypeLeafKind::LF_INDEX))
       return EC;
-    if (auto EC = CW.writeInteger(uint16_t(0)))
+    if (auto EC = CW.writeInteger<uint16_t>(0))
       return EC;
-    if (auto EC = CW.writeInteger(uint32_t(0xB0C0B0C0)))
+    if (auto EC = CW.writeInteger<uint32_t>(0xB0C0B0C0))
       return EC;
     FieldListSegments.push_back(SavedSegment);
 
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index ed6cf5743a12..aad20ae6dda1 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -52,12 +52,19 @@ namespace {
 /// - If the type record already exists in the destination stream, discard it
 ///   and update the type index map to forward the source type index to the
 ///   existing destination type index.
+///
+/// As an additional complication, type stream merging actually produces two
+/// streams: an item (or IPI) stream and a type stream, as this is what is
+/// actually stored in the final PDB. We choose which records go where by
+/// looking at the record kind.
 class TypeStreamMerger : public TypeVisitorCallbacks {
 public:
-  TypeStreamMerger(TypeTableBuilder &DestStream)
-      : DestStream(DestStream), FieldListBuilder(DestStream) {
-    assert(!hadError());
-  }
+  TypeStreamMerger(TypeTableBuilder &DestIdStream,
+                   TypeTableBuilder &DestTypeStream, TypeServerHandler *Handler)
+      : DestIdStream(DestIdStream), DestTypeStream(DestTypeStream),
+        FieldListBuilder(DestTypeStream), Handler(Handler) {}
+
+  static const TypeIndex Untranslated;
 
 /// TypeVisitorCallbacks overrides.
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
@@ -74,42 +81,65 @@ public:
   Error visitTypeEnd(CVType &Record) override;
   Error visitMemberEnd(CVMemberRecord &Record) override;
 
-  bool mergeStream(const CVTypeArray &Types);
+  Error mergeStream(const CVTypeArray &Types);
 
 private:
+  void addMapping(TypeIndex Idx);
+
+  bool remapIndex(TypeIndex &Idx);
+
+  size_t slotForIndex(TypeIndex Idx) const {
+    assert(!Idx.isSimple() && "simple type indices have no slots");
+    return Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  }
+
+  Error errorCorruptRecord() const {
+    return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+  }
+
   template <typename RecordType>
-  Error visitKnownRecordImpl(RecordType &Record) {
-    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);
-    IndexMap.push_back(DestStream.writeKnownType(Record));
+  Error writeRecord(RecordType &R, bool RemapSuccess) {
+    TypeIndex DestIdx = Untranslated;
+    if (RemapSuccess)
+      DestIdx = DestTypeStream.writeKnownType(R);
+    addMapping(DestIdx);
     return Error::success();
   }
 
-  Error visitKnownRecordImpl(FieldListRecord &Record) {
-    CVTypeVisitor Visitor(*this);
-
-    if (auto EC = Visitor.visitFieldListMemberStream(Record.Data))
-      return EC;
+  template <typename RecordType>
+  Error writeIdRecord(RecordType &R, bool RemapSuccess) {
+    TypeIndex DestIdx = Untranslated;
+    if (RemapSuccess)
+      DestIdx = DestIdStream.writeKnownType(R);
+    addMapping(DestIdx);
     return Error::success();
   }
 
   template <typename RecordType>
-  Error visitKnownMemberRecordImpl(RecordType &Record) {
-    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);
-    FieldListBuilder.writeMemberType(Record);
+  Error writeMember(RecordType &R, bool RemapSuccess) {
+    if (RemapSuccess)
+      FieldListBuilder.writeMemberType(R);
+    else
+      HadUntranslatedMember = true;
     return Error::success();
   }
 
-  bool hadError() { return FoundBadTypeIndex; }
+  Optional<Error> LastError;
+
+  bool IsSecondPass = false;
+
+  bool HadUntranslatedMember = false;
 
-  bool FoundBadTypeIndex = false;
+  unsigned NumBadIndices = 0;
 
   BumpPtrAllocator Allocator;
 
-  TypeTableBuilder &DestStream;
+  TypeTableBuilder &DestIdStream;
+  TypeTableBuilder &DestTypeStream;
   FieldListRecordBuilder FieldListBuilder;
+  TypeServerHandler *Handler;
 
-  bool IsInFieldList{false};
-  size_t BeginIndexMapSize = 0;
+  TypeIndex CurIndex{TypeIndex::FirstNonSimpleIndex};
 
   /// Map from source type index to destination type index. Indexed by source
   /// type index minus 0x1000.
@@ -118,70 +148,346 @@ private:
 
 } // end anonymous namespace
 
+const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
+
 Error TypeStreamMerger::visitTypeBegin(CVRecord<TypeLeafKind> &Rec) {
-  if (Rec.Type == TypeLeafKind::LF_FIELDLIST) {
-    assert(!IsInFieldList);
-    IsInFieldList = true;
-    FieldListBuilder.begin();
-  } else
-    BeginIndexMapSize = IndexMap.size();
   return Error::success();
 }
 
 Error TypeStreamMerger::visitTypeEnd(CVRecord<TypeLeafKind> &Rec) {
-  if (Rec.Type == TypeLeafKind::LF_FIELDLIST) {
-    TypeIndex Index = FieldListBuilder.end();
-    IndexMap.push_back(Index);
-    IsInFieldList = false;
-  }
+  CurIndex = TypeIndex(CurIndex.getIndex() + 1);
+  if (!IsSecondPass)
+    assert(IndexMap.size() == slotForIndex(CurIndex) &&
+           "visitKnownRecord should add one index map entry");
   return Error::success();
 }
 
 Error TypeStreamMerger::visitMemberEnd(CVMemberRecord &Rec) {
-  assert(IndexMap.size() == BeginIndexMapSize + 1);
   return Error::success();
 }
 
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  Error TypeStreamMerger::visitKnownRecord(CVType &CVR,                        \
-                                           Name##Record &Record) {             \
-    return visitKnownRecordImpl(Record);                                       \
+void TypeStreamMerger::addMapping(TypeIndex Idx) {
+  if (!IsSecondPass) {
+    assert(IndexMap.size() == slotForIndex(CurIndex) &&
+           "visitKnownRecord should add one index map entry");
+    IndexMap.push_back(Idx);
+  } else {
+    assert(slotForIndex(CurIndex) < IndexMap.size());
+    IndexMap[slotForIndex(CurIndex)] = Idx;
   }
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  Error TypeStreamMerger::visitKnownMember(CVMemberRecord &CVR,                \
-                                           Name##Record &Record) {             \
-    return visitKnownMemberRecordImpl(Record);                                 \
+}
+
+bool TypeStreamMerger::remapIndex(TypeIndex &Idx) {
+  // Simple types are unchanged.
+  if (Idx.isSimple())
+    return true;
+
+  // Check if this type index refers to a record we've already translated
+  // successfully. If it refers to a type later in the stream or a record we
+  // had to defer, defer it until later pass.
+  unsigned MapPos = slotForIndex(Idx);
+  if (MapPos < IndexMap.size() && IndexMap[MapPos] != Untranslated) {
+    Idx = IndexMap[MapPos];
+    return true;
   }
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+
+  // If this is the second pass and this index isn't in the map, then it points
+  // outside the current type stream, and this is a corrupt record.
+  if (IsSecondPass && MapPos >= IndexMap.size()) {
+    // FIXME: Print a more useful error. We can give the current record and the
+    // index that we think its pointing to.
+    LastError = joinErrors(std::move(*LastError), errorCorruptRecord());
+  }
+
+  ++NumBadIndices;
+
+  // This type index is invalid. Remap this to "not translated by cvpack",
+  // and return failure.
+  Idx = Untranslated;
+  return false;
+}
+
+//----------------------------------------------------------------------------//
+// Item records
+//----------------------------------------------------------------------------//
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, FuncIdRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ParentScope);
+  Success &= remapIndex(R.FunctionType);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFuncIdRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ClassType);
+  Success &= remapIndex(R.FunctionType);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, StringIdRecord &R) {
+  return writeIdRecord(R, remapIndex(R.Id));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, StringListRecord &R) {
+  bool Success = true;
+  for (TypeIndex &Str : R.StringIndices)
+    Success &= remapIndex(Str);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, BuildInfoRecord &R) {
+  bool Success = true;
+  for (TypeIndex &Arg : R.ArgIndices)
+    Success &= remapIndex(Arg);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, UdtSourceLineRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.UDT);
+  Success &= remapIndex(R.SourceFile);
+  // FIXME: Translate UdtSourceLineRecord into UdtModSourceLineRecords in the
+  // IPI stream.
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, UdtModSourceLineRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.UDT);
+  Success &= remapIndex(R.SourceFile);
+  return writeIdRecord(R, Success);
+}
+
+//----------------------------------------------------------------------------//
+// Type records
+//----------------------------------------------------------------------------//
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ModifierRecord &R) {
+  return writeRecord(R, remapIndex(R.ModifiedType));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ProcedureRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ReturnType);
+  Success &= remapIndex(R.ArgumentList);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFunctionRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ReturnType);
+  Success &= remapIndex(R.ClassType);
+  Success &= remapIndex(R.ThisType);
+  Success &= remapIndex(R.ArgumentList);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &Type, ArgListRecord &R) {
+  bool Success = true;
+  for (TypeIndex &Arg : R.ArgIndices)
+    Success &= remapIndex(Arg);
+  if (auto EC = writeRecord(R, Success))
+    return EC;
+  return Error::success();
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, PointerRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ReferentType);
+  if (R.isPointerToMember())
+    Success &= remapIndex(R.MemberInfo->ContainingType);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ArrayRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ElementType);
+  Success &= remapIndex(R.IndexType);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ClassRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.FieldList);
+  Success &= remapIndex(R.DerivationList);
+  Success &= remapIndex(R.VTableShape);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, UnionRecord &R) {
+  return writeRecord(R, remapIndex(R.FieldList));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, EnumRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.FieldList);
+  Success &= remapIndex(R.UnderlyingType);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, BitFieldRecord &R) {
+  return writeRecord(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableShapeRecord &R) {
+  return writeRecord(R, true);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, TypeServer2Record &R) {
+  return writeRecord(R, true);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, LabelRecord &R) {
+  return writeRecord(R, true);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.CompleteClass);
+  Success &= remapIndex(R.OverriddenVFTable);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &,
+                                         MethodOverloadListRecord &R) {
+  bool Success = true;
+  for (OneMethodRecord &Meth : R.Methods)
+    Success &= remapIndex(Meth.Type);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, FieldListRecord &R) {
+  // Visit the members inside the field list.
+  HadUntranslatedMember = false;
+  FieldListBuilder.begin();
+  CVTypeVisitor Visitor(*this);
+  if (auto EC = Visitor.visitFieldListMemberStream(R.Data))
+    return EC;
+
+  // Write the record if we translated all field list members.
+  TypeIndex DestIdx = Untranslated;
+  if (!HadUntranslatedMember)
+    DestIdx = FieldListBuilder.end();
+  else
+    FieldListBuilder.reset();
+  addMapping(DestIdx);
+
+  return Error::success();
+}
+
+//----------------------------------------------------------------------------//
+// Member records
+//----------------------------------------------------------------------------//
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         NestedTypeRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, OneMethodRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.Type);
+  return writeMember(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         OverloadedMethodRecord &R) {
+  return writeMember(R, remapIndex(R.MethodList));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         DataMemberRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         StaticDataMemberRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         EnumeratorRecord &R) {
+  return writeMember(R, true);
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, VFPtrRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, BaseClassRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         VirtualBaseClassRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.BaseType);
+  Success &= remapIndex(R.VBPtrType);
+  return writeMember(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         ListContinuationRecord &R) {
+  return writeMember(R, remapIndex(R.ContinuationIndex));
+}
 
 Error TypeStreamMerger::visitUnknownType(CVType &Rec) {
   // We failed to translate a type. Translate this index as "not translated".
-  IndexMap.push_back(
-      TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct));
-  return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+  addMapping(TypeIndex(SimpleTypeKind::NotTranslated));
+  return errorCorruptRecord();
 }
 
-bool TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
+Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
   assert(IndexMap.empty());
   TypeVisitorCallbackPipeline Pipeline;
+  LastError = Error::success();
 
   TypeDeserializer Deserializer;
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(*this);
 
   CVTypeVisitor Visitor(Pipeline);
+  if (Handler)
+    Visitor.addTypeServerHandler(*Handler);
+
+  if (auto EC = Visitor.visitTypeStream(Types))
+    return EC;
+
+  // If we found bad indices but no other errors, try doing another pass and see
+  // if we can resolve the indices that weren't in the map on the first pass.
+  // This may require multiple passes, but we should always make progress. MASM
+  // is the only known CodeView producer that makes type streams that aren't
+  // topologically sorted. The standard library contains MASM-produced objects,
+  // so this is important to handle correctly, but we don't have to be too
+  // efficient. MASM type streams are usually very small.
+  while (!*LastError && NumBadIndices > 0) {
+    unsigned BadIndicesRemaining = NumBadIndices;
+    IsSecondPass = true;
+    NumBadIndices = 0;
+    CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
+    if (auto EC = Visitor.visitTypeStream(Types))
+      return EC;
 
-  if (auto EC = Visitor.visitTypeStream(Types)) {
-    consumeError(std::move(EC));
-    return false;
+    assert(NumBadIndices <= BadIndicesRemaining &&
+           "second pass found more bad indices");
+    if (!*LastError && NumBadIndices == BadIndicesRemaining) {
+      return llvm::make_error<CodeViewError>(
+          cv_error_code::corrupt_record, "input type graph contains cycles");
+    }
   }
+
   IndexMap.clear();
-  return !hadError();
+
+  Error Ret = std::move(*LastError);
+  LastError.reset();
+  return Ret;
 }
 
-bool llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestStream,
-                                      const CVTypeArray &Types) {
-  return TypeStreamMerger(DestStream).mergeStream(Types);
+Error llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestIdStream,
+                                       TypeTableBuilder &DestTypeStream,
+                                       TypeServerHandler *Handler,
+                                       const CVTypeArray &Types) {
+  return TypeStreamMerger(DestIdStream, DestTypeStream, Handler)
+      .mergeStream(Types);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 08bc74a81e9a..e7b4b777b43f 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
+//===- DWARFAbbreviationDeclaration.cpp -----------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -86,7 +92,6 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
           case DW_FORM_line_strp:
           case DW_FORM_sec_offset:
           case DW_FORM_strp_sup:
-          case DW_FORM_ref_sup:
             ++FixedAttributeSize->NumDwarfOffsets;
             break;
 
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 7111ad3f9fc7..85e1eaedfc61 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -1,4 +1,4 @@
-//===--- DWARFAcceleratorTable.cpp ----------------------------------------===//
+//===- DWARFAcceleratorTable.cpp ------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+#include <utility>
 
-namespace llvm {
+using namespace llvm;
 
 bool DWARFAcceleratorTable::extract() {
   uint32_t Offset = 0;
@@ -46,7 +53,7 @@ bool DWARFAcceleratorTable::extract() {
   return true;
 }
 
-void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
+LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
   // Dump the header.
   OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
      << "Version = " << format("0x%04x", Hdr.Version) << '\n'
@@ -131,4 +138,3 @@ void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
     }
   }
 }
-}
diff --git a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 948972f8f136..6e550f2e9ec9 100644
--- a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -18,8 +18,10 @@ using namespace llvm;
 void DWARFCompileUnit::dump(raw_ostream &OS) {
   OS << format("0x%08x", getOffset()) << ": Compile Unit:"
      << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " version = " << format("0x%04x", getVersion());
+  if (getVersion() >= 5)
+    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " (next unit at " << format("0x%08x", getNextUnitOffset())
      << ")\n";
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 77f6f65ee131..cbce2dc89deb 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFContext.cpp --------------------------------------------------===//
+//===- DWARFContext.cpp ---------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,23 +7,45 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocVisitor.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 using namespace dwarf;
 using namespace object;
@@ -128,8 +150,7 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
       auto CUDIE = CU->getUnitDIE();
       if (!CUDIE)
         continue;
-      if (auto StmtOffset =
-              CUDIE.getAttributeValueAsSectionOffset(DW_AT_stmt_list)) {
+      if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list))) {
         DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                                savedAddressByteSize);
         DWARFDebugLine::LineTable LineTable;
@@ -387,7 +408,7 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   if (!UnitDIE)
     return nullptr;
 
-  auto Offset = UnitDIE.getAttributeValueAsSectionOffset(DW_AT_stmt_list);
+  auto Offset = toSectionOffset(UnitDIE.find(DW_AT_stmt_list));
   if (!Offset)
     return nullptr; // No line table for this compile unit.
 
@@ -440,23 +461,32 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   return getCompileUnitForOffset(CUOffset);
 }
 
-static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
-                                      FunctionNameKind Kind,
-                                      std::string &FunctionName) {
-  if (Kind == FunctionNameKind::None)
-    return false;
+static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU,
+                                                  uint64_t Address,
+                                                  FunctionNameKind Kind,
+                                                  std::string &FunctionName,
+                                                  uint32_t &StartLine) {
   // The address may correspond to instruction in some inlined function,
   // so we have to build the chain of inlined functions and take the
-  // name of the topmost function in it.SmallVectorImpl<DWARFDie> &InlinedChain
+  // name of the topmost function in it.
   SmallVector<DWARFDie, 4> InlinedChain;
   CU->getInlinedChainForAddress(Address, InlinedChain);
-  if (InlinedChain.size() == 0)
+  if (InlinedChain.empty())
     return false;
-  if (const char *Name = InlinedChain[0].getSubroutineName(Kind)) {
+
+  const DWARFDie &DIE = InlinedChain[0];
+  bool FoundResult = false;
+  const char *Name = nullptr;
+  if (Kind != FunctionNameKind::None && (Name = DIE.getSubroutineName(Kind))) {
     FunctionName = Name;
-    return true;
+    FoundResult = true;
+  }
+  if (auto DeclLineResult = DIE.getDeclLine()) {
+    StartLine = DeclLineResult;
+    FoundResult = true;
   }
-  return false;
+
+  return FoundResult;
 }
 
 DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
@@ -466,7 +496,9 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
   if (!CU)
     return Result;
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
+  getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind,
+                                        Result.FunctionName,
+                                        Result.StartLine);
   if (Spec.FLIKind != FileLineInfoKind::None) {
     if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
       LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
@@ -484,13 +516,16 @@ DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
     return Lines;
 
   std::string FunctionName = "<invalid>";
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, FunctionName);
+  uint32_t StartLine = 0;
+  getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind, FunctionName,
+                                        StartLine);
 
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (Spec.FLIKind == FileLineInfoKind::None) {
     DILineInfo Result;
     Result.FunctionName = FunctionName;
+    Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Address, Result));
     return Lines;
   }
@@ -511,6 +546,7 @@ DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
     Result.FunctionName = FunctionName;
     Result.Line = Row.Line;
     Result.Column = Row.Column;
+    Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Row.Address, Result));
   }
 
@@ -550,6 +586,8 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
     // Get function name if necessary.
     if (const char *Name = FunctionDIE.getSubroutineName(Spec.FNKind))
       Frame.FunctionName = Name;
+    if (auto DeclLineResult = FunctionDIE.getDeclLine())
+      Frame.StartLine = DeclLineResult;
     if (Spec.FLIKind != FileLineInfoKind::None) {
       if (i == 0) {
         // For the topmost frame, initialize the line table of this
@@ -578,6 +616,66 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
+static Error createError(const Twine &Reason, llvm::Error E) {
+  return make_error<StringError>(Reason + toString(std::move(E)),
+                                 inconvertibleErrorCode());
+}
+
+/// Returns the address of symbol relocation used against. Used for futher
+/// relocations computation. Symbol's section load address is taken in account if
+/// LoadedObjectInfo interface is provided.
+static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
+                                           const RelocationRef &Reloc,
+                                           const LoadedObjectInfo *L) {
+  uint64_t Ret = 0;
+  object::section_iterator RSec = Obj.section_end();
+  object::symbol_iterator Sym = Reloc.getSymbol();
+
+  // First calculate the address of the symbol or section as it appears
+  // in the object file
+  if (Sym != Obj.symbol_end()) {
+    Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
+    if (!SymAddrOrErr)
+      return createError("error: failed to compute symbol address: ",
+                         SymAddrOrErr.takeError());
+
+    // Also remember what section this symbol is in for later
+    auto SectOrErr = Sym->getSection();
+    if (!SectOrErr)
+      return createError("error: failed to get symbol section: ",
+                         SectOrErr.takeError());
+
+    RSec = *SectOrErr;
+    Ret = *SymAddrOrErr;
+  } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
+    RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
+    Ret = RSec->getAddress();
+  }
+
+  // If we are given load addresses for the sections, we need to adjust:
+  // SymAddr = (Address of Symbol Or Section in File) -
+  //           (Address of Section in File) +
+  //           (Load Address of Section)
+  // RSec is now either the section being targeted or the section
+  // containing the symbol being targeted. In either case,
+  // we need to perform the same computation.
+  if (L && RSec != Obj.section_end())
+    if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
+      Ret += SectionLoadAddress - RSec->getAddress();
+  return Ret;
+}
+
+static bool isRelocScattered(const object::ObjectFile &Obj,
+                             const RelocationRef &Reloc) {
+  const MachOObjectFile *MachObj = dyn_cast<MachOObjectFile>(&Obj);
+  if (!MachObj)
+    return false;
+  // MachO also has relocations that point to sections and
+  // scattered relocations.
+  auto RelocInfo = MachObj->getRelocation(Reloc.getRawDataRefImpl());
+  return MachObj->isRelocationScattered(RelocInfo);
+}
+
 DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     const LoadedObjectInfo *L)
     : IsLittleEndian(Obj.isLittleEndian()),
@@ -618,40 +716,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     name = name.substr(
         name.find_first_not_of("._z")); // Skip ".", "z" and "_" prefixes.
 
-    StringRef *SectionData =
-        StringSwitch<StringRef *>(name)
-            .Case("debug_info", &InfoSection.Data)
-            .Case("debug_abbrev", &AbbrevSection)
-            .Case("debug_loc", &LocSection.Data)
-            .Case("debug_line", &LineSection.Data)
-            .Case("debug_aranges", &ARangeSection)
-            .Case("debug_frame", &DebugFrameSection)
-            .Case("eh_frame", &EHFrameSection)
-            .Case("debug_str", &StringSection)
-            .Case("debug_ranges", &RangeSection)
-            .Case("debug_macinfo", &MacinfoSection)
-            .Case("debug_pubnames", &PubNamesSection)
-            .Case("debug_pubtypes", &PubTypesSection)
-            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
-            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
-            .Case("debug_info.dwo", &InfoDWOSection.Data)
-            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-            .Case("debug_loc.dwo", &LocDWOSection.Data)
-            .Case("debug_line.dwo", &LineDWOSection.Data)
-            .Case("debug_str.dwo", &StringDWOSection)
-            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-            .Case("debug_addr", &AddrSection)
-            .Case("apple_names", &AppleNamesSection.Data)
-            .Case("apple_types", &AppleTypesSection.Data)
-            .Case("apple_namespaces", &AppleNamespacesSection.Data)
-            .Case("apple_namespac", &AppleNamespacesSection.Data)
-            .Case("apple_objc", &AppleObjCSection.Data)
-            .Case("debug_cu_index", &CUIndexSection)
-            .Case("debug_tu_index", &TUIndexSection)
-            .Case("gdb_index", &GdbIndexSection)
-            // Any more debug info sections go here.
-            .Default(nullptr);
-    if (SectionData) {
+    if (StringRef *SectionData = MapSectionToMember(name)) {
       *SectionData = data;
       if (name == "debug_ranges") {
         // FIXME: Use the other dwo range section when we emit it.
@@ -716,73 +781,19 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     if (Section.relocation_begin() != Section.relocation_end()) {
       uint64_t SectionSize = RelocatedSection->getSize();
       for (const RelocationRef &Reloc : Section.relocations()) {
-        uint64_t Address = Reloc.getOffset();
-        uint64_t Type = Reloc.getType();
-        uint64_t SymAddr = 0;
-        uint64_t SectionLoadAddress = 0;
-        object::symbol_iterator Sym = Reloc.getSymbol();
-        object::section_iterator RSec = Obj.section_end();
-
-        // First calculate the address of the symbol or section as it appears
-        // in the objct file
-        if (Sym != Obj.symbol_end()) {
-          Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
-          if (!SymAddrOrErr) {
-            std::string Buf;
-            raw_string_ostream OS(Buf);
-            logAllUnhandledErrors(SymAddrOrErr.takeError(), OS, "");
-            OS.flush();
-            errs() << "error: failed to compute symbol address: "
-                   << Buf << '\n';
-            continue;
-          }
-          SymAddr = *SymAddrOrErr;
-          // Also remember what section this symbol is in for later
-          auto SectOrErr = Sym->getSection();
-          if (!SectOrErr) {
-            std::string Buf;
-            raw_string_ostream OS(Buf);
-            logAllUnhandledErrors(SectOrErr.takeError(), OS, "");
-            OS.flush();
-            errs() << "error: failed to get symbol section: "
-                   << Buf << '\n';
-            continue;
-          }
-          RSec = *SectOrErr;
-        } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
-          // MachO also has relocations that point to sections and
-          // scattered relocations.
-          auto RelocInfo = MObj->getRelocation(Reloc.getRawDataRefImpl());
-          if (MObj->isRelocationScattered(RelocInfo)) {
-            // FIXME: it's not clear how to correctly handle scattered
-            // relocations.
-            continue;
-          } else {
-            RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
-            SymAddr = RSec->getAddress();
-          }
-        }
+        // FIXME: it's not clear how to correctly handle scattered
+        // relocations.
+        if (isRelocScattered(Obj, Reloc))
+          continue;
 
-        // If we are given load addresses for the sections, we need to adjust:
-        // SymAddr = (Address of Symbol Or Section in File) -
-        //           (Address of Section in File) +
-        //           (Load Address of Section)
-        if (L != nullptr && RSec != Obj.section_end()) {
-          // RSec is now either the section being targeted or the section
-          // containing the symbol being targeted. In either case,
-          // we need to perform the same computation.
-          StringRef SecName;
-          RSec->getName(SecName);
-//           llvm::dbgs() << "Name: '" << SecName
-//                        << "', RSec: " << RSec->getRawDataRefImpl()
-//                        << ", Section: " << Section.getRawDataRefImpl() << "\n";
-          SectionLoadAddress = L->getSectionLoadAddress(*RSec);
-          if (SectionLoadAddress != 0)
-            SymAddr += SectionLoadAddress - RSec->getAddress();
+        Expected<uint64_t> SymAddrOrErr = getSymbolAddress(Obj, Reloc, L);
+        if (!SymAddrOrErr) {
+          errs() << toString(SymAddrOrErr.takeError()) << '\n';
+          continue;
         }
 
         object::RelocVisitor V(Obj);
-        object::RelocToApply R(V.visit(Type, Reloc, SymAddr));
+        object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr));
         if (V.error()) {
           SmallString<32> Name;
           Reloc.getTypeName(Name);
@@ -790,7 +801,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
                  << Name << "\n";
           continue;
         }
-
+        uint64_t Address = Reloc.getOffset();
         if (Address + R.Width > SectionSize) {
           errs() << "error: " << R.Width << "-byte relocation starting "
                  << Address << " bytes into section " << name << " which is "
@@ -812,4 +823,49 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
   }
 }
 
-void DWARFContextInMemory::anchor() { }
+DWARFContextInMemory::DWARFContextInMemory(
+    const StringMap<std::unique_ptr<MemoryBuffer>> &Sections, uint8_t AddrSize,
+    bool isLittleEndian)
+    : IsLittleEndian(isLittleEndian), AddressSize(AddrSize) {
+  for (const auto &SecIt : Sections) {
+    if (StringRef *SectionData = MapSectionToMember(SecIt.first()))
+      *SectionData = SecIt.second->getBuffer();
+  }
+}
+
+StringRef *DWARFContextInMemory::MapSectionToMember(StringRef Name) {
+  return StringSwitch<StringRef *>(Name)
+      .Case("debug_info", &InfoSection.Data)
+      .Case("debug_abbrev", &AbbrevSection)
+      .Case("debug_loc", &LocSection.Data)
+      .Case("debug_line", &LineSection.Data)
+      .Case("debug_aranges", &ARangeSection)
+      .Case("debug_frame", &DebugFrameSection)
+      .Case("eh_frame", &EHFrameSection)
+      .Case("debug_str", &StringSection)
+      .Case("debug_ranges", &RangeSection)
+      .Case("debug_macinfo", &MacinfoSection)
+      .Case("debug_pubnames", &PubNamesSection)
+      .Case("debug_pubtypes", &PubTypesSection)
+      .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+      .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+      .Case("debug_info.dwo", &InfoDWOSection.Data)
+      .Case("debug_abbrev.dwo", &AbbrevDWOSection)
+      .Case("debug_loc.dwo", &LocDWOSection.Data)
+      .Case("debug_line.dwo", &LineDWOSection.Data)
+      .Case("debug_str.dwo", &StringDWOSection)
+      .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+      .Case("debug_addr", &AddrSection)
+      .Case("apple_names", &AppleNamesSection.Data)
+      .Case("apple_types", &AppleTypesSection.Data)
+      .Case("apple_namespaces", &AppleNamespacesSection.Data)
+      .Case("apple_namespac", &AppleNamespacesSection.Data)
+      .Case("apple_objc", &AppleObjCSection.Data)
+      .Case("debug_cu_index", &CUIndexSection)
+      .Case("debug_tu_index", &TUIndexSection)
+      .Case("gdb_index", &GdbIndexSection)
+      // Any more debug info sections go here.
+      .Default(nullptr);
+}
+
+void DWARFContextInMemory::anchor() {}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
index e63e28997ed0..76dd2e4c21bc 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
+//===- DWARFDebugAbbrev.cpp -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,10 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 DWARFAbbreviationDeclarationSet::DWARFAbbreviationDeclarationSet() {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 67589cd01e55..ed5d726ae4e2 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
+//===- DWARFDebugArangeSet.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
 using namespace llvm;
 
 void DWARFDebugArangeSet::clear() {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 27a02c4c50d0..0cf71f530446 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
+//===- DWARFDebugAranges.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,11 +11,13 @@
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/DataExtractor.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
 #include <set>
+#include <vector>
+
 using namespace llvm;
 
 void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
@@ -81,7 +83,7 @@ void DWARFDebugAranges::construct() {
   std::sort(Endpoints.begin(), Endpoints.end());
   uint64_t PrevAddress = -1ULL;
   for (const auto &E : Endpoints) {
-    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
+    if (PrevAddress < E.Address && !ValidCUs.empty()) {
       // If the address range between two endpoints is described by some
       // CU, first try to extend the last range in Aranges. If we can't
       // do it, start a new range.
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 32b8320e26c5..b55ed6a46849 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//===- DWARFDebugFrame.h - Parsing of .debug_frame ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -15,6 +14,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
@@ -465,8 +465,7 @@ void FrameEntry::dumpInstructions(raw_ostream &OS) const {
   }
 }
 
-DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {
-}
+DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {}
 
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
@@ -485,17 +484,17 @@ static unsigned getSizeForEncoding(const DataExtractor &Data,
   unsigned format = symbolEncoding & 0x0f;
   switch (format) {
     default: llvm_unreachable("Unknown Encoding");
-    case dwarf::DW_EH_PE_absptr:
-    case dwarf::DW_EH_PE_signed:
+    case DW_EH_PE_absptr:
+    case DW_EH_PE_signed:
       return Data.getAddressSize();
-    case dwarf::DW_EH_PE_udata2:
-    case dwarf::DW_EH_PE_sdata2:
+    case DW_EH_PE_udata2:
+    case DW_EH_PE_sdata2:
       return 2;
-    case dwarf::DW_EH_PE_udata4:
-    case dwarf::DW_EH_PE_sdata4:
+    case DW_EH_PE_udata4:
+    case DW_EH_PE_sdata4:
       return 4;
-    case dwarf::DW_EH_PE_udata8:
-    case dwarf::DW_EH_PE_sdata8:
+    case DW_EH_PE_udata8:
+    case DW_EH_PE_sdata8:
       return 8;
   }
 }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index c487e1dca7c6..35f673c7acc6 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
+//===- DWARFDebugInfoEntry.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SyntaxHighlighting.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 using namespace dwarf;
-using namespace syntax;
 
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
                                              uint32_t *OffsetPtr) {
@@ -28,6 +25,7 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
   const uint32_t UEndOffset = U.getNextUnitOffset();
   return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0);
 }
+
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
                                       const DataExtractor &DebugInfoData,
                                       uint32_t UEndOffset, uint32_t D) {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 494059461fd7..e4670519b797 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLine.cpp ------------------------------------------------===//
+//===- DWARFDebugLine.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <utility>
+
 using namespace llvm;
 using namespace dwarf;
+
 typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
 
 DWARFDebugLine::Prologue::Prologue() { clear(); }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index ae5b9d70a2eb..e2799ab2d243 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLoc.cpp -------------------------------------------------===//
+//===- DWARFDebugLoc.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -71,7 +76,7 @@ void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
     }
   }
   if (data.isValidOffset(Offset))
-    llvm::errs() << "error: failed to consume entire .debug_loc section\n";
+    errs() << "error: failed to consume entire .debug_loc section\n";
 }
 
 void DWARFDebugLocDWO::parse(DataExtractor data) {
@@ -85,8 +90,8 @@ void DWARFDebugLocDWO::parse(DataExtractor data) {
                 data.getU8(&Offset))) != dwarf::DW_LLE_end_of_list) {
 
       if (Kind != dwarf::DW_LLE_startx_length) {
-        llvm::errs() << "error: dumping support for LLE of kind " << (int)Kind
-                     << " not implemented\n";
+        errs() << "error: dumping support for LLE of kind " << (int)Kind
+               << " not implemented\n";
         return;
       }
 
@@ -123,4 +128,3 @@ void DWARFDebugLocDWO::dump(raw_ostream &OS) const {
     }
   }
 }
-
diff --git a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 7710a90b5e13..e0a9adde8e58 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugMacro.cpp -----------------------------------------------===//
+//===- DWARFDebugMacro.cpp ------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "SyntaxHighlighting.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index 3c1fe93090c6..662e53d9d7e6 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugPubTable.cpp ---------------------------------------------===//
+//===- DWARFDebugPubTable.cpp ---------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
-using namespace llvm::dwarf;
+using namespace dwarf;
 
 DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
                                        bool GnuStyle)
@@ -54,7 +58,7 @@ void DWARFDebugPubTable::dump(StringRef Name, raw_ostream &OS) const {
       OS << format("0x%8.8x ", E.SecOffset);
       if (GnuStyle) {
         StringRef EntryLinkage =
-            dwarf::GDBIndexEntryLinkageString(E.Descriptor.Linkage);
+            GDBIndexEntryLinkageString(E.Descriptor.Linkage);
         StringRef EntryKind = dwarf::GDBIndexEntryKindString(E.Descriptor.Kind);
         OS << format("%-8s", EntryLinkage.data()) << ' '
            << format("%-8s", EntryKind.data()) << ' ';
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index d5df6885f5e9..f1d82fda8c06 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//===- DWARFDebugRangesList.cpp -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,9 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 89b83b11ab68..4308cc2e2639 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDie.cpp ------------------------------------------------------===//
+//===- DWARFDie.cpp -------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,25 +7,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "SyntaxHighlighting.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
 
-namespace {
- static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
+static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
   OS << " (";
   do {
     uint64_t Shift = countTrailingZeros(Val);
@@ -122,8 +130,6 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   OS << ")\n";
 }
 
-} // end anonymous namespace
-
 bool DWARFDie::isSubprogramDIE() const {
   return getTag() == DW_TAG_subprogram;
 }
@@ -134,7 +140,7 @@ bool DWARFDie::isSubroutineDIE() const {
 }
 
 Optional<DWARFFormValue>
-DWARFDie::getAttributeValue(dwarf::Attribute Attr) const {
+DWARFDie::find(dwarf::Attribute Attr) const {
   if (!isValid())
     return None;
   auto AbbrevDecl = getAbbreviationDeclarationPtr();
@@ -143,54 +149,41 @@ DWARFDie::getAttributeValue(dwarf::Attribute Attr) const {
   return None;
 }
 
-const char *DWARFDie::getAttributeValueAsString(dwarf::Attribute Attr,
-                                                const char *FailValue) const {
-  auto FormValue = getAttributeValue(Attr);
-  if (!FormValue)
-    return FailValue;
-  Optional<const char *> Result = FormValue->getAsCString();
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsAddress();
-  return None;
-}
-
-Optional<int64_t>
-DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsSignedConstant();
-  return None;
-}
-
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsUnsignedConstant();
-  return None;
-}
-
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsReference();
+Optional<DWARFFormValue>
+DWARFDie::find(ArrayRef<dwarf::Attribute> Attrs) const {
+  if (!isValid())
+    return None;
+  auto AbbrevDecl = getAbbreviationDeclarationPtr();
+  if (AbbrevDecl) {
+    for (auto Attr : Attrs) {
+      if (auto Value = AbbrevDecl->getAttributeValue(getOffset(), Attr, *U))
+        return Value;
+    }
+  }
   return None;
 }
 
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsSectionOffset(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsSectionOffset();
+Optional<DWARFFormValue>
+DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
+  if (!isValid())
+    return None;
+  auto Die = *this;
+  if (auto Value = Die.find(Attrs))
+    return Value;
+  if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
+    Die = D;
+  if (auto Value = Die.find(Attrs))
+    return Value;
+  if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification))
+    Die = D;
+  if (auto Value = Die.find(Attrs))
+    return Value;
   return None;
 }
 
-
 DWARFDie
 DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const {
-  auto SpecRef = getAttributeValueAsReference(Attr);
+  auto SpecRef = toReference(find(Attr));
   if (SpecRef) {
     auto SpecUnit = U->getUnitSection().getUnitForOffset(*SpecRef);
     if (SpecUnit)
@@ -201,14 +194,11 @@ DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const {
 
 Optional<uint64_t>
 DWARFDie::getRangesBaseAttribute() const {
-  auto Result = getAttributeValueAsSectionOffset(DW_AT_rnglists_base);
-  if (Result)
-    return Result;
-  return getAttributeValueAsSectionOffset(DW_AT_GNU_ranges_base);
+  return toSectionOffset(find({DW_AT_rnglists_base, DW_AT_GNU_ranges_base}));
 }
 
 Optional<uint64_t> DWARFDie::getHighPC(uint64_t LowPC) const {
-  if (auto FormValue = getAttributeValue(DW_AT_high_pc)) {
+  if (auto FormValue = find(DW_AT_high_pc)) {
     if (auto Address = FormValue->getAsAddress()) {
       // High PC is an address.
       return Address;
@@ -222,7 +212,7 @@ Optional<uint64_t> DWARFDie::getHighPC(uint64_t LowPC) const {
 }
 
 bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const {
-  auto LowPcAddr = getAttributeValueAsAddress(DW_AT_low_pc);
+  auto LowPcAddr = toAddress(find(DW_AT_low_pc));
   if (!LowPcAddr)
     return false;
   if (auto HighPcAddr = getHighPC(*LowPcAddr)) {
@@ -243,7 +233,7 @@ DWARFDie::getAddressRanges() const {
     return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
   }
   // Multiple ranges from .debug_ranges section.
-  auto RangesOffset = getAttributeValueAsSectionOffset(DW_AT_ranges);
+  auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
   if (RangesOffset) {
     DWARFDebugRangeList RangeList;
     if (U->extractRangeList(*RangesOffset, RangeList))
@@ -284,33 +274,26 @@ const char *
 DWARFDie::getName(DINameKind Kind) const {
   if (!isValid() || Kind == DINameKind::None)
     return nullptr;
-  const char *name = nullptr;
   // Try to get mangled name only if it was asked for.
   if (Kind == DINameKind::LinkageName) {
-    if ((name = getAttributeValueAsString(DW_AT_MIPS_linkage_name, nullptr)))
-      return name;
-    if ((name = getAttributeValueAsString(DW_AT_linkage_name, nullptr)))
-      return name;
+    if (auto Name = dwarf::toString(findRecursively({DW_AT_MIPS_linkage_name,
+                                    DW_AT_linkage_name}), nullptr))
+      return Name;
   }
-  if ((name = getAttributeValueAsString(DW_AT_name, nullptr)))
-    return name;
-  // Try to get name from specification DIE.
-  DWARFDie SpecDie = getAttributeValueAsReferencedDie(DW_AT_specification);
-  if (SpecDie && (name = SpecDie.getName(Kind)))
-    return name;
-  // Try to get name from abstract origin DIE.
-  DWARFDie AbsDie = getAttributeValueAsReferencedDie(DW_AT_abstract_origin);
-  if (AbsDie && (name = AbsDie.getName(Kind)))
-    return name;
+  if (auto Name = dwarf::toString(findRecursively(DW_AT_name), nullptr))
+    return Name;
   return nullptr;
 }
 
+uint64_t DWARFDie::getDeclLine() const {
+  return toUnsigned(findRecursively(DW_AT_decl_line), 0);
+}
+
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
                               uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsignedConstant(DW_AT_call_file).getValueOr(0);
-  CallLine = getAttributeValueAsUnsignedConstant(DW_AT_call_line).getValueOr(0);
-  CallColumn =
-      getAttributeValueAsUnsignedConstant(DW_AT_call_column).getValueOr(0);
+  CallFile = toUnsigned(find(DW_AT_call_file), 0);
+  CallLine = toUnsigned(find(DW_AT_call_line), 0);
+  CallColumn = toUnsigned(find(DW_AT_call_column), 0);
 }
 
 void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
@@ -340,6 +323,12 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
         
         // Dump all data in the DIE for the attributes.
         for (const auto &AttrSpec : AbbrevDecl->attributes()) {
+          if (AttrSpec.Form == DW_FORM_implicit_const) {
+            // We are dumping .debug_info section ,
+            // implicit_const attribute values are not really stored here,
+            // but in .debug_abbrev section. So we just skip such attrs.
+            continue;
+          }
           dumpAttribute(OS, *this, &offset, AttrSpec.Attr, AttrSpec.Form,
                         Indent);
         }
@@ -361,7 +350,6 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
   }
 }
 
-
 void DWARFDie::getInlinedChainForAddress(
     const uint64_t Address, SmallVectorImpl<DWARFDie> &InlinedChain) const {
   if (isNULL())
@@ -399,3 +387,53 @@ DWARFDie DWARFDie::getSibling() const {
     return U->getSibling(Die);
   return DWARFDie();
 }
+
+iterator_range<DWARFDie::attribute_iterator>
+DWARFDie::attributes() const {
+  return make_range(attribute_iterator(*this, false),
+                    attribute_iterator(*this, true));
+}
+
+DWARFDie::attribute_iterator::attribute_iterator(DWARFDie D, bool End) :
+    Die(D), AttrValue(0), Index(0) {
+  auto AbbrDecl = Die.getAbbreviationDeclarationPtr();
+  assert(AbbrDecl && "Must have abbreviation declaration");
+  if (End) {
+    // This is the end iterator so we set the index to the attribute count.
+    Index = AbbrDecl->getNumAttributes();
+  } else {
+    // This is the begin iterator so we extract the value for this->Index.
+    AttrValue.Offset = D.getOffset() + AbbrDecl->getCodeByteSize();
+    updateForIndex(*AbbrDecl, 0);
+  }
+}
+
+void DWARFDie::attribute_iterator::updateForIndex(
+    const DWARFAbbreviationDeclaration &AbbrDecl, uint32_t I) {
+  Index = I;
+  // AbbrDecl must be valid befor calling this function.
+  auto NumAttrs = AbbrDecl.getNumAttributes();
+  if (Index < NumAttrs) {
+    AttrValue.Attr = AbbrDecl.getAttrByIndex(Index);
+    // Add the previous byte size of any previous attribute value.
+    AttrValue.Offset += AttrValue.ByteSize;
+    AttrValue.Value.setForm(AbbrDecl.getFormByIndex(Index));
+    uint32_t ParseOffset = AttrValue.Offset;
+    auto U = Die.getDwarfUnit();
+    assert(U && "Die must have valid DWARF unit");
+    bool b = AttrValue.Value.extractValue(U->getDebugInfoExtractor(),
+                                          &ParseOffset, U);
+    (void)b;
+    assert(b && "extractValue cannot fail on fully parsed DWARF");
+    AttrValue.ByteSize = ParseOffset - AttrValue.Offset;
+  } else {
+    assert(Index == NumAttrs && "Indexes should be [0, NumAttrs) only");
+    AttrValue.clear();
+  }
+}
+
+DWARFDie::attribute_iterator &DWARFDie::attribute_iterator::operator++() {
+  if (auto AbbrDecl = Die.getAbbreviationDeclarationPtr())
+    updateForIndex(*AbbrDecl, Index + 1);
+  return *this;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index dc9310dc4e89..6de57b999adc 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFFormValue.cpp ------------------------------------------------===//
+//===- DWARFFormValue.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,16 +9,21 @@
 
 #include "SyntaxHighlighting.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
+#include <cinttypes>
+#include <cstdint>
 #include <limits>
+
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
@@ -66,13 +71,16 @@ class FormSizeHelper {
 
 public:
   FormSizeHelper(uint16_t V, uint8_t A, llvm::dwarf::DwarfFormat F)
-  : Version(V), AddrSize(A), Format(F) {}
+      : Version(V), AddrSize(A), Format(F) {}
+
   uint8_t getAddressByteSize() const { return AddrSize; }
+
   uint8_t getRefAddrByteSize() const {
     if (Version == 2)
       return AddrSize;
     return getDwarfOffsetByteSize();
   }
+
   uint8_t getDwarfOffsetByteSize() const {
     switch (Format) {
       case dwarf::DwarfFormat::DWARF32:
@@ -120,14 +128,21 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     case DW_FORM_flag:
     case DW_FORM_data1:
     case DW_FORM_ref1:
+    case DW_FORM_strx1:
+    case DW_FORM_addrx1:
       return 1;
 
     case DW_FORM_data2:
     case DW_FORM_ref2:
+    case DW_FORM_strx2:
+    case DW_FORM_addrx2:
       return 2;
 
     case DW_FORM_data4:
     case DW_FORM_ref4:
+    case DW_FORM_ref_sup4:
+    case DW_FORM_strx4:
+    case DW_FORM_addrx4:
       return 4;
 
     case DW_FORM_strp:
@@ -136,7 +151,6 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     case DW_FORM_line_strp:
     case DW_FORM_sec_offset:
     case DW_FORM_strp_sup:
-    case DW_FORM_ref_sup:
       if (U)
         return U->getDwarfOffsetByteSize();
       return None;
@@ -144,6 +158,7 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     case DW_FORM_data8:
     case DW_FORM_ref8:
     case DW_FORM_ref_sig8:
+    case DW_FORM_ref_sup8:
       return 8;
 
     case DW_FORM_flag_present:
@@ -211,7 +226,14 @@ static bool skipFormValue(dwarf::Form Form, const DataExtractor &DebugInfoData,
       case DW_FORM_ref4:
       case DW_FORM_ref8:
       case DW_FORM_ref_sig8:
-      case DW_FORM_ref_sup:
+      case DW_FORM_ref_sup4:
+      case DW_FORM_ref_sup8:
+      case DW_FORM_strx1:
+      case DW_FORM_strx2:
+      case DW_FORM_strx4:
+      case DW_FORM_addrx1:
+      case DW_FORM_addrx2:
+      case DW_FORM_addrx4:
       case DW_FORM_sec_offset:
       case DW_FORM_strp:
       case DW_FORM_strp_sup:
@@ -339,14 +361,21 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
     case DW_FORM_data1:
     case DW_FORM_ref1:
     case DW_FORM_flag:
+    case DW_FORM_strx1:
+    case DW_FORM_addrx1:
       Value.uval = data.getU8(offset_ptr);
       break;
     case DW_FORM_data2:
     case DW_FORM_ref2:
+    case DW_FORM_strx2:
+    case DW_FORM_addrx2:
       Value.uval = data.getU16(offset_ptr);
       break;
     case DW_FORM_data4:
-    case DW_FORM_ref4: {
+    case DW_FORM_ref4:
+    case DW_FORM_ref_sup4:
+    case DW_FORM_strx4:
+    case DW_FORM_addrx4: {
       Value.uval = data.getU32(offset_ptr);
       if (!U)
         break;
@@ -357,6 +386,7 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
     }
     case DW_FORM_data8:
     case DW_FORM_ref8:
+    case DW_FORM_ref_sup8:
       Value.uval = data.getU64(offset_ptr);
       break;
     case DW_FORM_sdata:
@@ -378,8 +408,7 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
     case DW_FORM_line_strp:
-    case DW_FORM_strp_sup:
-    case DW_FORM_ref_sup: {
+    case DW_FORM_strp_sup: {
       if (!U)
         return false;
       RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr);
@@ -400,7 +429,9 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
       Value.uval = data.getULEB128(offset_ptr);
       break;
     default:
-      return false;
+      // DWARFFormValue::skipValue() will have caught this and caused all
+      // DWARF DIEs to fail to be parsed, so this code is not be reachable.
+      llvm_unreachable("unsupported form");
     }
   } while (indirect);
 
@@ -495,21 +526,18 @@ DWARFFormValue::dump(raw_ostream &OS) const {
 
   case DW_FORM_sdata:     OS << Value.sval; break;
   case DW_FORM_udata:     OS << Value.uval; break;
-  case DW_FORM_strp: {
+  case DW_FORM_strp:
     OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
     dumpString(OS);
     break;
-  }
-  case DW_FORM_GNU_str_index: {
+  case DW_FORM_GNU_str_index:
     OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
     dumpString(OS);
     break;
-  }
-  case DW_FORM_GNU_strp_alt: {
+  case DW_FORM_GNU_strp_alt:
     OS << format("alt indirect string, offset: 0x%" PRIx64 "", uvalue);
     dumpString(OS);
     break;
-  }
   case DW_FORM_ref_addr:
     OS << format("0x%016" PRIx64, uvalue);
     break;
@@ -674,4 +702,3 @@ Optional<uint64_t> DWARFFormValue::getAsReferenceUVal() const {
     return None;
   return Value.uval;
 }
-
diff --git a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index ebb996162f1b..76354a9b1ddb 100644
--- a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFGdbIndex.cpp -------------------------------------------------===//
+//===- DWARFGdbIndex.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index 88fb20381f95..e0f819383289 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//===- DWARFTypeUnit.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
 
 using namespace llvm;
 
@@ -26,22 +29,24 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
 
 void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
   DWARFDie TD = getDIEForOffset(TypeOffset + getOffset());
-  const char *Name = TD.getAttributeValueAsString(llvm::dwarf::DW_AT_name, "");
+  const char *Name = TD.getName(DINameKind::ShortName);
 
   if (SummarizeTypes) {
     OS << "name = '" << Name << "'"
-       << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+       << " type_signature = " << format("0x%016" PRIx64, TypeHash)
        << " length = " << format("0x%08x", getLength()) << '\n';
     return;
   }
 
   OS << format("0x%08x", getOffset()) << ": Type Unit:"
      << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " version = " << format("0x%04x", getVersion());
+  if (getVersion() >= 5)
+    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " name = '" << Name << "'"
-     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_signature = " << format("0x%016" PRIx64, TypeHash)
      << " type_offset = " << format("0x%04x", TypeOffset)
      << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
 
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index ee2c569b0bce..4ee8e8f46d2e 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -13,6 +13,9 @@
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
@@ -20,12 +23,12 @@
 #include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <vector>
 
-namespace llvm {
-
+using namespace llvm;
 using namespace dwarf;
 
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
@@ -87,7 +90,15 @@ bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
 bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
   Version = debug_info.getU16(offset_ptr);
-  uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
+  uint64_t AbbrOffset;
+  if (Version >= 5) {
+    UnitType = debug_info.getU8(offset_ptr);
+    AddrSize = debug_info.getU8(offset_ptr);
+    AbbrOffset = debug_info.getU32(offset_ptr);
+  } else {
+    AbbrOffset = debug_info.getU32(offset_ptr);
+    AddrSize = debug_info.getU8(offset_ptr);
+  }
   if (IndexEntry) {
     if (AbbrOffset)
       return false;
@@ -99,7 +110,6 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
       return false;
     AbbrOffset = AbbrEntry->Offset;
   }
-  AddrSize = debug_info.getU8(offset_ptr);
 
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
   bool VersionOK = DWARFContext::isSupportedVersion(Version);
@@ -151,11 +161,11 @@ void DWARFUnit::clear() {
 }
 
 const char *DWARFUnit::getCompilationDir() {
-  return getUnitDIE().getAttributeValueAsString(DW_AT_comp_dir, nullptr);
+  return dwarf::toString(getUnitDIE().find(DW_AT_comp_dir), nullptr);
 }
 
 Optional<uint64_t> DWARFUnit::getDWOId() {
-  return getUnitDIE().getAttributeValueAsUnsignedConstant(DW_AT_GNU_dwo_id);
+  return toUnsigned(getUnitDIE().find(DW_AT_GNU_dwo_id));
 }
 
 void DWARFUnit::extractDIEsToVector(
@@ -225,17 +235,11 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   // If CU DIE was just parsed, copy several attribute values from it.
   if (!HasCUDie) {
     DWARFDie UnitDie = getUnitDIE();
-    auto BaseAddr = UnitDie.getAttributeValueAsAddress(DW_AT_low_pc);
-    if (!BaseAddr)
-      BaseAddr = UnitDie.getAttributeValueAsAddress(DW_AT_entry_pc);
+    auto BaseAddr = toAddress(UnitDie.find({DW_AT_low_pc, DW_AT_entry_pc}));
     if (BaseAddr)
       setBaseAddress(*BaseAddr);
-    AddrOffsetSectionBase =
-        UnitDie.getAttributeValueAsSectionOffset(DW_AT_GNU_addr_base)
-            .getValueOr(0);
-    RangeSectionBase =
-        UnitDie.getAttributeValueAsSectionOffset(DW_AT_rnglists_base)
-            .getValueOr(0);
+    AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
+    RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
@@ -243,8 +247,7 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   return DieArray.size();
 }
 
-DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
-    : DWOU(nullptr) {
+DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath) {
   auto Obj = object::ObjectFile::createObjectFile(DWOPath);
   if (!Obj) {
     // TODO: Actually report errors helpfully.
@@ -266,17 +269,16 @@ bool DWARFUnit::parseDWO() {
   DWARFDie UnitDie = getUnitDIE();
   if (!UnitDie)
     return false;
-  const char *DWOFileName =
-      UnitDie.getAttributeValueAsString(DW_AT_GNU_dwo_name, nullptr);
+  auto DWOFileName = dwarf::toString(UnitDie.find(DW_AT_GNU_dwo_name));
   if (!DWOFileName)
     return false;
-  const char *CompilationDir =
-      UnitDie.getAttributeValueAsString(DW_AT_comp_dir, nullptr);
+  auto CompilationDir = dwarf::toString(UnitDie.find(DW_AT_comp_dir));
   SmallString<16> AbsolutePath;
-  if (sys::path::is_relative(DWOFileName) && CompilationDir != nullptr) {
-    sys::path::append(AbsolutePath, CompilationDir);
+  if (sys::path::is_relative(*DWOFileName) && CompilationDir &&
+      *CompilationDir) {
+    sys::path::append(AbsolutePath, *CompilationDir);
   }
-  sys::path::append(AbsolutePath, DWOFileName);
+  sys::path::append(AbsolutePath, *DWOFileName);
   DWO = llvm::make_unique<DWOHolder>(AbsolutePath);
   DWARFUnit *DWOCU = DWO->getUnit();
   // Verify that compile unit in .dwo file is valid.
@@ -374,8 +376,8 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address,
     InlinedChain.clear();
 }
 
-const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
-                                        DWARFSectionKind Kind) {
+const DWARFUnitIndex &llvm::getDWARFUnitIndex(DWARFContext &Context,
+                                              DWARFSectionKind Kind) {
   if (Kind == DW_SECT_INFO)
     return Context.getCUIndex();
   assert(Kind == DW_SECT_TYPES);
@@ -413,11 +415,10 @@ DWARFDie DWARFUnit::getSibling(const DWARFDebugInfoEntry *Die) {
     return DWARFDie();
   
   // Find the next DIE whose depth is the same as the Die's depth.
-  for (size_t I=getDIEIndex(Die)+1, EndIdx = DieArray.size(); I<EndIdx; ++I) {
+  for (size_t I = getDIEIndex(Die) + 1, EndIdx = DieArray.size(); I < EndIdx;
+       ++I) {
     if (DieArray[I].getDepth() == Depth)
       return DWARFDie(this, &DieArray[I]);
   }
   return DWARFDie();
 }
-
-} // end namespace llvm
diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 96b316957dfd..0981a4dfdfa5 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFUnitIndex.cpp ------------------------------------------------===//
+//===- DWARFUnitIndex.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
 
-namespace llvm {
+using namespace llvm;
 
 bool DWARFUnitIndex::Header::parse(DataExtractor IndexData,
                                    uint32_t *OffsetPtr) {
@@ -152,6 +156,7 @@ DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const {
       return &Contributions[i];
   return nullptr;
 }
+
 const DWARFUnitIndex::Entry::SectionContribution *
 DWARFUnitIndex::Entry::getOffset() const {
   return &Contributions[Index->InfoColumn];
@@ -165,4 +170,3 @@ DWARFUnitIndex::getFromOffset(uint32_t Offset) const {
         return &Rows[i];
   return nullptr;
 }
-}
diff --git a/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
index 4f561d062b12..d4f44e446954 100644
--- a/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
+++ b/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
@@ -1,4 +1,4 @@
-//===-- SyntaxHighlighting.cpp ----------------------------------*- C++ -*-===//
+//===- SyntaxHighlighting.cpp ---------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,6 +9,8 @@
 
 #include "SyntaxHighlighting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
@@ -18,16 +20,16 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("use colored syntax highlighting (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-WithColor::WithColor(llvm::raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
+WithColor::WithColor(raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
   // Detect color from terminal type unless the user passed the --color option.
   if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE) {
     switch (Type) {
-    case Address:    OS.changeColor(llvm::raw_ostream::YELLOW);  break;
-    case String:     OS.changeColor(llvm::raw_ostream::GREEN);   break;
-    case Tag:        OS.changeColor(llvm::raw_ostream::BLUE);    break;
-    case Attribute:  OS.changeColor(llvm::raw_ostream::CYAN);    break;
-    case Enumerator: OS.changeColor(llvm::raw_ostream::MAGENTA); break;
-    case Macro:      OS.changeColor(llvm::raw_ostream::RED);     break;
+    case Address:    OS.changeColor(raw_ostream::YELLOW);  break;
+    case String:     OS.changeColor(raw_ostream::GREEN);   break;
+    case Tag:        OS.changeColor(raw_ostream::BLUE);    break;
+    case Attribute:  OS.changeColor(raw_ostream::CYAN);    break;
+    case Enumerator: OS.changeColor(raw_ostream::MAGENTA); break;
+    case Macro:      OS.changeColor(raw_ostream::RED);     break;
     }
   }
 }
diff --git a/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/lib/DebugInfo/DWARF/SyntaxHighlighting.h
index 16e68351d5e1..277de973dbf0 100644
--- a/lib/DebugInfo/DWARF/SyntaxHighlighting.h
+++ b/lib/DebugInfo/DWARF/SyntaxHighlighting.h
@@ -1,4 +1,4 @@
-//===-- SyntaxHighlighting.h ------------------------------------*- C++ -*-===//
+//===- SyntaxHighlighting.h -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,10 @@
 #ifndef LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
 #define LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
 
-#include "llvm/Support/raw_ostream.h"
-
 namespace llvm {
+
+class raw_ostream;
+
 namespace dwarf {
 namespace syntax {
 
@@ -22,18 +23,20 @@ enum HighlightColor { Address, String, Tag, Attribute, Enumerator, Macro };
 /// An RAII object that temporarily switches an output stream to a
 /// specific color.
 class WithColor {
-  llvm::raw_ostream &OS;
+  raw_ostream &OS;
 
 public:
   /// To be used like this: WithColor(OS, syntax::String) << "text";
-  WithColor(llvm::raw_ostream &OS, enum HighlightColor Type);
+  WithColor(raw_ostream &OS, enum HighlightColor Type);
   ~WithColor();
 
-  llvm::raw_ostream& get() { return OS; }
-  operator llvm::raw_ostream& () { return OS; }
+  raw_ostream& get() { return OS; }
+  operator raw_ostream& () { return OS; }
 };
-}
-}
-}
 
-#endif
+} // end namespace syntax
+} // end namespace dwarf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
diff --git a/lib/DebugInfo/MSF/CMakeLists.txt b/lib/DebugInfo/MSF/CMakeLists.txt
index dcb2a8e0cc9c..6f38de336ee0 100644
--- a/lib/DebugInfo/MSF/CMakeLists.txt
+++ b/lib/DebugInfo/MSF/CMakeLists.txt
@@ -3,8 +3,6 @@ add_llvm_library(LLVMDebugInfoMSF
   MSFBuilder.cpp
   MSFCommon.cpp
   MSFError.cpp
-  StreamReader.cpp
-  StreamWriter.cpp
   ADDITIONAL_HEADER_DIRS
   "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/MSF"
   )
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index e52c88a5bfb8..57953cfa338e 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -11,8 +11,8 @@
 
 #include "llvm/DebugInfo/MSF/IMSFFile.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
 #include "llvm/DebugInfo/MSF/MSFStreamLayout.h"
+#include "llvm/Support/BinaryStreamError.h"
 
 using namespace llvm;
 using namespace llvm::msf;
@@ -47,22 +47,20 @@ static Interval intersect(const Interval &I1, const Interval &I2) {
 
 MappedBlockStream::MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
                                      const MSFStreamLayout &Layout,
-                                     const ReadableStream &MsfData)
+                                     BinaryStreamRef MsfData)
     : BlockSize(BlockSize), NumBlocks(NumBlocks), StreamLayout(Layout),
       MsfData(MsfData) {}
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
                                 const MSFStreamLayout &Layout,
-                                const ReadableStream &MsfData) {
+                                BinaryStreamRef MsfData) {
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
       BlockSize, NumBlocks, Layout, MsfData);
 }
 
-std::unique_ptr<MappedBlockStream>
-MappedBlockStream::createIndexedStream(const MSFLayout &Layout,
-                                       const ReadableStream &MsfData,
-                                       uint32_t StreamIndex) {
+std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
+    const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
@@ -73,7 +71,7 @@ MappedBlockStream::createIndexedStream(const MSFLayout &Layout,
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
-                                         const ReadableStream &MsfData) {
+                                         BinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
@@ -82,19 +80,17 @@ MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                   const ReadableStream &MsfData) {
+                                   BinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
   return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
-                                   ArrayRef<uint8_t> &Buffer) const {
+                                   ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Size > StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  if (Offset > StreamLayout.Length - Size)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Size))
+    return EC;
 
   if (tryReadContiguously(Offset, Size, Buffer))
     return Error::success();
@@ -168,11 +164,12 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
   return Error::success();
 }
 
-Error MappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
+                                                    ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Offset >= StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, 1))
+    return EC;
+
   uint32_t First = Offset / BlockSize;
   uint32_t Last = First;
 
@@ -197,10 +194,10 @@ Error MappedBlockStream::readLongestContiguousChunk(
   return Error::success();
 }
 
-uint32_t MappedBlockStream::getLength() const { return StreamLayout.Length; }
+uint32_t MappedBlockStream::getLength() { return StreamLayout.Length; }
 
 bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
-                                            ArrayRef<uint8_t> &Buffer) const {
+                                            ArrayRef<uint8_t> &Buffer) {
   if (Size == 0) {
     Buffer = ArrayRef<uint8_t>();
     return true;
@@ -241,15 +238,13 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset,
-                                   MutableArrayRef<uint8_t> Buffer) const {
+                                   MutableArrayRef<uint8_t> Buffer) {
   uint32_t BlockNum = Offset / BlockSize;
   uint32_t OffsetInBlock = Offset % BlockSize;
 
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Buffer.size() > StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  if (Offset > StreamLayout.Length - Buffer.size())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Buffer.size()))
+    return EC;
 
   uint32_t BytesLeft = Buffer.size();
   uint32_t BytesWritten = 0;
@@ -319,21 +314,21 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
 
 WritableMappedBlockStream::WritableMappedBlockStream(
     uint32_t BlockSize, uint32_t NumBlocks, const MSFStreamLayout &Layout,
-    const WritableStream &MsfData)
+    WritableBinaryStreamRef MsfData)
     : ReadInterface(BlockSize, NumBlocks, Layout, MsfData),
       WriteInterface(MsfData) {}
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
                                         const MSFStreamLayout &Layout,
-                                        const WritableStream &MsfData) {
+                                        WritableBinaryStreamRef MsfData) {
   return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
       BlockSize, NumBlocks, Layout, MsfData);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
-                                               const WritableStream &MsfData,
+                                               WritableBinaryStreamRef MsfData,
                                                uint32_t StreamIndex) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
@@ -344,7 +339,7 @@ WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createDirectoryStream(
-    const MSFLayout &Layout, const WritableStream &MsfData) {
+    const MSFLayout &Layout, WritableBinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
@@ -353,34 +348,31 @@ WritableMappedBlockStream::createDirectoryStream(
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                           const WritableStream &MsfData) {
+                                           WritableBinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
   return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
 }
 
 Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
-                                           ArrayRef<uint8_t> &Buffer) const {
+                                           ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readBytes(Offset, Size, Buffer);
 }
 
 Error WritableMappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readLongestContiguousChunk(Offset, Buffer);
 }
 
-uint32_t WritableMappedBlockStream::getLength() const {
+uint32_t WritableMappedBlockStream::getLength() {
   return ReadInterface.getLength();
 }
 
 Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
-                                            ArrayRef<uint8_t> Buffer) const {
+                                            ArrayRef<uint8_t> Buffer) {
   // Make sure we aren't trying to write beyond the end of the stream.
-  if (Buffer.size() > getStreamLength())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-  if (Offset > getStreamLayout().Length - Buffer.size())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Buffer.size()))
+    return EC;
 
   uint32_t BlockNum = Offset / getBlockSize();
   uint32_t OffsetInBlock = Offset % getBlockSize();
@@ -410,6 +402,4 @@ Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
   return Error::success();
 }
 
-Error WritableMappedBlockStream::commit() const {
-  return WriteInterface.commit();
-}
+Error WritableMappedBlockStream::commit() { return WriteInterface.commit(); }
diff --git a/lib/DebugInfo/MSF/StreamReader.cpp b/lib/DebugInfo/MSF/StreamReader.cpp
deleted file mode 100644
index b85fd14a3b7f..000000000000
--- a/lib/DebugInfo/MSF/StreamReader.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- StreamReader.cpp - Reads bytes and objects from a stream -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-
-StreamReader::StreamReader(ReadableStreamRef S) : Stream(S), Offset(0) {}
-
-Error StreamReader::readLongestContiguousChunk(ArrayRef<uint8_t> &Buffer) {
-  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
-    return EC;
-  Offset += Buffer.size();
-  return Error::success();
-}
-
-Error StreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
-  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
-    return EC;
-  Offset += Size;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint8_t &Dest) {
-  const uint8_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint16_t &Dest) {
-  const support::ulittle16_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint32_t &Dest) {
-  const support::ulittle32_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint64_t &Dest) {
-  const support::ulittle64_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int8_t &Dest) {
-  const int8_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int16_t &Dest) {
-  const support::little16_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int32_t &Dest) {
-  const support::little32_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int64_t &Dest) {
-  const support::little64_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readZeroString(StringRef &Dest) {
-  uint32_t Length = 0;
-  // First compute the length of the string by reading 1 byte at a time.
-  uint32_t OriginalOffset = getOffset();
-  const char *C;
-  do {
-    if (auto EC = readObject(C))
-      return EC;
-    if (*C != '\0')
-      ++Length;
-  } while (*C != '\0');
-  // Now go back and request a reference for that many bytes.
-  uint32_t NewOffset = getOffset();
-  setOffset(OriginalOffset);
-
-  ArrayRef<uint8_t> Data;
-  if (auto EC = readBytes(Data, Length))
-    return EC;
-  Dest = StringRef(reinterpret_cast<const char *>(Data.begin()), Data.size());
-
-  // Now set the offset back to where it was after we calculated the length.
-  setOffset(NewOffset);
-  return Error::success();
-}
-
-Error StreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
-  ArrayRef<uint8_t> Bytes;
-  if (auto EC = readBytes(Bytes, Length))
-    return EC;
-  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
-  return Error::success();
-}
-
-Error StreamReader::readStreamRef(ReadableStreamRef &Ref) {
-  return readStreamRef(Ref, bytesRemaining());
-}
-
-Error StreamReader::readStreamRef(ReadableStreamRef &Ref, uint32_t Length) {
-  if (bytesRemaining() < Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  Ref = Stream.slice(Offset, Length);
-  Offset += Length;
-  return Error::success();
-}
-
-Error StreamReader::skip(uint32_t Amount) {
-  if (Amount > bytesRemaining())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  Offset += Amount;
-  return Error::success();
-}
-
-uint8_t StreamReader::peek() const {
-  ArrayRef<uint8_t> Buffer;
-  auto EC = Stream.readBytes(Offset, 1, Buffer);
-  assert(!EC && "Cannot peek an empty buffer!");
-  llvm::consumeError(std::move(EC));
-  return Buffer[0];
-}
diff --git a/lib/DebugInfo/MSF/StreamWriter.cpp b/lib/DebugInfo/MSF/StreamWriter.cpp
deleted file mode 100644
index cdae7c5acc04..000000000000
--- a/lib/DebugInfo/MSF/StreamWriter.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===- StreamWrite.cpp - Writes bytes and objects to a stream -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-
-StreamWriter::StreamWriter(WritableStreamRef S) : Stream(S), Offset(0) {}
-
-Error StreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
-  if (auto EC = Stream.writeBytes(Offset, Buffer))
-    return EC;
-  Offset += Buffer.size();
-  return Error::success();
-}
-
-Error StreamWriter::writeInteger(uint8_t Int) { return writeObject(Int); }
-
-Error StreamWriter::writeInteger(uint16_t Int) {
-  return writeObject(support::ulittle16_t(Int));
-}
-
-Error StreamWriter::writeInteger(uint32_t Int) {
-  return writeObject(support::ulittle32_t(Int));
-}
-
-Error StreamWriter::writeInteger(uint64_t Int) {
-  return writeObject(support::ulittle64_t(Int));
-}
-
-Error StreamWriter::writeInteger(int8_t Int) { return writeObject(Int); }
-
-Error StreamWriter::writeInteger(int16_t Int) {
-  return writeObject(support::little16_t(Int));
-}
-
-Error StreamWriter::writeInteger(int32_t Int) {
-  return writeObject(support::little32_t(Int));
-}
-
-Error StreamWriter::writeInteger(int64_t Int) {
-  return writeObject(support::little64_t(Int));
-}
-
-Error StreamWriter::writeZeroString(StringRef Str) {
-  if (auto EC = writeFixedString(Str))
-    return EC;
-  if (auto EC = writeObject('\0'))
-    return EC;
-
-  return Error::success();
-}
-
-Error StreamWriter::writeFixedString(StringRef Str) {
-  ArrayRef<uint8_t> Bytes(Str.bytes_begin(), Str.bytes_end());
-  if (auto EC = Stream.writeBytes(Offset, Bytes))
-    return EC;
-
-  Offset += Str.size();
-  return Error::success();
-}
-
-Error StreamWriter::writeStreamRef(ReadableStreamRef Ref) {
-  if (auto EC = writeStreamRef(Ref, Ref.getLength()))
-    return EC;
-  // Don't increment Offset here, it is done by the overloaded call to
-  // writeStreamRef.
-  return Error::success();
-}
-
-Error StreamWriter::writeStreamRef(ReadableStreamRef Ref, uint32_t Length) {
-  Ref = Ref.slice(0, Length);
-
-  StreamReader SrcReader(Ref);
-  // This is a bit tricky.  If we just call readBytes, we are requiring that it
-  // return us the entire stream as a contiguous buffer.  For large streams this
-  // will allocate a huge amount of space from the pool.  Instead, iterate over
-  // each contiguous chunk until we've consumed the entire stream.
-  while (SrcReader.bytesRemaining() > 0) {
-    ArrayRef<uint8_t> Chunk;
-    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
-      return EC;
-    if (auto EC = writeBytes(Chunk))
-      return EC;
-  }
-  return Error::success();
-}
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index 599f01eaf74f..f87a0b0a72e2 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -27,31 +27,38 @@ if(LLVM_ENABLE_DIA_SDK)
     set(LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB/DIA")
 endif()
 
-add_pdb_impl_folder(Raw
-  Raw/DbiStream.cpp
-  Raw/DbiStreamBuilder.cpp
-  Raw/EnumTables.cpp
-  Raw/GlobalsStream.cpp
-  Raw/GSI.cpp
-  Raw/Hash.cpp
-  Raw/InfoStream.cpp
-  Raw/InfoStreamBuilder.cpp
-  Raw/ModInfo.cpp
-  Raw/ModStream.cpp
-  Raw/NameHashTable.cpp
-  Raw/NameMap.cpp
-  Raw/NameMapBuilder.cpp
-  Raw/PDBFile.cpp
-  Raw/PDBFileBuilder.cpp
-  Raw/PublicsStream.cpp
-  Raw/RawError.cpp
-  Raw/RawSession.cpp
-  Raw/SymbolStream.cpp
-  Raw/TpiHashing.cpp
-  Raw/TpiStream.cpp
-  Raw/TpiStreamBuilder.cpp)
+add_pdb_impl_folder(Native
+  Native/DbiStream.cpp
+  Native/DbiStreamBuilder.cpp
+  Native/EnumTables.cpp
+  Native/GlobalsStream.cpp
+  Native/GSI.cpp
+  Native/Hash.cpp
+  Native/HashTable.cpp
+  Native/InfoStream.cpp
+  Native/InfoStreamBuilder.cpp
+  Native/ModInfo.cpp
+  Native/ModInfoBuilder.cpp
+  Native/ModStream.cpp
+  Native/NativeCompilandSymbol.cpp
+  Native/NativeEnumModules.cpp
+  Native/NativeExeSymbol.cpp
+  Native/NativeRawSymbol.cpp
+  Native/NamedStreamMap.cpp
+  Native/NativeSession.cpp
+  Native/PDBFile.cpp
+  Native/PDBFileBuilder.cpp
+  Native/PDBTypeServerHandler.cpp
+  Native/PublicsStream.cpp
+  Native/RawError.cpp
+  Native/StringTable.cpp
+  Native/StringTableBuilder.cpp
+  Native/SymbolStream.cpp
+  Native/TpiHashing.cpp
+  Native/TpiStream.cpp
+  Native/TpiStreamBuilder.cpp)
 
-list(APPEND LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB/Raw")
+list(APPEND LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB/Native")
 list(APPEND LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB")
 
 add_llvm_library(LLVMDebugInfoPDB
@@ -94,6 +101,7 @@ add_llvm_library(LLVMDebugInfoPDB
   PDBSymbolUnknown.cpp
   PDBSymbolUsingNamespace.cpp
   PDBSymDumper.cpp
+  UDTLayout.cpp
   ${PDB_IMPL_SOURCES}
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index bba5b0f94dca..5e8c0bdc171d 100644
--- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -10,9 +10,13 @@
 #include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -178,9 +182,10 @@ void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
 }
 
 namespace llvm {
-raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid) {
-  const PDB_UniqueId *Id = reinterpret_cast<const PDB_UniqueId *>(&Guid);
-  OS << *Id;
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GUID &G) {
+  StringRef GuidBytes(reinterpret_cast<const char *>(&G), sizeof(G));
+  codeview::detail::GuidAdapter A(GuidBytes);
+  A.format(OS, "");
   return OS;
 }
 }
@@ -715,6 +720,18 @@ uint32_t DIARawSymbol::getVirtualTableShapeId() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualTableShapeId);
 }
 
+std::unique_ptr<PDBSymbolTypeVTable>
+DIARawSymbol::getVirtualBaseTableType() const {
+  CComPtr<IDiaSymbol> TableType;
+  if (FAILED(Symbol->get_virtualBaseTableType(&TableType)) || !TableType)
+    return nullptr;
+
+  auto RawVT = llvm::make_unique<DIARawSymbol>(Session, TableType);
+  auto Pointer =
+      llvm::make_unique<PDBSymbolTypePointer>(Session, std::move(RawVT));
+  return unique_dyn_cast<PDBSymbolTypeVTable>(Pointer->getPointeeType());
+}
+
 PDB_DataKind DIARawSymbol::getDataKind() const {
   return PrivateGetDIAValue<DWORD, PDB_DataKind>(Symbol,
                                                  &IDiaSymbol::get_dataKind);
diff --git a/lib/DebugInfo/PDB/Raw/DbiStream.cpp b/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 4f4a0cf65785..b9f53578d326 100644
--- a/lib/DebugInfo/PDB/Raw/DbiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -7,21 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/ISectionContribVisitor.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
-#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstddef>
@@ -35,7 +34,7 @@ using namespace llvm::support;
 
 template <typename ContribType>
 static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
-                                 StreamReader &Reader) {
+                                 BinaryStreamReader &Reader) {
   if (Reader.bytesRemaining() % sizeof(ContribType) != 0)
     return make_error<RawError>(
         raw_error_code::corrupt_file,
@@ -48,13 +47,12 @@ static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
 }
 
 DbiStream::DbiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
-    : Pdb(File), Stream(std::move(Stream)), Header(nullptr) {
-}
+    : Pdb(File), Stream(std::move(Stream)), Header(nullptr) {}
 
 DbiStream::~DbiStream() = default;
 
 Error DbiStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (Stream->getLength() < sizeof(DbiStreamHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -127,8 +125,8 @@ Error DbiStream::reload() {
     return EC;
   if (auto EC = Reader.readStreamRef(ECSubstream, Header->ECSubstreamSize))
     return EC;
-  if (auto EC = Reader.readArray(DbgStreams, Header->OptionalDbgHdrSize /
-                                                 sizeof(ulittle16_t)))
+  if (auto EC = Reader.readArray(
+          DbgStreams, Header->OptionalDbgHdrSize / sizeof(ulittle16_t)))
     return EC;
 
   if (auto EC = initializeSectionContributionData())
@@ -147,7 +145,7 @@ Error DbiStream::reload() {
                                 "Found unexpected bytes in DBI Stream.");
 
   if (ECSubstream.getLength() > 0) {
-    StreamReader ECReader(ECSubstream);
+    BinaryStreamReader ECReader(ECSubstream);
     if (auto EC = ECNames.load(ECReader))
       return EC;
   }
@@ -209,16 +207,16 @@ PDB_Machine DbiStream::getMachineType() const {
   return static_cast<PDB_Machine>(Machine);
 }
 
-msf::FixedStreamArray<object::coff_section> DbiStream::getSectionHeaders() {
+FixedStreamArray<object::coff_section> DbiStream::getSectionHeaders() {
   return SectionHeaders;
 }
 
-msf::FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
+FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
   return FpoRecords;
 }
 
 ArrayRef<ModuleInfoEx> DbiStream::modules() const { return ModuleInfos; }
-msf::FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
+FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
   return SectionMap;
 }
 
@@ -237,7 +235,7 @@ Error DbiStream::initializeSectionContributionData() {
   if (SecContrSubstream.getLength() == 0)
     return Error::success();
 
-  StreamReader SCReader(SecContrSubstream);
+  BinaryStreamReader SCReader(SecContrSubstream);
   if (auto EC = SCReader.readEnum(SectionContribVersion))
     return EC;
 
@@ -256,7 +254,7 @@ Error DbiStream::initializeModInfoArray() {
 
   // Since each ModInfo in the stream is a variable length, we have to iterate
   // them to know how many there actually are.
-  StreamReader Reader(ModInfoSubstream);
+  BinaryStreamReader Reader(ModInfoSubstream);
 
   VarStreamArray<ModInfo> ModInfoArray;
   if (auto EC = Reader.readArray(ModInfoArray, ModInfoSubstream.getLength()))
@@ -286,7 +284,7 @@ Error DbiStream::initializeSectionHeadersData() {
                                 "Corrupted section header stream.");
 
   size_t NumSections = StreamLen / sizeof(object::coff_section);
-  msf::StreamReader Reader(*SHS);
+  BinaryStreamReader Reader(*SHS);
   if (auto EC = Reader.readArray(SectionHeaders, NumSections))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Could not read a bitmap.");
@@ -318,7 +316,7 @@ Error DbiStream::initializeFpoRecords() {
                                 "Corrupted New FPO stream.");
 
   size_t NumRecords = StreamLen / sizeof(object::FpoData);
-  msf::StreamReader Reader(*FS);
+  BinaryStreamReader Reader(*FS);
   if (auto EC = Reader.readArray(FpoRecords, NumRecords))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Corrupted New FPO stream.");
@@ -330,7 +328,7 @@ Error DbiStream::initializeSectionMapData() {
   if (SecMapSubstream.getLength() == 0)
     return Error::success();
 
-  StreamReader SMReader(SecMapSubstream);
+  BinaryStreamReader SMReader(SecMapSubstream);
   const SecMapHeader *Header;
   if (auto EC = SMReader.readObject(Header))
     return EC;
@@ -344,7 +342,7 @@ Error DbiStream::initializeFileInfo() {
     return Error::success();
 
   const FileInfoSubstreamHeader *FH;
-  StreamReader FISR(FileInfoSubstream);
+  BinaryStreamReader FISR(FileInfoSubstream);
   if (auto EC = FISR.readObject(FH))
     return EC;
 
@@ -413,14 +411,14 @@ uint32_t DbiStream::getDebugStreamIndex(DbgHeaderType Type) const {
 }
 
 Expected<StringRef> DbiStream::getFileNameForIndex(uint32_t Index) const {
-  StreamReader Names(NamesBuffer);
+  BinaryStreamReader Names(NamesBuffer);
   if (Index >= FileNameOffsets.size())
     return make_error<RawError>(raw_error_code::index_out_of_bounds);
 
   uint32_t FileOffset = FileNameOffsets[Index];
   Names.setOffset(FileOffset);
   StringRef Name;
-  if (auto EC = Names.readZeroString(Name))
+  if (auto EC = Names.readCString(Name))
     return std::move(EC);
   return Name;
 }
diff --git a/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 1d5b8d693b1e..a203aea60fe7 100644
--- a/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -7,15 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/DbiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfoBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/COFF.h"
 
 using namespace llvm;
@@ -23,15 +24,13 @@ using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-namespace {
-class ModiSubstreamBuilder {};
-}
-
 DbiStreamBuilder::DbiStreamBuilder(msf::MSFBuilder &Msf)
     : Msf(Msf), Allocator(Msf.getAllocator()), Age(1), BuildNumber(0),
       PdbDllVersion(0), PdbDllRbld(0), Flags(0), MachineType(PDB_Machine::x86),
       Header(nullptr), DbgStreams((int)DbgHeaderType::Max) {}
 
+DbiStreamBuilder::~DbiStreamBuilder() {}
+
 void DbiStreamBuilder::setVersionHeader(PdbRaw_DbiVer V) { VerHeader = V; }
 
 void DbiStreamBuilder::setAge(uint32_t A) { Age = A; }
@@ -75,39 +74,37 @@ uint32_t DbiStreamBuilder::calculateSerializedLength() const {
          calculateSectionMapStreamSize() + calculateDbgStreamsSize();
 }
 
-Error DbiStreamBuilder::addModuleInfo(StringRef ObjFile, StringRef Module) {
-  auto Entry = llvm::make_unique<ModuleInfo>();
-  ModuleInfo *M = Entry.get();
-  Entry->Mod = Module;
-  Entry->Obj = ObjFile;
-  auto Result = ModuleInfos.insert(std::make_pair(Module, std::move(Entry)));
+Expected<ModInfoBuilder &>
+DbiStreamBuilder::addModuleInfo(StringRef ModuleName) {
+  uint32_t Index = ModiList.size();
+  auto MIB = llvm::make_unique<ModInfoBuilder>(ModuleName, Index, Msf);
+  auto M = MIB.get();
+  auto Result = ModiMap.insert(std::make_pair(ModuleName, std::move(MIB)));
+
   if (!Result.second)
     return make_error<RawError>(raw_error_code::duplicate_entry,
                                 "The specified module already exists");
-  ModuleInfoList.push_back(M);
-  return Error::success();
+  ModiList.push_back(M);
+  return *M;
 }
 
 Error DbiStreamBuilder::addModuleSourceFile(StringRef Module, StringRef File) {
-  auto ModIter = ModuleInfos.find(Module);
-  if (ModIter == ModuleInfos.end())
+  auto ModIter = ModiMap.find(Module);
+  if (ModIter == ModiMap.end())
     return make_error<RawError>(raw_error_code::no_entry,
                                 "The specified module was not found");
   uint32_t Index = SourceFileNames.size();
   SourceFileNames.insert(std::make_pair(File, Index));
   auto &ModEntry = *ModIter;
-  ModEntry.second->SourceFiles.push_back(File);
+  ModEntry.second->addSourceFile(File);
   return Error::success();
 }
 
 uint32_t DbiStreamBuilder::calculateModiSubstreamSize() const {
   uint32_t Size = 0;
-  for (const auto &M : ModuleInfoList) {
-    Size += sizeof(ModuleInfoHeader);
-    Size += M->Mod.size() + 1;
-    Size += M->Obj.size() + 1;
-  }
-  return alignTo(Size, sizeof(uint32_t));
+  for (const auto &M : ModiList)
+    Size += M->calculateSerializedLength();
+  return Size;
 }
 
 uint32_t DbiStreamBuilder::calculateSectionContribsStreamSize() const {
@@ -127,11 +124,11 @@ uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const {
   uint32_t Size = 0;
   Size += sizeof(ulittle16_t);                         // NumModules
   Size += sizeof(ulittle16_t);                         // NumSourceFiles
-  Size += ModuleInfoList.size() * sizeof(ulittle16_t); // ModIndices
-  Size += ModuleInfoList.size() * sizeof(ulittle16_t); // ModFileCounts
+  Size += ModiList.size() * sizeof(ulittle16_t);       // ModIndices
+  Size += ModiList.size() * sizeof(ulittle16_t);       // ModFileCounts
   uint32_t NumFileInfos = 0;
-  for (const auto &M : ModuleInfoList)
-    NumFileInfos += M->SourceFiles.size();
+  for (const auto &M : ModiList)
+    NumFileInfos += M->source_files().size();
   Size += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets
   Size += calculateNamesBufferSize();
   return alignTo(Size, sizeof(uint32_t));
@@ -149,43 +146,20 @@ uint32_t DbiStreamBuilder::calculateDbgStreamsSize() const {
   return DbgStreams.size() * sizeof(uint16_t);
 }
 
-Error DbiStreamBuilder::generateModiSubstream() {
-  uint32_t Size = calculateModiSubstreamSize();
-  auto Data = Allocator.Allocate<uint8_t>(Size);
-
-  ModInfoBuffer = MutableByteStream(MutableArrayRef<uint8_t>(Data, Size));
-
-  StreamWriter ModiWriter(ModInfoBuffer);
-  for (const auto &M : ModuleInfoList) {
-    ModuleInfoHeader Layout = {};
-    Layout.ModDiStream = kInvalidStreamIndex;
-    Layout.NumFiles = M->SourceFiles.size();
-    if (auto EC = ModiWriter.writeObject(Layout))
-      return EC;
-    if (auto EC = ModiWriter.writeZeroString(M->Mod))
-      return EC;
-    if (auto EC = ModiWriter.writeZeroString(M->Obj))
-      return EC;
-  }
-  if (ModiWriter.bytesRemaining() > sizeof(uint32_t))
-    return make_error<RawError>(raw_error_code::invalid_format,
-                                "Unexpected bytes in Modi Stream Data");
-  return Error::success();
-}
-
 Error DbiStreamBuilder::generateFileInfoSubstream() {
   uint32_t Size = calculateFileInfoSubstreamSize();
   uint32_t NameSize = calculateNamesBufferSize();
   auto Data = Allocator.Allocate<uint8_t>(Size);
   uint32_t NamesOffset = Size - NameSize;
 
-  FileInfoBuffer = MutableByteStream(MutableArrayRef<uint8_t>(Data, Size));
+  FileInfoBuffer = MutableBinaryByteStream(MutableArrayRef<uint8_t>(Data, Size),
+                                           llvm::support::little);
 
-  WritableStreamRef MetadataBuffer =
-      WritableStreamRef(FileInfoBuffer).keep_front(NamesOffset);
-  StreamWriter MetadataWriter(MetadataBuffer);
+  WritableBinaryStreamRef MetadataBuffer =
+      WritableBinaryStreamRef(FileInfoBuffer).keep_front(NamesOffset);
+  BinaryStreamWriter MetadataWriter(MetadataBuffer);
 
-  uint16_t ModiCount = std::min<uint32_t>(UINT16_MAX, ModuleInfos.size());
+  uint16_t ModiCount = std::min<uint32_t>(UINT16_MAX, ModiList.size());
   uint16_t FileCount = std::min<uint32_t>(UINT16_MAX, SourceFileNames.size());
   if (auto EC = MetadataWriter.writeInteger(ModiCount)) // NumModules
     return EC;
@@ -195,8 +169,8 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
     if (auto EC = MetadataWriter.writeInteger(I)) // Mod Indices
       return EC;
   }
-  for (const auto MI : ModuleInfoList) {
-    FileCount = static_cast<uint16_t>(MI->SourceFiles.size());
+  for (const auto &MI : ModiList) {
+    FileCount = static_cast<uint16_t>(MI->source_files().size());
     if (auto EC = MetadataWriter.writeInteger(FileCount)) // Mod File Counts
       return EC;
   }
@@ -205,16 +179,16 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
   // A side effect of this is that this will actually compute the various
   // file name offsets, so we can then go back and write the FileNameOffsets
   // array to the other substream.
-  NamesBuffer = WritableStreamRef(FileInfoBuffer).drop_front(NamesOffset);
-  StreamWriter NameBufferWriter(NamesBuffer);
+  NamesBuffer = WritableBinaryStreamRef(FileInfoBuffer).drop_front(NamesOffset);
+  BinaryStreamWriter NameBufferWriter(NamesBuffer);
   for (auto &Name : SourceFileNames) {
     Name.second = NameBufferWriter.getOffset();
-    if (auto EC = NameBufferWriter.writeZeroString(Name.getKey()))
+    if (auto EC = NameBufferWriter.writeCString(Name.getKey()))
       return EC;
   }
 
-  for (const auto MI : ModuleInfoList) {
-    for (StringRef Name : MI->SourceFiles) {
+  for (const auto &MI : ModiList) {
+    for (StringRef Name : MI->source_files()) {
       auto Result = SourceFileNames.find(Name);
       if (Result == SourceFileNames.end())
         return make_error<RawError>(raw_error_code::no_entry,
@@ -240,13 +214,13 @@ Error DbiStreamBuilder::finalize() {
   if (Header)
     return Error::success();
 
-  DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
+  for (auto &MI : ModiList)
+    MI->finalize();
 
-  if (auto EC = generateModiSubstream())
-    return EC;
   if (auto EC = generateFileInfoSubstream())
     return EC;
 
+  DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
   H->VersionHeader = *VerHeader;
   H->VersionSignature = -1;
   H->Age = Age;
@@ -258,7 +232,7 @@ Error DbiStreamBuilder::finalize() {
 
   H->ECSubstreamSize = 0;
   H->FileInfoSize = FileInfoBuffer.getLength();
-  H->ModiSubstreamSize = ModInfoBuffer.getLength();
+  H->ModiSubstreamSize = calculateModiSubstreamSize();
   H->OptionalDbgHdrSize = DbgStreams.size() * sizeof(uint16_t);
   H->SecContrSubstreamSize = calculateSectionContribsStreamSize();
   H->SectionMapSize = calculateSectionMapStreamSize();
@@ -273,6 +247,11 @@ Error DbiStreamBuilder::finalize() {
 }
 
 Error DbiStreamBuilder::finalizeMsfLayout() {
+  for (auto &MI : ModiList) {
+    if (auto EC = MI->finalizeMsfLayout())
+      return EC;
+  }
+
   uint32_t Length = calculateSerializedLength();
   if (auto EC = Msf.setStreamSize(StreamDBI, Length))
     return EC;
@@ -358,19 +337,21 @@ std::vector<SecMapEntry> DbiStreamBuilder::createSectionMap(
 }
 
 Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
-                               const msf::WritableStream &Buffer) {
+                               WritableBinaryStreamRef MsfBuffer) {
   if (auto EC = finalize())
     return EC;
 
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamDBI);
+  auto DbiS = WritableMappedBlockStream::createIndexedStream(Layout, MsfBuffer,
+                                                             StreamDBI);
 
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*DbiS);
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  if (auto EC = Writer.writeStreamRef(ModInfoBuffer))
-    return EC;
+  for (auto &M : ModiList) {
+    if (auto EC = M->commit(Writer, Layout, MsfBuffer))
+      return EC;
+  }
 
   if (!SectionContribs.empty()) {
     if (auto EC = Writer.writeEnum(DbiSecContribVer60))
@@ -399,8 +380,8 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
     if (Stream.StreamNumber == kInvalidStreamIndex)
       continue;
     auto WritableStream = WritableMappedBlockStream::createIndexedStream(
-        Layout, Buffer, Stream.StreamNumber);
-    StreamWriter DbgStreamWriter(*WritableStream);
+        Layout, MsfBuffer, Stream.StreamNumber);
+    BinaryStreamWriter DbgStreamWriter(*WritableStream);
     if (auto EC = DbgStreamWriter.writeArray(Stream.Data))
       return EC;
   }
diff --git a/lib/DebugInfo/PDB/Raw/EnumTables.cpp b/lib/DebugInfo/PDB/Native/EnumTables.cpp
index fc9270c69947..b3837dc72e5b 100644
--- a/lib/DebugInfo/PDB/Raw/EnumTables.cpp
+++ b/lib/DebugInfo/PDB/Native/EnumTables.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/EnumTables.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/EnumTables.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/lib/DebugInfo/PDB/Raw/GSI.cpp b/lib/DebugInfo/PDB/Native/GSI.cpp
index 6ecbb5c8cfad..b219fe275f73 100644
--- a/lib/DebugInfo/PDB/Raw/GSI.cpp
+++ b/lib/DebugInfo/PDB/Native/GSI.cpp
@@ -9,10 +9,10 @@
 
 #include "GSI.h"
 
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 #include "llvm/Support/Error.h"
 
@@ -28,9 +28,9 @@ static Error checkHashHdrVersion(const GSIHashHeader *HashHdr) {
   return Error::success();
 }
 
-Error readGSIHashBuckets(
-    msf::FixedStreamArray<support::ulittle32_t> &HashBuckets,
-    const GSIHashHeader *HashHdr, msf::StreamReader &Reader) {
+Error readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
+                         const GSIHashHeader *HashHdr,
+                         BinaryStreamReader &Reader) {
   if (auto EC = checkHashHdrVersion(HashHdr))
     return EC;
 
@@ -57,7 +57,7 @@ Error readGSIHashBuckets(
 }
 
 Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
-                        msf::StreamReader &Reader) {
+                        BinaryStreamReader &Reader) {
   if (Reader.readObject(HashHdr))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Stream does not contain a GSIHashHeader.");
@@ -70,9 +70,9 @@ Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
   return Error::success();
 }
 
-Error readGSIHashRecords(msf::FixedStreamArray<PSHashRecord> &HashRecords,
+Error readGSIHashRecords(FixedStreamArray<PSHashRecord> &HashRecords,
                          const GSIHashHeader *HashHdr,
-                         msf::StreamReader &Reader) {
+                         BinaryStreamReader &Reader) {
   if (auto EC = checkHashHdrVersion(HashHdr))
     return EC;
 
diff --git a/lib/DebugInfo/PDB/Raw/GSI.h b/lib/DebugInfo/PDB/Native/GSI.h
index 82cebd946538..9e63bc83548f 100644
--- a/lib/DebugInfo/PDB/Raw/GSI.h
+++ b/lib/DebugInfo/PDB/Native/GSI.h
@@ -25,17 +25,15 @@
 #ifndef LLVM_LIB_DEBUGINFO_PDB_RAW_GSI_H
 #define LLVM_LIB_DEBUGINFO_PDB_RAW_GSI_H
 
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
 
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 
-namespace msf {
-class StreamReader;
-}
+class BinaryStreamReader;
 
 namespace pdb {
 
@@ -56,14 +54,14 @@ struct GSIHashHeader {
   support::ulittle32_t NumBuckets;
 };
 
-Error readGSIHashBuckets(
-    msf::FixedStreamArray<support::ulittle32_t> &HashBuckets,
-    const GSIHashHeader *HashHdr, msf::StreamReader &Reader);
+Error readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
+                         const GSIHashHeader *HashHdr,
+                         BinaryStreamReader &Reader);
 Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
-                        msf::StreamReader &Reader);
-Error readGSIHashRecords(msf::FixedStreamArray<PSHashRecord> &HashRecords,
+                        BinaryStreamReader &Reader);
+Error readGSIHashRecords(FixedStreamArray<PSHashRecord> &HashRecords,
                          const GSIHashHeader *HashHdr,
-                         msf::StreamReader &Reader);
+                         BinaryStreamReader &Reader);
 }
 }
 
diff --git a/lib/DebugInfo/PDB/Raw/GlobalsStream.cpp b/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
index 31afc9200b1b..a2ee0f047c58 100644
--- a/lib/DebugInfo/PDB/Raw/GlobalsStream.cpp
+++ b/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "GSI.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/GlobalsStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 
@@ -23,7 +23,7 @@ GlobalsStream::GlobalsStream(std::unique_ptr<MappedBlockStream> Stream)
 GlobalsStream::~GlobalsStream() = default;
 
 Error GlobalsStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   const GSIHashHeader *HashHdr;
   if (auto EC = readGSIHashHeader(HashHdr, Reader))
diff --git a/lib/DebugInfo/PDB/Raw/Hash.cpp b/lib/DebugInfo/PDB/Native/Hash.cpp
index b9f685ec69d4..2ad3f55dc5c3 100644
--- a/lib/DebugInfo/PDB/Raw/Hash.cpp
+++ b/lib/DebugInfo/PDB/Native/Hash.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Endian.h"
diff --git a/lib/DebugInfo/PDB/Native/HashTable.cpp b/lib/DebugInfo/PDB/Native/HashTable.cpp
new file mode 100644
index 000000000000..ebf8c9c04db1
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/HashTable.cpp
@@ -0,0 +1,302 @@
+//===- HashTable.cpp - PDB Hash Table ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+
+#include <assert.h>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+HashTable::HashTable() : HashTable(8) {}
+
+HashTable::HashTable(uint32_t Capacity) { Buckets.resize(Capacity); }
+
+Error HashTable::load(BinaryStreamReader &Stream) {
+  const Header *H;
+  if (auto EC = Stream.readObject(H))
+    return EC;
+  if (H->Capacity == 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid Hash Table Capacity");
+  if (H->Size > maxLoad(H->Capacity))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid Hash Table Size");
+
+  Buckets.resize(H->Capacity);
+
+  if (auto EC = readSparseBitVector(Stream, Present))
+    return EC;
+  if (Present.count() != H->Size)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Present bit vector does not match size!");
+
+  if (auto EC = readSparseBitVector(Stream, Deleted))
+    return EC;
+  if (Present.intersects(Deleted))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Present bit vector interesects deleted!");
+
+  for (uint32_t P : Present) {
+    if (auto EC = Stream.readInteger(Buckets[P].first))
+      return EC;
+    if (auto EC = Stream.readInteger(Buckets[P].second))
+      return EC;
+  }
+
+  return Error::success();
+}
+
+uint32_t HashTable::calculateSerializedLength() const {
+  uint32_t Size = sizeof(Header);
+
+  int NumBitsP = Present.find_last() + 1;
+  int NumBitsD = Deleted.find_last() + 1;
+
+  // Present bit set number of words, followed by that many actual words.
+  Size += sizeof(uint32_t);
+  Size += alignTo(NumBitsP, sizeof(uint32_t));
+
+  // Deleted bit set number of words, followed by that many actual words.
+  Size += sizeof(uint32_t);
+  Size += alignTo(NumBitsD, sizeof(uint32_t));
+
+  // One (Key, Value) pair for each entry Present.
+  Size += 2 * sizeof(uint32_t) * size();
+
+  return Size;
+}
+
+Error HashTable::commit(BinaryStreamWriter &Writer) const {
+  Header H;
+  H.Size = size();
+  H.Capacity = capacity();
+  if (auto EC = Writer.writeObject(H))
+    return EC;
+
+  if (auto EC = writeSparseBitVector(Writer, Present))
+    return EC;
+
+  if (auto EC = writeSparseBitVector(Writer, Deleted))
+    return EC;
+
+  for (const auto &Entry : *this) {
+    if (auto EC = Writer.writeInteger(Entry.first))
+      return EC;
+    if (auto EC = Writer.writeInteger(Entry.second))
+      return EC;
+  }
+  return Error::success();
+}
+
+void HashTable::clear() {
+  Buckets.resize(8);
+  Present.clear();
+  Deleted.clear();
+}
+
+uint32_t HashTable::capacity() const { return Buckets.size(); }
+uint32_t HashTable::size() const { return Present.count(); }
+
+HashTableIterator HashTable::begin() const { return HashTableIterator(*this); }
+HashTableIterator HashTable::end() const {
+  return HashTableIterator(*this, 0, true);
+}
+
+HashTableIterator HashTable::find(uint32_t K) {
+  uint32_t H = K % capacity();
+  uint32_t I = H;
+  Optional<uint32_t> FirstUnused;
+  do {
+    if (isPresent(I)) {
+      if (Buckets[I].first == K)
+        return HashTableIterator(*this, I, false);
+    } else {
+      if (!FirstUnused)
+        FirstUnused = I;
+      // Insertion occurs via linear probing from the slot hint, and will be
+      // inserted at the first empty / deleted location.  Therefore, if we are
+      // probing and find a location that is neither present nor deleted, then
+      // nothing must have EVER been inserted at this location, and thus it is
+      // not possible for a matching value to occur later.
+      if (!isDeleted(I))
+        break;
+    }
+    I = (I + 1) % capacity();
+  } while (I != H);
+
+  // The only way FirstUnused would not be set is if every single entry in the
+  // table were Present.  But this would violate the load factor constraints
+  // that we impose, so it should never happen.
+  assert(FirstUnused);
+  return HashTableIterator(*this, *FirstUnused, true);
+}
+
+void HashTable::set(uint32_t K, uint32_t V) {
+  auto Entry = find(K);
+  if (Entry != end()) {
+    assert(isPresent(Entry.index()));
+    assert(Buckets[Entry.index()].first == K);
+    // We're updating, no need to do anything special.
+    Buckets[Entry.index()].second = V;
+    return;
+  }
+
+  auto &B = Buckets[Entry.index()];
+  assert(!isPresent(Entry.index()));
+  assert(Entry.isEnd());
+  B.first = K;
+  B.second = V;
+  Present.set(Entry.index());
+  Deleted.reset(Entry.index());
+
+  grow();
+
+  assert(find(K) != end());
+}
+
+void HashTable::remove(uint32_t K) {
+  auto Iter = find(K);
+  // It wasn't here to begin with, just exit.
+  if (Iter == end())
+    return;
+
+  assert(Present.test(Iter.index()));
+  assert(!Deleted.test(Iter.index()));
+  Deleted.set(Iter.index());
+  Present.reset(Iter.index());
+}
+
+uint32_t HashTable::get(uint32_t K) {
+  auto I = find(K);
+  assert(I != end());
+  return (*I).second;
+}
+
+uint32_t HashTable::maxLoad(uint32_t capacity) { return capacity * 2 / 3 + 1; }
+
+void HashTable::grow() {
+  uint32_t S = size();
+  if (S < maxLoad(capacity()))
+    return;
+  assert(capacity() != UINT32_MAX && "Can't grow Hash table!");
+
+  uint32_t NewCapacity =
+      (capacity() <= INT32_MAX) ? capacity() * 2 : UINT32_MAX;
+
+  // Growing requires rebuilding the table and re-hashing every item.  Make a
+  // copy with a larger capacity, insert everything into the copy, then swap
+  // it in.
+  HashTable NewMap(NewCapacity);
+  for (auto I : Present) {
+    NewMap.set(Buckets[I].first, Buckets[I].second);
+  }
+
+  Buckets.swap(NewMap.Buckets);
+  std::swap(Present, NewMap.Present);
+  std::swap(Deleted, NewMap.Deleted);
+  assert(capacity() == NewCapacity);
+  assert(size() == S);
+}
+
+Error HashTable::readSparseBitVector(BinaryStreamReader &Stream,
+                                     SparseBitVector<> &V) {
+  uint32_t NumWords;
+  if (auto EC = Stream.readInteger(NumWords))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "Expected hash table number of words"));
+
+  for (uint32_t I = 0; I != NumWords; ++I) {
+    uint32_t Word;
+    if (auto EC = Stream.readInteger(Word))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected hash table word"));
+    for (unsigned Idx = 0; Idx < 32; ++Idx)
+      if (Word & (1U << Idx))
+        V.set((I * 32) + Idx);
+  }
+  return Error::success();
+}
+
+Error HashTable::writeSparseBitVector(BinaryStreamWriter &Writer,
+                                      SparseBitVector<> &Vec) {
+  int ReqBits = Vec.find_last() + 1;
+  uint32_t NumWords = alignTo(ReqBits, sizeof(uint32_t)) / sizeof(uint32_t);
+  if (auto EC = Writer.writeInteger(NumWords))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "Could not write linear map number of words"));
+
+  uint32_t Idx = 0;
+  for (uint32_t I = 0; I != NumWords; ++I) {
+    uint32_t Word = 0;
+    for (uint32_t WordIdx = 0; WordIdx < 32; ++WordIdx, ++Idx) {
+      if (Vec.test(Idx))
+        Word |= (1 << WordIdx);
+    }
+    if (auto EC = Writer.writeInteger(Word))
+      return joinErrors(std::move(EC), make_error<RawError>(
+                                           raw_error_code::corrupt_file,
+                                           "Could not write linear map word"));
+  }
+  return Error::success();
+}
+
+HashTableIterator::HashTableIterator(const HashTable &Map, uint32_t Index,
+                                     bool IsEnd)
+    : Map(&Map), Index(Index), IsEnd(IsEnd) {}
+
+HashTableIterator::HashTableIterator(const HashTable &Map) : Map(&Map) {
+  int I = Map.Present.find_first();
+  if (I == -1) {
+    Index = 0;
+    IsEnd = true;
+  } else {
+    Index = static_cast<uint32_t>(I);
+    IsEnd = false;
+  }
+}
+
+HashTableIterator &HashTableIterator::operator=(const HashTableIterator &R) {
+  Map = R.Map;
+  return *this;
+}
+
+bool HashTableIterator::operator==(const HashTableIterator &R) const {
+  if (IsEnd && R.IsEnd)
+    return true;
+  if (IsEnd != R.IsEnd)
+    return false;
+
+  return (Map == R.Map) && (Index == R.Index);
+}
+
+const std::pair<uint32_t, uint32_t> &HashTableIterator::operator*() const {
+  assert(Map->Present.test(Index));
+  return Map->Buckets[Index];
+}
+
+HashTableIterator &HashTableIterator::operator++() {
+  while (Index < Map->Buckets.size()) {
+    ++Index;
+    if (Map->Present.test(Index))
+      return *this;
+  }
+
+  IsEnd = true;
+  return *this;
+}
diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp
new file mode 100644
index 000000000000..2a1d12e82390
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -0,0 +1,126 @@
+//===- InfoStream.cpp - PDB Info Stream (Stream 1) Access -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
+    : Stream(std::move(Stream)) {}
+
+Error InfoStream::reload() {
+  BinaryStreamReader Reader(*Stream);
+
+  const InfoStreamHeader *H;
+  if (auto EC = Reader.readObject(H))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "PDB Stream does not contain a header."));
+
+  switch (H->Version) {
+  case PdbImplVC70:
+  case PdbImplVC80:
+  case PdbImplVC110:
+  case PdbImplVC140:
+    break;
+  default:
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported PDB stream version.");
+  }
+
+  Version = H->Version;
+  Signature = H->Signature;
+  Age = H->Age;
+  Guid = H->Guid;
+
+  uint32_t Offset = Reader.getOffset();
+  if (auto EC = NamedStreams.load(Reader))
+    return EC;
+  uint32_t NewOffset = Reader.getOffset();
+  NamedStreamMapByteSize = NewOffset - Offset;
+
+  bool Stop = false;
+  while (!Stop && !Reader.empty()) {
+    PdbRaw_FeatureSig Sig;
+    if (auto EC = Reader.readEnum(Sig))
+      return EC;
+    // Since this value comes from a file, it's possible we have some strange
+    // value which doesn't correspond to any value.  We don't want to warn on
+    // -Wcovered-switch-default in this case, so switch on the integral value
+    // instead of the enumeration value.
+    switch (uint32_t(Sig)) {
+    case uint32_t(PdbRaw_FeatureSig::VC110):
+      // No other flags for VC110 PDB.
+      Stop = true;
+      LLVM_FALLTHROUGH;
+    case uint32_t(PdbRaw_FeatureSig::VC140):
+      Features |= PdbFeatureContainsIdStream;
+      break;
+    case uint32_t(PdbRaw_FeatureSig::NoTypeMerge):
+      Features |= PdbFeatureNoTypeMerging;
+      break;
+    case uint32_t(PdbRaw_FeatureSig::MinimalDebugInfo):
+      Features |= PdbFeatureMinimalDebugInfo;
+    default:
+      continue;
+    }
+    FeatureSignatures.push_back(Sig);
+  }
+  return Error::success();
+}
+
+uint32_t InfoStream::getStreamSize() const { return Stream->getLength(); }
+
+uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
+  uint32_t Result;
+  if (!NamedStreams.get(Name, Result))
+    return 0;
+  return Result;
+}
+
+iterator_range<StringMapConstIterator<uint32_t>>
+InfoStream::named_streams() const {
+  return NamedStreams.entries();
+}
+
+PdbRaw_ImplVer InfoStream::getVersion() const {
+  return static_cast<PdbRaw_ImplVer>(Version);
+}
+
+uint32_t InfoStream::getSignature() const { return Signature; }
+
+uint32_t InfoStream::getAge() const { return Age; }
+
+PDB_UniqueId InfoStream::getGuid() const { return Guid; }
+
+uint32_t InfoStream::getNamedStreamMapByteSize() const {
+  return NamedStreamMapByteSize;
+}
+
+PdbRaw_Features InfoStream::getFeatures() const { return Features; }
+
+ArrayRef<PdbRaw_FeatureSig> InfoStream::getFeatureSignatures() const {
+  return FeatureSignatures;
+}
+
+const NamedStreamMap &InfoStream::getNamedStreams() const {
+  return NamedStreams;
+}
diff --git a/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index 73fbf853b4f7..f019d410328a 100644
--- a/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -7,22 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/InfoStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
 
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-InfoStreamBuilder::InfoStreamBuilder(msf::MSFBuilder &Msf)
-    : Msf(Msf), Ver(PdbRaw_ImplVer::PdbImplVC70), Sig(-1), Age(0) {}
+InfoStreamBuilder::InfoStreamBuilder(msf::MSFBuilder &Msf,
+                                     NamedStreamMap &NamedStreams)
+    : Msf(Msf), Ver(PdbRaw_ImplVer::PdbImplVC70), Sig(-1), Age(0),
+      NamedStreams(NamedStreams) {}
 
 void InfoStreamBuilder::setVersion(PdbRaw_ImplVer V) { Ver = V; }
 
@@ -32,26 +36,23 @@ void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
 
 void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; }
 
-NameMapBuilder &InfoStreamBuilder::getNamedStreamsBuilder() {
-  return NamedStreams;
-}
-
-uint32_t InfoStreamBuilder::calculateSerializedLength() const {
-  return sizeof(InfoStreamHeader) + NamedStreams.calculateSerializedLength();
+void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
+  Features.push_back(Sig);
 }
 
 Error InfoStreamBuilder::finalizeMsfLayout() {
-  uint32_t Length = calculateSerializedLength();
+  uint32_t Length = sizeof(InfoStreamHeader) + NamedStreams.finalize() +
+                    (Features.size() + 1) * sizeof(uint32_t);
   if (auto EC = Msf.setStreamSize(StreamPDB, Length))
     return EC;
   return Error::success();
 }
 
 Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
-                                const msf::WritableStream &Buffer) const {
+                                WritableBinaryStreamRef Buffer) const {
   auto InfoS =
       WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamPDB);
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*InfoS);
 
   InfoStreamHeader H;
   H.Age = Age;
@@ -61,5 +62,13 @@ Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = Writer.writeObject(H))
     return EC;
 
-  return NamedStreams.commit(Writer);
+  if (auto EC = NamedStreams.commit(Writer))
+    return EC;
+  if (auto EC = Writer.writeInteger(0))
+    return EC;
+  for (auto E : Features) {
+    if (auto EC = Writer.writeEnum(E))
+      return EC;
+  }
+  return Error::success();
 }
diff --git a/lib/DebugInfo/PDB/Raw/ModInfo.cpp b/lib/DebugInfo/PDB/Native/ModInfo.cpp
index b34d7700d036..1405286fd088 100644
--- a/lib/DebugInfo/PDB/Raw/ModInfo.cpp
+++ b/lib/DebugInfo/PDB/Native/ModInfo.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
 #include <cstdint>
 
 using namespace llvm;
-using namespace llvm::msf;
 using namespace llvm::pdb;
 using namespace llvm::support;
 
@@ -26,15 +25,15 @@ ModInfo::ModInfo(const ModInfo &Info) = default;
 
 ModInfo::~ModInfo() = default;
 
-Error ModInfo::initialize(ReadableStreamRef Stream, ModInfo &Info) {
-  StreamReader Reader(Stream);
+Error ModInfo::initialize(BinaryStreamRef Stream, ModInfo &Info) {
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(Info.Layout))
     return EC;
 
-  if (auto EC = Reader.readZeroString(Info.ModuleName))
+  if (auto EC = Reader.readCString(Info.ModuleName))
     return EC;
 
-  if (auto EC = Reader.readZeroString(Info.ObjFileName))
+  if (auto EC = Reader.readCString(Info.ObjFileName))
     return EC;
   return Error::success();
 }
diff --git a/lib/DebugInfo/PDB/Native/ModInfoBuilder.cpp b/lib/DebugInfo/PDB/Native/ModInfoBuilder.cpp
new file mode 100644
index 000000000000..73c45a953520
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/ModInfoBuilder.cpp
@@ -0,0 +1,136 @@
+//===- ModInfoBuilder.cpp - PDB Module Info Stream Creation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/ModInfoBuilder.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/COFF.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+namespace llvm {
+template <> struct BinaryItemTraits<CVSymbol> {
+  static size_t length(const CVSymbol &Item) { return Item.RecordData.size(); }
+
+  static ArrayRef<uint8_t> bytes(const CVSymbol &Item) {
+    return Item.RecordData;
+  }
+};
+}
+
+static uint32_t calculateDiSymbolStreamSize(uint32_t SymbolByteSize) {
+  uint32_t Size = sizeof(uint32_t); // Signature
+  Size += SymbolByteSize;           // Symbol Data
+  Size += 0;                        // TODO: Layout.LineBytes
+  Size += 0;                        // TODO: Layout.C13Bytes
+  Size += sizeof(uint32_t);         // GlobalRefs substream size (always 0)
+  Size += 0;                        // GlobalRefs substream bytes
+  return Size;
+}
+
+ModInfoBuilder::ModInfoBuilder(StringRef ModuleName, uint32_t ModIndex,
+                               msf::MSFBuilder &Msf)
+    : MSF(Msf), ModuleName(ModuleName) {
+  Layout.Mod = ModIndex;
+}
+
+uint16_t ModInfoBuilder::getStreamIndex() const { return Layout.ModDiStream; }
+
+void ModInfoBuilder::setObjFileName(StringRef Name) { ObjFileName = Name; }
+
+void ModInfoBuilder::addSymbol(CVSymbol Symbol) {
+  Symbols.push_back(Symbol);
+  SymbolByteSize += Symbol.data().size();
+}
+
+void ModInfoBuilder::addSourceFile(StringRef Path) {
+  SourceFiles.push_back(Path);
+}
+
+uint32_t ModInfoBuilder::calculateSerializedLength() const {
+  uint32_t L = sizeof(Layout);
+  uint32_t M = ModuleName.size() + 1;
+  uint32_t O = ObjFileName.size() + 1;
+  return alignTo(L + M + O, sizeof(uint32_t));
+}
+
+void ModInfoBuilder::finalize() {
+  Layout.C13Bytes = 0;
+  Layout.FileNameOffs = 0; // TODO: Fix this
+  Layout.Flags = 0;        // TODO: Fix this
+  Layout.LineBytes = 0;
+  (void)Layout.Mod;         // Set in constructor
+  (void)Layout.ModDiStream; // Set in finalizeMsfLayout
+  Layout.NumFiles = SourceFiles.size();
+  Layout.PdbFilePathNI = 0;
+  Layout.SrcFileNameNI = 0;
+
+  // This value includes both the signature field as well as the record bytes
+  // from the symbol stream.
+  Layout.SymBytes = SymbolByteSize + sizeof(uint32_t);
+}
+
+Error ModInfoBuilder::finalizeMsfLayout() {
+  this->Layout.ModDiStream = kInvalidStreamIndex;
+  auto ExpectedSN = MSF.addStream(calculateDiSymbolStreamSize(SymbolByteSize));
+  if (!ExpectedSN)
+    return ExpectedSN.takeError();
+  Layout.ModDiStream = *ExpectedSN;
+  return Error::success();
+}
+
+Error ModInfoBuilder::commit(BinaryStreamWriter &ModiWriter,
+                             const msf::MSFLayout &MsfLayout,
+                             WritableBinaryStreamRef MsfBuffer) {
+  // We write the Modi record to the `ModiWriter`, but we additionally write its
+  // symbol stream to a brand new stream.
+  if (auto EC = ModiWriter.writeObject(Layout))
+    return EC;
+  if (auto EC = ModiWriter.writeCString(ModuleName))
+    return EC;
+  if (auto EC = ModiWriter.writeCString(ObjFileName))
+    return EC;
+  if (auto EC = ModiWriter.padToAlignment(sizeof(uint32_t)))
+    return EC;
+
+  if (Layout.ModDiStream != kInvalidStreamIndex) {
+    auto NS = WritableMappedBlockStream::createIndexedStream(
+        MsfLayout, MsfBuffer, Layout.ModDiStream);
+    WritableBinaryStreamRef Ref(*NS);
+    BinaryStreamWriter SymbolWriter(Ref);
+    // Write the symbols.
+    if (auto EC =
+            SymbolWriter.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC))
+      return EC;
+    BinaryItemStream<CVSymbol> Records(llvm::support::endianness::little);
+    Records.setItems(Symbols);
+    BinaryStreamRef RecordsRef(Records);
+    if (auto EC = SymbolWriter.writeStreamRef(RecordsRef))
+      return EC;
+    // TODO: Write C11 Line data
+    // TODO: Write C13 Line data
+    // TODO: Figure out what GlobalRefs substream actually is and populate it.
+    if (auto EC = SymbolWriter.writeInteger<uint32_t>(0))
+      return EC;
+    if (SymbolWriter.bytesRemaining() > 0)
+      return make_error<RawError>(raw_error_code::stream_too_long);
+  }
+  return Error::success();
+}
diff --git a/lib/DebugInfo/PDB/Raw/ModStream.cpp b/lib/DebugInfo/PDB/Native/ModStream.cpp
index 0ffc5b7d44aa..08798cf0ed28 100644
--- a/lib/DebugInfo/PDB/Raw/ModStream.cpp
+++ b/lib/DebugInfo/PDB/Native/ModStream.cpp
@@ -7,15 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/ModStream.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
-#include "llvm/DebugInfo/PDB/Raw/ModStream.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
@@ -31,7 +31,7 @@ ModStream::ModStream(const ModInfo &Module,
 ModStream::~ModStream() = default;
 
 Error ModStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   uint32_t SymbolSize = Mod.getSymbolDebugInfoByteSize();
   uint32_t C11Size = Mod.getLineInfoByteSize();
@@ -41,7 +41,7 @@ Error ModStream::reload() {
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Module has both C11 and C13 line info");
 
-  ReadableStreamRef S;
+  BinaryStreamRef S;
 
   if (auto EC = Reader.readInteger(Signature))
     return EC;
@@ -53,7 +53,7 @@ Error ModStream::reload() {
   if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size))
     return EC;
 
-  StreamReader LineReader(C13LinesSubstream);
+  BinaryStreamReader LineReader(C13LinesSubstream);
   if (auto EC = LineReader.readArray(LineInfo, LineReader.bytesRemaining()))
     return EC;
 
diff --git a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
new file mode 100644
index 000000000000..c7ba32b82bc6
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -0,0 +1,135 @@
+//===- NamedStreamMap.cpp - PDB Named Stream Map ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
+
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NamedStreamMap::NamedStreamMap() = default;
+
+Error NamedStreamMap::load(BinaryStreamReader &Stream) {
+  Mapping.clear();
+  FinalizedHashTable.clear();
+  FinalizedInfo.reset();
+
+  uint32_t StringBufferSize;
+  if (auto EC = Stream.readInteger(StringBufferSize))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Expected string buffer size"));
+
+  BinaryStreamRef StringsBuffer;
+  if (auto EC = Stream.readStreamRef(StringsBuffer, StringBufferSize))
+    return EC;
+
+  HashTable OffsetIndexMap;
+  if (auto EC = OffsetIndexMap.load(Stream))
+    return EC;
+
+  uint32_t NameOffset;
+  uint32_t NameIndex;
+  for (const auto &Entry : OffsetIndexMap) {
+    std::tie(NameOffset, NameIndex) = Entry;
+
+    // Compute the offset of the start of the string relative to the stream.
+    BinaryStreamReader NameReader(StringsBuffer);
+    NameReader.setOffset(NameOffset);
+    // Pump out our c-string from the stream.
+    StringRef Str;
+    if (auto EC = NameReader.readCString(Str))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map name"));
+
+    // Add this to a string-map from name to stream number.
+    Mapping.insert({Str, NameIndex});
+  }
+
+  return Error::success();
+}
+
+Error NamedStreamMap::commit(BinaryStreamWriter &Writer) const {
+  assert(FinalizedInfo.hasValue());
+
+  // The first field is the number of bytes of string data.
+  if (auto EC = Writer.writeInteger(FinalizedInfo->StringDataBytes))
+    return EC;
+
+  // Now all of the string data itself.
+  for (const auto &Item : Mapping) {
+    if (auto EC = Writer.writeCString(Item.getKey()))
+      return EC;
+  }
+
+  // And finally the Offset Index map.
+  if (auto EC = FinalizedHashTable.commit(Writer))
+    return EC;
+
+  return Error::success();
+}
+
+uint32_t NamedStreamMap::finalize() {
+  if (FinalizedInfo.hasValue())
+    return FinalizedInfo->SerializedLength;
+
+  // Build the finalized hash table.
+  FinalizedHashTable.clear();
+  FinalizedInfo.emplace();
+  for (const auto &Item : Mapping) {
+    FinalizedHashTable.set(FinalizedInfo->StringDataBytes, Item.getValue());
+    FinalizedInfo->StringDataBytes += Item.getKeyLength() + 1;
+  }
+
+  // Number of bytes of string data.
+  FinalizedInfo->SerializedLength += sizeof(support::ulittle32_t);
+  // Followed by that many actual bytes of string data.
+  FinalizedInfo->SerializedLength += FinalizedInfo->StringDataBytes;
+  // Followed by the mapping from Offset to Index.
+  FinalizedInfo->SerializedLength +=
+      FinalizedHashTable.calculateSerializedLength();
+  return FinalizedInfo->SerializedLength;
+}
+
+iterator_range<StringMapConstIterator<uint32_t>>
+NamedStreamMap::entries() const {
+  return make_range<StringMapConstIterator<uint32_t>>(Mapping.begin(),
+                                                      Mapping.end());
+}
+
+uint32_t NamedStreamMap::size() const { return Mapping.size(); }
+
+bool NamedStreamMap::get(StringRef Stream, uint32_t &StreamNo) const {
+  auto Iter = Mapping.find(Stream);
+  if (Iter == Mapping.end())
+    return false;
+  StreamNo = Iter->second;
+  return true;
+}
+
+void NamedStreamMap::set(StringRef Stream, uint32_t StreamNo) {
+  FinalizedInfo.reset();
+  Mapping[Stream] = StreamNo;
+}
+
+void NamedStreamMap::remove(StringRef Stream) {
+  FinalizedInfo.reset();
+  Mapping.erase(Stream);
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
new file mode 100644
index 000000000000..9c0cc0bf8233
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -0,0 +1,43 @@
+//===- NativeCompilandSymbol.cpp - Native impl for compilands ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeCompilandSymbol::NativeCompilandSymbol(NativeSession &Session,
+                                             const ModuleInfoEx &MI)
+    : NativeRawSymbol(Session), Module(MI) {}
+
+PDB_SymType NativeCompilandSymbol::getSymTag() const {
+  return PDB_SymType::Compiland;
+}
+
+bool NativeCompilandSymbol::isEditAndContinueEnabled() const {
+  return Module.Info.hasECInfo();
+}
+
+uint32_t NativeCompilandSymbol::getLexicalParentId() const { return 0; }
+
+// The usage of getObjFileName for getLibraryName and getModuleName for getName
+// may seem backwards, but it is consistent with DIA, which is what this API
+// was modeled after.  We may rename these methods later to try to eliminate
+// this potential confusion.
+
+std::string NativeCompilandSymbol::getLibraryName() const {
+  return Module.Info.getObjFileName();
+}
+
+std::string NativeCompilandSymbol::getName() const {
+  return Module.Info.getModuleName();
+}
+
+} // namespace pdb
+} // namespace llvm
diff --git a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
new file mode 100644
index 000000000000..7532110d005c
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -0,0 +1,52 @@
+//==- NativeEnumModules.cpp - Native Symbol Enumerator impl ------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeEnumModules::NativeEnumModules(NativeSession &PDBSession,
+                                     ArrayRef<ModuleInfoEx> Modules,
+                                     uint32_t Index)
+    : Session(PDBSession), Modules(Modules), Index(Index) {}
+
+uint32_t NativeEnumModules::getChildCount() const {
+  return static_cast<uint32_t>(Modules.size());
+}
+
+std::unique_ptr<PDBSymbol>
+NativeEnumModules::getChildAtIndex(uint32_t Index) const {
+  if (Index >= Modules.size())
+    return nullptr;
+  return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland(Session,
+      std::unique_ptr<IPDBRawSymbol>(
+          new NativeCompilandSymbol(Session, Modules[Index]))));
+}
+
+std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() {
+  if (Index >= Modules.size())
+    return nullptr;
+  return getChildAtIndex(Index++);
+}
+
+void NativeEnumModules::reset() { Index = 0; }
+
+NativeEnumModules *NativeEnumModules::clone() const {
+  return new NativeEnumModules(Session, Modules, Index);
+}
+
+}
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
new file mode 100644
index 000000000000..ec2a4b87457c
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -0,0 +1,79 @@
+//===- NativeExeSymbol.cpp - native impl for PDBSymbolExe -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
+
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeExeSymbol::NativeExeSymbol(NativeSession &Session)
+    : NativeRawSymbol(Session), File(Session.getPDBFile()) {}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeExeSymbol::findChildren(PDB_SymType Type) const {
+  switch (Type) {
+  case PDB_SymType::Compiland: {
+    auto Dbi = File.getPDBDbiStream();
+    if (Dbi) {
+      const auto Modules = Dbi->modules();
+      return std::unique_ptr<IPDBEnumSymbols>(
+          new NativeEnumModules(Session, Modules));
+    }
+    consumeError(Dbi.takeError());
+    break;
+  }
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+uint32_t NativeExeSymbol::getAge() const {
+  auto IS = File.getPDBInfoStream();
+  if (IS)
+    return IS->getAge();
+  consumeError(IS.takeError());
+  return 0;
+}
+
+std::string NativeExeSymbol::getSymbolsFileName() const {
+  return File.getFilePath();
+}
+
+PDB_UniqueId NativeExeSymbol::getGuid() const {
+  auto IS = File.getPDBInfoStream();
+  if (IS)
+    return IS->getGuid();
+  consumeError(IS.takeError());
+  return PDB_UniqueId{{0}};
+}
+
+bool NativeExeSymbol::hasCTypes() const {
+  auto Dbi = File.getPDBDbiStream();
+  if (Dbi)
+    return Dbi->hasCTypes();
+  consumeError(Dbi.takeError());
+  return false;
+}
+
+bool NativeExeSymbol::hasPrivateSymbols() const {
+  auto Dbi = File.getPDBDbiStream();
+  if (Dbi)
+    return !Dbi->isStripped();
+  consumeError(Dbi.takeError());
+  return false;
+}
+
+} // namespace pdb
+} // namespace llvm
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
new file mode 100644
index 000000000000..3aba35adb53f
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -0,0 +1,706 @@
+//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NativeRawSymbol::NativeRawSymbol(NativeSession &PDBSession)
+  : Session(PDBSession) {}
+
+void NativeRawSymbol::dump(raw_ostream &OS, int Indent) const {}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildren(PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildren(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags, uint32_t RVA) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
+  return nullptr;
+}
+
+void NativeRawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const {
+  bytes.clear();
+}
+
+PDB_MemberAccess NativeRawSymbol::getAccess() const {
+  return PDB_MemberAccess::Private;
+}
+
+uint32_t NativeRawSymbol::getAddressOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getAddressSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getAge() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getArrayIndexTypeId() const {
+  return 0;
+}
+
+void NativeRawSymbol::getBackEndVersion(VersionInfo &Version) const {
+  Version.Major = 0;
+  Version.Minor = 0;
+  Version.Build = 0;
+  Version.QFE = 0;
+}
+
+uint32_t NativeRawSymbol::getBaseDataOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getBaseDataSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getBaseSymbolId() const {
+  return 0;
+}
+
+PDB_BuiltinType NativeRawSymbol::getBuiltinType() const {
+  return PDB_BuiltinType::None;
+}
+
+uint32_t NativeRawSymbol::getBitPosition() const {
+  return 0;
+}
+
+PDB_CallingConv NativeRawSymbol::getCallingConvention() const {
+  return PDB_CallingConv::FarStdCall;
+}
+
+uint32_t NativeRawSymbol::getClassParentId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getCompilerName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getCount() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getCountLiveRanges() const {
+  return 0;
+}
+
+void NativeRawSymbol::getFrontEndVersion(VersionInfo &Version) const {
+  Version.Major = 0;
+  Version.Minor = 0;
+  Version.Build = 0;
+  Version.QFE = 0;
+}
+
+PDB_Lang NativeRawSymbol::getLanguage() const {
+  return PDB_Lang::Cobol;
+}
+
+uint32_t NativeRawSymbol::getLexicalParentId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getLibraryName() const {
+  return "";
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartAddressOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartAddressSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartRelativeVirtualAddress() const {
+  return 0;
+}
+
+codeview::RegisterId NativeRawSymbol::getLocalBasePointerRegisterId() const {
+  return codeview::RegisterId::EAX;
+}
+
+uint32_t NativeRawSymbol::getLowerBoundId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getMemorySpaceKind() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfAcceleratorPointerTags() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfColumns() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfModifiers() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfRegisterIndices() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfRows() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getObjectFileName() const {
+  return "";
+}
+
+uint32_t NativeRawSymbol::getOemId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getOemSymbolId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getOffsetInUdt() const {
+  return 0;
+}
+
+PDB_Cpu NativeRawSymbol::getPlatform() const {
+  return PDB_Cpu::Intel8080;
+}
+
+uint32_t NativeRawSymbol::getRank() const {
+  return 0;
+}
+
+codeview::RegisterId NativeRawSymbol::getRegisterId() const {
+  return codeview::RegisterId::EAX;
+}
+
+uint32_t NativeRawSymbol::getRegisterType() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getRelativeVirtualAddress() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSamplerSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSignature() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSizeInUdt() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSlot() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getSourceFileName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getStride() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSubTypeId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getSymbolsFileName() const { return ""; }
+
+uint32_t NativeRawSymbol::getSymIndexId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetRelativeVirtualAddress() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getTargetVirtualAddress() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTextureSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTimeStamp() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getToken() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTypeId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUavSlot() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getUndecoratedName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUnmodifiedTypeId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUpperBoundId() const {
+  return 0;
+}
+
+Variant NativeRawSymbol::getValue() const {
+  return Variant();
+}
+
+uint32_t NativeRawSymbol::getVirtualBaseDispIndex() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getVirtualBaseOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getVirtualTableShapeId() const {
+  return 0;
+}
+
+std::unique_ptr<PDBSymbolTypeVTable>
+NativeRawSymbol::getVirtualBaseTableType() const {
+  return nullptr;
+}
+
+PDB_DataKind NativeRawSymbol::getDataKind() const {
+  return PDB_DataKind::Unknown;
+}
+
+PDB_SymType NativeRawSymbol::getSymTag() const {
+  return PDB_SymType::None;
+}
+
+PDB_UniqueId NativeRawSymbol::getGuid() const {
+  return PDB_UniqueId{{0}};
+}
+
+int32_t NativeRawSymbol::getOffset() const {
+  return 0;
+}
+
+int32_t NativeRawSymbol::getThisAdjust() const {
+  return 0;
+}
+
+int32_t NativeRawSymbol::getVirtualBasePointerOffset() const {
+  return 0;
+}
+
+PDB_LocType NativeRawSymbol::getLocationType() const {
+  return PDB_LocType::Null;
+}
+
+PDB_Machine NativeRawSymbol::getMachineType() const {
+  return PDB_Machine::Invalid;
+}
+
+codeview::ThunkOrdinal NativeRawSymbol::getThunkOrdinal() const {
+  return codeview::ThunkOrdinal::Standard;
+}
+
+uint64_t NativeRawSymbol::getLength() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getLiveRangeLength() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getVirtualAddress() const {
+  return 0;
+}
+
+PDB_UdtType NativeRawSymbol::getUdtKind() const {
+  return PDB_UdtType::Struct;
+}
+
+bool NativeRawSymbol::hasConstructor() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCustomCallingConvention() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasFarReturn() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCompilerGenerated() const {
+  return false;
+}
+
+bool NativeRawSymbol::isConstType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isEditAndContinueEnabled() const {
+  return false;
+}
+
+bool NativeRawSymbol::isFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::getAddressTaken() const {
+  return false;
+}
+
+bool NativeRawSymbol::getNoStackOrdering() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasAlloca() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasAssignmentOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCTypes() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCastOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasDebugInfo() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasEH() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasEHa() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInlAsm() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInlineAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInterruptReturn() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasFramePointer() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasLongJump() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasManagedCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNestedTypes() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNoInlineAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNoReturnAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasOptimizedCodeDebugInfo() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasOverloadedOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSEH() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSecurityChecks() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSetJump() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasStrictGSCheck() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorGroupSharedLocal() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorPointerTagLiveRange() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorStubFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAggregated() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIntroVirtualFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCVTCIL() const {
+  return false;
+}
+
+bool NativeRawSymbol::isConstructorVirtualBase() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCxxReturnUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isDataAligned() const {
+  return false;
+}
+
+bool NativeRawSymbol::isHLSLData() const {
+  return false;
+}
+
+bool NativeRawSymbol::isHotpatchable() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIndirectVirtualBaseClass() const {
+  return false;
+}
+
+bool NativeRawSymbol::isInterfaceUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIntrinsic() const {
+  return false;
+}
+
+bool NativeRawSymbol::isLTCG() const {
+  return false;
+}
+
+bool NativeRawSymbol::isLocationControlFlowDependent() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMSILNetmodule() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMatrixRowMajor() const {
+  return false;
+}
+
+bool NativeRawSymbol::isManagedCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMSILCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMultipleInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isNaked() const {
+  return false;
+}
+
+bool NativeRawSymbol::isNested() const {
+  return false;
+}
+
+bool NativeRawSymbol::isOptimizedAway() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPacked() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerBasedOnSymbolValue() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerToDataMember() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerToMemberFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPureVirtual() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRValueReference() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRefUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isReference() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRestrictedType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isReturnValue() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSafeBuffers() const {
+  return false;
+}
+
+bool NativeRawSymbol::isScoped() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSdl() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSingleInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSplitted() const {
+  return false;
+}
+
+bool NativeRawSymbol::isStatic() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasPrivateSymbols() const {
+  return false;
+}
+
+bool NativeRawSymbol::isUnalignedType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isUnreached() const {
+  return false;
+}
+
+bool NativeRawSymbol::isValueUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtual() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtualBaseClass() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtualInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVolatileType() const {
+  return false;
+}
+
+bool NativeRawSymbol::wasInlined() const {
+  return false;
+}
+
+std::string NativeRawSymbol::getUnused() const {
+  return "";
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
new file mode 100644
index 000000000000..7e6843bceb7d
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -0,0 +1,146 @@
+//===- NativeSession.cpp - Native implementation of IPDBSession -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+NativeSession::NativeSession(std::unique_ptr<PDBFile> PdbFile,
+                             std::unique_ptr<BumpPtrAllocator> Allocator)
+    : Pdb(std::move(PdbFile)), Allocator(std::move(Allocator)) {}
+
+NativeSession::~NativeSession() = default;
+
+Error NativeSession::createFromPdb(StringRef Path,
+                                   std::unique_ptr<IPDBSession> &Session) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
+      MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
+                                   /*RequiresNullTerminator=*/false);
+  if (!ErrorOrBuffer)
+    return make_error<GenericError>(generic_error_code::invalid_path);
+
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
+  auto Stream = llvm::make_unique<MemoryBufferByteStream>(
+      std::move(Buffer), llvm::support::little);
+
+  auto Allocator = llvm::make_unique<BumpPtrAllocator>();
+  auto File = llvm::make_unique<PDBFile>(Path, std::move(Stream), *Allocator);
+  if (auto EC = File->parseFileHeaders())
+    return EC;
+  if (auto EC = File->parseStreamData())
+    return EC;
+
+  Session =
+      llvm::make_unique<NativeSession>(std::move(File), std::move(Allocator));
+
+  return Error::success();
+}
+
+Error NativeSession::createFromExe(StringRef Path,
+                                   std::unique_ptr<IPDBSession> &Session) {
+  return make_error<RawError>(raw_error_code::feature_unsupported);
+}
+
+uint64_t NativeSession::getLoadAddress() const { return 0; }
+
+void NativeSession::setLoadAddress(uint64_t Address) {}
+
+std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() const {
+  auto RawSymbol =
+      llvm::make_unique<NativeExeSymbol>(const_cast<NativeSession &>(*this));
+  auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
+  std::unique_ptr<PDBSymbolExe> ExeSymbol(
+    static_cast<PDBSymbolExe *>(PdbSymbol.release()));
+  return ExeSymbol;
+}
+
+std::unique_ptr<PDBSymbol>
+NativeSession::getSymbolById(uint32_t SymbolId) const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbol>
+NativeSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
+                               const IPDBSourceFile &File) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeSession::findLineNumbersByAddress(uint64_t Address,
+                                        uint32_t Length) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles>
+NativeSession::findSourceFiles(const PDBSymbolCompiland *Compiland,
+                               StringRef Pattern,
+                               PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBSourceFile>
+NativeSession::findOneSourceFile(const PDBSymbolCompiland *Compiland,
+                                 StringRef Pattern,
+                                 PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
+NativeSession::findCompilandsForSourceFile(StringRef Pattern,
+                                           PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbolCompiland>
+NativeSession::findOneCompilandForSourceFile(StringRef Pattern,
+                                             PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> NativeSession::getAllSourceFiles() const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> NativeSession::getSourceFilesForCompiland(
+    const PDBSymbolCompiland &Compiland) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBSourceFile>
+NativeSession::getSourceFileById(uint32_t FileId) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumDataStreams> NativeSession::getDebugStreams() const {
+  return nullptr;
+}
diff --git a/lib/DebugInfo/PDB/Raw/PDBFile.cpp b/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 53491518b8c7..943e7fa13ab7 100644
--- a/lib/DebugInfo/PDB/Raw/PDBFile.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -7,24 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/GlobalsStream.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/PublicsStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/StringTable.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -38,12 +39,18 @@ namespace {
 typedef FixedStreamArray<support::ulittle32_t> ulittle_array;
 } // end anonymous namespace
 
-PDBFile::PDBFile(std::unique_ptr<ReadableStream> PdbFileBuffer,
+PDBFile::PDBFile(StringRef Path, std::unique_ptr<BinaryStream> PdbFileBuffer,
                  BumpPtrAllocator &Allocator)
-    : Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
+    : FilePath(Path), Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
 
 PDBFile::~PDBFile() = default;
 
+StringRef PDBFile::getFilePath() const { return FilePath; }
+
+StringRef PDBFile::getFileDirectory() const {
+  return sys::path::parent_path(FilePath);
+}
+
 uint32_t PDBFile::getBlockSize() const { return ContainerLayout.SB->BlockSize; }
 
 uint32_t PDBFile::getFreeBlockMapBlock() const {
@@ -106,7 +113,7 @@ Error PDBFile::setBlockData(uint32_t BlockIndex, uint32_t Offset,
 }
 
 Error PDBFile::parseFileHeaders() {
-  StreamReader Reader(*Buffer);
+  BinaryStreamReader Reader(*Buffer);
 
   // Initialize SB.
   const msf::SuperBlock *SB = nullptr;
@@ -140,7 +147,7 @@ Error PDBFile::parseFileHeaders() {
   // See the function fpmPn() for more information:
   // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L489
   auto FpmStream = MappedBlockStream::createFpmStream(ContainerLayout, *Buffer);
-  StreamReader FpmReader(*FpmStream);
+  BinaryStreamReader FpmReader(*FpmStream);
   ArrayRef<uint8_t> FpmBytes;
   if (auto EC = FpmReader.readBytes(FpmBytes,
                                     msf::getFullFpmByteSize(ContainerLayout)))
@@ -178,7 +185,7 @@ Error PDBFile::parseStreamData() {
   // subclass of IPDBStreamData which only accesses the fields that have already
   // been parsed, we can avoid this and reuse MappedBlockStream.
   auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer);
-  StreamReader Reader(*DS);
+  BinaryStreamReader Reader(*DS);
   if (auto EC = Reader.readInteger(NumStreams))
     return EC;
 
@@ -229,7 +236,8 @@ Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
 
     auto GlobalS = safelyCreateIndexedStream(
         ContainerLayout, *Buffer, DbiS->getGlobalSymbolStreamIndex());
-    if (!GlobalS) return GlobalS.takeError();
+    if (!GlobalS)
+      return GlobalS.takeError();
     auto TempGlobals = llvm::make_unique<GlobalsStream>(std::move(*GlobalS));
     if (auto EC = TempGlobals->reload())
       return std::move(EC);
@@ -241,7 +249,8 @@ Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
 Expected<InfoStream &> PDBFile::getPDBInfoStream() {
   if (!Info) {
     auto InfoS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamPDB);
-    if (!InfoS) return InfoS.takeError();
+    if (!InfoS)
+      return InfoS.takeError();
     auto TempInfo = llvm::make_unique<InfoStream>(std::move(*InfoS));
     if (auto EC = TempInfo->reload())
       return std::move(EC);
@@ -253,7 +262,8 @@ Expected<InfoStream &> PDBFile::getPDBInfoStream() {
 Expected<DbiStream &> PDBFile::getPDBDbiStream() {
   if (!Dbi) {
     auto DbiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamDBI);
-    if (!DbiS) return DbiS.takeError();
+    if (!DbiS)
+      return DbiS.takeError();
     auto TempDbi = llvm::make_unique<DbiStream>(*this, std::move(*DbiS));
     if (auto EC = TempDbi->reload())
       return std::move(EC);
@@ -265,7 +275,8 @@ Expected<DbiStream &> PDBFile::getPDBDbiStream() {
 Expected<TpiStream &> PDBFile::getPDBTpiStream() {
   if (!Tpi) {
     auto TpiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamTPI);
-    if (!TpiS) return TpiS.takeError();
+    if (!TpiS)
+      return TpiS.takeError();
     auto TempTpi = llvm::make_unique<TpiStream>(*this, std::move(*TpiS));
     if (auto EC = TempTpi->reload())
       return std::move(EC);
@@ -277,7 +288,8 @@ Expected<TpiStream &> PDBFile::getPDBTpiStream() {
 Expected<TpiStream &> PDBFile::getPDBIpiStream() {
   if (!Ipi) {
     auto IpiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamIPI);
-    if (!IpiS) return IpiS.takeError();
+    if (!IpiS)
+      return IpiS.takeError();
     auto TempIpi = llvm::make_unique<TpiStream>(*this, std::move(*IpiS));
     if (auto EC = TempIpi->reload())
       return std::move(EC);
@@ -294,7 +306,8 @@ Expected<PublicsStream &> PDBFile::getPDBPublicsStream() {
 
     auto PublicS = safelyCreateIndexedStream(
         ContainerLayout, *Buffer, DbiS->getPublicSymbolStreamIndex());
-    if (!PublicS) return PublicS.takeError();
+    if (!PublicS)
+      return PublicS.takeError();
     auto TempPublics =
         llvm::make_unique<PublicsStream>(*this, std::move(*PublicS));
     if (auto EC = TempPublics->reload())
@@ -313,7 +326,8 @@ Expected<SymbolStream &> PDBFile::getPDBSymbolStream() {
     uint32_t SymbolStreamNum = DbiS->getSymRecordStreamIndex();
     auto SymbolS =
         safelyCreateIndexedStream(ContainerLayout, *Buffer, SymbolStreamNum);
-    if (!SymbolS) return SymbolS.takeError();
+    if (!SymbolS)
+      return SymbolS.takeError();
 
     auto TempSymbols = llvm::make_unique<SymbolStream>(std::move(*SymbolS));
     if (auto EC = TempSymbols->reload())
@@ -323,8 +337,8 @@ Expected<SymbolStream &> PDBFile::getPDBSymbolStream() {
   return *Symbols;
 }
 
-Expected<NameHashTable &> PDBFile::getStringTable() {
-  if (!StringTable || !StringTableStream) {
+Expected<StringTable &> PDBFile::getStringTable() {
+  if (!Strings || !StringTableStream) {
     auto IS = getPDBInfoStream();
     if (!IS)
       return IS.takeError();
@@ -333,23 +347,25 @@ Expected<NameHashTable &> PDBFile::getStringTable() {
 
     auto NS =
         safelyCreateIndexedStream(ContainerLayout, *Buffer, NameStreamIndex);
-    if (!NS) return NS.takeError();
+    if (!NS)
+      return NS.takeError();
 
-    StreamReader Reader(**NS);
-    auto N = llvm::make_unique<NameHashTable>();
+    BinaryStreamReader Reader(**NS);
+    auto N = llvm::make_unique<StringTable>();
     if (auto EC = N->load(Reader))
       return std::move(EC);
-    StringTable = std::move(N);
+    Strings = std::move(N);
     StringTableStream = std::move(*NS);
   }
-  return *StringTable;
+  return *Strings;
 }
 
 bool PDBFile::hasPDBDbiStream() const { return StreamDBI < getNumStreams(); }
 
 bool PDBFile::hasPDBGlobalsStream() {
   auto DbiS = getPDBDbiStream();
-  if (!DbiS) return false;
+  if (!DbiS)
+    return false;
   return DbiS->getGlobalSymbolStreamIndex() < getNumStreams();
 }
 
@@ -359,13 +375,15 @@ bool PDBFile::hasPDBIpiStream() const { return StreamIPI < getNumStreams(); }
 
 bool PDBFile::hasPDBPublicsStream() {
   auto DbiS = getPDBDbiStream();
-  if (!DbiS) return false;
+  if (!DbiS)
+    return false;
   return DbiS->getPublicSymbolStreamIndex() < getNumStreams();
 }
 
 bool PDBFile::hasPDBSymbolStream() {
   auto DbiS = getPDBDbiStream();
-  if (!DbiS) return false;
+  if (!DbiS)
+    return false;
   return DbiS->getSymRecordStreamIndex() < getNumStreams();
 }
 
@@ -373,18 +391,19 @@ bool PDBFile::hasPDBTpiStream() const { return StreamTPI < getNumStreams(); }
 
 bool PDBFile::hasStringTable() {
   auto IS = getPDBInfoStream();
-  if (!IS) return false;
+  if (!IS)
+    return false;
   return IS->getNamedStreamIndex("/names") < getNumStreams();
 }
 
-/// Wrapper around MappedBlockStream::createIndexedStream()
-/// that checks if a stream with that index actually exists.
-/// If it does not, the return value will have an MSFError with
-/// code msf_error_code::no_stream. Else, the return value will
-/// contain the stream returned by createIndexedStream().
-Expected<std::unique_ptr<MappedBlockStream>> PDBFile::safelyCreateIndexedStream(
-    const MSFLayout &Layout, const ReadableStream &MsfData,
-    uint32_t StreamIndex) const {
+/// Wrapper around MappedBlockStream::createIndexedStream() that checks if a
+/// stream with that index actually exists.  If it does not, the return value
+/// will have an MSFError with code msf_error_code::no_stream.  Else, the return
+/// value will contain the stream returned by createIndexedStream().
+Expected<std::unique_ptr<MappedBlockStream>>
+PDBFile::safelyCreateIndexedStream(const MSFLayout &Layout,
+                                   BinaryStreamRef MsfData,
+                                   uint32_t StreamIndex) const {
   if (StreamIndex >= getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
   return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex);
diff --git a/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index 6fec0e32a8ae..b3c84903bc7e 100644
--- a/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -7,21 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/PDBFileBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
 
 #include "llvm/ADT/BitVector.h"
 
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStreamBuilder.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStreamBuilder.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/StringTableBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -44,7 +45,7 @@ MSFBuilder &PDBFileBuilder::getMsfBuilder() { return *Msf; }
 
 InfoStreamBuilder &PDBFileBuilder::getInfoBuilder() {
   if (!Info)
-    Info = llvm::make_unique<InfoStreamBuilder>(*Msf);
+    Info = llvm::make_unique<InfoStreamBuilder>(*Msf, NamedStreams);
   return *Info;
 }
 
@@ -66,7 +67,26 @@ TpiStreamBuilder &PDBFileBuilder::getIpiBuilder() {
   return *Ipi;
 }
 
-Expected<msf::MSFLayout> PDBFileBuilder::finalizeMsfLayout() const {
+StringTableBuilder &PDBFileBuilder::getStringTableBuilder() { return Strings; }
+
+Error PDBFileBuilder::addNamedStream(StringRef Name, uint32_t Size) {
+  auto ExpectedStream = Msf->addStream(Size);
+  if (!ExpectedStream)
+    return ExpectedStream.takeError();
+  NamedStreams.set(Name, *ExpectedStream);
+  return Error::success();
+}
+
+Expected<msf::MSFLayout> PDBFileBuilder::finalizeMsfLayout() {
+  uint32_t StringTableSize = Strings.finalize();
+
+  if (auto EC = addNamedStream("/names", StringTableSize))
+    return std::move(EC);
+  if (auto EC = addNamedStream("/LinkInfo", 0))
+    return std::move(EC);
+  if (auto EC = addNamedStream("/src/headerblock", 0))
+    return std::move(EC);
+
   if (Info) {
     if (auto EC = Info->finalizeMsfLayout())
       return std::move(EC);
@@ -98,8 +118,9 @@ Error PDBFileBuilder::commit(StringRef Filename) {
   if (OutFileOrError.getError())
     return llvm::make_error<pdb::GenericError>(generic_error_code::invalid_path,
                                                Filename);
-  FileBufferByteStream Buffer(std::move(*OutFileOrError));
-  StreamWriter Writer(Buffer);
+  FileBufferByteStream Buffer(std::move(*OutFileOrError),
+                              llvm::support::little);
+  BinaryStreamWriter Writer(Buffer);
 
   if (auto EC = Writer.writeObject(*Layout.SB))
     return EC;
@@ -111,9 +132,8 @@ Error PDBFileBuilder::commit(StringRef Filename) {
 
   auto DirStream =
       WritableMappedBlockStream::createDirectoryStream(Layout, Buffer);
-  StreamWriter DW(*DirStream);
-  if (auto EC =
-          DW.writeInteger(static_cast<uint32_t>(Layout.StreamSizes.size())))
+  BinaryStreamWriter DW(*DirStream);
+  if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size()))
     return EC;
 
   if (auto EC = DW.writeArray(Layout.StreamSizes))
@@ -124,6 +144,16 @@ Error PDBFileBuilder::commit(StringRef Filename) {
       return EC;
   }
 
+  uint32_t StringTableStreamNo = 0;
+  if (!NamedStreams.get("/names", StringTableStreamNo))
+    return llvm::make_error<pdb::RawError>(raw_error_code::no_stream);
+
+  auto NS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
+                                                           StringTableStreamNo);
+  BinaryStreamWriter NSWriter(*NS);
+  if (auto EC = Strings.commit(NSWriter))
+    return EC;
+
   if (Info) {
     if (auto EC = Info->commit(Layout, Buffer))
       return EC;
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
new file mode 100644
index 000000000000..629f3e80b0ed
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
@@ -0,0 +1,119 @@
+//===- PDBTypeServerHandler.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Handles CodeView LF_TYPESERVER2 records by attempting to locate a matching
+// PDB file, then loading the PDB file and visiting all types from the
+// referenced PDB using the original supplied visitor.
+//
+// The net effect of this is that when visiting a PDB containing a TypeServer
+// record, the TypeServer record is "replaced" with all of the records in
+// the referenced PDB file.  If a single instance of PDBTypeServerHandler
+// encounters the same TypeServer multiple times (for example reusing one
+// PDBTypeServerHandler across multiple visitations of distinct object files or
+// PDB files), PDBTypeServerHandler will optionally revisit all the records
+// again, or simply consume the record and do nothing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
+
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+static void ignoreErrors(Error EC) {
+  llvm::handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {});
+}
+
+PDBTypeServerHandler::PDBTypeServerHandler(bool RevisitAlways)
+    : RevisitAlways(RevisitAlways) {}
+
+void PDBTypeServerHandler::addSearchPath(StringRef Path) {
+  if (Path.empty() || !sys::fs::is_directory(Path))
+    return;
+
+  SearchPaths.push_back(Path);
+}
+
+Expected<bool>
+PDBTypeServerHandler::handleInternal(PDBFile &File,
+                                     TypeVisitorCallbacks &Callbacks) {
+  auto ExpectedTpi = File.getPDBTpiStream();
+  if (!ExpectedTpi)
+    return ExpectedTpi.takeError();
+  CVTypeVisitor Visitor(Callbacks);
+
+  if (auto EC = Visitor.visitTypeStream(ExpectedTpi->types(nullptr)))
+    return std::move(EC);
+
+  return true;
+}
+
+Expected<bool> PDBTypeServerHandler::handle(TypeServer2Record &TS,
+                                            TypeVisitorCallbacks &Callbacks) {
+  if (Session) {
+    // If we've already handled this TypeServer and we only want to handle each
+    // TypeServer once, consume the record without doing anything.
+    if (!RevisitAlways)
+      return true;
+
+    return handleInternal(Session->getPDBFile(), Callbacks);
+  }
+
+  StringRef File = sys::path::filename(TS.Name);
+  if (File.empty())
+    return make_error<CodeViewError>(
+        cv_error_code::corrupt_record,
+        "TypeServer2Record does not contain filename!");
+
+  for (auto Path : SearchPaths) {
+    sys::path::append(Path, File);
+    if (!sys::fs::exists(Path))
+      continue;
+
+    std::unique_ptr<IPDBSession> ThisSession;
+    if (auto EC = loadDataForPDB(PDB_ReaderType::Native, Path, ThisSession)) {
+      // It is not an error if this PDB fails to load, it just means that it
+      // doesn't match and we should continue searching.
+      ignoreErrors(std::move(EC));
+      continue;
+    }
+
+    std::unique_ptr<NativeSession> NS(
+        static_cast<NativeSession *>(ThisSession.release()));
+    PDBFile &File = NS->getPDBFile();
+    auto ExpectedInfo = File.getPDBInfoStream();
+    // All PDB Files should have an Info stream.
+    if (!ExpectedInfo)
+      return ExpectedInfo.takeError();
+
+    // Just because a file with a matching name was found and it was an actual
+    // PDB file doesn't mean it matches.  For it to match the InfoStream's GUID
+    // must match the GUID specified in the TypeServer2 record.
+    ArrayRef<uint8_t> GuidBytes(ExpectedInfo->getGuid().Guid);
+    StringRef GuidStr(reinterpret_cast<const char *>(GuidBytes.begin()),
+                      GuidBytes.size());
+    if (GuidStr != TS.Guid)
+      continue;
+
+    Session = std::move(NS);
+    return handleInternal(File, Callbacks);
+  }
+
+  // We couldn't find a matching PDB, so let it be handled by someone else.
+  return false;
+}
diff --git a/lib/DebugInfo/PDB/Raw/PublicsStream.cpp b/lib/DebugInfo/PDB/Native/PublicsStream.cpp
index b31f605a078c..58202577672a 100644
--- a/lib/DebugInfo/PDB/Raw/PublicsStream.cpp
+++ b/lib/DebugInfo/PDB/Native/PublicsStream.cpp
@@ -22,15 +22,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
 #include "GSI.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/PublicsStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -69,7 +69,7 @@ uint32_t PublicsStream::getAddrMap() const { return Header->AddrMap; }
 // we skip over the hash table which we believe contains information about
 // public symbols.
 Error PublicsStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   // Check stream size.
   if (Reader.bytesRemaining() < sizeof(HeaderInfo) + sizeof(GSIHashHeader))
diff --git a/lib/DebugInfo/PDB/Raw/RawError.cpp b/lib/DebugInfo/PDB/Native/RawError.cpp
index f4a5057509eb..548289fff3df 100644
--- a/lib/DebugInfo/PDB/Raw/RawError.cpp
+++ b/lib/DebugInfo/PDB/Native/RawError.cpp
@@ -1,4 +1,4 @@
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 
@@ -38,6 +38,8 @@ public:
       return "The entry does not exist.";
     case raw_error_code::not_writable:
       return "The PDB does not support writing.";
+    case raw_error_code::stream_too_long:
+      return "The stream was longer than expected.";
     case raw_error_code::invalid_tpi_hash:
       return "The Type record has an invalid hash value.";
     }
diff --git a/lib/DebugInfo/PDB/Raw/NameHashTable.cpp b/lib/DebugInfo/PDB/Native/StringTable.cpp
index 84cccb354bd8..7e28389b8383 100644
--- a/lib/DebugInfo/PDB/Raw/NameHashTable.cpp
+++ b/lib/DebugInfo/PDB/Native/StringTable.cpp
@@ -1,4 +1,4 @@
-//===- NameHashTable.cpp - PDB Name Hash Table ------------------*- C++ -*-===//
+//===- StringTable.cpp - PDB String Table -----------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,33 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
+#include "llvm/DebugInfo/PDB/Native/StringTable.h"
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/Hash.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
-using namespace llvm::msf;
 using namespace llvm::support;
 using namespace llvm::pdb;
 
-NameHashTable::NameHashTable() : Signature(0), HashVersion(0), NameCount(0) {}
+StringTable::StringTable() {}
 
-Error NameHashTable::load(StreamReader &Stream) {
-  struct Header {
-    support::ulittle32_t Signature;
-    support::ulittle32_t HashVersion;
-    support::ulittle32_t ByteSize;
-  };
+Error StringTable::load(BinaryStreamReader &Stream) {
+  ByteSize = Stream.getLength();
 
-  const Header *H;
+  const StringTableHeader *H;
   if (auto EC = Stream.readObject(H))
     return EC;
 
-  if (H->Signature != 0xEFFEEFFE)
+  if (H->Signature != StringTableSignature)
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Invalid hash table signature");
   if (H->HashVersion != 1 && H->HashVersion != 2)
@@ -62,10 +58,19 @@ Error NameHashTable::load(StreamReader &Stream) {
 
   if (auto EC = Stream.readInteger(NameCount))
     return EC;
+
+  if (Stream.bytesRemaining() > 0)
+    return make_error<RawError>(raw_error_code::stream_too_long,
+      "Unexpected bytes found in string table");
+
   return Error::success();
 }
 
-StringRef NameHashTable::getStringForID(uint32_t ID) const {
+uint32_t StringTable::getByteSize() const {
+  return ByteSize;
+}
+
+StringRef StringTable::getStringForID(uint32_t ID) const {
   if (ID == IDs[0])
     return StringRef();
 
@@ -73,14 +78,14 @@ StringRef NameHashTable::getStringForID(uint32_t ID) const {
   // the starting offset of the string we're looking for.  So just seek into
   // the desired offset and a read a null terminated stream from that offset.
   StringRef Result;
-  StreamReader NameReader(NamesBuffer);
+  BinaryStreamReader NameReader(NamesBuffer);
   NameReader.setOffset(ID);
-  if (auto EC = NameReader.readZeroString(Result))
+  if (auto EC = NameReader.readCString(Result))
     consumeError(std::move(EC));
   return Result;
 }
 
-uint32_t NameHashTable::getIDForString(StringRef Str) const {
+uint32_t StringTable::getIDForString(StringRef Str) const {
   uint32_t Hash = (HashVersion == 1) ? hashStringV1(Str) : hashStringV2(Str);
   size_t Count = IDs.size();
   uint32_t Start = Hash % Count;
@@ -99,6 +104,6 @@ uint32_t NameHashTable::getIDForString(StringRef Str) const {
   return IDs[0];
 }
 
-FixedStreamArray<support::ulittle32_t> NameHashTable::name_ids() const {
+FixedStreamArray<support::ulittle32_t> StringTable::name_ids() const {
   return IDs;
 }
diff --git a/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp b/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp
new file mode 100644
index 000000000000..e0f8370ab608
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp
@@ -0,0 +1,102 @@
+//===- StringTableBuilder.cpp - PDB String Table ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/StringTableBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support;
+using namespace llvm::support::endian;
+using namespace llvm::pdb;
+
+uint32_t StringTableBuilder::insert(StringRef S) {
+  auto P = Strings.insert({S, StringSize});
+
+  // If a given string didn't exist in the string table, we want to increment
+  // the string table size.
+  if (P.second)
+    StringSize += S.size() + 1; // +1 for '\0'
+  return P.first->second;
+}
+
+static uint32_t computeBucketCount(uint32_t NumStrings) {
+  // The /names stream is basically an on-disk open-addressing hash table.
+  // Hash collisions are resolved by linear probing. We cannot make
+  // utilization 100% because it will make the linear probing extremely
+  // slow. But lower utilization wastes disk space. As a reasonable
+  // load factor, we choose 80%. We need +1 because slot 0 is reserved.
+  return (NumStrings + 1) * 1.25;
+}
+
+uint32_t StringTableBuilder::finalize() {
+  uint32_t Size = 0;
+  Size += sizeof(StringTableHeader);
+  Size += StringSize;
+  Size += sizeof(uint32_t); // Hash table begins with 4-byte size field.
+
+  uint32_t BucketCount = computeBucketCount(Strings.size());
+  Size += BucketCount * sizeof(uint32_t);
+
+  Size +=
+      sizeof(uint32_t); // The /names stream ends with the number of strings.
+  return Size;
+}
+
+Error StringTableBuilder::commit(BinaryStreamWriter &Writer) const {
+  // Write a header
+  StringTableHeader H;
+  H.Signature = StringTableSignature;
+  H.HashVersion = 1;
+  H.ByteSize = StringSize;
+  if (auto EC = Writer.writeObject(H))
+    return EC;
+
+  // Write a string table.
+  uint32_t StringStart = Writer.getOffset();
+  for (auto Pair : Strings) {
+    StringRef S = Pair.first;
+    uint32_t Offset = Pair.second;
+    Writer.setOffset(StringStart + Offset);
+    if (auto EC = Writer.writeCString(S))
+      return EC;
+  }
+  Writer.setOffset(StringStart + StringSize);
+
+  // Write a hash table.
+  uint32_t BucketCount = computeBucketCount(Strings.size());
+  if (auto EC = Writer.writeInteger(BucketCount))
+    return EC;
+  std::vector<ulittle32_t> Buckets(BucketCount);
+
+  for (auto Pair : Strings) {
+    StringRef S = Pair.first;
+    uint32_t Offset = Pair.second;
+    uint32_t Hash = hashStringV1(S);
+
+    for (uint32_t I = 0; I != BucketCount; ++I) {
+      uint32_t Slot = (Hash + I) % BucketCount;
+      if (Slot == 0)
+        continue; // Skip reserved slot
+      if (Buckets[Slot] != 0)
+        continue;
+      Buckets[Slot] = Offset;
+      break;
+    }
+  }
+
+  if (auto EC = Writer.writeArray(ArrayRef<ulittle32_t>(Buckets)))
+    return EC;
+  if (auto EC = Writer.writeInteger(static_cast<uint32_t>(Strings.size())))
+    return EC;
+  return Error::success();
+}
diff --git a/lib/DebugInfo/PDB/Raw/SymbolStream.cpp b/lib/DebugInfo/PDB/Native/SymbolStream.cpp
index 2f3ac3497f39..9e9ebd11495b 100644
--- a/lib/DebugInfo/PDB/Raw/SymbolStream.cpp
+++ b/lib/DebugInfo/PDB/Native/SymbolStream.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -30,7 +29,7 @@ SymbolStream::SymbolStream(std::unique_ptr<MappedBlockStream> Stream)
 SymbolStream::~SymbolStream() {}
 
 Error SymbolStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (auto EC = Reader.readArray(SymbolRecords, Stream->getLength()))
     return EC;
diff --git a/lib/DebugInfo/PDB/Raw/TpiHashing.cpp b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
index 6c3ddb3d57af..16904a5a27ed 100644
--- a/lib/DebugInfo/PDB/Raw/TpiHashing.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/TpiHashing.h"
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
 
-#include "llvm/DebugInfo/PDB/Raw/Hash.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/lib/DebugInfo/PDB/Raw/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index a1167cd98454..5fef3edf8c2d 100644
--- a/lib/DebugInfo/PDB/Raw/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -7,19 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiHashing.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -53,7 +54,7 @@ Error TpiStream::verifyHashValues() {
 }
 
 Error TpiStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (Reader.bytesRemaining() < sizeof(TpiStreamHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -92,11 +93,12 @@ Error TpiStream::reload() {
 
     auto HS = MappedBlockStream::createIndexedStream(
         Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex);
-    StreamReader HSR(*HS);
+    BinaryStreamReader HSR(*HS);
 
+    // There should be a hash value for every type record, or no hashes at all.
     uint32_t NumHashValues =
         Header->HashValueBuffer.Length / sizeof(ulittle32_t);
-    if (NumHashValues != NumTypeRecords())
+    if (NumHashValues != NumTypeRecords() && NumHashValues != 0)
       return make_error<RawError>(
           raw_error_code::corrupt_file,
           "TPI hash count does not match with the number of type records.");
@@ -113,18 +115,19 @@ Error TpiStream::reload() {
     if (auto EC = HSR.readArray(TypeIndexOffsets, NumTypeIndexOffsets))
       return EC;
 
-    HSR.setOffset(Header->HashAdjBuffer.Off);
-    uint32_t NumHashAdjustments =
-        Header->HashAdjBuffer.Length / sizeof(TypeIndexOffset);
-    if (auto EC = HSR.readArray(HashAdjustments, NumHashAdjustments))
-      return EC;
+    if (Header->HashAdjBuffer.Length > 0) {
+      HSR.setOffset(Header->HashAdjBuffer.Off);
+      if (auto EC = HashAdjusters.load(HSR))
+        return EC;
+    }
 
     HashStream = std::move(HS);
 
     // TPI hash table is a parallel array for the type records.
     // Verify that the hash values match with type records.
-    if (auto EC = verifyHashValues())
-      return EC;
+    if (NumHashValues > 0)
+      if (auto EC = verifyHashValues())
+        return EC;
   }
 
   return Error::success();
@@ -154,23 +157,17 @@ uint16_t TpiStream::getTypeHashStreamAuxIndex() const {
 uint32_t TpiStream::NumHashBuckets() const { return Header->NumHashBuckets; }
 uint32_t TpiStream::getHashKeySize() const { return Header->HashKeySize; }
 
-FixedStreamArray<support::ulittle32_t>
-TpiStream::getHashValues() const {
+FixedStreamArray<support::ulittle32_t> TpiStream::getHashValues() const {
   return HashValues;
 }
 
-FixedStreamArray<TypeIndexOffset>
-TpiStream::getTypeIndexOffsets() const {
+FixedStreamArray<TypeIndexOffset> TpiStream::getTypeIndexOffsets() const {
   return TypeIndexOffsets;
 }
 
-FixedStreamArray<TypeIndexOffset>
-TpiStream::getHashAdjustments() const {
-  return HashAdjustments;
-}
+HashTable &TpiStream::getHashAdjusters() { return HashAdjusters; }
 
-iterator_range<CVTypeArray::Iterator>
-TpiStream::types(bool *HadError) const {
+CVTypeRange TpiStream::types(bool *HadError) const {
   return make_range(TypeRecords.begin(HadError), TypeRecords.end());
 }
 
diff --git a/lib/DebugInfo/PDB/Raw/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index c769321f18c1..375c35b11145 100644
--- a/lib/DebugInfo/PDB/Raw/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -7,22 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -43,9 +43,22 @@ void TpiStreamBuilder::setVersionHeader(PdbRaw_TpiVer Version) {
   VerHeader = Version;
 }
 
-void TpiStreamBuilder::addTypeRecord(const codeview::CVType &Record) {
+void TpiStreamBuilder::addTypeRecord(ArrayRef<uint8_t> Record,
+                                     Optional<uint32_t> Hash) {
+  // If we just crossed an 8KB threshold, add a type index offset.
+  size_t NewSize = TypeRecordBytes + Record.size();
+  constexpr size_t EightKB = 8 * 1024;
+  if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecords.empty()) {
+    TypeIndexOffsets.push_back(
+        {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
+                             TypeRecords.size()),
+         ulittle32_t(TypeRecordBytes)});
+  }
+  TypeRecordBytes = NewSize;
+
   TypeRecords.push_back(Record);
-  TypeRecordStream.setItems(TypeRecords);
+  if (Hash)
+    TypeHashes.push_back(*Hash);
 }
 
 Error TpiStreamBuilder::finalize() {
@@ -55,13 +68,12 @@ Error TpiStreamBuilder::finalize() {
   TpiStreamHeader *H = Allocator.Allocate<TpiStreamHeader>();
 
   uint32_t Count = TypeRecords.size();
-  uint32_t HashBufferSize = calculateHashBufferSize();
 
   H->Version = *VerHeader;
   H->HeaderSize = sizeof(TpiStreamHeader);
   H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
   H->TypeIndexEnd = H->TypeIndexBegin + Count;
-  H->TypeRecordBytes = TypeRecordStream.getLength();
+  H->TypeRecordBytes = TypeRecordBytes;
 
   H->HashStreamIndex = HashStreamIndex;
   H->HashAuxStreamIndex = kInvalidStreamIndex;
@@ -72,24 +84,32 @@ Error TpiStreamBuilder::finalize() {
   // the `HashStreamIndex` field of the `TpiStreamHeader`.  Therefore, the data
   // begins at offset 0 of this independent stream.
   H->HashValueBuffer.Off = 0;
-  H->HashValueBuffer.Length = HashBufferSize;
+  H->HashValueBuffer.Length = calculateHashBufferSize();
+
+  // We never write any adjustments into our PDBs, so this is usually some
+  // offset with zero length.
   H->HashAdjBuffer.Off = H->HashValueBuffer.Off + H->HashValueBuffer.Length;
   H->HashAdjBuffer.Length = 0;
+
   H->IndexOffsetBuffer.Off = H->HashAdjBuffer.Off + H->HashAdjBuffer.Length;
-  H->IndexOffsetBuffer.Length = 0;
+  H->IndexOffsetBuffer.Length = calculateIndexOffsetSize();
 
   Header = H;
   return Error::success();
 }
 
-uint32_t TpiStreamBuilder::calculateSerializedLength() const {
-  return sizeof(TpiStreamHeader) + TypeRecordStream.getLength();
+uint32_t TpiStreamBuilder::calculateSerializedLength() {
+  return sizeof(TpiStreamHeader) + TypeRecordBytes;
 }
 
 uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
-  if (TypeRecords.empty() || !TypeRecords[0].Hash.hasValue())
-    return 0;
-  return TypeRecords.size() * sizeof(ulittle32_t);
+  assert((TypeRecords.size() == TypeHashes.size() || TypeHashes.empty()) &&
+         "either all or no type records should have hashes");
+  return TypeHashes.size() * sizeof(ulittle32_t);
+}
+
+uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
+  return TypeIndexOffsets.size() * sizeof(TypeIndexOffset);
 }
 
 Error TpiStreamBuilder::finalizeMsfLayout() {
@@ -97,48 +117,60 @@ Error TpiStreamBuilder::finalizeMsfLayout() {
   if (auto EC = Msf.setStreamSize(Idx, Length))
     return EC;
 
-  uint32_t HashBufferSize = calculateHashBufferSize();
+  uint32_t HashStreamSize =
+      calculateHashBufferSize() + calculateIndexOffsetSize();
 
-  if (HashBufferSize == 0)
+  if (HashStreamSize == 0)
     return Error::success();
 
-  auto ExpectedIndex = Msf.addStream(HashBufferSize);
+  auto ExpectedIndex = Msf.addStream(HashStreamSize);
   if (!ExpectedIndex)
     return ExpectedIndex.takeError();
   HashStreamIndex = *ExpectedIndex;
-  ulittle32_t *H = Allocator.Allocate<ulittle32_t>(TypeRecords.size());
-  MutableArrayRef<ulittle32_t> HashBuffer(H, TypeRecords.size());
-  for (uint32_t I = 0; I < TypeRecords.size(); ++I) {
-    HashBuffer[I] = *TypeRecords[I].Hash % MinTpiHashBuckets;
+  if (!TypeHashes.empty()) {
+    ulittle32_t *H = Allocator.Allocate<ulittle32_t>(TypeHashes.size());
+    MutableArrayRef<ulittle32_t> HashBuffer(H, TypeHashes.size());
+    for (uint32_t I = 0; I < TypeHashes.size(); ++I) {
+      HashBuffer[I] = TypeHashes[I] % MinTpiHashBuckets;
+    }
+    ArrayRef<uint8_t> Bytes(
+        reinterpret_cast<const uint8_t *>(HashBuffer.data()),
+        calculateHashBufferSize());
+    HashValueStream =
+        llvm::make_unique<BinaryByteStream>(Bytes, llvm::support::little);
   }
-  ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(HashBuffer.data()),
-                          HashBufferSize);
-  HashValueStream = llvm::make_unique<ByteStream>(Bytes);
   return Error::success();
 }
 
 Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
-                               const msf::WritableStream &Buffer) {
+                               WritableBinaryStreamRef Buffer) {
   if (auto EC = finalize())
     return EC;
 
   auto InfoS =
       WritableMappedBlockStream::createIndexedStream(Layout, Buffer, Idx);
 
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*InfoS);
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  auto RecordArray = VarStreamArray<codeview::CVType>(TypeRecordStream);
-  if (auto EC = Writer.writeArray(RecordArray))
-    return EC;
+  for (auto Rec : TypeRecords)
+    if (auto EC = Writer.writeBytes(Rec))
+      return EC;
 
   if (HashStreamIndex != kInvalidStreamIndex) {
     auto HVS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
                                                               HashStreamIndex);
-    StreamWriter HW(*HVS);
-    if (auto EC = HW.writeStreamRef(*HashValueStream))
-      return EC;
+    BinaryStreamWriter HW(*HVS);
+    if (HashValueStream) {
+      if (auto EC = HW.writeStreamRef(*HashValueStream))
+        return EC;
+    }
+
+    for (auto &IndexOffset : TypeIndexOffsets) {
+      if (auto EC = HW.writeObject(IndexOffset))
+        return EC;
+    }
   }
 
   return Error::success();
diff --git a/lib/DebugInfo/PDB/PDB.cpp b/lib/DebugInfo/PDB/PDB.cpp
index 0d720591b81d..7e3acc1165f3 100644
--- a/lib/DebugInfo/PDB/PDB.cpp
+++ b/lib/DebugInfo/PDB/PDB.cpp
@@ -17,7 +17,7 @@
 #if LLVM_ENABLE_DIA_SDK
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #endif
-#include "llvm/DebugInfo/PDB/Raw/RawSession.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 
@@ -27,8 +27,8 @@ using namespace llvm::pdb;
 Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
                                 std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
-  if (Type == PDB_ReaderType::Raw)
-    return RawSession::createFromPdb(Path, Session);
+  if (Type == PDB_ReaderType::Native)
+    return NativeSession::createFromPdb(Path, Session);
 
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromPdb(Path, Session);
@@ -40,8 +40,8 @@ Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
 Error llvm::pdb::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
                                 std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
-  if (Type == PDB_ReaderType::Raw)
-    return RawSession::createFromExe(Path, Session);
+  if (Type == PDB_ReaderType::Native)
+    return NativeSession::createFromExe(Path, Session);
 
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromExe(Path, Session);
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
index b7eee6e53941..dc22a30facab 100644
--- a/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -259,6 +260,12 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
   return OS;
 }
 
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Guid) {
+  codeview::detail::GuidAdapter A(Guid.Guid);
+  A.format(OS, "");
+  return OS;
+}
+
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   switch (Type) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Class, "class", OS)
@@ -269,25 +276,6 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   return OS;
 }
 
-raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
-  static const char *Lookup = "0123456789ABCDEF";
-
-  static_assert(sizeof(PDB_UniqueId) == 16, "Expected 16-byte GUID");
-  ArrayRef<uint8_t> GuidBytes(reinterpret_cast<const uint8_t*>(&Id), 16);
-  OS << "{";
-  for (int i=0; i < 16;) {
-    uint8_t Byte = GuidBytes[i];
-    uint8_t HighNibble = (Byte >> 4) & 0xF;
-    uint8_t LowNibble = Byte & 0xF;
-    OS << Lookup[HighNibble] << Lookup[LowNibble];
-    ++i;
-    if (i>=4 && i<=10 && i%2==0)
-      OS << "-";
-  }
-  OS << "}";
-  return OS;
-}
-
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
                                    const PDB_Machine &Machine) {
   switch (Machine) {
diff --git a/lib/DebugInfo/PDB/PDBSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbol.cpp
index 633e11aacf12..74010c2dd7dd 100644
--- a/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolAnnotation.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
@@ -53,6 +54,9 @@ PDBSymbol::PDBSymbol(const IPDBSession &PDBSession,
                      std::unique_ptr<IPDBRawSymbol> Symbol)
     : Session(PDBSession), RawSymbol(std::move(Symbol)) {}
 
+PDBSymbol::PDBSymbol(PDBSymbol &Symbol)
+    : Session(Symbol.Session), RawSymbol(std::move(Symbol.RawSymbol)) {}
+
 PDBSymbol::~PDBSymbol() = default;
 
 #define FACTORY_SYMTAG_CASE(Tag, Type)                                         \
@@ -99,16 +103,30 @@ PDBSymbol::create(const IPDBSession &PDBSession,
   }
 }
 
-#define TRY_DUMP_TYPE(Type)                                                    \
-  if (const Type *DerivedThis = dyn_cast<Type>(this))                          \
-    Dumper.dump(OS, Indent, *DerivedThis);
-
-#define ELSE_TRY_DUMP_TYPE(Type, Dumper) else TRY_DUMP_TYPE(Type, Dumper)
-
 void PDBSymbol::defaultDump(raw_ostream &OS, int Indent) const {
   RawSymbol->dump(OS, Indent);
 }
 
+void PDBSymbol::dumpProperties() const {
+  outs() << "\n";
+  defaultDump(outs(), 0);
+  outs().flush();
+}
+
+void PDBSymbol::dumpChildStats() const {
+  TagStats Stats;
+  getChildStats(Stats);
+  outs() << "\n";
+  for (auto &Stat : Stats) {
+    outs() << Stat.first << ": " << Stat.second << "\n";
+  }
+  outs().flush();
+}
+
+std::unique_ptr<PDBSymbol> PDBSymbol::clone() const {
+  return Session.getSymbolById(getSymIndexId());
+}
+
 PDB_SymType PDBSymbol::getSymTag() const { return RawSymbol->getSymTag(); }
 uint32_t PDBSymbol::getSymIndexId() const { return RawSymbol->getSymIndexId(); }
 
@@ -141,6 +159,8 @@ PDBSymbol::findInlineFramesByRVA(uint32_t RVA) const {
 std::unique_ptr<IPDBEnumSymbols>
 PDBSymbol::getChildStats(TagStats &Stats) const {
   std::unique_ptr<IPDBEnumSymbols> Result(findAllChildren());
+  if (!Result)
+    return nullptr;
   Stats.clear();
   while (auto Child = Result->getNext()) {
     ++Stats[Child->getSymTag()];
@@ -148,3 +168,7 @@ PDBSymbol::getChildStats(TagStats &Stats) const {
   Result->reset();
   return Result;
 }
+
+std::unique_ptr<PDBSymbol> PDBSymbol::getSymbolByIdHelper(uint32_t Id) const {
+  return Session.getSymbolById(Id);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
index cdb167b6191c..3648272e1d0e 100644
--- a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolAnnotation::PDBSymbolAnnotation(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Annotation);
+}
 
 void PDBSymbolAnnotation::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
index fd5dc9427abf..7385d3ba1489 100644
--- a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -19,6 +19,8 @@ using namespace llvm::pdb;
 
 PDBSymbolBlock::PDBSymbolBlock(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Block);
+}
 
 void PDBSymbolBlock::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index ebff08846cac..854cf42d1bae 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompiland::PDBSymbolCompiland(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Compiland);
+}
 
 void PDBSymbolCompiland::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
index 6dbd5228f2cd..e08450e0ad0c 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompilandDetails::PDBSymbolCompilandDetails(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CompilandDetails);
+}
 
 void PDBSymbolCompilandDetails::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
index 9c7f0b1be56f..2f1c43666ae5 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompilandEnv::PDBSymbolCompilandEnv(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CompilandEnv);
+}
 
 std::string PDBSymbolCompilandEnv::getValue() const {
   Variant Value = RawSymbol->getValue();
diff --git a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
index 0ea387a0eabb..9ec20bb62d75 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCustom::PDBSymbolCustom(const IPDBSession &PDBSession,
                                  std::unique_ptr<IPDBRawSymbol> CustomSymbol)
-    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {}
+    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Custom);
+}
 
 void PDBSymbolCustom::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) {
   RawSymbol->getDataBytes(bytes);
diff --git a/lib/DebugInfo/PDB/PDBSymbolData.cpp b/lib/DebugInfo/PDB/PDBSymbolData.cpp
index 62bb6f3f41e2..60026689c6f1 100644
--- a/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -19,10 +19,8 @@ using namespace llvm::pdb;
 
 PDBSymbolData::PDBSymbolData(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> DataSymbol)
-    : PDBSymbol(PDBSession, std::move(DataSymbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolData::getType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(DataSymbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Data);
 }
 
 void PDBSymbolData::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
index 60101c168a79..7417167b61ad 100644
--- a/lib/DebugInfo/PDB/PDBSymbolExe.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 
 #include <utility>
 
@@ -18,6 +19,18 @@ using namespace llvm::pdb;
 
 PDBSymbolExe::PDBSymbolExe(const IPDBSession &PDBSession,
                            std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Exe);
+}
 
 void PDBSymbolExe::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
+
+uint32_t PDBSymbolExe::getPointerByteSize() const {
+  auto Pointer = findOneChild<PDBSymbolTypePointer>();
+  if (Pointer)
+    return Pointer->getLength();
+
+  if (getMachineType() == PDB_Machine::x86)
+    return 4;
+  return 8;
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index 35251c0cc1c1..0734a1f8314a 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -85,10 +85,8 @@ private:
 
 PDBSymbolFunc::PDBSymbolFunc(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbolTypeFunctionSig> PDBSymbolFunc::getSignature() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeFunctionSig>(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Function);
 }
 
 std::unique_ptr<IPDBEnumChildren<PDBSymbolData>>
@@ -96,8 +94,15 @@ PDBSymbolFunc::getArguments() const {
   return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
-std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolFunc::getClassParent() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
-}
-
 void PDBSymbolFunc::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
+
+bool PDBSymbolFunc::isDestructor() const {
+  std::string Name = getName();
+  if (Name.empty())
+    return false;
+  if (Name[0] == '~')
+    return true;
+  if (Name == "__vecDelDtor")
+    return true;
+  return false;
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
index 77e996f651df..482c95e3a850 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolFuncDebugEnd::PDBSymbolFuncDebugEnd(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FuncDebugEnd);
+}
 
 void PDBSymbolFuncDebugEnd::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
index 9c653879176b..ae23c7619e2a 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolFuncDebugStart::PDBSymbolFuncDebugStart(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FuncDebugStart);
+}
 
 void PDBSymbolFuncDebugStart::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
index d2cfd11c35e4..a67a20d8e352 100644
--- a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolLabel::PDBSymbolLabel(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Label);
+}
 
 void PDBSymbolLabel::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
index 97d668740818..87bb4044216b 100644
--- a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolPublicSymbol::PDBSymbolPublicSymbol(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::PublicSymbol);
+}
 
 void PDBSymbolPublicSymbol::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
index ef8897d12af4..b2648197f9cc 100644
--- a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolThunk::PDBSymbolThunk(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Thunk);
+}
 
 void PDBSymbolThunk::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
index c010cc5d7678..a8054a42d866 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -19,12 +19,14 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeArray::PDBSymbolTypeArray(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypeArray::getElementType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::ArrayType);
 }
 
 void PDBSymbolTypeArray::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypeArray::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
index 382c397b24d2..0ee18d471624 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeBaseClass::PDBSymbolTypeBaseClass(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::BaseClass);
+}
 
 void PDBSymbolTypeBaseClass::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index e5d65bf5d1fd..0bf563af7df5 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeBuiltin::PDBSymbolTypeBuiltin(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::BuiltinType);
+}
 
 void PDBSymbolTypeBuiltin::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
index 1d80c97f9ede..f617d8d0c2df 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeCustom::PDBSymbolTypeCustom(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CustomType);
+}
 
 void PDBSymbolTypeCustom::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
index 535d97dcd21e..68ba87c1cdf8 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeDimension::PDBSymbolTypeDimension(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Dimension);
+}
 
 void PDBSymbolTypeDimension::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
index 788f2b732aaa..2addea072c88 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -21,15 +21,8 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeEnum::PDBSymbolTypeEnum(const IPDBSession &PDBSession,
                                      std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolTypeEnum::getClassParent() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
-}
-
-std::unique_ptr<PDBSymbolTypeBuiltin>
-PDBSymbolTypeEnum::getUnderlyingType() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeBuiltin>(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Enum);
 }
 
 void PDBSymbolTypeEnum::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
index 5831baebb993..ec27985e91d1 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeFriend::PDBSymbolTypeFriend(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Friend);
+}
 
 void PDBSymbolTypeFriend::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
index c6f586db9e57..4d5cd63f6857 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeFunctionArg::PDBSymbolTypeFunctionArg(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FunctionArg);
+}
 
 void PDBSymbolTypeFunctionArg::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
index 057ae260885f..473529d1b043 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -68,10 +68,8 @@ private:
 
 PDBSymbolTypeFunctionSig::PDBSymbolTypeFunctionSig(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getReturnType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FunctionSig);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -79,13 +77,10 @@ PDBSymbolTypeFunctionSig::getArguments() const {
   return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
-std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getClassParent() const {
-  uint32_t ClassId = getClassParentId();
-  if (ClassId == 0)
-    return nullptr;
-  return Session.getSymbolById(ClassId);
-}
-
 void PDBSymbolTypeFunctionSig::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypeFunctionSig::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
index 072d2cfd42fb..86e0ec4f8565 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeManaged::PDBSymbolTypeManaged(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::ManagedType);
+}
 
 void PDBSymbolTypeManaged::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
index 699771450a5d..69819811d61f 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -19,12 +19,14 @@ using namespace llvm::pdb;
 
 PDBSymbolTypePointer::PDBSymbolTypePointer(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypePointer::getPointeeType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::PointerType);
 }
 
 void PDBSymbolTypePointer::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypePointer::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
index 0f283b9e21a4..102b540e0fef 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeTypedef::PDBSymbolTypeTypedef(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Typedef);
+}
 
 void PDBSymbolTypeTypedef::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
index c71838cc7a6f..15dc15352165 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -9,7 +9,15 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/DebugInfo/PDB/UDTLayout.h"
 
 #include <utility>
 
@@ -18,6 +26,8 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeUDT::PDBSymbolTypeUDT(const IPDBSession &PDBSession,
                                    std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::UDT);
+}
 
 void PDBSymbolTypeUDT::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
index 6b76db5912ce..9a21855f57f0 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeVTable::PDBSymbolTypeVTable(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::VTable);
+}
 
 void PDBSymbolTypeVTable::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
index ef509d64bf60..a516a4d2c429 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeVTableShape::PDBSymbolTypeVTableShape(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::VTableShape);
+}
 
 void PDBSymbolTypeVTableShape::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
index 6a62d554f42c..020aec9e98a8 100644
--- a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolUsingNamespace::PDBSymbolUsingNamespace(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::UsingNamespace);
+}
 
 void PDBSymbolUsingNamespace::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/Raw/InfoStream.cpp b/lib/DebugInfo/PDB/Raw/InfoStream.cpp
deleted file mode 100644
index f19535d11806..000000000000
--- a/lib/DebugInfo/PDB/Raw/InfoStream.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===- InfoStream.cpp - PDB Info Stream (Stream 1) Access -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
-    : Stream(std::move(Stream)) {}
-
-Error InfoStream::reload() {
-  StreamReader Reader(*Stream);
-
-  const InfoStreamHeader *H;
-  if (auto EC = Reader.readObject(H))
-    return joinErrors(
-        std::move(EC),
-        make_error<RawError>(raw_error_code::corrupt_file,
-                             "PDB Stream does not contain a header."));
-
-  switch (H->Version) {
-  case PdbImplVC70:
-  case PdbImplVC80:
-  case PdbImplVC110:
-  case PdbImplVC140:
-    break;
-  default:
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Unsupported PDB stream version.");
-  }
-
-  Version = H->Version;
-  Signature = H->Signature;
-  Age = H->Age;
-  Guid = H->Guid;
-
-  return NamedStreams.load(Reader);
-}
-
-uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
-  uint32_t Result;
-  if (!NamedStreams.tryGetValue(Name, Result))
-    return 0;
-  return Result;
-}
-
-iterator_range<StringMapConstIterator<uint32_t>>
-InfoStream::named_streams() const {
-  return NamedStreams.entries();
-}
-
-PdbRaw_ImplVer InfoStream::getVersion() const {
-  return static_cast<PdbRaw_ImplVer>(Version);
-}
-
-uint32_t InfoStream::getSignature() const { return Signature; }
-
-uint32_t InfoStream::getAge() const { return Age; }
-
-PDB_UniqueId InfoStream::getGuid() const { return Guid; }
diff --git a/lib/DebugInfo/PDB/Raw/NameMap.cpp b/lib/DebugInfo/PDB/Raw/NameMap.cpp
deleted file mode 100644
index 0f55f58da381..000000000000
--- a/lib/DebugInfo/PDB/Raw/NameMap.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- NameMap.cpp - PDB Name Map -------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/NameMap.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-NameMap::NameMap() = default;
-
-Error NameMap::load(StreamReader &Stream) {
-  // This is some sort of weird string-set/hash table encoded in the stream.
-  // It starts with the number of bytes in the table.
-  uint32_t NumberOfBytes;
-  if (auto EC = Stream.readInteger(NumberOfBytes))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map length"));
-  if (Stream.bytesRemaining() < NumberOfBytes)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid name map length");
-
-  // Following that field is the starting offset of strings in the name table.
-  uint32_t StringsOffset = Stream.getOffset();
-  Stream.setOffset(StringsOffset + NumberOfBytes);
-
-  // This appears to be equivalent to the total number of strings *actually*
-  // in the name table.
-  uint32_t HashSize;
-  if (auto EC = Stream.readInteger(HashSize))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map hash size"));
-
-  // This appears to be an upper bound on the number of strings in the name
-  // table.
-  uint32_t MaxNumberOfStrings;
-  if (auto EC = Stream.readInteger(MaxNumberOfStrings))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map max strings"));
-
-  if (MaxNumberOfStrings > (UINT32_MAX / sizeof(uint32_t)))
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Implausible number of strings");
-
-  const uint32_t MaxNumberOfWords = UINT32_MAX / (sizeof(uint32_t) * 8);
-
-  // This appears to be a hash table which uses bitfields to determine whether
-  // or not a bucket is 'present'.
-  uint32_t NumPresentWords;
-  if (auto EC = Stream.readInteger(NumPresentWords))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map num words"));
-
-  if (NumPresentWords > MaxNumberOfWords)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Number of present words is too large");
-
-  SparseBitVector<> Present;
-  for (uint32_t I = 0; I != NumPresentWords; ++I) {
-    uint32_t Word;
-    if (auto EC = Stream.readInteger(Word))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map word"));
-    for (unsigned Idx = 0; Idx < 32; ++Idx)
-      if (Word & (1U << Idx))
-        Present.set((I * 32) + Idx);
-  }
-
-  // This appears to be a hash table which uses bitfields to determine whether
-  // or not a bucket is 'deleted'.
-  uint32_t NumDeletedWords;
-  if (auto EC = Stream.readInteger(NumDeletedWords))
-    return joinErrors(
-        std::move(EC),
-        make_error<RawError>(raw_error_code::corrupt_file,
-                             "Expected name map num deleted words"));
-
-  if (NumDeletedWords > MaxNumberOfWords)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Number of deleted words is too large");
-
-  SparseBitVector<> Deleted;
-  for (uint32_t I = 0; I != NumDeletedWords; ++I) {
-    uint32_t Word;
-    if (auto EC = Stream.readInteger(Word))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map word"));
-    for (unsigned Idx = 0; Idx < 32; ++Idx)
-      if (Word & (1U << Idx))
-        Deleted.set((I * 32) + Idx);
-  }
-
-  for (unsigned I : Present) {
-    // For all present entries, dump out their mapping.
-    (void)I;
-
-    // This appears to be an offset relative to the start of the strings.
-    // It tells us where the null-terminated string begins.
-    uint32_t NameOffset;
-    if (auto EC = Stream.readInteger(NameOffset))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name offset"));
-
-    // This appears to be a stream number into the stream directory.
-    uint32_t NameIndex;
-    if (auto EC = Stream.readInteger(NameIndex))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name index"));
-
-    // Compute the offset of the start of the string relative to the stream.
-    uint32_t StringOffset = StringsOffset + NameOffset;
-    uint32_t OldOffset = Stream.getOffset();
-    // Pump out our c-string from the stream.
-    StringRef Str;
-    Stream.setOffset(StringOffset);
-    if (auto EC = Stream.readZeroString(Str))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name"));
-
-    Stream.setOffset(OldOffset);
-    // Add this to a string-map from name to stream number.
-    Mapping.insert({Str, NameIndex});
-  }
-
-  return Error::success();
-}
-
-iterator_range<StringMapConstIterator<uint32_t>> NameMap::entries() const {
-  return make_range<StringMapConstIterator<uint32_t>>(Mapping.begin(),
-                                                      Mapping.end());
-}
-
-bool NameMap::tryGetValue(StringRef Name, uint32_t &Value) const {
-  auto Iter = Mapping.find(Name);
-  if (Iter == Mapping.end())
-    return false;
-  Value = Iter->second;
-  return true;
-}
diff --git a/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp b/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp
deleted file mode 100644
index f570d5931b0f..000000000000
--- a/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//===- NameMapBuilder.cpp - PDB Name Map Builder ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/NameMap.h"
-#include "llvm/DebugInfo/PDB/Raw/NameMapBuilder.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::pdb;
-
-NameMapBuilder::NameMapBuilder() = default;
-
-void NameMapBuilder::addMapping(StringRef Name, uint32_t Mapping) {
-  StringDataBytes += Name.size() + 1;
-  Map.insert({Name, Mapping});
-}
-
-Expected<std::unique_ptr<NameMap>> NameMapBuilder::build() {
-  auto Result = llvm::make_unique<NameMap>();
-  Result->Mapping = Map;
-  return std::move(Result);
-}
-
-uint32_t NameMapBuilder::calculateSerializedLength() const {
-  uint32_t TotalLength = 0;
-
-  TotalLength += sizeof(support::ulittle32_t); // StringDataBytes value
-  TotalLength += StringDataBytes;              // actual string data
-
-  TotalLength += sizeof(support::ulittle32_t); // Hash Size
-  TotalLength += sizeof(support::ulittle32_t); // Max Number of Strings
-  TotalLength += sizeof(support::ulittle32_t); // Num Present Words
-  // One bitmask word for each present entry
-  TotalLength += Map.size() * sizeof(support::ulittle32_t);
-  TotalLength += sizeof(support::ulittle32_t); // Num Deleted Words
-
-  // For each present word, which we are treating as equivalent to the number of
-  // entries in the table, we have a pair of integers.  An offset into the
-  // string data, and a corresponding stream number.
-  TotalLength += Map.size() * 2 * sizeof(support::ulittle32_t);
-
-  return TotalLength;
-}
-
-Error NameMapBuilder::commit(msf::StreamWriter &Writer) const {
-  // The first field is the number of bytes of string data.  So add
-  // up the length of all strings plus a null terminator for each
-  // one.
-  uint32_t NumBytes = 0;
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    NumBytes += B->getKeyLength() + 1;
-  }
-
-  if (auto EC = Writer.writeInteger(NumBytes)) // Number of bytes of string data
-    return EC;
-  // Now all of the string data itself.
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    if (auto EC = Writer.writeZeroString(B->getKey()))
-      return EC;
-  }
-
-  if (auto EC = Writer.writeInteger(Map.size())) // Hash Size
-    return EC;
-
-  if (auto EC = Writer.writeInteger(Map.size())) // Max Number of Strings
-    return EC;
-
-  if (auto EC = Writer.writeInteger(Map.size())) // Num Present Words
-    return EC;
-
-  // For each entry in the mapping, write a bit mask which represents a bucket
-  // to store it in.  We don't use this, so the value we write isn't important
-  // to us, it just has to be there.
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    if (auto EC = Writer.writeInteger(1U))
-      return EC;
-  }
-
-  if (auto EC = Writer.writeInteger(0U)) // Num Deleted Words
-    return EC;
-
-  // Mappings of each word.
-  uint32_t OffsetSoFar = 0;
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    // This is a list of key value pairs where the key is the offset into the
-    // strings buffer, and the value is a stream number.  Write each pair.
-    if (auto EC = Writer.writeInteger(OffsetSoFar))
-      return EC;
-
-    if (auto EC = Writer.writeInteger(B->second))
-      return EC;
-
-    OffsetSoFar += B->getKeyLength() + 1;
-  }
-
-  return Error::success();
-}
diff --git a/lib/DebugInfo/PDB/Raw/RawSession.cpp b/lib/DebugInfo/PDB/Raw/RawSession.cpp
deleted file mode 100644
index cd3a2064c717..000000000000
--- a/lib/DebugInfo/PDB/Raw/RawSession.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===- RawSession.cpp - Raw implementation of IPDBSession -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawSession.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
-#include <memory>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-RawSession::RawSession(std::unique_ptr<PDBFile> PdbFile,
-                       std::unique_ptr<BumpPtrAllocator> Allocator)
-    : Pdb(std::move(PdbFile)), Allocator(std::move(Allocator)) {}
-
-RawSession::~RawSession() = default;
-
-Error RawSession::createFromPdb(StringRef Path,
-                                std::unique_ptr<IPDBSession> &Session) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
-      MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
-                                   /*RequiresNullTerminator=*/false);
-  if (!ErrorOrBuffer)
-    return make_error<GenericError>(generic_error_code::invalid_path);
-
-  std::unique_ptr<MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
-  auto Stream = llvm::make_unique<MemoryBufferByteStream>(std::move(Buffer));
-
-  auto Allocator = llvm::make_unique<BumpPtrAllocator>();
-  auto File = llvm::make_unique<PDBFile>(std::move(Stream), *Allocator);
-  if (auto EC = File->parseFileHeaders())
-    return EC;
-  if (auto EC = File->parseStreamData())
-    return EC;
-
-  Session =
-      llvm::make_unique<RawSession>(std::move(File), std::move(Allocator));
-
-  return Error::success();
-}
-
-Error RawSession::createFromExe(StringRef Path,
-                                std::unique_ptr<IPDBSession> &Session) {
-  return make_error<RawError>(raw_error_code::feature_unsupported);
-}
-
-uint64_t RawSession::getLoadAddress() const { return 0; }
-
-void RawSession::setLoadAddress(uint64_t Address) {}
-
-std::unique_ptr<PDBSymbolExe> RawSession::getGlobalScope() const {
-  return nullptr;
-}
-
-std::unique_ptr<PDBSymbol> RawSession::getSymbolById(uint32_t SymbolId) const {
-  return nullptr;
-}
-
-std::unique_ptr<PDBSymbol>
-RawSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumLineNumbers>
-RawSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
-                            const IPDBSourceFile &File) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumLineNumbers>
-RawSession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumSourceFiles>
-RawSession::findSourceFiles(const PDBSymbolCompiland *Compiland,
-                            StringRef Pattern,
-                            PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBSourceFile>
-RawSession::findOneSourceFile(const PDBSymbolCompiland *Compiland,
-                              StringRef Pattern,
-                              PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
-RawSession::findCompilandsForSourceFile(StringRef Pattern,
-                                        PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<PDBSymbolCompiland>
-RawSession::findOneCompilandForSourceFile(StringRef Pattern,
-                                          PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumSourceFiles> RawSession::getAllSourceFiles() const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumSourceFiles> RawSession::getSourceFilesForCompiland(
-    const PDBSymbolCompiland &Compiland) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBSourceFile>
-RawSession::getSourceFileById(uint32_t FileId) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumDataStreams> RawSession::getDebugStreams() const {
-  return nullptr;
-}
diff --git a/lib/DebugInfo/PDB/UDTLayout.cpp b/lib/DebugInfo/PDB/UDTLayout.cpp
new file mode 100644
index 000000000000..61cef093d4ce
--- /dev/null
+++ b/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -0,0 +1,335 @@
+//===- UDTLayout.cpp --------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/UDTLayout.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+static std::unique_ptr<PDBSymbol> getSymbolType(const PDBSymbol &Symbol) {
+  const IPDBSession &Session = Symbol.getSession();
+  const IPDBRawSymbol &RawSymbol = Symbol.getRawSymbol();
+  uint32_t TypeId = RawSymbol.getTypeId();
+  return Session.getSymbolById(TypeId);
+}
+
+static uint32_t getTypeLength(const PDBSymbol &Symbol) {
+  auto SymbolType = getSymbolType(Symbol);
+  const IPDBRawSymbol &RawType = SymbolType->getRawSymbol();
+
+  return RawType.getLength();
+}
+
+StorageItemBase::StorageItemBase(const UDTLayoutBase &Parent,
+                                 const PDBSymbol &Symbol,
+                                 const std::string &Name,
+                                 uint32_t OffsetInParent, uint32_t Size)
+    : Parent(Parent), Symbol(Symbol), Name(Name),
+      OffsetInParent(OffsetInParent), SizeOf(Size) {
+  UsedBytes.resize(SizeOf, true);
+}
+
+uint32_t StorageItemBase::deepPaddingSize() const {
+  // sizeof(Field) - sizeof(typeof(Field)) is trailing padding.
+  return SizeOf - getTypeLength(Symbol);
+}
+
+DataMemberLayoutItem::DataMemberLayoutItem(
+    const UDTLayoutBase &Parent, std::unique_ptr<PDBSymbolData> DataMember)
+    : StorageItemBase(Parent, *DataMember, DataMember->getName(),
+                      DataMember->getOffset(), getTypeLength(*DataMember)),
+      DataMember(std::move(DataMember)) {
+  auto Type = this->DataMember->getType();
+  if (auto UDT = unique_dyn_cast<PDBSymbolTypeUDT>(Type)) {
+    // UDT data members might have padding in between fields, but otherwise
+    // a member should occupy its entire storage.
+    UsedBytes.resize(SizeOf, false);
+    UdtLayout = llvm::make_unique<ClassLayout>(std::move(UDT));
+  }
+}
+
+const PDBSymbolData &DataMemberLayoutItem::getDataMember() {
+  return *dyn_cast<PDBSymbolData>(&Symbol);
+}
+
+bool DataMemberLayoutItem::hasUDTLayout() const { return UdtLayout != nullptr; }
+
+const ClassLayout &DataMemberLayoutItem::getUDTLayout() const {
+  return *UdtLayout;
+}
+
+uint32_t DataMemberLayoutItem::deepPaddingSize() const {
+  uint32_t Result = StorageItemBase::deepPaddingSize();
+  if (UdtLayout)
+    Result += UdtLayout->deepPaddingSize();
+  return Result;
+}
+
+VTableLayoutItem::VTableLayoutItem(const UDTLayoutBase &Parent,
+                                   std::unique_ptr<PDBSymbolTypeVTable> VTable)
+    : StorageItemBase(Parent, *VTable, "<vtbl>", 0, getTypeLength(*VTable)),
+      VTable(std::move(VTable)) {
+  auto VTableType = cast<PDBSymbolTypePointer>(this->VTable->getType());
+  ElementSize = VTableType->getLength();
+
+  Shape =
+      unique_dyn_cast<PDBSymbolTypeVTableShape>(VTableType->getPointeeType());
+  if (Shape)
+    VTableFuncs.resize(Shape->getCount());
+}
+
+UDTLayoutBase::UDTLayoutBase(const PDBSymbol &Symbol, const std::string &Name,
+                             uint32_t Size)
+    : SymbolBase(Symbol), Name(Name), SizeOf(Size) {
+  UsedBytes.resize(Size);
+  ChildrenPerByte.resize(Size);
+  initializeChildren(Symbol);
+}
+
+ClassLayout::ClassLayout(const PDBSymbolTypeUDT &UDT)
+    : UDTLayoutBase(UDT, UDT.getName(), UDT.getLength()), UDT(UDT) {}
+
+ClassLayout::ClassLayout(std::unique_ptr<PDBSymbolTypeUDT> UDT)
+    : ClassLayout(*UDT) {
+  OwnedStorage = std::move(UDT);
+}
+
+BaseClassLayout::BaseClassLayout(const UDTLayoutBase &Parent,
+                                 std::unique_ptr<PDBSymbolTypeBaseClass> Base)
+    : UDTLayoutBase(*Base, Base->getName(), Base->getLength()),
+      StorageItemBase(Parent, *Base, Base->getName(), Base->getOffset(),
+                      Base->getLength()),
+      Base(std::move(Base)) {
+  IsVirtualBase = this->Base->isVirtualBaseClass();
+}
+
+uint32_t UDTLayoutBase::shallowPaddingSize() const {
+  return UsedBytes.size() - UsedBytes.count();
+}
+
+uint32_t UDTLayoutBase::deepPaddingSize() const {
+  uint32_t Result = shallowPaddingSize();
+  for (auto &Child : ChildStorage)
+    Result += Child->deepPaddingSize();
+  return Result;
+}
+
+void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
+  // Handled bases first, followed by VTables, followed by data members,
+  // followed by functions, followed by other.  This ordering is necessary
+  // so that bases and vtables get initialized before any functions which
+  // may override them.
+
+  UniquePtrVector<PDBSymbolTypeBaseClass> Bases;
+  UniquePtrVector<PDBSymbolTypeVTable> VTables;
+  UniquePtrVector<PDBSymbolData> Members;
+  auto Children = Sym.findAllChildren();
+  while (auto Child = Children->getNext()) {
+    if (auto Base = unique_dyn_cast<PDBSymbolTypeBaseClass>(Child)) {
+      if (Base->isVirtualBaseClass())
+        VirtualBases.push_back(std::move(Base));
+      else
+        Bases.push_back(std::move(Base));
+    }
+
+    else if (auto Data = unique_dyn_cast<PDBSymbolData>(Child)) {
+      if (Data->getDataKind() == PDB_DataKind::Member)
+        Members.push_back(std::move(Data));
+      else
+        Other.push_back(std::move(Child));
+    } else if (auto VT = unique_dyn_cast<PDBSymbolTypeVTable>(Child))
+      VTables.push_back(std::move(VT));
+    else if (auto Func = unique_dyn_cast<PDBSymbolFunc>(Child))
+      Funcs.push_back(std::move(Func));
+    else
+      Other.push_back(std::move(Child));
+  }
+
+  for (auto &Base : Bases) {
+    auto BL = llvm::make_unique<BaseClassLayout>(*this, std::move(Base));
+    BaseClasses.push_back(BL.get());
+
+    addChildToLayout(std::move(BL));
+  }
+
+  for (auto &VT : VTables) {
+    auto VTLayout = llvm::make_unique<VTableLayoutItem>(*this, std::move(VT));
+
+    VTable = VTLayout.get();
+
+    addChildToLayout(std::move(VTLayout));
+    continue;
+  }
+
+  for (auto &Data : Members) {
+    auto DM = llvm::make_unique<DataMemberLayoutItem>(*this, std::move(Data));
+
+    addChildToLayout(std::move(DM));
+  }
+
+  for (auto &Func : Funcs) {
+    if (!Func->isVirtual())
+      continue;
+
+    if (Func->isIntroVirtualFunction())
+      addVirtualIntro(*Func);
+    else
+      addVirtualOverride(*Func);
+  }
+}
+
+void UDTLayoutBase::addVirtualIntro(PDBSymbolFunc &Func) {
+  // Kind of a hack, but we prefer the more common destructor name that people
+  // are familiar with, e.g. ~ClassName.  It seems there are always both and
+  // the vector deleting destructor overwrites the nice destructor, so just
+  // ignore the vector deleting destructor.
+  if (Func.getName() == "__vecDelDtor")
+    return;
+
+  if (!VTable) {
+    // FIXME: Handle this.  What's most likely happening is we have an intro
+    // virtual in a derived class where the base also has an intro virtual.
+    // In this case the vtable lives in the base.  What we really need is
+    // for each UDTLayoutBase to contain a list of all its vtables, and
+    // then propagate this list up the hierarchy so that derived classes have
+    // direct access to their bases' vtables.
+    return;
+  }
+
+  uint32_t Stride = VTable->getElementSize();
+
+  uint32_t Index = Func.getVirtualBaseOffset();
+  assert(Index % Stride == 0);
+  Index /= Stride;
+
+  VTable->setFunction(Index, Func);
+}
+
+VTableLayoutItem *UDTLayoutBase::findVTableAtOffset(uint32_t RelativeOffset) {
+  if (VTable && VTable->getOffsetInParent() == RelativeOffset)
+    return VTable;
+  for (auto Base : BaseClasses) {
+    uint32_t Begin = Base->getOffsetInParent();
+    uint32_t End = Begin + Base->getSize();
+    if (RelativeOffset < Begin || RelativeOffset >= End)
+      continue;
+
+    return Base->findVTableAtOffset(RelativeOffset - Begin);
+  }
+
+  return nullptr;
+}
+
+void UDTLayoutBase::addVirtualOverride(PDBSymbolFunc &Func) {
+  auto Signature = Func.getSignature();
+  auto ThisAdjust = Signature->getThisAdjust();
+  // ThisAdjust tells us which VTable we're looking for.  Specifically, it's
+  // the offset into the current class of the VTable we're looking for.  So
+  // look through the base hierarchy until we find one such that
+  // AbsoluteOffset(VT) == ThisAdjust
+  VTableLayoutItem *VT = findVTableAtOffset(ThisAdjust);
+  if (!VT) {
+    // FIXME: There really should be a vtable here.  If there's not it probably
+    // means that the vtable is in a virtual base, which we don't yet support.
+    assert(!VirtualBases.empty());
+    return;
+  }
+  int32_t OverrideIndex = -1;
+  // Now we've found the VTable.  Func will not have a virtual base offset set,
+  // so instead we need to compare names and signatures.  We iterate each item
+  // in the VTable.  All items should already have non null entries because they
+  // were initialized by the intro virtual, which was guaranteed to come before.
+  for (auto ItemAndIndex : enumerate(VT->funcs())) {
+    auto Item = ItemAndIndex.value();
+    assert(Item);
+    // If the name doesn't match, this isn't an override.  Note that it's ok
+    // for the return type to not match (e.g. co-variant return).
+    if (Item->getName() != Func.getName()) {
+      if (Item->isDestructor() && Func.isDestructor()) {
+        OverrideIndex = ItemAndIndex.index();
+        break;
+      }
+      continue;
+    }
+    // Now make sure it's the right overload.  Get the signature of the existing
+    // vtable method and make sure it has the same arglist and the same cv-ness.
+    auto ExistingSig = Item->getSignature();
+    if (ExistingSig->isConstType() != Signature->isConstType())
+      continue;
+    if (ExistingSig->isVolatileType() != Signature->isVolatileType())
+      continue;
+
+    // Now compare arguments.  Using the raw bytes of the PDB this would be
+    // trivial
+    // because there is an ArgListId and they should be identical.  But DIA
+    // doesn't
+    // expose this, so the best we can do is iterate each argument and confirm
+    // that
+    // each one is identical.
+    if (ExistingSig->getCount() != Signature->getCount())
+      continue;
+    bool IsMatch = true;
+    auto ExistingEnumerator = ExistingSig->getArguments();
+    auto NewEnumerator = Signature->getArguments();
+    for (uint32_t I = 0; I < ExistingEnumerator->getChildCount(); ++I) {
+      auto ExistingArg = ExistingEnumerator->getNext();
+      auto NewArg = NewEnumerator->getNext();
+      if (ExistingArg->getSymIndexId() != NewArg->getSymIndexId()) {
+        IsMatch = false;
+        break;
+      }
+    }
+    if (!IsMatch)
+      continue;
+
+    // It's a match!  Stick the new function into the VTable.
+    OverrideIndex = ItemAndIndex.index();
+    break;
+  }
+  if (OverrideIndex == -1) {
+    // FIXME: This is probably due to one of the other FIXMEs in this file.
+    return;
+  }
+  VT->setFunction(OverrideIndex, Func);
+}
+
+void UDTLayoutBase::addChildToLayout(std::unique_ptr<StorageItemBase> Child) {
+  uint32_t Begin = Child->getOffsetInParent();
+  uint32_t End = Begin + Child->getSize();
+  // Due to the empty base optimization, End might point outside the bounds of
+  // the parent class.  If that happens, just clamp the value.
+  End = std::min(End, getClassSize());
+
+  UsedBytes.set(Begin, End);
+  while (Begin != End) {
+    ChildrenPerByte[Begin].push_back(Child.get());
+    ++Begin;
+  }
+
+  auto Loc = std::upper_bound(
+      ChildStorage.begin(), ChildStorage.end(), Begin,
+      [](uint32_t Off, const std::unique_ptr<StorageItemBase> &Item) {
+        return Off < Item->getOffsetInParent();
+      });
+
+  ChildStorage.insert(Loc, std::move(Child));
+}
+\ No newline at end of file
diff --git a/lib/DebugInfo/Symbolize/DIPrinter.cpp b/lib/DebugInfo/Symbolize/DIPrinter.cpp
index be5c603a38ef..c1e2536d6e20 100644
--- a/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -78,8 +78,18 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
   std::string Filename = Info.FileName;
   if (Filename == kDILineInfoBadString)
     Filename = kBadString;
-  OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
-  printContext(Filename, Info.Line);
+  if (!Verbose) {
+    OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
+    printContext(Filename, Info.Line);
+    return;
+  }
+  OS << "  Filename: " << Filename << "\n";
+  if (Info.StartLine)
+    OS << "Function start line: " << Info.StartLine << "\n";
+  OS << "  Line: " << Info.Line << "\n";
+  OS << "  Column: " << Info.Column << "\n";
+  if (Info.Discriminator)
+    OS << "  Discriminator: " << Info.Discriminator << "\n";
 }
 
 DIPrinter &DIPrinter::operator<<(const DILineInfo &Info) {
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index f6940080089f..f672680cb9ea 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -1,4 +1,4 @@
-//===-- SymbolizableObjectFile.cpp ----------------------------------------===//
+//===- SymbolizableObjectFile.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "SymbolizableObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolSize.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-
-namespace llvm {
-namespace symbolize {
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
+using namespace llvm;
 using namespace object;
+using namespace symbolize;
 
 static DILineInfoSpecifier
 getDILineInfoSpecifier(FunctionNameKind FNKind) {
@@ -73,14 +87,17 @@ SymbolizableObjectFile::SymbolizableObjectFile(ObjectFile *Obj,
     : Module(Obj), DebugInfoContext(std::move(DICtx)) {}
 
 namespace {
+
 struct OffsetNamePair {
   uint32_t Offset;
   StringRef Name;
+
   bool operator<(const OffsetNamePair &R) const {
     return Offset < R.Offset;
   }
 };
-}
+
+} // end anonymous namespace
 
 std::error_code SymbolizableObjectFile::addCoffExportSymbols(
     const COFFObjectFile *CoffObj) {
@@ -147,7 +164,7 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
     return errorToErrorCode(SymbolNameOrErr.takeError());
   StringRef SymbolName = *SymbolNameOrErr;
   // Mach-O symbol table names have leading underscore, skip it.
-  if (Module->isMachO() && SymbolName.size() > 0 && SymbolName[0] == '_')
+  if (Module->isMachO() && !SymbolName.empty() && SymbolName[0] == '_')
     SymbolName = SymbolName.drop_front();
   // FIXME: If a function has alias, there are two entries in symbol table
   // with same address size. Make sure we choose the correct one.
@@ -252,7 +269,3 @@ DIGlobal SymbolizableObjectFile::symbolizeData(uint64_t ModuleOffset) const {
                          Res.Size);
   return Res;
 }
-
-}  // namespace symbolize
-}  // namespace llvm
-
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 8583b6a36e63..216cca8de4f5 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -1,4 +1,4 @@
-//===-- SymbolizableObjectFile.h -------------------------------- C++ -----===//
+//===- SymbolizableObjectFile.h ---------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,14 +13,20 @@
 #ifndef LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
 #define LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Support/ErrorOr.h"
+#include <cstdint>
 #include <map>
+#include <memory>
+#include <string>
+#include <system_error>
 
 namespace llvm {
+
 class DataExtractor;
-}
 
-namespace llvm {
 namespace symbolize {
 
 class SymbolizableObjectFile : public SymbolizableModule {
@@ -65,6 +71,7 @@ private:
     // If size is 0, assume that symbol occupies the whole memory range up to
     // the following symbol.
     uint64_t Size;
+
     friend bool operator<(const SymbolDesc &s1, const SymbolDesc &s2) {
       return s1.Addr < s2.Addr;
     }
@@ -76,7 +83,8 @@ private:
                          std::unique_ptr<DIContext> DICtx);
 };
 
-}  // namespace symbolize
-}  // namespace llvm
+} // end namespace symbolize
+
+} // end namespace llvm
 
-#endif  // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+#endif // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 097b6ca2e083..49dbe74d25df 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -36,6 +36,12 @@ enum {
   success
 };
 
+enum {
+  CV_const = (1 << 0),
+  CV_volatile = (1 << 1),
+  CV_restrict = (1 << 2),
+};
+
 template <class C>
 static const char *parse_type(const char *first, const char *last, C &db);
 template <class C>
@@ -436,15 +442,15 @@ static const char *parse_cv_qualifiers(const char *first, const char *last,
   cv = 0;
   if (first != last) {
     if (*first == 'r') {
-      cv |= 4;
+      cv |= CV_restrict;
       ++first;
     }
     if (*first == 'V') {
-      cv |= 2;
+      cv |= CV_volatile;
       ++first;
     }
     if (*first == 'K') {
-      cv |= 1;
+      cv |= CV_const;
       ++first;
     }
   }
@@ -1396,7 +1402,8 @@ static const char *parse_function_type(const char *first, const char *last,
         int ref_qual = 0;
         while (true) {
           if (t == last) {
-            db.names.pop_back();
+            if (!db.names.empty())
+              db.names.pop_back();
             return first;
           }
           if (*t == 'E') {
@@ -1663,27 +1670,30 @@ static const char *parse_type(const char *first, const char *last, C &db) {
           db.subs.emplace_back();
           for (size_t k = k0; k < k1; ++k) {
             if (is_function) {
-              size_t p = db.names[k].second.size();
-              if (db.names[k].second[p - 2] == '&')
-                p -= 3;
-              else if (db.names[k].second.back() == '&')
+              auto &name = db.names[k].second;
+              size_t p = name.size();
+
+              if (name[p - 2] == '&' && name[p - 1] == '&')
                 p -= 2;
-              if (cv & 1) {
-                db.names[k].second.insert(p, " const");
+              else if (name.back() == '&')
+                p -= 1;
+
+              if (cv & CV_const) {
+                name.insert(p, " const");
                 p += 6;
               }
-              if (cv & 2) {
-                db.names[k].second.insert(p, " volatile");
+              if (cv & CV_volatile) {
+                name.insert(p, " volatile");
                 p += 9;
               }
-              if (cv & 4)
-                db.names[k].second.insert(p, " restrict");
+              if (cv & CV_restrict)
+                name.insert(p, " restrict");
             } else {
-              if (cv & 1)
+              if (cv & CV_const)
                 db.names[k].first.append(" const");
-              if (cv & 2)
+              if (cv & CV_volatile)
                 db.names[k].first.append(" volatile");
-              if (cv & 4)
+              if (cv & CV_restrict)
                 db.names[k].first.append(" restrict");
             }
             db.subs.back().push_back(db.names[k]);
@@ -3826,6 +3836,8 @@ static const char *parse_call_offset(const char *first, const char *last) {
 //                ::= GV <object name> # Guard variable for one-time
 //                initialization
 //                                     # No <type>
+//                ::= TW <object name> # Thread-local wrapper
+//                ::= TH <object name> # Thread-local initialization
 //      extension ::= TC <first type> <number> _ <second type> # construction
 //      vtable for second-in-first
 //      extension ::= GR <object name> # reference temporary for object
@@ -3919,6 +3931,27 @@ static const char *parse_special_name(const char *first, const char *last,
           }
         }
         break;
+      case 'W':
+        // TW <object name> # Thread-local wrapper
+        t = parse_name(first + 2, last, db);
+        if (t != first + 2) {
+          if (db.names.empty())
+            return first;
+          db.names.back().first.insert(0, "thread-local wrapper routine for ");
+          first = t;
+        }
+        break;
+      case 'H':
+        // TH <object name> # Thread-local initialization
+        t = parse_name(first + 2, last, db);
+        if (t != first + 2) {
+          if (db.names.empty())
+            return first;
+          db.names.back().first.insert(
+              0, "thread-local initialization routine for ");
+          first = t;
+        }
+        break;
       default:
         // T <call-offset> <base encoding>
         {
@@ -4074,11 +4107,11 @@ static const char *parse_encoding(const char *first, const char *last, C &db) {
           if (db.names.empty())
             return first;
           db.names.back().first += ')';
-          if (cv & 1)
+          if (cv & CV_const)
             db.names.back().first.append(" const");
-          if (cv & 2)
+          if (cv & CV_volatile)
             db.names.back().first.append(" volatile");
-          if (cv & 4)
+          if (cv & CV_restrict)
             db.names.back().first.append(" restrict");
           if (ref == 1)
             db.names.back().first.append(" &");
@@ -4225,20 +4258,11 @@ char *llvm::itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
       *status = invalid_args;
     return nullptr;
   }
-
-  size_t len = std::strlen(mangled_name);
-  if (len < 2 || strncmp(mangled_name, "_Z", 2)) {
-    if (len < 4 || strncmp(mangled_name, "___Z", 4)) {
-      if (status)
-        *status = invalid_mangled_name;
-      return nullptr;
-    }
-  }
-
   size_t internal_size = buf != nullptr ? *n : 0;
   Db db;
   db.template_param.emplace_back();
   int internal_status = success;
+  size_t len = std::strlen(mangled_name);
   demangle(mangled_name, mangled_name + len, db, internal_status);
   if (internal_status == success && db.fix_forward_references &&
       !db.template_param.empty() && !db.template_param.front().empty()) {
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index b4bed325f491..2ee72f9a8c16 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -515,7 +515,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   // to the function tells DynamicLibrary to load the program, not a library.
   if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr, ErrorStr))
     return nullptr;
-  
+
   // If the user specified a memory manager but didn't specify which engine to
   // create, we assume they only want the JIT, and we fail if they only want
   // the interpreter.
@@ -616,7 +616,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         for (unsigned int i = 0; i < elemNum; ++i) {
           Type *ElemTy = STy->getElementType(i);
           if (ElemTy->isIntegerTy())
-            Result.AggregateVal[i].IntVal = 
+            Result.AggregateVal[i].IntVal =
               APInt(ElemTy->getPrimitiveSizeInBits(), 0);
           else if (ElemTy->isAggregateType()) {
               const Constant *ElemUndef = UndefValue::get(ElemTy);
@@ -727,7 +727,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         APFloat apf = APFloat(APFloat::x87DoubleExtended(), GV.IntVal);
         uint64_t v;
         bool ignored;
-        (void)apf.convertToInteger(&v, BitWidth,
+        (void)apf.convertToInteger(makeMutableArrayRef(v), BitWidth,
                                    CE->getOpcode()==Instruction::FPToSI,
                                    APFloat::rmTowardZero, &ignored);
         GV.IntVal = v; // endian?
@@ -979,7 +979,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     // Check if vector holds integers.
     if (ElemTy->isIntegerTy()) {
       if (CAZ) {
-        GenericValue intZero;     
+        GenericValue intZero;
         intZero.IntVal = APInt(ElemTy->getScalarSizeInBits(), 0ull);
         std::fill(Result.AggregateVal.begin(), Result.AggregateVal.end(),
                   intZero);
@@ -1079,7 +1079,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
         *(((float*)Ptr)+i) = Val.AggregateVal[i].FloatVal;
       if (cast<VectorType>(Ty)->getElementType()->isIntegerTy()) {
         unsigned numOfBytes =(Val.AggregateVal[i].IntVal.getBitWidth()+7)/8;
-        StoreIntToMemory(Val.AggregateVal[i].IntVal, 
+        StoreIntToMemory(Val.AggregateVal[i].IntVal,
           (uint8_t*)Ptr + numOfBytes*i, numOfBytes);
       }
     }
@@ -1186,7 +1186,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   DEBUG(Init->dump());
   if (isa<UndefValue>(Init))
     return;
-  
+
   if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
     unsigned ElementSize =
         getDataLayout().getTypeAllocSize(CP->getType()->getElementType());
@@ -1194,12 +1194,12 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
       InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
-  
+
   if (isa<ConstantAggregateZero>(Init)) {
     memset(Addr, 0, (size_t)getDataLayout().getTypeAllocSize(Init->getType()));
     return;
   }
-  
+
   if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
     unsigned ElementSize =
         getDataLayout().getTypeAllocSize(CPA->getType()->getElementType());
@@ -1207,7 +1207,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
       InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
-  
+
   if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
     const StructLayout *SL =
         getDataLayout().getStructLayout(cast<StructType>(CPS->getType()));
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 1d7c6e714ed0..e956dbebaffe 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -188,7 +188,7 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
     for (auto &F : *Mod) {
       auto Attrs = F.getAttributes();
       StringRef Value(options.NoFramePointerElim ? "true" : "false");
-      Attrs = Attrs.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+      Attrs = Attrs.addAttribute(F.getContext(), AttributeList::FunctionIndex,
                                  "no-frame-pointer-elim", Value);
       F.setAttributes(Attrs);
     }
diff --git a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
index 3b8c4b973e68..e6c33b2ecc2a 100644
--- a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -4,7 +4,7 @@ if( HAVE_LIBDL )
     set(LLVM_INTEL_JIT_LIBS ${CMAKE_DL_LIBS})
 endif()
 
-set(LLVM_INTEL_JIT_LIBS ${PTHREAD_LIB} ${LLVM_INTEL_JIT_LIBS})
+set(LLVM_INTEL_JIT_LIBS ${LLVM_PTHREAD_LIB} ${LLVM_INTEL_JIT_LIBS})
 
 
 add_llvm_library(LLVMIntelJITEvents
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 923f6e7147db..e29e9fc2c702 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -899,10 +899,10 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
 
   // Check to see if any of the cases match...
   BasicBlock *Dest = nullptr;
-  for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
+  for (auto Case : I.cases()) {
+    GenericValue CaseVal = getOperandValue(Case.getCaseValue(), SF);
     if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
-      Dest = cast<BasicBlock>(i.getCaseSuccessor());
+      Dest = cast<BasicBlock>(Case.getCaseSuccessor());
       break;
     }
   }
diff --git a/lib/ExecutionEngine/Orc/CMakeLists.txt b/lib/ExecutionEngine/Orc/CMakeLists.txt
index 685e882e4a83..f83e002c758f 100644
--- a/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMOrcJIT
   OrcCBindings.cpp
   OrcError.cpp
   OrcMCJITReplacement.cpp
+  RPCUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ExecutionEngine/Orc
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index a74fae775ac4..a79dd844bf4f 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -16,7 +16,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Error.h"
 
@@ -30,7 +30,7 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 class OrcCBindingsStack {
 public:
   typedef orc::JITCompileCallbackManager CompileCallbackMgr;
-  typedef orc::ObjectLinkingLayer<> ObjLayerT;
+  typedef orc::RTDyldObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>
       CODLayerT;
diff --git a/lib/ExecutionEngine/Orc/OrcError.cpp b/lib/ExecutionEngine/Orc/OrcError.cpp
index c531fe369920..9e70c4ac1dbf 100644
--- a/lib/ExecutionEngine/Orc/OrcError.cpp
+++ b/lib/ExecutionEngine/Orc/OrcError.cpp
@@ -39,14 +39,19 @@ public:
       return "Remote indirect stubs owner does not exist";
     case OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse:
       return "Remote indirect stubs owner Id already in use";
+    case OrcErrorCode::RPCConnectionClosed:
+      return "RPC connection closed";
+    case OrcErrorCode::RPCCouldNotNegotiateFunction:
+      return "Could not negotiate RPC function";
     case OrcErrorCode::RPCResponseAbandoned:
       return "RPC response abandoned";
     case OrcErrorCode::UnexpectedRPCCall:
       return "Unexpected RPC call";
     case OrcErrorCode::UnexpectedRPCResponse:
       return "Unexpected RPC response";
-    case OrcErrorCode::UnknownRPCFunction:
-      return "Unknown RPC function";
+    case OrcErrorCode::UnknownErrorCodeFromRemote:
+      return "Unknown error returned from remote RPC function "
+             "(Use StringError to get error message)";
     }
     llvm_unreachable("Unhandled error code");
   }
@@ -58,10 +63,10 @@ static ManagedStatic<OrcErrorCategory> OrcErrCat;
 namespace llvm {
 namespace orc {
 
-Error orcError(OrcErrorCode ErrCode) {
+std::error_code orcError(OrcErrorCode ErrCode) {
   typedef std::underlying_type<OrcErrorCode>::type UT;
-  return errorCodeToError(
-      std::error_code(static_cast<UT>(ErrCode), *OrcErrCat));
+  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
 }
+
 }
 }
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index af70960a1f92..a5100a56bcf1 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -24,7 +24,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Object/Archive.h"
@@ -315,7 +315,7 @@ private:
     NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
 
     template <typename ObjListT>
-    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H,
+    void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT H,
                     const ObjListT &Objects,
                     const LoadedObjInfoListT &Infos) const {
       M.UnfinalizedSections[H] = std::move(M.SectionsAllocatedSinceLastLoad);
@@ -344,7 +344,7 @@ private:
   public:
     NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
 
-    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H) {
+    void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT H) {
       M.UnfinalizedSections.erase(H);
     }
 
@@ -361,7 +361,7 @@ private:
     return MangledName;
   }
 
-  typedef ObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
+  typedef RTDyldObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
   typedef IRCompileLayer<ObjectLayerT> CompileLayerT;
   typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
 
diff --git a/lib/ExecutionEngine/Orc/RPCUtils.cpp b/lib/ExecutionEngine/Orc/RPCUtils.cpp
new file mode 100644
index 000000000000..2a7ab5ca8180
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/RPCUtils.cpp
@@ -0,0 +1,55 @@
+//===--------------- RPCUtils.cpp - RPCUtils implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RPCUtils implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/RPCUtils.h"
+
+char llvm::orc::rpc::RPCFatalError::ID = 0;
+char llvm::orc::rpc::ConnectionClosed::ID = 0;
+char llvm::orc::rpc::ResponseAbandoned::ID = 0;
+char llvm::orc::rpc::CouldNotNegotiate::ID = 0;
+
+namespace llvm {
+namespace orc {
+namespace rpc {
+
+std::error_code ConnectionClosed::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCConnectionClosed);
+}
+
+void ConnectionClosed::log(raw_ostream &OS) const {
+  OS << "RPC connection already closed";
+}
+
+std::error_code ResponseAbandoned::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCResponseAbandoned);
+}
+
+void ResponseAbandoned::log(raw_ostream &OS) const {
+  OS << "RPC response abandoned";
+}
+
+CouldNotNegotiate::CouldNotNegotiate(std::string Signature)
+    : Signature(std::move(Signature)) {}
+
+std::error_code CouldNotNegotiate::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCCouldNotNegotiateFunction);
+}
+
+void CouldNotNegotiate::log(raw_ostream &OS) const {
+  OS << "Could not negotiate RPC function " << Signature;
+}
+
+
+} // end namespace rpc
+} // end namespace orc
+} // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 63b56f725209..df9d2ceba329 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -443,7 +443,7 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
        SI != SE; ++SI) {
     const SectionRef &Section = *SI;
 
-    bool IsRequired = isRequiredForExecution(Section);
+    bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections;
 
     // Consider only the sections that are required to be loaded for execution
     if (IsRequired) {
@@ -484,6 +484,14 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
     }
   }
 
+  // Compute Global Offset Table size. If it is not zero we
+  // also update alignment, which is equal to a size of a
+  // single GOT entry.
+  if (unsigned GotSize = computeGOTSize(Obj)) {
+    RWSectionSizes.push_back(GotSize);
+    RWDataAlign = std::max<uint32_t>(RWDataAlign, getGOTEntrySize());
+  }
+
   // Compute the size of all common symbols
   uint64_t CommonSize = 0;
   uint32_t CommonAlign = 1;
@@ -518,6 +526,24 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
   return Error::success();
 }
 
+// compute GOT size
+unsigned RuntimeDyldImpl::computeGOTSize(const ObjectFile &Obj) {
+  size_t GotEntrySize = getGOTEntrySize();
+  if (!GotEntrySize)
+    return 0;
+
+  size_t GotSize = 0;
+  for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
+       SI != SE; ++SI) {
+
+    for (const RelocationRef &Reloc : SI->relocations())
+      if (relocationNeedsGot(Reloc))
+        GotSize += GotEntrySize;
+  }
+
+  return GotSize;
+}
+
 // compute stub buffer size for the given section
 unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
                                                     const SectionRef &Section) {
@@ -677,7 +703,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
   unsigned PaddingSize = 0;
   unsigned StubBufSize = 0;
-  bool IsRequired = isRequiredForExecution(Section);
+  bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections;
   bool IsVirtual = Section.isVirtual();
   bool IsZeroInit = isZeroInit(Section);
   bool IsReadOnly = isReadOnlyData(Section);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 05615d3cc6cf..f780137d0874 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -272,6 +272,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
   default:
     llvm_unreachable("Relocation type not implemented yet!");
     break;
+  case ELF::R_X86_64_NONE:
+    break;
   case ELF::R_X86_64_64: {
     support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
         Value + Addend;
@@ -419,6 +421,18 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     // from bits 11:0 of X
     or32AArch64Imm(TargetPtr, Value + Addend);
     break;
+  case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+    // Operation: S + A
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:0 of X
+    or32AArch64Imm(TargetPtr, getBits(Value + Addend, 0, 11));
+    break;
+  case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+    // Operation: S + A
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:1 of X
+    or32AArch64Imm(TargetPtr, getBits(Value + Addend, 1, 11));
+    break;
   case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
     // Operation: S + A
     // Immediate goes in bits 21:10 of LD/ST instruction, taken
@@ -431,6 +445,12 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     // from bits 11:3 of X
     or32AArch64Imm(TargetPtr, getBits(Value + Addend, 3, 11));
     break;
+  case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+    // Operation: S + A
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:4 of X
+    or32AArch64Imm(TargetPtr, getBits(Value + Addend, 4, 11));
+    break;
   }
 }
 
@@ -900,7 +920,7 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
 }
 
 // Sometimes we don't need to create thunk for a branch.
-// This typically happens when branch target is located 
+// This typically happens when branch target is located
 // in the same object file. In such case target is either
 // a weak symbol or symbol in a different executable section.
 // This function checks if branch target is located in the
@@ -941,6 +961,61 @@ bool RuntimeDyldELF::resolveAArch64ShortBranch(
   return true;
 }
 
+void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
+                                          const RelocationValueRef &Value,
+                                          relocation_iterator RelI,
+                                          StubMap &Stubs) {
+
+  DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
+  SectionEntry &Section = Sections[SectionID];
+
+  uint64_t Offset = RelI->getOffset();
+  unsigned RelType = RelI->getType();
+  // Look for an existing stub.
+  StubMap::const_iterator i = Stubs.find(Value);
+  if (i != Stubs.end()) {
+    resolveRelocation(Section, Offset,
+                      (uint64_t)Section.getAddressWithOffset(i->second),
+                      RelType, 0);
+    DEBUG(dbgs() << " Stub function found\n");
+  } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
+    // Create a new stub function.
+    DEBUG(dbgs() << " Create a new stub function\n");
+    Stubs[Value] = Section.getStubOffset();
+    uint8_t *StubTargetAddr = createStubFunction(
+        Section.getAddressWithOffset(Section.getStubOffset()));
+
+    RelocationEntry REmovz_g3(SectionID, StubTargetAddr - Section.getAddress(),
+                              ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
+    RelocationEntry REmovk_g2(SectionID,
+                              StubTargetAddr - Section.getAddress() + 4,
+                              ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
+    RelocationEntry REmovk_g1(SectionID,
+                              StubTargetAddr - Section.getAddress() + 8,
+                              ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
+    RelocationEntry REmovk_g0(SectionID,
+                              StubTargetAddr - Section.getAddress() + 12,
+                              ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
+
+    if (Value.SymbolName) {
+      addRelocationForSymbol(REmovz_g3, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g2, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g1, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g0, Value.SymbolName);
+    } else {
+      addRelocationForSection(REmovz_g3, Value.SectionID);
+      addRelocationForSection(REmovk_g2, Value.SectionID);
+      addRelocationForSection(REmovk_g1, Value.SectionID);
+      addRelocationForSection(REmovk_g0, Value.SectionID);
+    }
+    resolveRelocation(Section, Offset,
+                      reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
+                          Section.getStubOffset())),
+                      RelType, 0);
+    Section.advanceStubOffset(getMaxStubSize());
+  }
+}
+
 Expected<relocation_iterator>
 RuntimeDyldELF::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
@@ -1035,55 +1110,22 @@ RuntimeDyldELF::processRelocationRef(
 
   DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
                << "\n");
-  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be) &&
-      (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26)) {
-    // This is an AArch64 branch relocation, need to use a stub function.
-    DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
-    SectionEntry &Section = Sections[SectionID];
-
-    // Look for an existing stub.
-    StubMap::const_iterator i = Stubs.find(Value);
-    if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.getAddressWithOffset(i->second),
-                        RelType, 0);
-      DEBUG(dbgs() << " Stub function found\n");
-    } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
-      // Create a new stub function.
-      DEBUG(dbgs() << " Create a new stub function\n");
-      Stubs[Value] = Section.getStubOffset();
-      uint8_t *StubTargetAddr = createStubFunction(
-          Section.getAddressWithOffset(Section.getStubOffset()));
-
-      RelocationEntry REmovz_g3(SectionID,
-                                StubTargetAddr - Section.getAddress(),
-                                ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
-      RelocationEntry REmovk_g2(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 4,
-                                ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
-      RelocationEntry REmovk_g1(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 8,
-                                ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
-      RelocationEntry REmovk_g0(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 12,
-                                ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
-
-      if (Value.SymbolName) {
-        addRelocationForSymbol(REmovz_g3, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g2, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g1, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g0, Value.SymbolName);
-      } else {
-        addRelocationForSection(REmovz_g3, Value.SectionID);
-        addRelocationForSection(REmovk_g2, Value.SectionID);
-        addRelocationForSection(REmovk_g1, Value.SectionID);
-        addRelocationForSection(REmovk_g0, Value.SectionID);
-      }
-      resolveRelocation(Section, Offset,
-                        reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
-                            Section.getStubOffset())),
-                        RelType, 0);
-      Section.advanceStubOffset(getMaxStubSize());
+  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be)) {
+    if (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26) {
+      resolveAArch64Branch(SectionID, Value, RelI, Stubs);
+    } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
+      // Craete new GOT entry or find existing one. If GOT entry is
+      // to be created, then we also emit ABS64 relocation for it.
+      uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_AARCH64_ADR_PREL_PG_HI21);
+
+    } else if (RelType == ELF::R_AARCH64_LD64_GOT_LO12_NC) {
+      uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_AARCH64_LDST64_ABS_LO12_NC);
+    } else {
+      processSimpleRelocation(SectionID, Offset, RelType, Value);
     }
   } else if (Arch == Triple::arm) {
     if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL ||
@@ -1232,7 +1274,7 @@ RuntimeDyldELF::processRelocationRef(
       if (i != GOTSymbolOffsets.end())
         RE.SymOffset = i->second;
       else {
-        RE.SymOffset = allocateGOTEntries(SectionID, 1);
+        RE.SymOffset = allocateGOTEntries(1);
         GOTSymbolOffsets[TargetName] = RE.SymOffset;
       }
     }
@@ -1489,14 +1531,15 @@ RuntimeDyldELF::processRelocationRef(
           Section.advanceStubOffset(getMaxStubSize());
 
           // Allocate a GOT Entry
-          uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
+          uint64_t GOTOffset = allocateGOTEntries(1);
 
           // The load of the GOT address has an addend of -4
-          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4);
+          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4,
+                                     ELF::R_X86_64_PC32);
 
           // Fill in the value of the symbol we're targeting into the GOT
           addRelocationForSymbol(
-              computeGOTOffsetRE(SectionID, GOTOffset, 0, ELF::R_X86_64_64),
+              computeGOTOffsetRE(GOTOffset, 0, ELF::R_X86_64_64),
               Value.SymbolName);
         }
 
@@ -1511,11 +1554,13 @@ RuntimeDyldELF::processRelocationRef(
     } else if (RelType == ELF::R_X86_64_GOTPCREL ||
                RelType == ELF::R_X86_64_GOTPCRELX ||
                RelType == ELF::R_X86_64_REX_GOTPCRELX) {
-      uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
-      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend);
+      uint64_t GOTOffset = allocateGOTEntries(1);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_X86_64_PC32);
 
       // Fill in the value of the symbol we're targeting into the GOT
-      RelocationEntry RE = computeGOTOffsetRE(SectionID, GOTOffset, Value.Offset, ELF::R_X86_64_64);
+      RelocationEntry RE =
+          computeGOTOffsetRE(GOTOffset, Value.Offset, ELF::R_X86_64_64);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -1573,9 +1618,7 @@ size_t RuntimeDyldELF::getGOTEntrySize() {
   return Result;
 }
 
-uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
-{
-  (void)SectionID; // The GOT Section is the same for all section in the object file
+uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned no) {
   if (GOTSectionID == 0) {
     GOTSectionID = Sections.size();
     // Reserve a section id. We'll allocate the section later
@@ -1587,17 +1630,38 @@ uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
   return StartOffset;
 }
 
-void RuntimeDyldELF::resolveGOTOffsetRelocation(unsigned SectionID, uint64_t Offset, uint64_t GOTOffset)
-{
+uint64_t RuntimeDyldELF::findOrAllocGOTEntry(const RelocationValueRef &Value,
+                                             unsigned GOTRelType) {
+  auto E = GOTOffsetMap.insert({Value, 0});
+  if (E.second) {
+    uint64_t GOTOffset = allocateGOTEntries(1);
+
+    // Create relocation for newly created GOT entry
+    RelocationEntry RE =
+        computeGOTOffsetRE(GOTOffset, Value.Offset, GOTRelType);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+
+    E.first->second = GOTOffset;
+  }
+
+  return E.first->second;
+}
+
+void RuntimeDyldELF::resolveGOTOffsetRelocation(unsigned SectionID,
+                                                uint64_t Offset,
+                                                uint64_t GOTOffset,
+                                                uint32_t Type) {
   // Fill in the relative address of the GOT Entry into the stub
-  RelocationEntry GOTRE(SectionID, Offset, ELF::R_X86_64_PC32, GOTOffset);
+  RelocationEntry GOTRE(SectionID, Offset, Type, GOTOffset);
   addRelocationForSection(GOTRE, GOTSectionID);
 }
 
-RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(unsigned SectionID, uint64_t GOTOffset, uint64_t SymbolOffset,
-                                                   uint32_t Type)
-{
-  (void)SectionID; // The GOT Section is the same for all section in the object file
+RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(uint64_t GOTOffset,
+                                                   uint64_t SymbolOffset,
+                                                   uint32_t Type) {
   return RelocationEntry(GOTSectionID, GOTOffset, Type, SymbolOffset);
 }
 
@@ -1663,6 +1727,19 @@ bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const {
   return Obj.isELF();
 }
 
+bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
+  unsigned RelTy = R.getType();
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be)
+    return RelTy == ELF::R_AARCH64_ADR_GOT_PAGE ||
+           RelTy == ELF::R_AARCH64_LD64_GOT_LO12_NC;
+
+  if (Arch == Triple::x86_64)
+    return RelTy == ELF::R_X86_64_GOTPCREL ||
+           RelTy == ELF::R_X86_64_GOTPCRELX ||
+           RelTy == ELF::R_X86_64_REX_GOTPCRELX;
+  return false;
+}
+
 bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const {
   if (Arch != Triple::x86_64)
     return true;  // Conservative answer
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index d1867d091fe2..498979705b77 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -43,6 +43,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI,
                                  const RelocationValueRef &Value);
 
+  void resolveAArch64Branch(unsigned SectionID, const RelocationValueRef &Value,
+                            relocation_iterator RelI, StubMap &Stubs);
+
   void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset,
                             uint32_t Value, uint32_t Type, int32_t Addend);
 
@@ -88,24 +91,26 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                             ObjSectionToIDMap &LocalSections,
                             RelocationValueRef &Rel);
 protected:
-  size_t getGOTEntrySize();
+  size_t getGOTEntrySize() override;
 
 private:
   SectionEntry &getSection(unsigned SectionID) { return Sections[SectionID]; }
 
   // Allocate no GOT entries for use in the given section.
-  uint64_t allocateGOTEntries(unsigned SectionID, unsigned no);
+  uint64_t allocateGOTEntries(unsigned no);
+
+  // Find GOT entry corresponding to relocation or create new one.
+  uint64_t findOrAllocGOTEntry(const RelocationValueRef &Value,
+                               unsigned GOTRelType);
 
   // Resolve the relvative address of GOTOffset in Section ID and place
   // it at the given Offset
   void resolveGOTOffsetRelocation(unsigned SectionID, uint64_t Offset,
-                                  uint64_t GOTOffset);
+                                  uint64_t GOTOffset, uint32_t Type);
 
   // For a GOT entry referenced from SectionID, compute a relocation entry
   // that will place the final resolved value in the GOT slot
-  RelocationEntry computeGOTOffsetRE(unsigned SectionID,
-                                     uint64_t GOTOffset,
-                                     uint64_t SymbolOffset,
+  RelocationEntry computeGOTOffsetRE(uint64_t GOTOffset, uint64_t SymbolOffset,
                                      unsigned Type);
 
   // Compute the address in memory where we can find the placeholder
@@ -146,6 +151,10 @@ private:
   SmallVector<SID, 2> UnregisteredEHFrameSections;
   SmallVector<SID, 2> RegisteredEHFrameSections;
 
+  // Map between GOT relocation value and corresponding GOT offset
+  std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
+
+  bool relocationNeedsGot(const RelocationRef &R) const override;
   bool relocationNeedsStub(const RelocationRef &R) const override;
 
 public:
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 279d0de2da76..f5cc883d98fd 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -213,7 +213,7 @@ public:
   }
 };
 
-/// @brief Symbol info for RuntimeDyld. 
+/// @brief Symbol info for RuntimeDyld.
 class SymbolTableEntry {
 public:
   SymbolTableEntry()
@@ -426,6 +426,9 @@ protected:
                               uint64_t &RODataSize, uint32_t &RODataAlign,
                               uint64_t &RWDataSize, uint32_t &RWDataAlign);
 
+  // \brief Compute GOT size
+  unsigned computeGOTSize(const ObjectFile &Obj);
+
   // \brief Compute the stub buffer size required for a section
   unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
@@ -433,6 +436,14 @@ protected:
   // \brief Implementation of the generic part of the loadObject algorithm.
   Expected<ObjSectionToIDMap> loadObjectImpl(const object::ObjectFile &Obj);
 
+  // \brief Return size of Global Offset Table (GOT) entry
+  virtual size_t getGOTEntrySize() { return 0; }
+
+  // \brief Return true if the relocation R may require allocating a GOT entry.
+  virtual bool relocationNeedsGot(const RelocationRef &R) const {
+    return false;
+  }
+
   // \brief Return true if the relocation R may require allocating a stub.
   virtual bool relocationNeedsStub(const RelocationRef &R) const {
     return true;    // Conservative answer
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
index 70bd017bae6b..59cef04cdece 100644
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -12,8 +12,9 @@ if( LLVM_USE_SANITIZE_COVERAGE )
     FuzzerCrossOver.cpp
     FuzzerDriver.cpp
     FuzzerExtFunctionsDlsym.cpp
+    FuzzerExtFunctionsDlsymWin.cpp
     FuzzerExtFunctionsWeak.cpp
-    FuzzerExtFunctionsWeakAlias.cpp
+    FuzzerExtraCounters.cpp
     FuzzerIO.cpp
     FuzzerIOPosix.cpp
     FuzzerIOWindows.cpp
@@ -21,6 +22,8 @@ if( LLVM_USE_SANITIZE_COVERAGE )
     FuzzerMerge.cpp
     FuzzerMutate.cpp
     FuzzerSHA1.cpp
+    FuzzerShmemPosix.cpp
+    FuzzerShmemWindows.cpp
     FuzzerTracePC.cpp
     FuzzerTraceState.cpp
     FuzzerUtil.cpp
@@ -32,12 +35,12 @@ if( LLVM_USE_SANITIZE_COVERAGE )
   add_library(LLVMFuzzerNoMain STATIC
     $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
     )
-  target_link_libraries(LLVMFuzzerNoMain ${PTHREAD_LIB})
+  target_link_libraries(LLVMFuzzerNoMain ${LLVM_PTHREAD_LIB})
   add_library(LLVMFuzzer STATIC
     FuzzerMain.cpp
     $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
     )
-  target_link_libraries(LLVMFuzzer ${PTHREAD_LIB})
+  target_link_libraries(LLVMFuzzer ${LLVM_PTHREAD_LIB})
 
   if( LLVM_INCLUDE_TESTS )
     add_subdirectory(test)
diff --git a/lib/Fuzzer/FuzzerCorpus.h b/lib/Fuzzer/FuzzerCorpus.h
index 468d5e5ddc70..0f0573994a03 100644
--- a/lib/Fuzzer/FuzzerCorpus.h
+++ b/lib/Fuzzer/FuzzerCorpus.h
@@ -37,8 +37,8 @@ struct InputInfo {
 };
 
 class InputCorpus {
+  static const size_t kFeatureSetSize = 1 << 21;
  public:
-  static const size_t kFeatureSetSize = 1 << 16;
   InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) {
     memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
     memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
@@ -68,7 +68,8 @@ class InputCorpus {
   }
   bool empty() const { return Inputs.empty(); }
   const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
-  void AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile = false) {
+  void AddToCorpus(const Unit &U, size_t NumFeatures,
+                   bool MayDeleteFile = false) {
     assert(!U.empty());
     uint8_t Hash[kSHA1NumBytes];
     if (FeatureDebug)
@@ -82,7 +83,7 @@ class InputCorpus {
     II.MayDeleteFile = MayDeleteFile;
     memcpy(II.Sha1, Hash, kSHA1NumBytes);
     UpdateCorpusDistribution();
-    ValidateFeatureSet();
+    // ValidateFeatureSet();
   }
 
   bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
@@ -97,7 +98,7 @@ class InputCorpus {
   // Hypothesis: units added to the corpus last are more likely to be
   // interesting. This function gives more weight to the more recent units.
   size_t ChooseUnitIdxToMutate(Random &Rand) {
-    size_t Idx = static_cast<size_t>(CorpusDistribution(Rand.Get_mt19937()));
+    size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
     assert(Idx < Inputs.size());
     return Idx;
   }
@@ -132,7 +133,7 @@ class InputCorpus {
       Printf("EVICTED %zd\n", Idx);
   }
 
-  bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
+  void AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
     assert(NewSize);
     Idx = Idx % kFeatureSetSize;
     uint32_t OldSize = GetFeature(Idx);
@@ -144,23 +145,20 @@ class InputCorpus {
         II.NumFeatures--;
         if (II.NumFeatures == 0)
           DeleteInput(OldIdx);
+      } else {
+        NumAddedFeatures++;
       }
+      NumUpdatedFeatures++;
       if (FeatureDebug)
         Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
       SmallestElementPerFeature[Idx] = Inputs.size();
       InputSizesPerFeature[Idx] = NewSize;
       CountingFeatures = true;
-      return true;
     }
-    return false;
   }
 
-  size_t NumFeatures() const {
-    size_t Res = 0;
-    for (size_t i = 0; i < kFeatureSetSize; i++)
-      Res += GetFeature(i) != 0;
-    return Res;
-  }
+  size_t NumFeatures() const { return NumAddedFeatures; }
+  size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
 
   void ResetFeatureSet() {
     assert(Inputs.empty());
@@ -213,6 +211,8 @@ private:
   std::vector<InputInfo*> Inputs;
 
   bool CountingFeatures = false;
+  size_t NumAddedFeatures = 0;
+  size_t NumUpdatedFeatures = 0;
   uint32_t InputSizesPerFeature[kFeatureSetSize];
   uint32_t SmallestElementPerFeature[kFeatureSetSize];
 
diff --git a/lib/Fuzzer/FuzzerDefs.h b/lib/Fuzzer/FuzzerDefs.h
index 0f5b8a7cf211..bd1827508002 100644
--- a/lib/Fuzzer/FuzzerDefs.h
+++ b/lib/Fuzzer/FuzzerDefs.h
@@ -47,8 +47,30 @@
 
 #ifdef __clang__  // avoid gcc warning.
 #  define ATTRIBUTE_NO_SANITIZE_MEMORY __attribute__((no_sanitize("memory")))
+#  define ALWAYS_INLINE __attribute__((always_inline))
 #else
 #  define ATTRIBUTE_NO_SANITIZE_MEMORY
+#  define ALWAYS_INLINE
+#endif // __clang__
+
+#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
+
+#if defined(__has_feature)
+#  if __has_feature(address_sanitizer)
+#    define ATTRIBUTE_NO_SANITIZE_ALL ATTRIBUTE_NO_SANITIZE_ADDRESS
+#  elif __has_feature(memory_sanitizer)
+#    define ATTRIBUTE_NO_SANITIZE_ALL ATTRIBUTE_NO_SANITIZE_MEMORY
+#  else
+#    define ATTRIBUTE_NO_SANITIZE_ALL
+#  endif
+#else
+#  define ATTRIBUTE_NO_SANITIZE_ALL
+#endif
+
+#if LIBFUZZER_WINDOWS
+#define ATTRIBUTE_INTERFACE __declspec(dllexport)
+#else
+#define ATTRIBUTE_INTERFACE __attribute__((visibility("default")))
 #endif
 
 namespace fuzzer {
@@ -74,9 +96,10 @@ typedef int (*UserCallback)(const uint8_t *Data, size_t Size);
 
 int FuzzerDriver(int *argc, char ***argv, UserCallback Callback);
 
-struct ScopedDoingMyOwnMemmem {
-  ScopedDoingMyOwnMemmem();
-  ~ScopedDoingMyOwnMemmem();
+struct ScopedDoingMyOwnMemOrStr {
+  ScopedDoingMyOwnMemOrStr() { DoingMyOwnMemOrStr++; }
+  ~ScopedDoingMyOwnMemOrStr() { DoingMyOwnMemOrStr--; }
+  static int DoingMyOwnMemOrStr;
 };
 
 inline uint8_t  Bswap(uint8_t x)  { return x; }
@@ -84,6 +107,10 @@ inline uint16_t Bswap(uint16_t x) { return __builtin_bswap16(x); }
 inline uint32_t Bswap(uint32_t x) { return __builtin_bswap32(x); }
 inline uint64_t Bswap(uint64_t x) { return __builtin_bswap64(x); }
 
+uint8_t *ExtraCountersBegin();
+uint8_t *ExtraCountersEnd();
+void ClearExtraCounters();
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_DEFS_H
diff --git a/lib/Fuzzer/FuzzerDictionary.h b/lib/Fuzzer/FuzzerDictionary.h
index eba0eabb6838..84cee87b8971 100644
--- a/lib/Fuzzer/FuzzerDictionary.h
+++ b/lib/Fuzzer/FuzzerDictionary.h
@@ -20,8 +20,9 @@
 
 namespace fuzzer {
 // A simple POD sized array of bytes.
-template <size_t kMaxSize> class FixedWord {
+template <size_t kMaxSizeT> class FixedWord {
 public:
+  static const size_t kMaxSize = kMaxSizeT;
   FixedWord() {}
   FixedWord(const uint8_t *B, uint8_t S) { Set(B, S); }
 
@@ -32,10 +33,12 @@ public:
   }
 
   bool operator==(const FixedWord<kMaxSize> &w) const {
+    ScopedDoingMyOwnMemOrStr scoped_doing_my_own_mem_os_str;
     return Size == w.Size && 0 == memcmp(Data, w.Data, Size);
   }
 
   bool operator<(const FixedWord<kMaxSize> &w) const {
+    ScopedDoingMyOwnMemOrStr scoped_doing_my_own_mem_os_str;
     if (Size != w.Size)
       return Size < w.Size;
     return memcmp(Data, w.Data, Size) < 0;
@@ -50,7 +53,7 @@ private:
   uint8_t Data[kMaxSize];
 };
 
-typedef FixedWord<27> Word; // 28 bytes.
+typedef FixedWord<64> Word;
 
 class DictionaryEntry {
  public:
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index 2bbcb25275e4..0fb83ca64de6 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -15,6 +15,7 @@
 #include "FuzzerIO.h"
 #include "FuzzerMutate.h"
 #include "FuzzerRandom.h"
+#include "FuzzerShmem.h"
 #include "FuzzerTracePC.h"
 #include <algorithm>
 #include <atomic>
@@ -277,7 +278,19 @@ static bool AllInputsAreFiles() {
   return true;
 }
 
-int MinimizeCrashInput(const std::vector<std::string> &Args) {
+static std::string GetDedupTokenFromFile(const std::string &Path) {
+  auto S = FileToString(Path);
+  auto Beg = S.find("DEDUP_TOKEN:");
+  if (Beg == std::string::npos)
+    return "";
+  auto End = S.find('\n', Beg);
+  if (End == std::string::npos)
+    return "";
+  return S.substr(Beg, End - Beg);
+}
+
+int MinimizeCrashInput(const std::vector<std::string> &Args,
+                       const FuzzingOptions &Options) {
   if (Inputs->size() != 1) {
     Printf("ERROR: -minimize_crash should be given one input file\n");
     exit(1);
@@ -294,19 +307,18 @@ int MinimizeCrashInput(const std::vector<std::string> &Args) {
            "INFO: defaulting to -max_total_time=600\n");
     BaseCmd += " -max_total_time=600";
   }
-  // BaseCmd += " >  /dev/null 2>&1 ";
+
+  auto LogFilePath = DirPlusFile(
+      TmpDir(), "libFuzzerTemp." + std::to_string(GetPid()) + ".txt");
+  auto LogFileRedirect = " > " + LogFilePath + " 2>&1 ";
 
   std::string CurrentFilePath = InputFilePath;
   while (true) {
     Unit U = FileToVector(CurrentFilePath);
-    if (U.size() < 2) {
-      Printf("CRASH_MIN: '%s' is small enough\n", CurrentFilePath.c_str());
-      return 0;
-    }
     Printf("CRASH_MIN: minimizing crash input: '%s' (%zd bytes)\n",
            CurrentFilePath.c_str(), U.size());
 
-    auto Cmd = BaseCmd + " " + CurrentFilePath;
+    auto Cmd = BaseCmd + " " + CurrentFilePath + LogFileRedirect;
 
     Printf("CRASH_MIN: executing: %s\n", Cmd.c_str());
     int ExitCode = ExecuteCommand(Cmd);
@@ -317,12 +329,19 @@ int MinimizeCrashInput(const std::vector<std::string> &Args) {
     Printf("CRASH_MIN: '%s' (%zd bytes) caused a crash. Will try to minimize "
            "it further\n",
            CurrentFilePath.c_str(), U.size());
-
-    std::string ArtifactPath = "minimized-from-" + Hash(U);
+    auto DedupToken1 = GetDedupTokenFromFile(LogFilePath);
+    if (!DedupToken1.empty())
+      Printf("CRASH_MIN: DedupToken1: %s\n", DedupToken1.c_str());
+
+    std::string ArtifactPath =
+        Flags.exact_artifact_path
+            ? Flags.exact_artifact_path
+            : Options.ArtifactPrefix + "minimized-from-" + Hash(U);
     Cmd += " -minimize_crash_internal_step=1 -exact_artifact_path=" +
         ArtifactPath;
     Printf("CRASH_MIN: executing: %s\n", Cmd.c_str());
     ExitCode = ExecuteCommand(Cmd);
+    CopyFileToErr(LogFilePath);
     if (ExitCode == 0) {
       if (Flags.exact_artifact_path) {
         CurrentFilePath = Flags.exact_artifact_path;
@@ -330,11 +349,26 @@ int MinimizeCrashInput(const std::vector<std::string> &Args) {
       }
       Printf("CRASH_MIN: failed to minimize beyond %s (%d bytes), exiting\n",
              CurrentFilePath.c_str(), U.size());
-      return 0;
+      break;
     }
+    auto DedupToken2 = GetDedupTokenFromFile(LogFilePath);
+    if (!DedupToken2.empty())
+      Printf("CRASH_MIN: DedupToken2: %s\n", DedupToken2.c_str());
+
+    if (DedupToken1 != DedupToken2) {
+      if (Flags.exact_artifact_path) {
+        CurrentFilePath = Flags.exact_artifact_path;
+        WriteToFile(U, CurrentFilePath);
+      }
+      Printf("CRASH_MIN: mismatch in dedup tokens"
+             " (looks like a different bug). Won't minimize further\n");
+      break;
+    }
+
     CurrentFilePath = ArtifactPath;
-    Printf("\n\n\n\n\n\n*********************************\n");
+    Printf("*********************************\n");
   }
+  RemoveFile(LogFilePath);
   return 0;
 }
 
@@ -342,8 +376,11 @@ int MinimizeCrashInputInternalStep(Fuzzer *F, InputCorpus *Corpus) {
   assert(Inputs->size() == 1);
   std::string InputFilePath = Inputs->at(0);
   Unit U = FileToVector(InputFilePath);
-  assert(U.size() > 2);
   Printf("INFO: Starting MinimizeCrashInputInternalStep: %zd\n", U.size());
+  if (U.size() < 2) {
+    Printf("INFO: The input is small enough, exiting\n");
+    exit(0);
+  }
   Corpus->AddToCorpus(U, 0);
   F->SetMaxInputLen(U.size());
   F->SetMaxMutationLen(U.size() - 1);
@@ -353,24 +390,94 @@ int MinimizeCrashInputInternalStep(Fuzzer *F, InputCorpus *Corpus) {
   return 0;
 }
 
+int AnalyzeDictionary(Fuzzer *F, const std::vector<Unit>& Dict,
+                      UnitVector& Corpus) {
+  Printf("Started dictionary minimization (up to %d tests)\n",
+         Dict.size() * Corpus.size() * 2);
+
+  // Scores and usage count for each dictionary unit.
+  std::vector<int> Scores(Dict.size());
+  std::vector<int> Usages(Dict.size());
+
+  std::vector<size_t> InitialFeatures;
+  std::vector<size_t> ModifiedFeatures;
+  for (auto &C : Corpus) {
+    // Get coverage for the testcase without modifications.
+    F->ExecuteCallback(C.data(), C.size());
+    InitialFeatures.clear();
+    TPC.CollectFeatures([&](size_t Feature) -> bool {
+      InitialFeatures.push_back(Feature);
+      return true;
+    });
+
+    for (size_t i = 0; i < Dict.size(); ++i) {
+      auto Data = C;
+      auto StartPos = std::search(Data.begin(), Data.end(),
+                                  Dict[i].begin(), Dict[i].end());
+      // Skip dictionary unit, if the testcase does not contain it.
+      if (StartPos == Data.end())
+        continue;
+
+      ++Usages[i];
+      while (StartPos != Data.end()) {
+        // Replace all occurrences of dictionary unit in the testcase.
+        auto EndPos = StartPos + Dict[i].size();
+        for (auto It = StartPos; It != EndPos; ++It)
+          *It ^= 0xFF;
+
+        StartPos = std::search(EndPos, Data.end(),
+                               Dict[i].begin(), Dict[i].end());
+      }
+
+      // Get coverage for testcase with masked occurrences of dictionary unit.
+      F->ExecuteCallback(Data.data(), Data.size());
+      ModifiedFeatures.clear();
+      TPC.CollectFeatures([&](size_t Feature) -> bool {
+        ModifiedFeatures.push_back(Feature);
+        return true;
+      });
+
+      if (InitialFeatures == ModifiedFeatures)
+        --Scores[i];
+      else
+        Scores[i] += 2;
+    }
+  }
+
+  Printf("###### Useless dictionary elements. ######\n");
+  for (size_t i = 0; i < Dict.size(); ++i) {
+    // Dictionary units with positive score are treated as useful ones.
+    if (Scores[i] > 0)
+       continue;
+
+    Printf("\"");
+    PrintASCII(Dict[i].data(), Dict[i].size(), "\"");
+    Printf(" # Score: %d, Used: %d\n", Scores[i], Usages[i]);
+  }
+  Printf("###### End of useless dictionary elements. ######\n");
+  return 0;
+}
+
 int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   using namespace fuzzer;
   assert(argc && argv && "Argument pointers cannot be nullptr");
+  std::string Argv0((*argv)[0]);
   EF = new ExternalFunctions();
   if (EF->LLVMFuzzerInitialize)
     EF->LLVMFuzzerInitialize(argc, argv);
   const std::vector<std::string> Args(*argv, *argv + *argc);
   assert(!Args.empty());
   ProgName = new std::string(Args[0]);
+  if (Argv0 != *ProgName) {
+    Printf("ERROR: argv[0] has been modified in LLVMFuzzerInitialize\n");
+    exit(1);
+  }
   ParseFlags(Args);
   if (Flags.help) {
     PrintHelp();
     return 0;
   }
 
-  if (Flags.minimize_crash)
-    return MinimizeCrashInput(Args);
-
   if (Flags.close_fd_mask & 2)
     DupAndCloseStderr();
   if (Flags.close_fd_mask & 1)
@@ -401,7 +508,6 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   Options.MutateDepth = Flags.mutate_depth;
   Options.UseCounters = Flags.use_counters;
   Options.UseIndirCalls = Flags.use_indir_calls;
-  Options.UseMemcmp = Flags.use_memcmp;
   Options.UseMemmem = Flags.use_memmem;
   Options.UseCmp = Flags.use_cmp;
   Options.UseValueProfile = Flags.use_value_profile;
@@ -471,9 +577,37 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   Options.HandleXfsz = Flags.handle_xfsz;
   SetSignalHandler(Options);
 
+  if (Flags.minimize_crash)
+    return MinimizeCrashInput(Args, Options);
+
   if (Flags.minimize_crash_internal_step)
     return MinimizeCrashInputInternalStep(F, Corpus);
 
+  if (auto Name = Flags.run_equivalence_server) {
+    SMR.Destroy(Name);
+    if (!SMR.Create(Name)) {
+       Printf("ERROR: can't create shared memory region\n");
+      return 1;
+    }
+    Printf("INFO: EQUIVALENCE SERVER UP\n");
+    while (true) {
+      SMR.WaitClient();
+      size_t Size = SMR.ReadByteArraySize();
+      SMR.WriteByteArray(nullptr, 0);
+      F->RunOne(SMR.GetByteArray(), Size);
+      SMR.PostServer();
+    }
+    return 0;
+  }
+
+  if (auto Name = Flags.use_equivalence_server) {
+    if (!SMR.Open(Name)) {
+      Printf("ERROR: can't open shared memory region\n");
+      return 1;
+    }
+    Printf("INFO: EQUIVALENCE CLIENT UP\n");
+  }
+
   if (DoPlainRun) {
     Options.SaveArtifacts = false;
     int Runs = std::max(1, Flags.runs);
@@ -499,14 +633,12 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   if (Flags.merge) {
     if (Options.MaxLen == 0)
       F->SetMaxInputLen(kMaxSaneLen);
-    if (TPC.UsingTracePcGuard()) {
-      if (Flags.merge_control_file)
-        F->CrashResistantMergeInternalStep(Flags.merge_control_file);
-      else
-        F->CrashResistantMerge(Args, *Inputs);
-    } else {
-      F->Merge(*Inputs);
-    }
+    if (Flags.merge_control_file)
+      F->CrashResistantMergeInternalStep(Flags.merge_control_file);
+    else
+      F->CrashResistantMerge(Args, *Inputs,
+                             Flags.load_coverage_summary,
+                             Flags.save_coverage_summary);
     exit(0);
   }
 
@@ -519,6 +651,19 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
                            TemporaryMaxLen, /*ExitOnError=*/false);
   }
 
+  if (Flags.analyze_dict) {
+    if (Dictionary.empty() || Inputs->empty()) {
+      Printf("ERROR: can't analyze dict without dict and corpus provided\n");
+      return 1;
+    }
+    if (AnalyzeDictionary(F, Dictionary, InitialCorpus)) {
+      Printf("Dictionary analysis failed\n");
+      exit(1);
+    }
+    Printf("Dictionary analysis suceeded\n");
+    exit(0);
+  }
+
   if (Options.MaxLen == 0) {
     size_t MaxLen = 0;
     for (auto &U : InitialCorpus)
@@ -536,7 +681,7 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   F->Loop();
 
   if (Flags.verbosity)
-    Printf("Done %d runs in %zd second(s)\n", F->getTotalNumberOfRuns(),
+    Printf("Done %zd runs in %zd second(s)\n", F->getTotalNumberOfRuns(),
            F->secondsSinceProcessStartUp());
   F->PrintFinalStats();
 
diff --git a/lib/Fuzzer/FuzzerExtFunctions.def b/lib/Fuzzer/FuzzerExtFunctions.def
index 61c72e4a209e..3bc5302c31c6 100644
--- a/lib/Fuzzer/FuzzerExtFunctions.def
+++ b/lib/Fuzzer/FuzzerExtFunctions.def
@@ -29,22 +29,18 @@ EXT_FUNC(LLVMFuzzerCustomCrossOver, size_t,
 EXT_FUNC(__lsan_enable, void, (), false);
 EXT_FUNC(__lsan_disable, void, (), false);
 EXT_FUNC(__lsan_do_recoverable_leak_check, int, (), false);
-EXT_FUNC(__sanitizer_get_number_of_counters, size_t, (), false);
 EXT_FUNC(__sanitizer_install_malloc_and_free_hooks, int,
          (void (*malloc_hook)(const volatile void *, size_t),
           void (*free_hook)(const volatile void *)),
          false);
-EXT_FUNC(__sanitizer_get_total_unique_caller_callee_pairs, size_t, (), false);
-EXT_FUNC(__sanitizer_get_total_unique_coverage, size_t, (), true);
-EXT_FUNC(__sanitizer_print_memory_profile, int, (size_t), false);
+EXT_FUNC(__sanitizer_print_memory_profile, int, (size_t, size_t), false);
 EXT_FUNC(__sanitizer_print_stack_trace, void, (), true);
 EXT_FUNC(__sanitizer_symbolize_pc, void,
          (void *, const char *fmt, char *out_buf, size_t out_buf_size), false);
 EXT_FUNC(__sanitizer_get_module_and_offset_for_pc, int,
          (void *pc, char *module_path,
          size_t module_path_len,void **pc_offset), false);
-EXT_FUNC(__sanitizer_reset_coverage, void, (), true);
 EXT_FUNC(__sanitizer_set_death_callback, void, (void (*)(void)), true);
 EXT_FUNC(__sanitizer_set_report_fd, void, (void*), false);
-EXT_FUNC(__sanitizer_update_counter_bitset_and_clear_counters, uintptr_t,
-  (uint8_t*), false);
+EXT_FUNC(__sanitizer_dump_coverage, void, (const uintptr_t *, uintptr_t),
+         false);
diff --git a/lib/Fuzzer/FuzzerExtFunctionsDlsymWin.cpp b/lib/Fuzzer/FuzzerExtFunctionsDlsymWin.cpp
new file mode 100644
index 000000000000..77521698c80a
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtFunctionsDlsymWin.cpp
@@ -0,0 +1,60 @@
+//===- FuzzerExtFunctionsDlsymWin.cpp - Interface to external functions ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Implementation using dynamic loading for Windows.
+//===----------------------------------------------------------------------===//
+#include "FuzzerDefs.h"
+#if LIBFUZZER_WINDOWS
+
+#include "FuzzerExtFunctions.h"
+#include "FuzzerIO.h"
+#include "Windows.h"
+#include "Psapi.h"
+
+namespace fuzzer {
+
+ExternalFunctions::ExternalFunctions() {
+  HMODULE Modules[1024];
+  DWORD BytesNeeded;
+  HANDLE CurrentProcess = GetCurrentProcess();
+
+  if (!EnumProcessModules(CurrentProcess, Modules, sizeof(Modules),
+                          &BytesNeeded)) {
+    Printf("EnumProcessModules failed (error: %d).\n", GetLastError());
+    exit(1);
+  }
+
+  if (sizeof(Modules) < BytesNeeded) {
+    Printf("Error: the array is not big enough to hold all loaded modules.\n");
+    exit(1);
+  }
+
+  for (size_t i = 0; i < (BytesNeeded / sizeof(HMODULE)); i++)
+  {
+    FARPROC Fn;
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+    if (this->NAME == nullptr) {                                               \
+      Fn = GetProcAddress(Modules[i], #NAME);                                  \
+      if (Fn == nullptr)                                                       \
+         Fn = GetProcAddress(Modules[i], #NAME "__dll");                       \
+      this->NAME = (decltype(ExternalFunctions::NAME)) Fn;                     \
+    }
+#include "FuzzerExtFunctions.def"
+#undef EXT_FUNC
+  }
+
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  if (this->NAME == nullptr && WARN)                                           \
+    Printf("WARNING: Failed to find function \"%s\".\n", #NAME);
+#include "FuzzerExtFunctions.def"
+#undef EXT_FUNC
+}
+
+} // namespace fuzzer
+
+#endif // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerExtraCounters.cpp b/lib/Fuzzer/FuzzerExtraCounters.cpp
new file mode 100644
index 000000000000..07dbe0fdee76
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtraCounters.cpp
@@ -0,0 +1,41 @@
+//===- FuzzerExtraCounters.cpp - Extra coverage counters ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Extra coverage counters defined by user code.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerDefs.h"
+
+#if LIBFUZZER_LINUX
+__attribute__((weak)) extern uint8_t __start___libfuzzer_extra_counters;
+__attribute__((weak)) extern uint8_t __stop___libfuzzer_extra_counters;
+
+namespace fuzzer {
+uint8_t *ExtraCountersBegin() { return &__start___libfuzzer_extra_counters; }
+uint8_t *ExtraCountersEnd() { return &__stop___libfuzzer_extra_counters; }
+ATTRIBUTE_NO_SANITIZE_ALL
+void ClearExtraCounters() {  // hand-written memset, don't asan-ify.
+  uintptr_t *Beg = reinterpret_cast<uintptr_t*>(ExtraCountersBegin());
+  uintptr_t *End = reinterpret_cast<uintptr_t*>(ExtraCountersEnd());
+  for (; Beg < End; Beg++) {
+    *Beg = 0;
+    __asm__ __volatile__("" : : : "memory");
+  }
+}
+
+}  // namespace fuzzer
+
+#else
+// TODO: implement for other platforms.
+namespace fuzzer {
+uint8_t *ExtraCountersBegin() { return nullptr; }
+uint8_t *ExtraCountersEnd() { return nullptr; }
+void ClearExtraCounters() {}
+}  // namespace fuzzer
+
+#endif
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 22aad353acec..28bf0ca8ce69 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -39,14 +39,19 @@ FUZZER_FLAG_INT(merge, 0, "If 1, the 2-nd, 3-rd, etc corpora will be "
   "merged into the 1-st corpus. Only interesting units will be taken. "
   "This flag can be used to minimize a corpus.")
 FUZZER_FLAG_STRING(merge_control_file, "internal flag")
+FUZZER_FLAG_STRING(save_coverage_summary, "Experimental:"
+                   " save coverage summary to a given file."
+                   " Used with -merge=1")
+FUZZER_FLAG_STRING(load_coverage_summary, "Experimental:"
+                   " load coverage summary from a given file."
+                   " Treat this coverage as belonging to the first corpus. "
+                   " Used with -merge=1")
 FUZZER_FLAG_INT(minimize_crash, 0, "If 1, minimizes the provided"
   " crash input. Use with -runs=N or -max_total_time=N to limit "
   "the number attempts")
 FUZZER_FLAG_INT(minimize_crash_internal_step, 0, "internal flag")
 FUZZER_FLAG_INT(use_counters, 1, "Use coverage counters")
 FUZZER_FLAG_INT(use_indir_calls, 1, "Use indirect caller-callee counters")
-FUZZER_FLAG_INT(use_memcmp, 1,
-                "Use hints from intercepting memcmp, strcmp, etc")
 FUZZER_FLAG_INT(use_memmem, 1,
                 "Use hints from intercepting memmem, strstr, etc")
 FUZZER_FLAG_INT(use_value_profile, 0,
@@ -85,7 +90,7 @@ FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit."
 FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit."
                                   " Experimental, only with trace-pc-guard")
 FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
-FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGSEGV.")
+FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.")
 FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
 FUZZER_FLAG_INT(handle_ill, 1, "If 1, try to intercept SIGILL.")
 FUZZER_FLAG_INT(handle_fpe, 1, "If 1, try to intercept SIGFPE.")
@@ -108,6 +113,10 @@ FUZZER_FLAG_STRING(exit_on_item, "Exit if an item with a given sha1 sum"
     " was added to the corpus. "
     "Used primarily for testing libFuzzer itself.")
 
+FUZZER_FLAG_STRING(run_equivalence_server, "Experimental")
+FUZZER_FLAG_STRING(use_equivalence_server, "Experimental")
+FUZZER_FLAG_INT(analyze_dict, 0, "Experimental")
+
 FUZZER_DEPRECATED_FLAG(exit_on_first)
 FUZZER_DEPRECATED_FLAG(save_minimized_corpus)
 FUZZER_DEPRECATED_FLAG(sync_command)
diff --git a/lib/Fuzzer/FuzzerIO.cpp b/lib/Fuzzer/FuzzerIO.cpp
index eda8e8772930..e3f609ed8a80 100644
--- a/lib/Fuzzer/FuzzerIO.cpp
+++ b/lib/Fuzzer/FuzzerIO.cpp
@@ -96,14 +96,15 @@ void DupAndCloseStderr() {
     if (NewOutputFile) {
       OutputFile = NewOutputFile;
       if (EF->__sanitizer_set_report_fd)
-        EF->__sanitizer_set_report_fd(reinterpret_cast<void *>(OutputFd));
-      CloseFile(2);
+        EF->__sanitizer_set_report_fd(
+            reinterpret_cast<void *>(GetHandleFromFd(OutputFd)));
+      DiscardOutput(2);
     }
   }
 }
 
 void CloseStdout() {
-  CloseFile(1);
+  DiscardOutput(1);
 }
 
 void Printf(const char *Fmt, ...) {
diff --git a/lib/Fuzzer/FuzzerIO.h b/lib/Fuzzer/FuzzerIO.h
index 15bfd3d34727..3b66a52d1a64 100644
--- a/lib/Fuzzer/FuzzerIO.h
+++ b/lib/Fuzzer/FuzzerIO.h
@@ -40,12 +40,17 @@ std::string DirName(const std::string &FileName);
 // Returns path to a TmpDir.
 std::string TmpDir();
 
+bool IsInterestingCoverageFile(const std::string &FileName);
+
 void DupAndCloseStderr();
 
 void CloseStdout();
 
 void Printf(const char *Fmt, ...);
 
+// Print using raw syscalls, useful when printing at early init stages.
+void RawPrint(const char *Str);
+
 // Platform specific functions:
 bool IsFile(const std::string &Path);
 
@@ -62,6 +67,10 @@ int DuplicateFile(int Fd);
 
 void RemoveFile(const std::string &Path);
 
+void DiscardOutput(int Fd);
+
+intptr_t GetHandleFromFd(int fd);
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_IO_H
diff --git a/lib/Fuzzer/FuzzerIOPosix.cpp b/lib/Fuzzer/FuzzerIOPosix.cpp
index 6d8edf6ff538..c5ebdbac467b 100644
--- a/lib/Fuzzer/FuzzerIOPosix.cpp
+++ b/lib/Fuzzer/FuzzerIOPosix.cpp
@@ -75,6 +75,18 @@ void RemoveFile(const std::string &Path) {
   unlink(Path.c_str());
 }
 
+void DiscardOutput(int Fd) {
+  FILE* Temp = fopen("/dev/null", "w");
+  if (!Temp)
+    return;
+  dup2(fileno(Temp), Fd);
+  fclose(Temp);
+}
+
+intptr_t GetHandleFromFd(int fd) {
+  return static_cast<intptr_t>(fd);
+}
+
 std::string DirName(const std::string &FileName) {
   char *Tmp = new char[FileName.size() + 1];
   memcpy(Tmp, FileName.c_str(), FileName.size() + 1);
@@ -89,6 +101,23 @@ std::string TmpDir() {
   return "/tmp";
 }
 
+bool IsInterestingCoverageFile(const std::string &FileName) {
+  if (FileName.find("compiler-rt/lib/") != std::string::npos)
+    return false; // sanitizer internal.
+  if (FileName.find("/usr/lib/") != std::string::npos)
+    return false;
+  if (FileName.find("/usr/include/") != std::string::npos)
+    return false;
+  if (FileName == "<null>")
+    return false;
+  return true;
+}
+
+
+void RawPrint(const char *Str) {
+  write(2, Str, strlen(Str));
+}
+
 }  // namespace fuzzer
 
 #endif // LIBFUZZER_POSIX
diff --git a/lib/Fuzzer/FuzzerIOWindows.cpp b/lib/Fuzzer/FuzzerIOWindows.cpp
index 056f0721a336..75d4e3a06071 100644
--- a/lib/Fuzzer/FuzzerIOWindows.cpp
+++ b/lib/Fuzzer/FuzzerIOWindows.cpp
@@ -89,8 +89,10 @@ void ListFilesInDirRecursive(const std::string &Dir, long *Epoch,
   HANDLE FindHandle(FindFirstFileA(Path.c_str(), &FindInfo));
   if (FindHandle == INVALID_HANDLE_VALUE)
   {
-    Printf("No file found in: %s.\n", Dir.c_str());
-    return;
+    if (GetLastError() == ERROR_FILE_NOT_FOUND)
+      return;
+    Printf("No such directory: %s; exiting\n", Dir.c_str());
+    exit(1);
   }
 
   do {
@@ -139,6 +141,18 @@ void RemoveFile(const std::string &Path) {
   _unlink(Path.c_str());
 }
 
+void DiscardOutput(int Fd) {
+  FILE* Temp = fopen("nul", "w");
+  if (!Temp)
+    return;
+  _dup2(_fileno(Temp), Fd);
+  fclose(Temp);
+}
+
+intptr_t GetHandleFromFd(int fd) {
+  return _get_osfhandle(fd);
+}
+
 static bool IsSeparator(char C) {
   return C == '\\' || C == '/';
 }
@@ -277,7 +291,32 @@ std::string DirName(const std::string &FileName) {
   return FileName.substr(0, LocationLen + DirLen);
 }
 
-std::string TmpDir() { return "TODO: implement TmpDir"; }
+std::string TmpDir() {
+  std::string Tmp;
+  Tmp.resize(MAX_PATH + 1);
+  DWORD Size = GetTempPathA(Tmp.size(), &Tmp[0]);
+  if (Size == 0) {
+    Printf("Couldn't get Tmp path.\n");
+    exit(1);
+  }
+  Tmp.resize(Size);
+  return Tmp;
+}
+
+bool IsInterestingCoverageFile(const std::string &FileName) {
+  if (FileName.find("Program Files") != std::string::npos)
+    return false;
+  if (FileName.find("compiler-rt\\lib\\") != std::string::npos)
+    return false; // sanitizer internal.
+  if (FileName == "<null>")
+    return false;
+  return true;
+}
+
+void RawPrint(const char *Str) {
+  // Not tested, may or may not work. Fix if needed.
+  Printf("%s", Str);
+}
 
 }  // namespace fuzzer
 
diff --git a/lib/Fuzzer/FuzzerInterface.h b/lib/Fuzzer/FuzzerInterface.h
index d47e20e3a2b9..c2c0a39843c0 100644
--- a/lib/Fuzzer/FuzzerInterface.h
+++ b/lib/Fuzzer/FuzzerInterface.h
@@ -55,7 +55,7 @@ size_t LLVMFuzzerCustomCrossOver(const uint8_t *Data1, size_t Size1,
                                  unsigned int Seed);
 
 // Experimental, may go away in future.
-// libFuzzer-provided function to be used inside LLVMFuzzerTestOneInput.
+// libFuzzer-provided function to be used inside LLVMFuzzerCustomMutator.
 // Mutates raw data in [Data, Data+Size) inplace.
 // Returns the new size, which is not greater than MaxSize.
 size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize);
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index 0d2c7a78aca8..c26615631ecd 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -32,26 +32,6 @@ using namespace std::chrono;
 class Fuzzer {
 public:
 
-  // Aggregates all available coverage measurements.
-  struct Coverage {
-    Coverage() { Reset(); }
-
-    void Reset() {
-      BlockCoverage = 0;
-      CallerCalleeCoverage = 0;
-      CounterBitmapBits = 0;
-      CounterBitmap.clear();
-      VPMap.Reset();
-    }
-
-    size_t BlockCoverage;
-    size_t CallerCalleeCoverage;
-    // Precalculated number of bits in CounterBitmap.
-    size_t CounterBitmapBits;
-    std::vector<uint8_t> CounterBitmap;
-    ValueBitMap VPMap;
-  };
-
   Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD,
          FuzzingOptions Options);
   ~Fuzzer();
@@ -90,25 +70,23 @@ public:
   // Merge Corpora[1:] into Corpora[0].
   void Merge(const std::vector<std::string> &Corpora);
   void CrashResistantMerge(const std::vector<std::string> &Args,
-                           const std::vector<std::string> &Corpora);
+                           const std::vector<std::string> &Corpora,
+                           const char *CoverageSummaryInputPathOrNull,
+                           const char *CoverageSummaryOutputPathOrNull);
   void CrashResistantMergeInternalStep(const std::string &ControlFilePath);
-  // Returns a subset of 'Extra' that adds coverage to 'Initial'.
-  UnitVector FindExtraUnits(const UnitVector &Initial, const UnitVector &Extra);
   MutationDispatcher &GetMD() { return MD; }
   void PrintFinalStats();
   void SetMaxInputLen(size_t MaxInputLen);
   void SetMaxMutationLen(size_t MaxMutationLen);
   void RssLimitCallback();
 
-  // Public for tests.
-  void ResetCoverage();
-
   bool InFuzzingThread() const { return IsMyThread; }
   size_t GetCurrentUnitInFuzzingThead(const uint8_t **Data) const;
   void TryDetectingAMemoryLeak(const uint8_t *Data, size_t Size,
                                bool DuringInitialCorpusExecution);
 
   void HandleMalloc(size_t Size);
+  void AnnounceOutput(const uint8_t *Data, size_t Size);
 
 private:
   void AlarmCallback();
@@ -134,16 +112,10 @@ private:
   // Stop tracing.
   void StopTraceRecording();
 
-  void SetDeathCallback();
   static void StaticDeathCallback();
   void DumpCurrentUnit(const char *Prefix);
   void DeathCallback();
 
-  void ResetEdgeCoverage();
-  void ResetCounters();
-  void PrepareCounters(Fuzzer::Coverage *C);
-  bool RecordMaxCoverage(Fuzzer::Coverage *C);
-
   void AllocateCurrentUnitData();
   uint8_t *CurrentUnitData = nullptr;
   std::atomic<size_t> CurrentUnitSize;
@@ -166,16 +138,11 @@ private:
   long TimeOfLongestUnitInSeconds = 0;
   long EpochOfLastReadOfOutputCorpus = 0;
 
-  // Maximum recorded coverage.
-  Coverage MaxCoverage;
-
   size_t MaxInputLen = 0;
   size_t MaxMutationLen = 0;
 
   // Need to know our own thread.
   static thread_local bool IsMyThread;
-
-  bool InMergeMode = false;
 };
 
 }; // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 9f49d1557990..704092896eb6 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -14,6 +14,7 @@
 #include "FuzzerIO.h"
 #include "FuzzerMutate.h"
 #include "FuzzerRandom.h"
+#include "FuzzerShmem.h"
 #include "FuzzerTracePC.h"
 #include <algorithm>
 #include <cstring>
@@ -42,73 +43,11 @@ static const size_t kMaxUnitSizeToPrint = 256;
 
 thread_local bool Fuzzer::IsMyThread;
 
-static void MissingExternalApiFunction(const char *FnName) {
-  Printf("ERROR: %s is not defined. Exiting.\n"
-         "Did you use -fsanitize-coverage=... to build your code?\n",
-         FnName);
-  exit(1);
-}
-
-#define CHECK_EXTERNAL_FUNCTION(fn)                                            \
-  do {                                                                         \
-    if (!(EF->fn))                                                             \
-      MissingExternalApiFunction(#fn);                                         \
-  } while (false)
+SharedMemoryRegion SMR;
 
 // Only one Fuzzer per process.
 static Fuzzer *F;
 
-void Fuzzer::ResetEdgeCoverage() {
-  CHECK_EXTERNAL_FUNCTION(__sanitizer_reset_coverage);
-  EF->__sanitizer_reset_coverage();
-}
-
-void Fuzzer::ResetCounters() {
-  if (Options.UseCounters)
-    EF->__sanitizer_update_counter_bitset_and_clear_counters(0);
-}
-
-void Fuzzer::PrepareCounters(Fuzzer::Coverage *C) {
-  if (Options.UseCounters) {
-    size_t NumCounters = EF->__sanitizer_get_number_of_counters();
-    C->CounterBitmap.resize(NumCounters);
-  }
-}
-
-// Records data to a maximum coverage tracker. Returns true if additional
-// coverage was discovered.
-bool Fuzzer::RecordMaxCoverage(Fuzzer::Coverage *C) {
-  bool Res = false;
-
-  uint64_t NewBlockCoverage = EF->__sanitizer_get_total_unique_coverage();
-  if (NewBlockCoverage > C->BlockCoverage) {
-    Res = true;
-    C->BlockCoverage = NewBlockCoverage;
-  }
-
-  if (Options.UseIndirCalls &&
-      EF->__sanitizer_get_total_unique_caller_callee_pairs) {
-    uint64_t NewCallerCalleeCoverage =
-        EF->__sanitizer_get_total_unique_caller_callee_pairs();
-    if (NewCallerCalleeCoverage > C->CallerCalleeCoverage) {
-      Res = true;
-      C->CallerCalleeCoverage = NewCallerCalleeCoverage;
-    }
-  }
-
-  if (Options.UseCounters) {
-    uint64_t CounterDelta =
-        EF->__sanitizer_update_counter_bitset_and_clear_counters(
-            C->CounterBitmap.data());
-    if (CounterDelta > 0) {
-      Res = true;
-      C->CounterBitmapBits += CounterDelta;
-    }
-  }
-
-  return Res;
-}
-
 // Leak detection is expensive, so we first check if there were more mallocs
 // than frees (using the sanitizer malloc hooks) and only then try to call lsan.
 struct MallocFreeTracer {
@@ -176,12 +115,12 @@ void Fuzzer::HandleMalloc(size_t Size) {
 Fuzzer::Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD,
                FuzzingOptions Options)
     : CB(CB), Corpus(Corpus), MD(MD), Options(Options) {
-  SetDeathCallback();
+  if (EF->__sanitizer_set_death_callback)
+    EF->__sanitizer_set_death_callback(StaticDeathCallback);
   InitializeTraceState();
   assert(!F);
   F = this;
   TPC.ResetMaps();
-  ResetCoverage();
   IsMyThread = true;
   if (Options.DetectLeaks && EF->__sanitizer_install_malloc_and_free_hooks)
     EF->__sanitizer_install_malloc_and_free_hooks(MallocHook, FreeHook);
@@ -206,33 +145,12 @@ void Fuzzer::AllocateCurrentUnitData() {
   CurrentUnitData = new uint8_t[MaxInputLen];
 }
 
-void Fuzzer::SetDeathCallback() {
-  CHECK_EXTERNAL_FUNCTION(__sanitizer_set_death_callback);
-  EF->__sanitizer_set_death_callback(StaticDeathCallback);
-}
-
 void Fuzzer::StaticDeathCallback() {
   assert(F);
   F->DeathCallback();
 }
 
-static void WarnOnUnsuccessfullMerge(bool DoWarn) {
-  if (!DoWarn) return;
-  Printf(
-   "***\n"
-   "***\n"
-   "***\n"
-   "*** NOTE: merge did not succeed due to a failure on one of the inputs.\n"
-   "*** You will need to filter out crashes from the corpus, e.g. like this:\n"
-   "***   for f in WITH_CRASHES/*; do ./fuzzer $f && cp $f NO_CRASHES; done\n"
-   "*** Future versions may have crash-resistant merge, stay tuned.\n"
-   "***\n"
-   "***\n"
-   "***\n");
-}
-
 void Fuzzer::DumpCurrentUnit(const char *Prefix) {
-  WarnOnUnsuccessfullMerge(InMergeMode);
   if (!CurrentUnitData) return;  // Happens when running individual inputs.
   MD.PrintMutationSequence();
   Printf("; base unit: %s\n", Sha1ToString(BaseSha1).c_str());
@@ -293,7 +211,10 @@ void Fuzzer::InterruptCallback() {
 NO_SANITIZE_MEMORY
 void Fuzzer::AlarmCallback() {
   assert(Options.UnitTimeoutSec > 0);
+  // In Windows Alarm callback is executed by a different thread.
+#if !LIBFUZZER_WINDOWS
   if (!InFuzzingThread()) return;
+#endif
   if (!RunningCB)
     return; // We have not started running units yet.
   size_t Seconds =
@@ -323,7 +244,7 @@ void Fuzzer::RssLimitCallback() {
       GetPid(), GetPeakRSSMb(), Options.RssLimitMb);
   Printf("   To change the out-of-memory limit use -rss_limit_mb=<N>\n\n");
   if (EF->__sanitizer_print_memory_profile)
-    EF->__sanitizer_print_memory_profile(95);
+    EF->__sanitizer_print_memory_profile(95, 8);
   DumpCurrentUnit("oom-");
   Printf("SUMMARY: libFuzzer: out-of-memory\n");
   PrintFinalStats();
@@ -338,26 +259,18 @@ void Fuzzer::PrintStats(const char *Where, const char *End, size_t Units) {
       csvHeaderPrinted = true;
       Printf("runs,block_cov,bits,cc_cov,corpus,execs_per_sec,tbms,reason\n");
     }
-    Printf("%zd,%zd,%zd,%zd,%zd,%zd,%s\n", TotalNumberOfRuns,
-           MaxCoverage.BlockCoverage, MaxCoverage.CounterBitmapBits,
-           MaxCoverage.CallerCalleeCoverage, Corpus.size(), ExecPerSec, Where);
+    Printf("%zd,%zd,%zd,%zd,%s\n", TotalNumberOfRuns,
+           TPC.GetTotalPCCoverage(),
+           Corpus.size(), ExecPerSec, Where);
   }
 
   if (!Options.Verbosity)
     return;
   Printf("#%zd\t%s", TotalNumberOfRuns, Where);
-  if (MaxCoverage.BlockCoverage)
-    Printf(" cov: %zd", MaxCoverage.BlockCoverage);
-  if (size_t N = MaxCoverage.VPMap.GetNumBitsSinceLastMerge())
-    Printf(" vp: %zd", N);
   if (size_t N = TPC.GetTotalPCCoverage())
     Printf(" cov: %zd", N);
-  if (auto TB = MaxCoverage.CounterBitmapBits)
-    Printf(" bits: %zd", TB);
   if (size_t N = Corpus.NumFeatures())
     Printf( " ft: %zd", N);
-  if (MaxCoverage.CallerCalleeCoverage)
-    Printf(" indir: %zd", MaxCoverage.CallerCalleeCoverage);
   if (!Corpus.empty()) {
     Printf(" corp: %zd", Corpus.NumActiveUnits());
     if (size_t N = Corpus.SizeInBytes()) {
@@ -456,7 +369,7 @@ void Fuzzer::RereadOutputCorpus(size_t MaxSize) {
 }
 
 void Fuzzer::ShuffleCorpus(UnitVector *V) {
-  std::random_shuffle(V->begin(), V->end(), MD.GetRand());
+  std::shuffle(V->begin(), V->end(), MD.GetRand());
   if (Options.PreferSmall)
     std::stable_sort(V->begin(), V->end(), [](const Unit &A, const Unit &B) {
       return A.size() < B.size();
@@ -476,8 +389,6 @@ void Fuzzer::ShuffleAndMinimize(UnitVector *InitialCorpus) {
     if (size_t NumFeatures = RunOne(U)) {
       CheckExitOnSrcPosOrItem();
       Corpus.AddToCorpus(U, NumFeatures);
-      if (Options.Verbosity >= 2)
-        Printf("NEW0: %zd L %zd\n", MaxCoverage.BlockCoverage, U.size());
     }
     TryDetectingAMemoryLeak(U.data(), U.size(),
                             /*DuringInitialCorpusExecution*/ true);
@@ -496,18 +407,11 @@ size_t Fuzzer::RunOne(const uint8_t *Data, size_t Size) {
 
   ExecuteCallback(Data, Size);
 
-  size_t Res = 0;
-  if (size_t NumFeatures = TPC.CollectFeatures([&](size_t Feature) -> bool {
-        return Corpus.AddFeature(Feature, Size, Options.Shrink);
-      }))
-    Res = NumFeatures;
-
-  if (!TPC.UsingTracePcGuard()) {
-    if (TPC.UpdateValueProfileMap(&MaxCoverage.VPMap))
-      Res = 1;
-    if (!Res && RecordMaxCoverage(&MaxCoverage))
-      Res = 1;
-  }
+  size_t NumUpdatesBefore = Corpus.NumFeatureUpdates();
+  TPC.CollectFeatures([&](size_t Feature) {
+    Corpus.AddFeature(Feature, Size, Options.Shrink);
+  });
+  size_t NumUpdatesAfter = Corpus.NumFeatureUpdates();
 
   auto TimeOfUnit =
       duration_cast<seconds>(UnitStopTime - UnitStartTime).count();
@@ -520,7 +424,7 @@ size_t Fuzzer::RunOne(const uint8_t *Data, size_t Size) {
     Printf("Slowest unit: %zd s:\n", TimeOfLongestUnitInSeconds);
     WriteUnitToFileWithPrefix({Data, Data + Size}, "slow-unit-");
   }
-  return Res;
+  return NumUpdatesAfter - NumUpdatesBefore;
 }
 
 size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
@@ -531,6 +435,8 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
 
 void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   assert(InFuzzingThread());
+  if (SMR.IsClient())
+    SMR.WriteByteArray(Data, Size);
   // We copy the contents of Unit into a separate heap buffer
   // so that we reliably find buffer overflows in it.
   uint8_t *DataCopy = new uint8_t[Size];
@@ -540,7 +446,6 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   CurrentUnitSize = Size;
   AllocTracer.Start(Options.TraceMalloc);
   UnitStartTime = system_clock::now();
-  ResetCounters();  // Reset coverage right before the callback.
   TPC.ResetMaps();
   RunningCB = true;
   int Res = CB(DataCopy, Size);
@@ -597,77 +502,6 @@ void Fuzzer::ReportNewCoverage(InputInfo *II, const Unit &U) {
   TPC.PrintNewPCs();
 }
 
-// Finds minimal number of units in 'Extra' that add coverage to 'Initial'.
-// We do it by actually executing the units, sometimes more than once,
-// because we may be using different coverage-like signals and the only
-// common thing between them is that we can say "this unit found new stuff".
-UnitVector Fuzzer::FindExtraUnits(const UnitVector &Initial,
-                                  const UnitVector &Extra) {
-  UnitVector Res = Extra;
-  UnitVector Tmp;
-  size_t OldSize = Res.size();
-  for (int Iter = 0; Iter < 10; Iter++) {
-    ShuffleCorpus(&Res);
-    TPC.ResetMaps();
-    Corpus.ResetFeatureSet();
-    ResetCoverage();
-
-    for (auto &U : Initial) {
-      TPC.ResetMaps();
-      RunOne(U);
-    }
-
-    Tmp.clear();
-    for (auto &U : Res) {
-      TPC.ResetMaps();
-      if (RunOne(U))
-        Tmp.push_back(U);
-    }
-
-    char Stat[7] = "MIN   ";
-    Stat[3] = '0' + Iter;
-    PrintStats(Stat, "\n", Tmp.size());
-
-    size_t NewSize = Tmp.size();
-    assert(NewSize <= OldSize);
-    Res.swap(Tmp);
-
-    if (NewSize + 5 >= OldSize)
-      break;
-    OldSize = NewSize;
-  }
-  return Res;
-}
-
-void Fuzzer::Merge(const std::vector<std::string> &Corpora) {
-  if (Corpora.size() <= 1) {
-    Printf("Merge requires two or more corpus dirs\n");
-    return;
-  }
-  InMergeMode = true;
-  std::vector<std::string> ExtraCorpora(Corpora.begin() + 1, Corpora.end());
-
-  assert(MaxInputLen > 0);
-  UnitVector Initial, Extra;
-  ReadDirToVectorOfUnits(Corpora[0].c_str(), &Initial, nullptr, MaxInputLen,
-                         true);
-  for (auto &C : ExtraCorpora)
-    ReadDirToVectorOfUnits(C.c_str(), &Extra, nullptr, MaxInputLen, true);
-
-  if (!Initial.empty()) {
-    Printf("=== Minimizing the initial corpus of %zd units\n", Initial.size());
-    Initial = FindExtraUnits({}, Initial);
-  }
-
-  Printf("=== Merging extra %zd units\n", Extra.size());
-  auto Res = FindExtraUnits(Initial, Extra);
-
-  for (auto &U: Res)
-    WriteToOutputCorpus(U);
-
-  Printf("=== Merge: written %zd units\n", Res.size());
-}
-
 // Tries detecting a memory leak on the particular input that we have just
 // executed before calling this function.
 void Fuzzer::TryDetectingAMemoryLeak(const uint8_t *Data, size_t Size,
@@ -762,12 +596,6 @@ void Fuzzer::MutateAndTestOne() {
   }
 }
 
-void Fuzzer::ResetCoverage() {
-  ResetEdgeCoverage();
-  MaxCoverage.Reset();
-  PrepareCounters(&MaxCoverage);
-}
-
 void Fuzzer::Loop() {
   TPC.InitializePrintNewPCs();
   system_clock::time_point LastCorpusReload = system_clock::now();
@@ -792,7 +620,7 @@ void Fuzzer::Loop() {
 }
 
 void Fuzzer::MinimizeCrashLoop(const Unit &U) {
-  if (U.size() <= 2) return;
+  if (U.size() <= 1) return;
   while (!TimedOut() && TotalNumberOfRuns < Options.MaxNumberOfRuns) {
     MD.StartMutationSequence();
     memcpy(CurrentUnitData, U.data(), U.size());
@@ -806,6 +634,29 @@ void Fuzzer::MinimizeCrashLoop(const Unit &U) {
   }
 }
 
+void Fuzzer::AnnounceOutput(const uint8_t *Data, size_t Size) {
+  if (SMR.IsServer()) {
+    SMR.WriteByteArray(Data, Size);
+  } else if (SMR.IsClient()) {
+    SMR.PostClient();
+    SMR.WaitServer();
+    size_t OtherSize = SMR.ReadByteArraySize();
+    uint8_t *OtherData = SMR.GetByteArray();
+    if (Size != OtherSize || memcmp(Data, OtherData, Size) != 0) {
+      size_t i = 0;
+      for (i = 0; i < Min(Size, OtherSize); i++)
+        if (Data[i] != OtherData[i])
+          break;
+      Printf("==%lu== ERROR: libFuzzer: equivalence-mismatch. Sizes: %zd %zd; "
+             "offset %zd\n", GetPid(), Size, OtherSize, i);
+      DumpCurrentUnit("mismatch-");
+      Printf("SUMMARY: libFuzzer: equivalence-mismatch\n");
+      PrintFinalStats();
+      _Exit(Options.ErrorExitCode);
+    }
+  }
+}
+
 } // namespace fuzzer
 
 extern "C" {
@@ -814,4 +665,10 @@ size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
   assert(fuzzer::F);
   return fuzzer::F->GetMD().DefaultMutate(Data, Size, MaxSize);
 }
+
+// Experimental
+void LLVMFuzzerAnnounceOutput(const uint8_t *Data, size_t Size) {
+  assert(fuzzer::F);
+  fuzzer::F->AnnounceOutput(Data, Size);
+}
 }  // extern "C"
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
index 9e559115680c..e66460c29e2f 100644
--- a/lib/Fuzzer/FuzzerMerge.cpp
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -17,6 +17,7 @@
 
 #include <fstream>
 #include <iterator>
+#include <set>
 #include <sstream>
 
 namespace fuzzer {
@@ -73,6 +74,7 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
   size_t ExpectedStartMarker = 0;
   const size_t kInvalidStartMarker = -1;
   size_t LastSeenStartMarker = kInvalidStartMarker;
+  std::vector<uint32_t> TmpFeatures;
   while (std::getline(IS, Line, '\n')) {
     std::istringstream ISS1(Line);
     std::string Marker;
@@ -88,17 +90,17 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
       assert(ExpectedStartMarker < Files.size());
       ExpectedStartMarker++;
     } else if (Marker == "DONE") {
-      // DONE FILE_SIZE COV1 COV2 COV3 ...
+      // DONE FILE_ID COV1 COV2 COV3 ...
       size_t CurrentFileIdx = N;
       if (CurrentFileIdx != LastSeenStartMarker)
         return false;
       LastSeenStartMarker = kInvalidStartMarker;
       if (ParseCoverage) {
-        auto &V = Files[CurrentFileIdx].Features;
-        V.clear();
+        TmpFeatures.clear();  // use a vector from outer scope to avoid resizes.
         while (ISS1 >> std::hex >> N)
-          V.push_back(N);
-        std::sort(V.begin(), V.end());
+          TmpFeatures.push_back(N);
+        std::sort(TmpFeatures.begin(), TmpFeatures.end());
+        Files[CurrentFileIdx].Features = TmpFeatures;
       }
     } else {
       return false;
@@ -111,12 +113,20 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
   return true;
 }
 
+size_t Merger::ApproximateMemoryConsumption() const  {
+  size_t Res = 0;
+  for (const auto &F: Files)
+    Res += sizeof(F) + F.Features.size() * sizeof(F.Features[0]);
+  return Res;
+}
+
 // Decides which files need to be merged (add thost to NewFiles).
 // Returns the number of new features added.
-size_t Merger::Merge(std::vector<std::string> *NewFiles) {
+size_t Merger::Merge(const std::set<uint32_t> &InitialFeatures,
+                     std::vector<std::string> *NewFiles) {
   NewFiles->clear();
   assert(NumFilesInFirstCorpus <= Files.size());
-  std::set<uint32_t> AllFeatures;
+  std::set<uint32_t> AllFeatures(InitialFeatures);
 
   // What features are in the initial corpus?
   for (size_t i = 0; i < NumFilesInFirstCorpus; i++) {
@@ -158,6 +168,42 @@ size_t Merger::Merge(std::vector<std::string> *NewFiles) {
   return AllFeatures.size() - InitialNumFeatures;
 }
 
+void Merger::PrintSummary(std::ostream &OS) {
+  for (auto &File : Files) {
+    OS << std::hex;
+    OS << File.Name << " size: " << File.Size << " features: ";
+    for (auto Feature : File.Features)
+      OS << " " << Feature;
+    OS << "\n";
+  }
+}
+
+std::set<uint32_t> Merger::AllFeatures() const {
+  std::set<uint32_t> S;
+  for (auto &File : Files)
+    S.insert(File.Features.begin(), File.Features.end());
+  return S;
+}
+
+std::set<uint32_t> Merger::ParseSummary(std::istream &IS) {
+  std::string Line, Tmp;
+  std::set<uint32_t> Res;
+  while (std::getline(IS, Line, '\n')) {
+    size_t N;
+    std::istringstream ISS1(Line);
+    ISS1 >> Tmp;  // Name
+    ISS1 >> Tmp;  // size:
+    assert(Tmp == "size:" && "Corrupt summary file");
+    ISS1 >> std::hex;
+    ISS1 >> N;    // File Size
+    ISS1 >> Tmp;  // features:
+    assert(Tmp == "features:" && "Corrupt summary file");
+    while (ISS1 >> std::hex >> N)
+      Res.insert(N);
+  }
+  return Res;
+}
+
 // Inner process. May crash if the target crashes.
 void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
   Printf("MERGE-INNER: using the control file '%s'\n", CFPath.c_str());
@@ -208,7 +254,9 @@ void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
 
 // Outer process. Does not call the target code and thus sohuld not fail.
 void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
-                                 const std::vector<std::string> &Corpora) {
+                                 const std::vector<std::string> &Corpora,
+                                 const char *CoverageSummaryInputPathOrNull,
+                                 const char *CoverageSummaryOutputPathOrNull) {
   if (Corpora.size() <= 1) {
     Printf("Merge requires two or more corpus dirs\n");
     return;
@@ -239,15 +287,21 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   // Execute the inner process untill it passes.
   // Every inner process should execute at least one input.
   std::string BaseCmd = CloneArgsWithoutX(Args, "keep-all-flags");
+  bool Success = false;
   for (size_t i = 1; i <= AllFiles.size(); i++) {
     Printf("MERGE-OUTER: attempt %zd\n", i);
     auto ExitCode =
         ExecuteCommand(BaseCmd + " -merge_control_file=" + CFPath);
     if (!ExitCode) {
       Printf("MERGE-OUTER: succesfull in %zd attempt(s)\n", i);
+      Success = true;
       break;
     }
   }
+  if (!Success) {
+    Printf("MERGE-OUTER: zero succesfull attempts, exiting\n");
+    exit(1);
+  }
   // Read the control file and do the merge.
   Merger M;
   std::ifstream IF(CFPath);
@@ -256,8 +310,23 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   IF.seekg(0, IF.beg);
   M.ParseOrExit(IF, true);
   IF.close();
+  Printf("MERGE-OUTER: consumed %zdMb (%zdMb rss) to parse the control file\n",
+         M.ApproximateMemoryConsumption() >> 20, GetPeakRSSMb());
+  if (CoverageSummaryOutputPathOrNull) {
+    Printf("MERGE-OUTER: writing coverage summary for %zd files to %s\n",
+           M.Files.size(), CoverageSummaryOutputPathOrNull);
+    std::ofstream SummaryOut(CoverageSummaryOutputPathOrNull);
+    M.PrintSummary(SummaryOut);
+  }
   std::vector<std::string> NewFiles;
-  size_t NumNewFeatures = M.Merge(&NewFiles);
+  std::set<uint32_t> InitialFeatures;
+  if (CoverageSummaryInputPathOrNull) {
+    std::ifstream SummaryIn(CoverageSummaryInputPathOrNull);
+    InitialFeatures = M.ParseSummary(SummaryIn);
+    Printf("MERGE-OUTER: coverage summary loaded from %s, %zd features found\n",
+           CoverageSummaryInputPathOrNull, InitialFeatures.size());
+  }
+  size_t NumNewFeatures = M.Merge(InitialFeatures, &NewFiles);
   Printf("MERGE-OUTER: %zd new files with %zd new features added\n",
          NewFiles.size(), NumNewFeatures);
   for (auto &F: NewFiles)
diff --git a/lib/Fuzzer/FuzzerMerge.h b/lib/Fuzzer/FuzzerMerge.h
index 8a2fe5d74f88..cf4a0863571d 100644
--- a/lib/Fuzzer/FuzzerMerge.h
+++ b/lib/Fuzzer/FuzzerMerge.h
@@ -43,7 +43,9 @@
 #include "FuzzerDefs.h"
 
 #include <istream>
+#include <ostream>
 #include <set>
+#include <vector>
 
 namespace fuzzer {
 
@@ -62,7 +64,15 @@ struct Merger {
   bool Parse(std::istream &IS, bool ParseCoverage);
   bool Parse(const std::string &Str, bool ParseCoverage);
   void ParseOrExit(std::istream &IS, bool ParseCoverage);
-  size_t Merge(std::vector<std::string> *NewFiles);
+  void PrintSummary(std::ostream &OS);
+  std::set<uint32_t> ParseSummary(std::istream &IS);
+  size_t Merge(const std::set<uint32_t> &InitialFeatures,
+               std::vector<std::string> *NewFiles);
+  size_t Merge(std::vector<std::string> *NewFiles) {
+    return Merge({}, NewFiles);
+  }
+  size_t ApproximateMemoryConsumption() const;
+  std::set<uint32_t> AllFeatures() const;
 };
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index 96a87b879d6f..cd846c7deec5 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -81,8 +81,8 @@ size_t MutationDispatcher::Mutate_CustomCrossOver(uint8_t *Data, size_t Size,
   const Unit &Other = (*Corpus)[Idx];
   if (Other.empty())
     return 0;
-  MutateInPlaceHere.resize(MaxSize);
-  auto &U = MutateInPlaceHere;
+  CustomCrossOverInPlaceHere.resize(MaxSize);
+  auto &U = CustomCrossOverInPlaceHere;
   size_t NewSize = EF->LLVMFuzzerCustomCrossOver(
       Data, Size, Other.data(), Other.size(), U.data(), U.size(), Rand.Rand());
   if (!NewSize)
@@ -94,21 +94,18 @@ size_t MutationDispatcher::Mutate_CustomCrossOver(uint8_t *Data, size_t Size,
 
 size_t MutationDispatcher::Mutate_ShuffleBytes(uint8_t *Data, size_t Size,
                                                size_t MaxSize) {
-  if (Size > MaxSize) return 0;
-  assert(Size);
+  if (Size > MaxSize || Size == 0) return 0;
   size_t ShuffleAmount =
       Rand(std::min(Size, (size_t)8)) + 1; // [1,8] and <= Size.
   size_t ShuffleStart = Rand(Size - ShuffleAmount);
   assert(ShuffleStart + ShuffleAmount <= Size);
-  std::random_shuffle(Data + ShuffleStart, Data + ShuffleStart + ShuffleAmount,
-                      Rand);
+  std::shuffle(Data + ShuffleStart, Data + ShuffleStart + ShuffleAmount, Rand);
   return Size;
 }
 
 size_t MutationDispatcher::Mutate_EraseBytes(uint8_t *Data, size_t Size,
                                              size_t MaxSize) {
-  assert(Size);
-  if (Size == 1) return 0;
+  if (Size <= 1) return 0;
   size_t N = Rand(Size / 2) + 1;
   assert(N < Size);
   size_t Idx = Rand(Size - N + 1);
@@ -200,28 +197,27 @@ size_t MutationDispatcher::ApplyDictionaryEntry(uint8_t *Data, size_t Size,
 // It first tries to find one of the arguments (possibly swapped) in the
 // input and if it succeeds it creates a DE with a position hint.
 // Otherwise it creates a DE with one of the arguments w/o a position hint.
-template <class T>
 DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
-    T Arg1, T Arg2, const uint8_t *Data, size_t Size) {
-  ScopedDoingMyOwnMemmem scoped_doing_my_own_memmem;
+    const void *Arg1, const void *Arg2,
+    const void *Arg1Mutation, const void *Arg2Mutation,
+    size_t ArgSize, const uint8_t *Data,
+    size_t Size) {
+  ScopedDoingMyOwnMemOrStr scoped_doing_my_own_mem_os_str;
   bool HandleFirst = Rand.RandBool();
-  T ExistingBytes, DesiredBytes;
+  const void *ExistingBytes, *DesiredBytes;
   Word W;
   const uint8_t *End = Data + Size;
   for (int Arg = 0; Arg < 2; Arg++) {
     ExistingBytes = HandleFirst ? Arg1 : Arg2;
-    DesiredBytes = HandleFirst ? Arg2 : Arg1;
-    DesiredBytes += Rand(-1, 1);
-    if (Rand.RandBool()) ExistingBytes = Bswap(ExistingBytes);
-    if (Rand.RandBool()) DesiredBytes = Bswap(DesiredBytes);
+    DesiredBytes = HandleFirst ? Arg2Mutation : Arg1Mutation;
     HandleFirst = !HandleFirst;
-    W.Set(reinterpret_cast<uint8_t*>(&DesiredBytes), sizeof(T));
+    W.Set(reinterpret_cast<const uint8_t*>(DesiredBytes), ArgSize);
     const size_t kMaxNumPositions = 8;
     size_t Positions[kMaxNumPositions];
     size_t NumPositions = 0;
     for (const uint8_t *Cur = Data;
          Cur < End && NumPositions < kMaxNumPositions; Cur++) {
-      Cur = (uint8_t *)SearchMemory(Cur, End - Cur, &ExistingBytes, sizeof(T));
+      Cur = (uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
       if (!Cur) break;
       Positions[NumPositions++] = Cur - Data;
     }
@@ -232,21 +228,48 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
   return DE;
 }
 
+
+template <class T>
+DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
+    T Arg1, T Arg2, const uint8_t *Data, size_t Size) {
+  if (Rand.RandBool()) Arg1 = Bswap(Arg1);
+  if (Rand.RandBool()) Arg2 = Bswap(Arg2);
+  T Arg1Mutation = Arg1 + Rand(-1, 1);
+  T Arg2Mutation = Arg2 + Rand(-1, 1);
+  return MakeDictionaryEntryFromCMP(&Arg1, &Arg2, &Arg1Mutation, &Arg2Mutation,
+                                    sizeof(Arg1), Data, Size);
+}
+
+DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
+    const Word &Arg1, const Word &Arg2, const uint8_t *Data, size_t Size) {
+  return MakeDictionaryEntryFromCMP(Arg1.data(), Arg2.data(), Arg1.data(),
+                                    Arg2.data(), Arg1.size(), Data, Size);
+}
+
 size_t MutationDispatcher::Mutate_AddWordFromTORC(
     uint8_t *Data, size_t Size, size_t MaxSize) {
   Word W;
   DictionaryEntry DE;
-  if (Rand.RandBool()) {
+  switch (Rand(3)) {
+  case 0: {
     auto X = TPC.TORC8.Get(Rand.Rand());
     DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
-  } else {
+  } break;
+  case 1: {
     auto X = TPC.TORC4.Get(Rand.Rand());
     if ((X.A >> 16) == 0 && (X.B >> 16) == 0 && Rand.RandBool())
-      DE = MakeDictionaryEntryFromCMP((uint16_t)X.A, (uint16_t)X.B, Data,
-                                      Size);
+      DE = MakeDictionaryEntryFromCMP((uint16_t)X.A, (uint16_t)X.B, Data, Size);
     else
       DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
+  } break;
+  case 2: {
+    auto X = TPC.TORCW.Get(Rand.Rand());
+    DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
+  } break;
+  default:
+    assert(0);
   }
+  if (!DE.GetW().size()) return 0;
   Size = ApplyDictionaryEntry(Data, Size, MaxSize, DE);
   if (!Size) return 0;
   DictionaryEntry &DERef =
@@ -317,7 +340,7 @@ size_t MutationDispatcher::InsertPartOf(const uint8_t *From, size_t FromSize,
 
 size_t MutationDispatcher::Mutate_CopyPart(uint8_t *Data, size_t Size,
                                            size_t MaxSize) {
-  if (Size > MaxSize) return 0;
+  if (Size > MaxSize || Size == 0) return 0;
   if (Rand.RandBool())
     return CopyPartOf(Data, Size, Data, Size);
   else
@@ -413,9 +436,9 @@ size_t MutationDispatcher::Mutate_CrossOver(uint8_t *Data, size_t Size,
       break;
     case 1:
       NewSize = InsertPartOf(O.data(), O.size(), U.data(), U.size(), MaxSize);
-      if (NewSize)
-        break;
-      // LLVM_FALLTHROUGH;
+      if (!NewSize)
+        NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size());
+      break;
     case 2:
       NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size());
       break;
@@ -437,6 +460,7 @@ void MutationDispatcher::RecordSuccessfulMutationSequence() {
   for (auto DE : CurrentDictionaryEntrySequence) {
     // PersistentAutoDictionary.AddWithSuccessCountOne(DE);
     DE->IncSuccessCount();
+    assert(DE->GetW().size());
     // Linear search is fine here as this happens seldom.
     if (!PersistentAutoDictionary.ContainsWord(DE->GetW()))
       PersistentAutoDictionary.push_back({DE->GetW(), 1});
@@ -451,6 +475,7 @@ void MutationDispatcher::PrintRecommendedDictionary() {
   if (V.empty()) return;
   Printf("###### Recommended dictionary. ######\n");
   for (auto &DE: V) {
+    assert(DE.GetW().size());
     Printf("\"");
     PrintASCII(DE.GetW(), "\"");
     Printf(" # Uses: %zd\n", DE.GetUseCount());
@@ -485,14 +510,6 @@ size_t MutationDispatcher::MutateImpl(uint8_t *Data, size_t Size,
                                       size_t MaxSize,
                                       const std::vector<Mutator> &Mutators) {
   assert(MaxSize > 0);
-  if (Size == 0) {
-    for (size_t i = 0; i < Min(size_t(4), MaxSize); i++)
-      Data[i] = RandCh(Rand);
-    if (Options.OnlyASCII)
-      ToASCII(Data, MaxSize);
-    return MaxSize;
-  }
-  assert(Size > 0);
   // Some mutations may fail (e.g. can't insert more bytes if Size == MaxSize),
   // in which case they will return 0.
   // Try several times before returning un-mutated data.
@@ -506,7 +523,8 @@ size_t MutationDispatcher::MutateImpl(uint8_t *Data, size_t Size,
       return NewSize;
     }
   }
-  return std::min(Size, MaxSize);
+  *Data = ' ';
+  return 1;   // Fallback, should not happen frequently.
 }
 
 void MutationDispatcher::AddWordToManualDictionary(const Word &W) {
diff --git a/lib/Fuzzer/FuzzerMutate.h b/lib/Fuzzer/FuzzerMutate.h
index d3c0b0012468..8c8fb3fd74c7 100644
--- a/lib/Fuzzer/FuzzerMutate.h
+++ b/lib/Fuzzer/FuzzerMutate.h
@@ -14,6 +14,7 @@
 
 #include "FuzzerDefs.h"
 #include "FuzzerDictionary.h"
+#include "FuzzerOptions.h"
 #include "FuzzerRandom.h"
 
 namespace fuzzer {
@@ -113,9 +114,16 @@ private:
   template <class T>
   DictionaryEntry MakeDictionaryEntryFromCMP(T Arg1, T Arg2,
                                              const uint8_t *Data, size_t Size);
+  DictionaryEntry MakeDictionaryEntryFromCMP(const Word &Arg1, const Word &Arg2,
+                                             const uint8_t *Data, size_t Size);
+  DictionaryEntry MakeDictionaryEntryFromCMP(const void *Arg1, const void *Arg2,
+                                             const void *Arg1Mutation,
+                                             const void *Arg2Mutation,
+                                             size_t ArgSize,
+                                             const uint8_t *Data, size_t Size);
 
   Random &Rand;
-  const FuzzingOptions &Options;
+  const FuzzingOptions Options;
 
   // Dictionary provided by the user via -dict=DICT_FILE.
   Dictionary ManualDictionary;
@@ -135,6 +143,9 @@ private:
 
   const InputCorpus *Corpus = nullptr;
   std::vector<uint8_t> MutateInPlaceHere;
+  // CustomCrossOver needs its own buffer as a custom implementation may call
+  // LLVMFuzzerMutate, which in turn may resize MutateInPlaceHere.
+  std::vector<uint8_t> CustomCrossOverInPlaceHere;
 
   std::vector<Mutator> Mutators;
   std::vector<Mutator> DefaultMutators;
diff --git a/lib/Fuzzer/FuzzerOptions.h b/lib/Fuzzer/FuzzerOptions.h
index 6f72205600b9..872def0326f0 100644
--- a/lib/Fuzzer/FuzzerOptions.h
+++ b/lib/Fuzzer/FuzzerOptions.h
@@ -1,4 +1,3 @@
-//===- FuzzerOptions.h - Internal header for the Fuzzer ---------*- C++ -* ===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,7 +28,6 @@ struct FuzzingOptions {
   int MutateDepth = 5;
   bool UseCounters = false;
   bool UseIndirCalls = true;
-  bool UseMemcmp = true;
   bool UseMemmem = true;
   bool UseCmp = false;
   bool UseValueProfile = false;
diff --git a/lib/Fuzzer/FuzzerRandom.h b/lib/Fuzzer/FuzzerRandom.h
index b1be0bb935fa..8a1aa3ef5fdc 100644
--- a/lib/Fuzzer/FuzzerRandom.h
+++ b/lib/Fuzzer/FuzzerRandom.h
@@ -15,10 +15,11 @@
 #include <random>
 
 namespace fuzzer {
-class Random {
+class Random : public std::mt19937 {
  public:
-  Random(unsigned int seed) : R(seed) {}
-  size_t Rand() { return R(); }
+  Random(unsigned int seed) : std::mt19937(seed) {}
+  result_type operator()() { return this->std::mt19937::operator()(); }
+  size_t Rand() { return this->operator()(); }
   size_t RandBool() { return Rand() % 2; }
   size_t operator()(size_t n) { return n ? Rand() % n : 0; }
   intptr_t operator()(intptr_t From, intptr_t To) {
@@ -26,9 +27,6 @@ class Random {
     intptr_t RangeSize = To - From + 1;
     return operator()(RangeSize) + From;
   }
-  std::mt19937 &Get_mt19937() { return R; }
- private:
-  std::mt19937 R;
 };
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerShmem.h b/lib/Fuzzer/FuzzerShmem.h
new file mode 100644
index 000000000000..53568e0acb69
--- /dev/null
+++ b/lib/Fuzzer/FuzzerShmem.h
@@ -0,0 +1,69 @@
+//===- FuzzerShmem.h - shared memory interface ------------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// SharedMemoryRegion
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUZZER_SHMEM_H
+#define LLVM_FUZZER_SHMEM_H
+
+#include <algorithm>
+#include <cstring>
+#include <string>
+
+#include "FuzzerDefs.h"
+
+namespace fuzzer {
+
+class SharedMemoryRegion {
+ public:
+  bool Create(const char *Name);
+  bool Open(const char *Name);
+  bool Destroy(const char *Name);
+  uint8_t *GetData() { return Data; }
+  void PostServer() {Post(0);}
+  void WaitServer() {Wait(0);}
+  void PostClient() {Post(1);}
+  void WaitClient() {Wait(1);}
+
+  size_t WriteByteArray(const uint8_t *Bytes, size_t N) {
+    assert(N <= kShmemSize - sizeof(N));
+    memcpy(GetData(), &N, sizeof(N));
+    memcpy(GetData() + sizeof(N), Bytes, N);
+    assert(N == ReadByteArraySize());
+    return N;
+  }
+  size_t ReadByteArraySize() {
+    size_t Res;
+    memcpy(&Res, GetData(), sizeof(Res));
+    return Res;
+  }
+  uint8_t *GetByteArray() { return GetData() + sizeof(size_t); }
+
+  bool IsServer() const { return Data && IAmServer; }
+  bool IsClient() const { return Data && !IAmServer; }
+
+private:
+
+  static const size_t kShmemSize = 1 << 22;
+  bool IAmServer;
+  std::string Path(const char *Name);
+  std::string SemName(const char *Name, int Idx);
+  void Post(int Idx);
+  void Wait(int Idx);
+
+  bool Map(int fd);
+  uint8_t *Data = nullptr;
+  void *Semaphore[2];
+};
+
+extern SharedMemoryRegion SMR;
+
+}  // namespace fuzzer
+
+#endif  // LLVM_FUZZER_SHMEM_H
diff --git a/lib/Fuzzer/FuzzerShmemPosix.cpp b/lib/Fuzzer/FuzzerShmemPosix.cpp
new file mode 100644
index 000000000000..2723bdd86f48
--- /dev/null
+++ b/lib/Fuzzer/FuzzerShmemPosix.cpp
@@ -0,0 +1,103 @@
+//===- FuzzerShmemPosix.cpp - Posix shared memory ---------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// SharedMemoryRegion
+//===----------------------------------------------------------------------===//
+#include "FuzzerDefs.h"
+#if LIBFUZZER_POSIX
+
+#include "FuzzerIO.h"
+#include "FuzzerShmem.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <semaphore.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+namespace fuzzer {
+
+std::string SharedMemoryRegion::Path(const char *Name) {
+  return DirPlusFile(TmpDir(), Name);
+}
+
+std::string SharedMemoryRegion::SemName(const char *Name, int Idx) {
+  std::string Res(Name);
+  return Res + (char)('0' + Idx);
+}
+
+bool SharedMemoryRegion::Map(int fd) {
+  Data =
+      (uint8_t *)mmap(0, kShmemSize, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+  if (Data == (uint8_t*)-1)
+    return false;
+  return true;
+}
+
+bool SharedMemoryRegion::Create(const char *Name) {
+  int fd = open(Path(Name).c_str(), O_CREAT | O_RDWR, 0777);
+  if (fd < 0) return false;
+  if (ftruncate(fd, kShmemSize) < 0) return false;
+  if (!Map(fd))
+    return false;
+  for (int i = 0; i < 2; i++) {
+    sem_unlink(SemName(Name, i).c_str());
+    Semaphore[i] = sem_open(SemName(Name, i).c_str(), O_CREAT, 0644, 0);
+    if (Semaphore[i] == (void *)-1)
+      return false;
+  }
+  IAmServer = true;
+  return true;
+}
+
+bool SharedMemoryRegion::Open(const char *Name) {
+  int fd = open(Path(Name).c_str(), O_RDWR);
+  if (fd < 0) return false;
+  struct stat stat_res;
+  if (0 != fstat(fd, &stat_res))
+    return false;
+  assert(stat_res.st_size == kShmemSize);
+  if (!Map(fd))
+    return false;
+  for (int i = 0; i < 2; i++) {
+    Semaphore[i] = sem_open(SemName(Name, i).c_str(), 0);
+    if (Semaphore[i] == (void *)-1)
+      return false;
+  }
+  IAmServer = false;
+  return true;
+}
+
+bool SharedMemoryRegion::Destroy(const char *Name) {
+  return 0 == unlink(Path(Name).c_str());
+}
+
+void SharedMemoryRegion::Post(int Idx) {
+  assert(Idx == 0 || Idx == 1);
+  sem_post((sem_t*)Semaphore[Idx]);
+}
+
+void SharedMemoryRegion::Wait(int Idx) {
+  assert(Idx == 0 || Idx == 1);
+  for (int i = 0; i < 10 && sem_wait((sem_t*)Semaphore[Idx]); i++) {
+    // sem_wait may fail if interrupted by a signal.
+    sleep(i);
+    if (i)
+      Printf("%s: sem_wait[%d] failed %s\n", i < 9 ? "WARNING" : "ERROR", i,
+             strerror(errno));
+    if (i == 9) abort();
+  }
+}
+
+}  // namespace fuzzer
+
+#endif  // LIBFUZZER_POSIX
diff --git a/lib/Fuzzer/FuzzerShmemWindows.cpp b/lib/Fuzzer/FuzzerShmemWindows.cpp
new file mode 100644
index 000000000000..6325b4b8e5b4
--- /dev/null
+++ b/lib/Fuzzer/FuzzerShmemWindows.cpp
@@ -0,0 +1,64 @@
+//===- FuzzerShmemWindows.cpp - Posix shared memory -------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// SharedMemoryRegion
+//===----------------------------------------------------------------------===//
+#include "FuzzerDefs.h"
+#if LIBFUZZER_WINDOWS
+
+#include "FuzzerIO.h"
+#include "FuzzerShmem.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+namespace fuzzer {
+
+std::string SharedMemoryRegion::Path(const char *Name) {
+  return DirPlusFile(TmpDir(), Name);
+}
+
+std::string SharedMemoryRegion::SemName(const char *Name, int Idx) {
+  std::string Res(Name);
+  return Res + (char)('0' + Idx);
+}
+
+bool SharedMemoryRegion::Map(int fd) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+bool SharedMemoryRegion::Create(const char *Name) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+bool SharedMemoryRegion::Open(const char *Name) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+bool SharedMemoryRegion::Destroy(const char *Name) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+void SharedMemoryRegion::Post(int Idx) {
+  assert(0 && "UNIMPLEMENTED");
+}
+
+void SharedMemoryRegion::Wait(int Idx) {
+  Semaphore[1] = nullptr;
+  assert(0 && "UNIMPLEMENTED");
+}
+
+}  // namespace fuzzer
+
+#endif  // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
index 39d6e6026210..ce0f7a47eee6 100644
--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -18,27 +18,37 @@
 #include "FuzzerExtFunctions.h"
 #include "FuzzerIO.h"
 #include "FuzzerTracePC.h"
+#include "FuzzerUtil.h"
 #include "FuzzerValueBitMap.h"
 #include <map>
-#include <sanitizer/coverage_interface.h>
 #include <set>
 #include <sstream>
 
+// The coverage counters and PCs.
+// These are declared as global variables named "__sancov_*" to simplify
+// experiments with inlined instrumentation.
+alignas(64) ATTRIBUTE_INTERFACE
+uint8_t __sancov_trace_pc_guard_8bit_counters[fuzzer::TracePC::kNumPCs];
+
+ATTRIBUTE_INTERFACE
+uintptr_t __sancov_trace_pc_pcs[fuzzer::TracePC::kNumPCs];
+
 namespace fuzzer {
 
 TracePC TPC;
 
-void TracePC::HandleTrace(uint32_t *Guard, uintptr_t PC) {
-  uint32_t Idx = *Guard;
-  if (!Idx) return;
-  PCs[Idx % kNumPCs] = PC;
-  Counters[Idx % kNumCounters]++;
+uint8_t *TracePC::Counters() const {
+  return __sancov_trace_pc_guard_8bit_counters;
+}
+
+uintptr_t *TracePC::PCs() const {
+  return __sancov_trace_pc_pcs;
 }
 
 size_t TracePC::GetTotalPCCoverage() {
   size_t Res = 0;
-  for (size_t i = 1; i < GetNumPCs(); i++)
-    if (PCs[i])
+  for (size_t i = 1, N = GetNumPCs(); i < N; i++)
+    if (PCs()[i])
       Res++;
   return Res;
 }
@@ -46,8 +56,16 @@ size_t TracePC::GetTotalPCCoverage() {
 void TracePC::HandleInit(uint32_t *Start, uint32_t *Stop) {
   if (Start == Stop || *Start) return;
   assert(NumModules < sizeof(Modules) / sizeof(Modules[0]));
-  for (uint32_t *P = Start; P < Stop; P++)
-    *P = ++NumGuards;
+  for (uint32_t *P = Start; P < Stop; P++) {
+    NumGuards++;
+    if (NumGuards == kNumPCs) {
+      RawPrint(
+          "WARNING: The binary has too many instrumented PCs.\n"
+          "         You may want to reduce the size of the binary\n"
+          "         for more efficient fuzzing and precise coverage data\n");
+    }
+    *P = NumGuards % kNumPCs;
+  }
   Modules[NumModules].Start = Start;
   Modules[NumModules].Stop = Stop;
   NumModules++;
@@ -60,23 +78,12 @@ void TracePC::PrintModuleInfo() {
   Printf("\n");
 }
 
+ATTRIBUTE_NO_SANITIZE_ALL
 void TracePC::HandleCallerCallee(uintptr_t Caller, uintptr_t Callee) {
   const uintptr_t kBits = 12;
   const uintptr_t kMask = (1 << kBits) - 1;
   uintptr_t Idx = (Caller & kMask) | ((Callee & kMask) << kBits);
-  HandleValueProfile(Idx);
-}
-
-static bool IsInterestingCoverageFile(std::string &File) {
-  if (File.find("compiler-rt/lib/") != std::string::npos)
-    return false; // sanitizer internal.
-  if (File.find("/usr/lib/") != std::string::npos)
-    return false;
-  if (File.find("/usr/include/") != std::string::npos)
-    return false;
-  if (File == "<null>")
-    return false;
-  return true;
+  ValueProfileMap.AddValueModPrime(Idx);
 }
 
 void TracePC::InitializePrintNewPCs() {
@@ -84,16 +91,16 @@ void TracePC::InitializePrintNewPCs() {
   assert(!PrintedPCs);
   PrintedPCs = new std::set<uintptr_t>;
   for (size_t i = 1; i < GetNumPCs(); i++)
-    if (PCs[i])
-      PrintedPCs->insert(PCs[i]);
+    if (PCs()[i])
+      PrintedPCs->insert(PCs()[i]);
 }
 
 void TracePC::PrintNewPCs() {
   if (!DoPrintNewPCs) return;
   assert(PrintedPCs);
   for (size_t i = 1; i < GetNumPCs(); i++)
-    if (PCs[i] && PrintedPCs->insert(PCs[i]).second)
-      PrintPC("\tNEW_PC: %p %F %L\n", "\tNEW_PC: %p\n", PCs[i]);
+    if (PCs()[i] && PrintedPCs->insert(PCs()[i]).second)
+      PrintPC("\tNEW_PC: %p %F %L\n", "\tNEW_PC: %p\n", PCs()[i]);
 }
 
 void TracePC::PrintCoverage() {
@@ -110,20 +117,21 @@ void TracePC::PrintCoverage() {
       CoveredLines;
   Printf("COVERAGE:\n");
   for (size_t i = 1; i < GetNumPCs(); i++) {
-    if (!PCs[i]) continue;
-    std::string FileStr = DescribePC("%s", PCs[i]);
+    uintptr_t PC = PCs()[i];
+    if (!PC) continue;
+    std::string FileStr = DescribePC("%s", PC);
     if (!IsInterestingCoverageFile(FileStr)) continue;
-    std::string FixedPCStr = DescribePC("%p", PCs[i]);
-    std::string FunctionStr = DescribePC("%F", PCs[i]);
-    std::string LineStr = DescribePC("%l", PCs[i]);
+    std::string FixedPCStr = DescribePC("%p", PC);
+    std::string FunctionStr = DescribePC("%F", PC);
+    std::string LineStr = DescribePC("%l", PC);
     char ModulePathRaw[4096] = "";  // What's PATH_MAX in portable C++?
     void *OffsetRaw = nullptr;
     if (!EF->__sanitizer_get_module_and_offset_for_pc(
-            reinterpret_cast<void *>(PCs[i]), ModulePathRaw,
+            reinterpret_cast<void *>(PC), ModulePathRaw,
             sizeof(ModulePathRaw), &OffsetRaw))
       continue;
     std::string Module = ModulePathRaw;
-    uintptr_t FixedPC = std::stol(FixedPCStr, 0, 16);
+    uintptr_t FixedPC = std::stoull(FixedPCStr, 0, 16);
     uintptr_t PcOffset = reinterpret_cast<uintptr_t>(OffsetRaw);
     ModuleOffsets[Module] = FixedPC - PcOffset;
     CoveredPCsPerModule[Module].push_back(PcOffset);
@@ -154,8 +162,8 @@ void TracePC::PrintCoverage() {
     Printf("MODULE_WITH_COVERAGE: %s\n", ModuleName.c_str());
     // sancov does not yet fully support DSOs.
     // std::string Cmd = "sancov -print-coverage-pcs " + ModuleName;
-    std::string Cmd = "objdump -d " + ModuleName +
-        " | grep 'call.*__sanitizer_cov_trace_pc_guard' | awk -F: '{print $1}'";
+    std::string Cmd = DisassembleCmd(ModuleName) + " | " +
+        SearchRegexCmd("call.*__sanitizer_cov_trace_pc_guard");
     std::string SanCovOutput;
     if (!ExecuteCommandAndReadOutput(Cmd, &SanCovOutput)) {
       Printf("INFO: Command failed: %s\n", Cmd.c_str());
@@ -164,7 +172,11 @@ void TracePC::PrintCoverage() {
     std::istringstream ISS(SanCovOutput);
     std::string S;
     while (std::getline(ISS, S, '\n')) {
-      uintptr_t PcOffset = std::stol(S, 0, 16);
+      size_t PcOffsetEnd = S.find(':');
+      if (PcOffsetEnd == std::string::npos)
+        continue;
+      S.resize(PcOffsetEnd);
+      uintptr_t PcOffset = std::stoull(S, 0, 16);
       if (!std::binary_search(CoveredOffsets.begin(), CoveredOffsets.end(),
                               PcOffset)) {
         uintptr_t PC = ModuleOffset + PcOffset;
@@ -196,8 +208,19 @@ void TracePC::PrintCoverage() {
   }
 }
 
+inline ALWAYS_INLINE uintptr_t GetPreviousInstructionPc(uintptr_t PC) {
+  // TODO: this implementation is x86 only.
+  // see sanitizer_common GetPreviousInstructionPc for full implementation.
+  return PC - 1;
+}
+
 void TracePC::DumpCoverage() {
-  __sanitizer_dump_coverage(PCs, GetNumPCs());
+  if (EF->__sanitizer_dump_coverage) {
+    std::vector<uintptr_t> PCsCopy(GetNumPCs());
+    for (size_t i = 0; i < GetNumPCs(); i++)
+      PCsCopy[i] = PCs()[i] ? GetPreviousInstructionPc(PCs()[i]) : 0;
+    EF->__sanitizer_dump_coverage(PCsCopy.data(), PCsCopy.size());
+  }
 }
 
 // Value profile.
@@ -210,97 +233,118 @@ void TracePC::DumpCoverage() {
 // For cmp instructions the interesting value is a XOR of the parameters.
 // The interesting value is mixed up with the PC and is then added to the map.
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_NO_SANITIZE_ALL
 void TracePC::AddValueForMemcmp(void *caller_pc, const void *s1, const void *s2,
-                              size_t n) {
+                                size_t n, bool StopAtZero) {
   if (!n) return;
-  size_t Len = std::min(n, (size_t)32);
-  const uint8_t *A1 = reinterpret_cast<const uint8_t *>(s1);
-  const uint8_t *A2 = reinterpret_cast<const uint8_t *>(s2);
-  size_t I = 0;
-  for (; I < Len; I++)
-    if (A1[I] != A2[I])
-      break;
-  size_t PC = reinterpret_cast<size_t>(caller_pc);
-  size_t Idx = I;
-  // if (I < Len)
-  //  Idx += __builtin_popcountl((A1[I] ^ A2[I])) - 1;
-  TPC.HandleValueProfile((PC & 4095) | (Idx << 12));
-}
-
-ATTRIBUTE_NO_SANITIZE_MEMORY
-void TracePC::AddValueForStrcmp(void *caller_pc, const char *s1, const char *s2,
-                              size_t n) {
-  if (!n) return;
-  size_t Len = std::min(n, (size_t)32);
+  size_t Len = std::min(n, Word::GetMaxSize());
   const uint8_t *A1 = reinterpret_cast<const uint8_t *>(s1);
   const uint8_t *A2 = reinterpret_cast<const uint8_t *>(s2);
+  uint8_t B1[Word::kMaxSize];
+  uint8_t B2[Word::kMaxSize];
+  // Copy the data into locals in this non-msan-instrumented function
+  // to avoid msan complaining further.
+  size_t Hash = 0;  // Compute some simple hash of both strings.
+  for (size_t i = 0; i < Len; i++) {
+    B1[i] = A1[i];
+    B2[i] = A2[i];
+    size_t T = B1[i];
+    Hash ^= (T << 8) | B2[i];
+  }
   size_t I = 0;
   for (; I < Len; I++)
-    if (A1[I] != A2[I] || A1[I] == 0)
+    if (B1[I] != B2[I] || (StopAtZero && B1[I] == 0))
       break;
   size_t PC = reinterpret_cast<size_t>(caller_pc);
-  size_t Idx = I;
-  // if (I < Len && A1[I])
-  //  Idx += __builtin_popcountl((A1[I] ^ A2[I])) - 1;
-  TPC.HandleValueProfile((PC & 4095) | (Idx << 12));
+  size_t Idx = (PC & 4095) | (I << 12);
+  ValueProfileMap.AddValue(Idx);
+  TORCW.Insert(Idx ^ Hash, Word(B1, Len), Word(B2, Len));
 }
 
 template <class T>
-ATTRIBUTE_TARGET_POPCNT
-#ifdef __clang__  // g++ can't handle this __attribute__ here :(
-__attribute__((always_inline))
-#endif  // __clang__
-void TracePC::HandleCmp(void *PC, T Arg1, T Arg2) {
-  uintptr_t PCuint = reinterpret_cast<uintptr_t>(PC);
+ATTRIBUTE_TARGET_POPCNT ALWAYS_INLINE
+ATTRIBUTE_NO_SANITIZE_ALL
+void TracePC::HandleCmp(uintptr_t PC, T Arg1, T Arg2) {
   uint64_t ArgXor = Arg1 ^ Arg2;
-  uint64_t ArgDistance = __builtin_popcountl(ArgXor) + 1; // [1,65]
-  uintptr_t Idx = ((PCuint & 4095) + 1) * ArgDistance;
+  uint64_t ArgDistance = __builtin_popcountll(ArgXor) + 1; // [1,65]
+  uintptr_t Idx = ((PC & 4095) + 1) * ArgDistance;
   if (sizeof(T) == 4)
       TORC4.Insert(ArgXor, Arg1, Arg2);
   else if (sizeof(T) == 8)
       TORC8.Insert(ArgXor, Arg1, Arg2);
-  HandleValueProfile(Idx);
+  ValueProfileMap.AddValue(Idx);
 }
 
 } // namespace fuzzer
 
 extern "C" {
-__attribute__((visibility("default")))
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
 void __sanitizer_cov_trace_pc_guard(uint32_t *Guard) {
-  uintptr_t PC = (uintptr_t)__builtin_return_address(0);
-  fuzzer::TPC.HandleTrace(Guard, PC);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  uint32_t Idx = *Guard;
+  __sancov_trace_pc_pcs[Idx] = PC;
+  __sancov_trace_pc_guard_8bit_counters[Idx]++;
+}
+
+// Best-effort support for -fsanitize-coverage=trace-pc, which is available
+// in both Clang and GCC.
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+void __sanitizer_cov_trace_pc() {
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  uintptr_t Idx = PC & (((uintptr_t)1 << fuzzer::TracePC::kTracePcBits) - 1);
+  __sancov_trace_pc_pcs[Idx] = PC;
+  __sancov_trace_pc_guard_8bit_counters[Idx]++;
 }
 
-__attribute__((visibility("default")))
+ATTRIBUTE_INTERFACE
 void __sanitizer_cov_trace_pc_guard_init(uint32_t *Start, uint32_t *Stop) {
   fuzzer::TPC.HandleInit(Start, Stop);
 }
 
-__attribute__((visibility("default")))
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
 void __sanitizer_cov_trace_pc_indir(uintptr_t Callee) {
-  uintptr_t PC = (uintptr_t)__builtin_return_address(0);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   fuzzer::TPC.HandleCallerCallee(PC, Callee);
 }
 
-__attribute__((visibility("default")))
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_cmp8(uint64_t Arg1, uint64_t Arg2) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Arg1, Arg2);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Arg1, Arg2);
 }
-__attribute__((visibility("default")))
+
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_cmp4(uint32_t Arg1, uint32_t Arg2) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Arg1, Arg2);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Arg1, Arg2);
 }
-__attribute__((visibility("default")))
+
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_cmp2(uint16_t Arg1, uint16_t Arg2) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Arg1, Arg2);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Arg1, Arg2);
 }
-__attribute__((visibility("default")))
+
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_cmp1(uint8_t Arg1, uint8_t Arg2) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Arg1, Arg2);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Arg1, Arg2);
 }
 
-__attribute__((visibility("default")))
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases) {
   uint64_t N = Cases[0];
   uint64_t ValSizeInBits = Cases[1];
@@ -308,7 +352,7 @@ void __sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases) {
   // Skip the most common and the most boring case.
   if (Vals[N - 1]  < 256 && Val < 256)
     return;
-  char *PC = (char*)__builtin_return_address(0);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   size_t i;
   uint64_t Token = 0;
   for (i = 0; i < N; i++) {
@@ -325,17 +369,27 @@ void __sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases) {
     fuzzer::TPC.HandleCmp(PC + i, Token, (uint64_t)(0));
 }
 
-__attribute__((visibility("default")))
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_div4(uint32_t Val) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Val, (uint32_t)0);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Val, (uint32_t)0);
 }
-__attribute__((visibility("default")))
+
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_div8(uint64_t Val) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Val, (uint64_t)0);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Val, (uint64_t)0);
 }
-__attribute__((visibility("default")))
+
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+ATTRIBUTE_TARGET_POPCNT
 void __sanitizer_cov_trace_gep(uintptr_t Idx) {
-  fuzzer::TPC.HandleCmp(__builtin_return_address(0), Idx, (uintptr_t)0);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  fuzzer::TPC.HandleCmp(PC, Idx, (uintptr_t)0);
 }
-
 }  // extern "C"
diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h
index b6b26b6c9af8..6523fa06005c 100644
--- a/lib/Fuzzer/FuzzerTracePC.h
+++ b/lib/Fuzzer/FuzzerTracePC.h
@@ -13,7 +13,9 @@
 #define LLVM_FUZZER_TRACE_PC
 
 #include "FuzzerDefs.h"
+#include "FuzzerDictionary.h"
 #include "FuzzerValueBitMap.h"
+
 #include <set>
 
 namespace fuzzer {
@@ -31,7 +33,8 @@ struct TableOfRecentCompares {
   struct Pair {
     T A, B;
   };
-  void Insert(size_t Idx, T Arg1, T Arg2) {
+  ATTRIBUTE_NO_SANITIZE_ALL
+  void Insert(size_t Idx, const T &Arg1, const T &Arg2) {
     Idx = Idx % kSize;
     Table[Idx].A = Arg1;
     Table[Idx].B = Arg2;
@@ -44,25 +47,23 @@ struct TableOfRecentCompares {
 
 class TracePC {
  public:
-  static const size_t kFeatureSetSize = ValueBitMap::kNumberOfItems;
+  static const size_t kNumPCs = 1 << 21;
+  // How many bits of PC are used from __sanitizer_cov_trace_pc.
+  static const size_t kTracePcBits = 18;
 
-  void HandleTrace(uint32_t *guard, uintptr_t PC);
   void HandleInit(uint32_t *start, uint32_t *stop);
   void HandleCallerCallee(uintptr_t Caller, uintptr_t Callee);
-  void HandleValueProfile(size_t Value) { ValueProfileMap.AddValue(Value); }
-  template <class T> void HandleCmp(void *PC, T Arg1, T Arg2);
+  template <class T> void HandleCmp(uintptr_t PC, T Arg1, T Arg2);
   size_t GetTotalPCCoverage();
   void SetUseCounters(bool UC) { UseCounters = UC; }
   void SetUseValueProfile(bool VP) { UseValueProfile = VP; }
   void SetPrintNewPCs(bool P) { DoPrintNewPCs = P; }
-  template <class Callback> size_t CollectFeatures(Callback CB);
-  bool UpdateValueProfileMap(ValueBitMap *MaxValueProfileMap) {
-    return UseValueProfile && MaxValueProfileMap->MergeFrom(ValueProfileMap);
-  }
+  template <class Callback> void CollectFeatures(Callback CB) const;
 
   void ResetMaps() {
     ValueProfileMap.Reset();
-    memset(Counters, 0, sizeof(Counters));
+    memset(Counters(), 0, GetNumPCs());
+    ClearExtraCounters();
   }
 
   void UpdateFeatureSet(size_t CurrentElementIdx, size_t CurrentElementSize);
@@ -74,22 +75,20 @@ class TracePC {
   void DumpCoverage();
 
   void AddValueForMemcmp(void *caller_pc, const void *s1, const void *s2,
-                         size_t n);
-  void AddValueForStrcmp(void *caller_pc, const char *s1, const char *s2,
-                         size_t n);
-
-  bool UsingTracePcGuard() const {return NumModules; }
+                         size_t n, bool StopAtZero);
 
-  static const size_t kTORCSize = 1 << 5;
-  TableOfRecentCompares<uint32_t, kTORCSize> TORC4;
-  TableOfRecentCompares<uint64_t, kTORCSize> TORC8;
+  TableOfRecentCompares<uint32_t, 32> TORC4;
+  TableOfRecentCompares<uint64_t, 32> TORC8;
+  TableOfRecentCompares<Word, 32> TORCW;
 
   void PrintNewPCs();
   void InitializePrintNewPCs();
-  size_t GetNumPCs() const { return Min(kNumPCs, NumGuards + 1); }
+  size_t GetNumPCs() const {
+    return NumGuards == 0 ? (1 << kTracePcBits) : Min(kNumPCs, NumGuards + 1);
+  }
   uintptr_t GetPC(size_t Idx) {
     assert(Idx < GetNumPCs());
-    return PCs[Idx];
+    return PCs()[Idx];
   }
 
 private:
@@ -105,51 +104,55 @@ private:
   size_t NumModules;  // linker-initialized.
   size_t NumGuards;  // linker-initialized.
 
-  static const size_t kNumCounters = 1 << 14;
-  alignas(8) uint8_t Counters[kNumCounters];
-
-  static const size_t kNumPCs = 1 << 24;
-  uintptr_t PCs[kNumPCs];
+  uint8_t *Counters() const;
+  uintptr_t *PCs() const;
 
   std::set<uintptr_t> *PrintedPCs;
 
   ValueBitMap ValueProfileMap;
 };
 
-template <class Callback>
-size_t TracePC::CollectFeatures(Callback CB) {
-  if (!UsingTracePcGuard()) return 0;
-  size_t Res = 0;
-  const size_t Step = 8;
-  assert(reinterpret_cast<uintptr_t>(Counters) % Step == 0);
-  size_t N = Min(kNumCounters, NumGuards + 1);
-  N = (N + Step - 1) & ~(Step - 1);  // Round up.
-  for (size_t Idx = 0; Idx < N; Idx += Step) {
-    uint64_t Bundle = *reinterpret_cast<uint64_t*>(&Counters[Idx]);
-    if (!Bundle) continue;
-    for (size_t i = Idx; i < Idx + Step; i++) {
-      uint8_t Counter = (Bundle >> ((i - Idx) * 8)) & 0xff;
-      if (!Counter) continue;
-      Counters[i] = 0;
-      unsigned Bit = 0;
-      /**/ if (Counter >= 128) Bit = 7;
-      else if (Counter >= 32) Bit = 6;
-      else if (Counter >= 16) Bit = 5;
-      else if (Counter >= 8) Bit = 4;
-      else if (Counter >= 4) Bit = 3;
-      else if (Counter >= 3) Bit = 2;
-      else if (Counter >= 2) Bit = 1;
-      size_t Feature = (i * 8 + Bit);
-      if (CB(Feature))
-        Res++;
-    }
-  }
+template <class Callback> // void Callback(size_t Idx, uint8_t Value);
+ATTRIBUTE_NO_SANITIZE_ALL
+void ForEachNonZeroByte(const uint8_t *Begin, const uint8_t *End,
+                        size_t FirstFeature, Callback Handle8bitCounter) {
+  typedef uintptr_t LargeType;
+  const size_t Step = sizeof(LargeType) / sizeof(uint8_t);
+  assert(!(reinterpret_cast<uintptr_t>(Begin) % 64));
+  for (auto P = Begin; P < End; P += Step)
+    if (LargeType Bundle = *reinterpret_cast<const LargeType *>(P))
+      for (size_t I = 0; I < Step; I++, Bundle >>= 8)
+        if (uint8_t V = Bundle & 0xff)
+          Handle8bitCounter(FirstFeature + P - Begin + I, V);
+}
+
+template <class Callback>  // bool Callback(size_t Feature)
+ATTRIBUTE_NO_SANITIZE_ALL
+__attribute__((noinline))
+void TracePC::CollectFeatures(Callback HandleFeature) const {
+  uint8_t *Counters = this->Counters();
+  size_t N = GetNumPCs();
+  auto Handle8bitCounter = [&](size_t Idx, uint8_t Counter) {
+    assert(Counter);
+    unsigned Bit = 0;
+    /**/ if (Counter >= 128) Bit = 7;
+    else if (Counter >= 32) Bit = 6;
+    else if (Counter >= 16) Bit = 5;
+    else if (Counter >= 8) Bit = 4;
+    else if (Counter >= 4) Bit = 3;
+    else if (Counter >= 3) Bit = 2;
+    else if (Counter >= 2) Bit = 1;
+    HandleFeature(Idx * 8 + Bit);
+  };
+
+  ForEachNonZeroByte(Counters, Counters + N, 0, Handle8bitCounter);
+  ForEachNonZeroByte(ExtraCountersBegin(), ExtraCountersEnd(), N * 8,
+                     Handle8bitCounter);
+
   if (UseValueProfile)
     ValueProfileMap.ForEach([&](size_t Idx) {
-      if (CB(NumGuards * 8 + Idx))
-        Res++;
+      HandleFeature(N * 8 + Idx);
     });
-  return Res;
 }
 
 extern TracePC TPC;
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
index 2ad9702fab0e..a486223d650c 100644
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@@ -13,7 +13,6 @@
 #include "FuzzerInternal.h"
 #include "FuzzerIO.h"
 #include "FuzzerMutate.h"
-#include "FuzzerRandom.h"
 #include "FuzzerTracePC.h"
 #include <algorithm>
 #include <cstring>
@@ -23,19 +22,10 @@
 
 namespace fuzzer {
 
-// For now, very simple: put Size bytes of Data at position Pos.
-struct TraceBasedMutation {
-  uint32_t Pos;
-  Word W;
-};
-
 // Declared as static globals for faster checks inside the hooks.
-static bool RecordingMemcmp = false;
 static bool RecordingMemmem = false;
-static bool DoingMyOwnMemmem = false;
 
-ScopedDoingMyOwnMemmem::ScopedDoingMyOwnMemmem() { DoingMyOwnMemmem = true; }
-ScopedDoingMyOwnMemmem::~ScopedDoingMyOwnMemmem() { DoingMyOwnMemmem = false; }
+int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr;
 
 class TraceState {
 public:
@@ -43,64 +33,21 @@ public:
              const Fuzzer *F)
       : MD(MD), Options(Options), F(F) {}
 
-  void TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
-                           const uint8_t *Data2);
-
-  int TryToAddDesiredData(const uint8_t *PresentData,
-                          const uint8_t *DesiredData, size_t DataSize);
-
   void StartTraceRecording() {
-    if (!Options.UseMemcmp)
+    if (!Options.UseMemmem)
       return;
-    RecordingMemcmp = Options.UseMemcmp;
-    RecordingMemmem = Options.UseMemmem;
-    NumMutations = 0;
+    RecordingMemmem = true;
     InterestingWords.clear();
     MD.ClearAutoDictionary();
   }
 
   void StopTraceRecording() {
-    if (!RecordingMemcmp)
+    if (!RecordingMemmem)
       return;
-    RecordingMemcmp = false;
-    for (size_t i = 0; i < NumMutations; i++) {
-      auto &M = Mutations[i];
-      if (Options.Verbosity >= 2) {
-        AutoDictUnitCounts[M.W]++;
-        AutoDictAdds++;
-        if ((AutoDictAdds & (AutoDictAdds - 1)) == 0) {
-          typedef std::pair<size_t, Word> CU;
-          std::vector<CU> CountedUnits;
-          for (auto &I : AutoDictUnitCounts)
-            CountedUnits.push_back(std::make_pair(I.second, I.first));
-          std::sort(CountedUnits.begin(), CountedUnits.end(),
-                    [](const CU &a, const CU &b) { return a.first > b.first; });
-          Printf("AutoDict:\n");
-          for (auto &I : CountedUnits) {
-            Printf("   %zd ", I.first);
-            PrintASCII(I.second.data(), I.second.size());
-            Printf("\n");
-          }
-        }
-      }
-      MD.AddWordToAutoDictionary({M.W, M.Pos});
-    }
     for (auto &W : InterestingWords)
       MD.AddWordToAutoDictionary({W});
   }
 
-  void AddMutation(uint32_t Pos, uint32_t Size, const uint8_t *Data) {
-    if (NumMutations >= kMaxMutations) return;
-    auto &M = Mutations[NumMutations++];
-    M.Pos = Pos;
-    M.W.Set(Data, Size);
-  }
-
-  void AddMutation(uint32_t Pos, uint32_t Size, uint64_t Data) {
-    assert(Size <= sizeof(Data));
-    AddMutation(Pos, Size, reinterpret_cast<uint8_t*>(&Data));
-  }
-
   void AddInterestingWord(const uint8_t *Data, size_t Size) {
     if (!RecordingMemmem || !F->InFuzzingThread()) return;
     if (Size <= 1) return;
@@ -110,75 +57,14 @@ public:
   }
 
  private:
-  bool IsTwoByteData(uint64_t Data) {
-    int64_t Signed = static_cast<int64_t>(Data);
-    Signed >>= 16;
-    return Signed == 0 || Signed == -1L;
-  }
-
-  // We don't want to create too many trace-based mutations as it is both
-  // expensive and useless. So after some number of mutations is collected,
-  // start rejecting some of them. The more there are mutations the more we
-  // reject.
-  bool WantToHandleOneMoreMutation() {
-    const size_t FirstN = 64;
-    // Gladly handle first N mutations.
-    if (NumMutations <= FirstN) return true;
-    size_t Diff = NumMutations - FirstN;
-    size_t DiffLog = sizeof(long) * 8 - __builtin_clzl((long)Diff);
-    assert(DiffLog > 0 && DiffLog < 64);
-    bool WantThisOne = MD.GetRand()(1 << DiffLog) == 0;  // 1 out of DiffLog.
-    return WantThisOne;
-  }
 
-  static const size_t kMaxMutations = 1 << 16;
-  size_t NumMutations;
-  TraceBasedMutation Mutations[kMaxMutations];
   // TODO: std::set is too inefficient, need to have a custom DS here.
   std::set<Word> InterestingWords;
   MutationDispatcher &MD;
   const FuzzingOptions Options;
   const Fuzzer *F;
-  std::map<Word, size_t> AutoDictUnitCounts;
-  size_t AutoDictAdds = 0;
 };
 
-int TraceState::TryToAddDesiredData(const uint8_t *PresentData,
-                                    const uint8_t *DesiredData,
-                                    size_t DataSize) {
-  if (NumMutations >= kMaxMutations || !WantToHandleOneMoreMutation()) return 0;
-  ScopedDoingMyOwnMemmem scoped_doing_my_own_memmem;
-  const uint8_t *UnitData;
-  auto UnitSize = F->GetCurrentUnitInFuzzingThead(&UnitData);
-  int Res = 0;
-  const uint8_t *Beg = UnitData;
-  const uint8_t *End = Beg + UnitSize;
-  for (const uint8_t *Cur = Beg; Cur < End; Cur++) {
-    Cur = (uint8_t *)SearchMemory(Cur, End - Cur, PresentData, DataSize);
-    if (!Cur)
-      break;
-    size_t Pos = Cur - Beg;
-    assert(Pos < UnitSize);
-    AddMutation(Pos, DataSize, DesiredData);
-    Res++;
-  }
-  return Res;
-}
-
-void TraceState::TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
-                                     const uint8_t *Data2) {
-  if (!RecordingMemcmp || !F->InFuzzingThread()) return;
-  CmpSize = std::min(CmpSize, Word::GetMaxSize());
-  int Added2 = TryToAddDesiredData(Data1, Data2, CmpSize);
-  int Added1 = TryToAddDesiredData(Data2, Data1, CmpSize);
-  if ((Added1 || Added2) && Options.Verbosity >= 3) {
-    Printf("MemCmp Added %d%d: ", Added1, Added2);
-    if (Added1) PrintASCII(Data1, CmpSize);
-    if (Added2) PrintASCII(Data2, CmpSize);
-    Printf("\n");
-  }
-}
-
 static TraceState *TS;
 
 void Fuzzer::StartTraceRecording() {
@@ -192,7 +78,7 @@ void Fuzzer::StopTraceRecording() {
 }
 
 void Fuzzer::InitializeTraceState() {
-  if (!Options.UseMemcmp) return;
+  if (!Options.UseMemmem) return;
   TS = new TraceState(MD, Options, this);
 }
 
@@ -202,10 +88,17 @@ static size_t InternalStrnlen(const char *S, size_t MaxLen) {
   return Len;
 }
 
+// Finds min of (strlen(S1), strlen(S2)).
+// Needed bacause one of these strings may actually be non-zero terminated.
+static size_t InternalStrnlen2(const char *S1, const char *S2) {
+  size_t Len = 0;
+  for (; S1[Len] && S2[Len]; Len++)  {}
+  return Len;
+}
+
 }  // namespace fuzzer
 
 using fuzzer::TS;
-using fuzzer::RecordingMemcmp;
 
 extern "C" {
 
@@ -215,62 +108,72 @@ extern "C" {
 #endif
 
 #if LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
                                   const void *s2, size_t n, int result) {
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n);
-  if (!RecordingMemcmp) return;
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   if (result == 0) return;  // No reason to mutate.
   if (n <= 1) return;  // Not interesting.
-  TS->TraceMemcmpCallback(n, reinterpret_cast<const uint8_t *>(s1),
-                          reinterpret_cast<const uint8_t *>(s2));
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false);
 }
 
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
                                    const char *s2, size_t n, int result) {
-  fuzzer::TPC.AddValueForStrcmp(caller_pc, s1, s2, n);
-  if (!RecordingMemcmp) return;
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   if (result == 0) return;  // No reason to mutate.
   size_t Len1 = fuzzer::InternalStrnlen(s1, n);
   size_t Len2 = fuzzer::InternalStrnlen(s2, n);
   n = std::min(n, Len1);
   n = std::min(n, Len2);
   if (n <= 1) return;  // Not interesting.
-  TS->TraceMemcmpCallback(n, reinterpret_cast<const uint8_t *>(s1),
-                          reinterpret_cast<const uint8_t *>(s2));
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true);
 }
 
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
                                    const char *s2, int result) {
-  fuzzer::TPC.AddValueForStrcmp(caller_pc, s1, s2, 64);
-  if (!RecordingMemcmp) return;
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   if (result == 0) return;  // No reason to mutate.
-  size_t Len1 = strlen(s1);
-  size_t Len2 = strlen(s2);
-  size_t N = std::min(Len1, Len2);
+  size_t N = fuzzer::InternalStrnlen2(s1, s2);
   if (N <= 1) return;  // Not interesting.
-  TS->TraceMemcmpCallback(N, reinterpret_cast<const uint8_t *>(s1),
-                          reinterpret_cast<const uint8_t *>(s2));
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true);
 }
 
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
                                        const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
 }
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
                                       const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
 }
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
                                   const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
 }
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
                                       const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
 }
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
                                   const void *s2, size_t len2, void *result) {
-  if (fuzzer::DoingMyOwnMemmem) return;
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), len2);
 }
 
diff --git a/lib/Fuzzer/FuzzerUtil.h b/lib/Fuzzer/FuzzerUtil.h
index 08058c56e4c5..f84fd9ef0fce 100644
--- a/lib/Fuzzer/FuzzerUtil.h
+++ b/lib/Fuzzer/FuzzerUtil.h
@@ -67,6 +67,10 @@ inline std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
   return CloneArgsWithoutX(Args, X, X);
 }
 
+std::string DisassembleCmd(const std::string &FileName);
+
+std::string SearchRegexCmd(const std::string &Regex);
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_UTIL_H
diff --git a/lib/Fuzzer/FuzzerUtilPosix.cpp b/lib/Fuzzer/FuzzerUtilPosix.cpp
index e8d48dc81a3b..0161309fbf86 100644
--- a/lib/Fuzzer/FuzzerUtilPosix.cpp
+++ b/lib/Fuzzer/FuzzerUtilPosix.cpp
@@ -118,6 +118,14 @@ const void *SearchMemory(const void *Data, size_t DataLen, const void *Patt,
   return memmem(Data, DataLen, Patt, PattLen);
 }
 
+std::string DisassembleCmd(const std::string &FileName) {
+  return "objdump -d " + FileName;
+}
+
+std::string SearchRegexCmd(const std::string &Regex) {
+  return "grep '" + Regex + "'";
+}
+
 }  // namespace fuzzer
 
 #endif // LIBFUZZER_POSIX
diff --git a/lib/Fuzzer/FuzzerUtilWindows.cpp b/lib/Fuzzer/FuzzerUtilWindows.cpp
index 3ca1f2c8f562..08bb3cf3be15 100644
--- a/lib/Fuzzer/FuzzerUtilWindows.cpp
+++ b/lib/Fuzzer/FuzzerUtilWindows.cpp
@@ -28,7 +28,7 @@ namespace fuzzer {
 
 static const FuzzingOptions* HandlerOpt = nullptr;
 
-LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) {
+static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) {
   switch (ExceptionInfo->ExceptionRecord->ExceptionCode) {
     case EXCEPTION_ACCESS_VIOLATION:
     case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
@@ -126,10 +126,7 @@ void SetSignalHandler(const FuzzingOptions& Options) {
 
   if (Options.HandleSegv || Options.HandleBus || Options.HandleIll ||
       Options.HandleFpe)
-    if (!AddVectoredExceptionHandler(1, ExceptionHandler)) {
-      Printf("libFuzzer: AddVectoredExceptionHandler failed.\n");
-      exit(1);
-    }
+    SetUnhandledExceptionFilter(ExceptionHandler);
 
   if (Options.HandleAbrt)
     if (SIG_ERR == signal(SIGABRT, CrashHandler)) {
@@ -178,6 +175,17 @@ const void *SearchMemory(const void *Data, size_t DataLen, const void *Patt,
   return NULL;
 }
 
+std::string DisassembleCmd(const std::string &FileName) {
+  if (ExecuteCommand("dumpbin /summary > nul") == 0)
+    return "dumpbin /disasm " + FileName;
+  Printf("libFuzzer: couldn't find tool to disassemble (dumpbin)\n");
+  exit(1);
+}
+
+std::string SearchRegexCmd(const std::string &Regex) {
+  return "findstr /r \"" + Regex + "\"";
+}
+
 } // namespace fuzzer
 
 #endif // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerValueBitMap.h b/lib/Fuzzer/FuzzerValueBitMap.h
index 0692acd13ee3..8f7ff74300f4 100644
--- a/lib/Fuzzer/FuzzerValueBitMap.h
+++ b/lib/Fuzzer/FuzzerValueBitMap.h
@@ -18,19 +18,20 @@ namespace fuzzer {
 
 // A bit map containing kMapSizeInWords bits.
 struct ValueBitMap {
-  static const size_t kMapSizeInBits = 65371;        // Prime.
-  static const size_t kMapSizeInBitsAligned = 65536; // 2^16
+  static const size_t kMapSizeInBits = 1 << 16;
+  static const size_t kMapPrimeMod = 65371;  // Largest Prime < kMapSizeInBits;
   static const size_t kBitsInWord = (sizeof(uintptr_t) * 8);
-  static const size_t kMapSizeInWords = kMapSizeInBitsAligned / kBitsInWord;
+  static const size_t kMapSizeInWords = kMapSizeInBits / kBitsInWord;
  public:
-  static const size_t kNumberOfItems = kMapSizeInBits;
+
   // Clears all bits.
   void Reset() { memset(Map, 0, sizeof(Map)); }
 
   // Computes a hash function of Value and sets the corresponding bit.
   // Returns true if the bit was changed from 0 to 1.
+  ATTRIBUTE_NO_SANITIZE_ALL
   inline bool AddValue(uintptr_t Value) {
-    uintptr_t Idx = Value < kMapSizeInBits ? Value : Value % kMapSizeInBits;
+    uintptr_t Idx = Value % kMapSizeInBits;
     uintptr_t WordIdx = Idx / kBitsInWord;
     uintptr_t BitIdx = Idx % kBitsInWord;
     uintptr_t Old = Map[WordIdx];
@@ -39,6 +40,11 @@ struct ValueBitMap {
     return New != Old;
   }
 
+  ATTRIBUTE_NO_SANITIZE_ALL
+  inline bool AddValueModPrime(uintptr_t Value) {
+    return AddValue(Value % kMapPrimeMod);
+  }
+
   inline bool Get(uintptr_t Idx) {
     assert(Idx < kMapSizeInBits);
     uintptr_t WordIdx = Idx / kBitsInWord;
@@ -62,14 +68,15 @@ struct ValueBitMap {
         Other.Map[i] = 0;
       }
       if (M)
-        Res += __builtin_popcountl(M);
+        Res += __builtin_popcountll(M);
     }
     NumBits = Res;
     return OldNumBits < NumBits;
   }
 
   template <class Callback>
-  void ForEach(Callback CB) {
+  ATTRIBUTE_NO_SANITIZE_ALL
+  void ForEach(Callback CB) const {
     for (size_t i = 0; i < kMapSizeInWords; i++)
       if (uintptr_t M = Map[i])
         for (size_t j = 0; j < sizeof(M) * 8; j++)
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index fc9589552ba3..b3a54e57fceb 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -238,6 +238,13 @@ static void maybe_duplicate_stderr() {
   }
 }
 
+// Define LLVMFuzzerMutate to avoid link failures for targets that use it
+// with libFuzzer's LLVMFuzzerCustomMutator.
+extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
+  assert(false && "LLVMFuzzerMutate should not be called from afl_driver");
+  return 0;
+}
+
 int main(int argc, char **argv) {
   fprintf(stderr, "======================= INFO =========================\n"
                   "This binary is built for AFL-fuzz.\n"
diff --git a/lib/Fuzzer/build.sh b/lib/Fuzzer/build.sh
index 27c148ad43db..4556af5daf7d 100755
--- a/lib/Fuzzer/build.sh
+++ b/lib/Fuzzer/build.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 LIBFUZZER_SRC_DIR=$(dirname $0)
+CXX="${CXX:-clang}"
 for f in $LIBFUZZER_SRC_DIR/*.cpp; do
-  clang -g -O2 -fno-omit-frame-pointer -std=c++11 $f -c &
+  $CXX -g -O2 -fno-omit-frame-pointer -std=c++11 $f -c &
 done
 wait
 rm -f libFuzzer.a
diff --git a/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp b/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp
index 577481431ae2..69b0d59fb8ef 100644
--- a/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp
+++ b/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp
@@ -14,7 +14,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   uint64_t y;
   memcpy(&x, Data, sizeof(x));
   memcpy(&y, Data + sizeof(x), sizeof(y));
-  if (labs(x) < 0 && y == 0xbaddcafedeadbeefUL) {
+  if (llabs(x) < 0 && y == 0xbaddcafedeadbeefULL) {
     printf("BINGO; Found the target, exiting; x = 0x%lx y 0x%lx\n", x, y);
     exit(1);
   }
diff --git a/lib/Fuzzer/test/BadStrcmpTest.cpp b/lib/Fuzzer/test/BadStrcmpTest.cpp
new file mode 100644
index 000000000000..159cd7ea5f70
--- /dev/null
+++ b/lib/Fuzzer/test/BadStrcmpTest.cpp
@@ -0,0 +1,19 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test that we don't creash in case of bad strcmp params.
+#include <cstdint>
+#include <cstring>
+#include <cstddef>
+
+static volatile int Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size != 10) return 0;
+  // Data is not zero-terminated, so this call is bad.
+  // Still, there are cases when such calles appear, see e.g.
+  // https://bugs.llvm.org/show_bug.cgi?id=32357
+  Sink = strcmp(reinterpret_cast<const char*>(Data), "123456789");
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/BogusInitializeTest.cpp b/lib/Fuzzer/test/BogusInitializeTest.cpp
new file mode 100644
index 000000000000..c7e81a5478b2
--- /dev/null
+++ b/lib/Fuzzer/test/BogusInitializeTest.cpp
@@ -0,0 +1,15 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Make sure LLVMFuzzerInitialize does not change argv[0].
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  ***argv = 'X';
+  return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  return 0;
+}
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index c0457746a0e7..f72bc3909a3c 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -11,21 +11,35 @@ set(variables_to_filter
   LIBFUZZER_FLAGS_BASE
   )
 foreach (VARNAME ${variables_to_filter})
-  string(REPLACE " " ";" BUILD_FLAGS_AS_LIST "${${VARNAME}}")
-  set(new_flags "")
-  foreach (flag ${BUILD_FLAGS_AS_LIST})
-    # NOTE: Use of XX here is to avoid a CMake warning due to CMP0054
-    if (NOT ("XX${flag}" MATCHES "XX-O[0123s]"))
-      set(new_flags "${new_flags} ${flag}")
-    else()
-      set(new_flags "${new_flags} -O0")
-    endif()
-  endforeach()
-  set(${VARNAME} "${new_flags}")
+  string(REGEX REPLACE "([-/]O)[123s]" "\\10" ${VARNAME} "${${VARNAME}}")
 endforeach()
 
 # Enable the coverage instrumentation (it is disabled for the Fuzzer lib).
-set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=trace-pc-guard,indirect-calls,trace-cmp,trace-div,trace-gep -g")
+set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=trace-pc-guard,indirect-calls,trace-cmp,trace-div,trace-gep -gline-tables-only")
+
+if(MSVC)
+  # For tests use the CRT specified for release build
+  # (asan doesn't support MDd and MTd)
+  if ("${LLVM_USE_CRT_RELEASE}" STREQUAL "")
+    set(CRT_FLAG " /MD ")
+  else()
+    set(CRT_FLAG " /${LLVM_USE_CRT_RELEASE} ")
+  endif()
+  # In order to use the sanitizers in Windows, we need to link against many
+  # runtime libraries which will depend on the target being created
+  # (executable or dll) and the c runtime library used (MT/MD).
+  # By default, cmake uses link.exe for linking, which fails because we don't
+  # specify the appropiate dependencies.
+  # As we don't want to consider all of that possible situations which depends
+  # on the implementation of the compiler-rt, the simplest option is to change
+  # the rules for linking executables and shared libraries, using the compiler
+  # instead of link.exe. Clang will consider the sanitizer flags, and
+  # automatically provide the required libraries to the linker.
+  set(CMAKE_CXX_LINK_EXECUTABLE "<CMAKE_CXX_COMPILER> <FLAGS> ${CMAKE_CXX_FLAGS} ${CRT_FLAG} <OBJECTS> -o <TARGET> <LINK_LIBRARIES> /link <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS>")
+  set(CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> ${CMAKE_CXX_FLAGS} ${CRT_FLAG} /LD <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG> <TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> /link <LINK_FLAGS>")
+endif()
+
+add_custom_target(TestBinaries)
 
 # add_libfuzzer_test(<name>
 #   SOURCES source0.cpp [source1.cpp ...]
@@ -51,12 +65,9 @@ function(add_libfuzzer_test name)
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY
     "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
     )
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${name} PARENT_SCOPE)
+  add_dependencies(TestBinaries LLVMFuzzer-${name})
 endfunction()
 
-# Variable to keep track of all test targets
-set(TestBinaries)
-
 ###############################################################################
 # Basic tests
 ###############################################################################
@@ -65,16 +76,23 @@ set(Tests
   AbsNegAndConstantTest
   AbsNegAndConstant64Test
   AccumulateAllocationsTest
+  BadStrcmpTest
+  BogusInitializeTest
   BufferOverflowOnInput
   CallerCalleeTest
   CounterTest
+  CustomCrossOverAndMutateTest
   CustomCrossOverTest
   CustomMutatorTest
+  CxxStringEqTest
   DivTest
   EmptyTest
+  EquivalenceATest
+  EquivalenceBTest
   FourIndependentBranchesTest
   FullCoverageSetTest
   InitializeTest
+  Memcmp64BytesTest
   MemcmpTest
   LeakTest
   LeakTimeoutTest
@@ -92,6 +110,7 @@ set(Tests
   SimpleHashTest
   SimpleTest
   SimpleThreadedTest
+  SingleByteInputTest
   SingleMemcmpTest
   SingleStrcmpTest
   SingleStrncmpTest
@@ -105,17 +124,19 @@ set(Tests
   SwapCmpTest
   SwitchTest
   Switch2Test
+  TableLookupTest
   ThreadedLeakTest
   ThreadedTest
   TimeoutTest
   TimeoutEmptyTest
   TraceMallocTest
+  TwoDifferentBugsTest
   )
 
-if(APPLE)
-  # LeakSanitizer is not supported on OSX right now
+if(APPLE OR MSVC)
+  # LeakSanitizer is not supported on OSX and Windows right now
   set(HAS_LSAN 0)
-  message(WARNING "LeakSanitizer is not supported on Apple platforms."
+  message(WARNING "LeakSanitizer is not supported."
     " Building and running LibFuzzer LeakSanitizer tests is disabled."
     )
 else()
@@ -126,6 +147,17 @@ foreach(Test ${Tests})
   add_libfuzzer_test(${Test} SOURCES ${Test}.cpp)
 endforeach()
 
+function(test_export_symbol target symbol)
+  if(MSVC)
+    set_target_properties(LLVMFuzzer-${target} PROPERTIES LINK_FLAGS
+        "-export:${symbol}")
+  endif()
+endfunction()
+
+test_export_symbol(InitializeTest "LLVMFuzzerInitialize")
+test_export_symbol(BogusInitializeTest "LLVMFuzzerInitialize")
+test_export_symbol(CustomCrossOverTest "LLVMFuzzerCustomCrossOver")
+test_export_symbol(CustomMutatorTest "LLVMFuzzerCustomMutator")
 
 ###############################################################################
 # Unit tests
@@ -150,13 +182,13 @@ target_include_directories(LLVMFuzzer-Unittest PRIVATE
   "${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include"
   )
 
-set(TestBinaries ${TestBinaries} LLVMFuzzer-Unittest)
+add_dependencies(TestBinaries LLVMFuzzer-Unittest)
 set_target_properties(LLVMFuzzer-Unittest
   PROPERTIES RUNTIME_OUTPUT_DIRECTORY
   "${CMAKE_CURRENT_BINARY_DIR}"
 )
 
-set(TestBinaries ${TestBinaries} LLVMFuzzer-StandaloneInitializeTest)
+add_dependencies(TestBinaries LLVMFuzzer-StandaloneInitializeTest)
 set_target_properties(LLVMFuzzer-StandaloneInitializeTest
   PROPERTIES RUNTIME_OUTPUT_DIRECTORY
   "${CMAKE_CURRENT_BINARY_DIR}"
@@ -170,6 +202,7 @@ include_directories(..)
 
 # add_subdirectory(uninstrumented)
 add_subdirectory(no-coverage)
+add_subdirectory(trace-pc)
 add_subdirectory(ubsan)
 
 add_library(LLVMFuzzer-DSO1 SHARED DSO1.cpp)
@@ -187,12 +220,22 @@ target_link_libraries(LLVMFuzzer-DSOTest
 
 set_target_properties(LLVMFuzzer-DSOTest PROPERTIES RUNTIME_OUTPUT_DIRECTORY
   "${CMAKE_BINARY_DIR}/lib/Fuzzer/test")
-set_target_properties(LLVMFuzzer-DSO1 PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-  "${CMAKE_BINARY_DIR}/lib/Fuzzer/lib")
-set_target_properties(LLVMFuzzer-DSO2 PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-  "${CMAKE_BINARY_DIR}/lib/Fuzzer/lib")
 
-set(TestBinaries ${TestBinaries} LLVMFuzzer-DSOTest)
+if(MSVC)
+  set_output_directory(LLVMFuzzer-DSO1
+    BINARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
+    LIBRARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/test")
+  set_output_directory(LLVMFuzzer-DSO2
+    BINARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
+    LIBRARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/test")
+else(MSVC)
+  set_output_directory(LLVMFuzzer-DSO1
+    LIBRARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/lib")
+  set_output_directory(LLVMFuzzer-DSO2
+    LIBRARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/lib")
+endif()
+
+add_dependencies(TestBinaries LLVMFuzzer-DSOTest)
 
 ###############################################################################
 # Configure lit to run the tests
@@ -200,6 +243,10 @@ set(TestBinaries ${TestBinaries} LLVMFuzzer-DSOTest)
 # Note this is done after declaring all tests so we can inform lit if any tests
 # need to be disabled.
 ###############################################################################
+set(LIBFUZZER_POSIX 1)
+if (MSVC)
+  set(LIBFUZZER_POSIX 0)
+endif()
 
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
@@ -213,5 +260,11 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-fuzzer "Running Fuzzer tests"
     ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS ${TestBinaries} FileCheck not
+    DEPENDS TestBinaries
     )
+
+# Don't add dependencies on Windows. The linker step would fail on Windows,
+# since cmake will use link.exe for linking and won't include compiler-rt libs.
+if(NOT MSVC)
+  add_dependencies(check-fuzzer FileCheck sancov not)
+endif()
diff --git a/lib/Fuzzer/test/CustomCrossOverAndMutateTest.cpp b/lib/Fuzzer/test/CustomCrossOverAndMutateTest.cpp
new file mode 100644
index 000000000000..74fc939534ca
--- /dev/null
+++ b/lib/Fuzzer/test/CustomCrossOverAndMutateTest.cpp
@@ -0,0 +1,34 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test that libFuzzer does not crash when LLVMFuzzerMutate called from
+// LLVMFuzzerCustomCrossOver.
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "FuzzerInterface.h"
+
+static volatile int sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  std::string Str(reinterpret_cast<const char *>(Data), Size);
+  if (Size && Data[0] == '0')
+    sink++;
+  return 0;
+}
+
+extern "C" size_t LLVMFuzzerCustomCrossOver(const uint8_t *Data1, size_t Size1,
+                                            const uint8_t *Data2, size_t Size2,
+                                            uint8_t *Out, size_t MaxOutSize,
+                                            unsigned int Seed) {
+  std::vector<uint8_t> Buffer(MaxOutSize * 10);
+  LLVMFuzzerMutate(Buffer.data(), Buffer.size(), Buffer.size());
+  size_t Size = std::min(Size1, MaxOutSize);
+  memcpy(Out, Data1, Size);
+  return Size;
+}
diff --git a/lib/Fuzzer/test/CxxStringEqTest.cpp b/lib/Fuzzer/test/CxxStringEqTest.cpp
new file mode 100644
index 000000000000..e0e23c972ccb
--- /dev/null
+++ b/lib/Fuzzer/test/CxxStringEqTest.cpp
@@ -0,0 +1,25 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. Must find a specific string
+// used in std::string operator ==.
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <string>
+#include <iostream>
+
+static volatile int Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  std::string Str((const char*)Data, Size);
+  bool Eq = Str == "FooBar";
+  Sink = Str == "123456";   // Try to confuse the fuzzer
+  if (Eq) {
+    std::cout << "BINGO; Found the target, exiting\n";
+    std::cout.flush();
+    abort();
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/DSO1.cpp b/lib/Fuzzer/test/DSO1.cpp
index 4a293890f4b0..72a5ec4a0cde 100644
--- a/lib/Fuzzer/test/DSO1.cpp
+++ b/lib/Fuzzer/test/DSO1.cpp
@@ -2,7 +2,9 @@
 // License. See LICENSE.TXT for details.
 
 // Source code for a simple DSO.
-
+#ifdef _WIN32
+__declspec( dllexport )
+#endif
 int DSO1(int a) {
   if (a < 123456)
     return 0;
diff --git a/lib/Fuzzer/test/DSO2.cpp b/lib/Fuzzer/test/DSO2.cpp
index 04b308d193ac..2967055dc227 100644
--- a/lib/Fuzzer/test/DSO2.cpp
+++ b/lib/Fuzzer/test/DSO2.cpp
@@ -2,7 +2,9 @@
 // License. See LICENSE.TXT for details.
 
 // Source code for a simple DSO.
-
+#ifdef _WIN32
+__declspec( dllexport )
+#endif
 int DSO2(int a) {
   if (a < 3598235)
     return 0;
diff --git a/lib/Fuzzer/test/EquivalenceATest.cpp b/lib/Fuzzer/test/EquivalenceATest.cpp
new file mode 100644
index 000000000000..7d1ebb0f6a4a
--- /dev/null
+++ b/lib/Fuzzer/test/EquivalenceATest.cpp
@@ -0,0 +1,17 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+// Test for libFuzzer's "equivalence" fuzzing, part A.
+extern "C" void LLVMFuzzerAnnounceOutput(const uint8_t *Data, size_t Size);
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  // fprintf(stderr, "A %zd\n", Size);
+  uint8_t Result[50];
+  if (Size > 50) Size = 50;
+  for (size_t i = 0; i < Size; i++)
+    Result[Size - i - 1] = Data[i];
+  LLVMFuzzerAnnounceOutput(Result, Size);
+  return 0;
+}
diff --git a/lib/Fuzzer/test/EquivalenceBTest.cpp b/lib/Fuzzer/test/EquivalenceBTest.cpp
new file mode 100644
index 000000000000..b1de208b57f6
--- /dev/null
+++ b/lib/Fuzzer/test/EquivalenceBTest.cpp
@@ -0,0 +1,27 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+// Test for libFuzzer's "equivalence" fuzzing, part B.
+extern "C" void LLVMFuzzerAnnounceOutput(const uint8_t *Data, size_t Size);
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  // fprintf(stderr, "B %zd\n", Size);
+  uint8_t Result[50];
+  if (Size > 50) Size = 50;
+  for (size_t i = 0; i < Size; i++)
+    Result[Size - i - 1] = Data[i];
+
+  // Be a bit different from EquivalenceATest
+  if (Size > 10 && Data[5] == 'B' && Data[6] == 'C' && Data[7] == 'D') {
+    static int c;
+    if (!c)
+      fprintf(stderr, "ZZZZZZZ\n");
+    c = 1;
+    Result[2]++;
+  }
+
+  LLVMFuzzerAnnounceOutput(Result, Size);
+  return 0;
+}
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 4992ef57b6ca..78ea874f2ce2 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -10,10 +10,12 @@
 #include "FuzzerDictionary.h"
 #include "FuzzerMerge.h"
 #include "FuzzerMutate.h"
+#include "FuzzerTracePC.h"
 #include "FuzzerRandom.h"
 #include "gtest/gtest.h"
 #include <memory>
 #include <set>
+#include <sstream>
 
 using namespace fuzzer;
 
@@ -584,15 +586,15 @@ TEST(FuzzerUtil, Base64) {
 
 TEST(Corpus, Distribution) {
   Random Rand(0);
-  InputCorpus C("");
+  std::unique_ptr<InputCorpus> C(new InputCorpus(""));
   size_t N = 10;
   size_t TriesPerUnit = 1<<16;
   for (size_t i = 0; i < N; i++)
-    C.AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 0);
+    C->AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 0);
 
   std::vector<size_t> Hist(N);
   for (size_t i = 0; i < N * TriesPerUnit; i++) {
-    Hist[C.ChooseUnitIdxToMutate(Rand)]++;
+    Hist[C->ChooseUnitIdxToMutate(Rand)]++;
   }
   for (size_t i = 0; i < N; i++) {
     // A weak sanity check that every unit gets invoked.
@@ -636,7 +638,10 @@ static void Merge(const std::string &Input,
   Merger M;
   std::vector<std::string> NewFiles;
   EXPECT_TRUE(M.Parse(Input, true));
+  std::stringstream SS;
+  M.PrintSummary(SS);
   EXPECT_EQ(NumNewFeatures, M.Merge(&NewFiles));
+  EXPECT_EQ(M.AllFeatures(), M.ParseSummary(SS));
   EQ(NewFiles, Result);
 }
 
@@ -706,6 +711,16 @@ TEST(Merge, Good) {
   EQ(M.Files[2].Features, {1, 3, 6});
   EXPECT_EQ(3U, M.Merge(&NewFiles));
   EQ(NewFiles, {"B"});
+
+  // Same as the above, but with InitialFeatures.
+  EXPECT_TRUE(M.Parse("2\n0\nB\nC\n"
+                        "STARTED 0 1001\nDONE 0 4 5 6 \n"
+                        "STARTED 1 1002\nDONE 1 6 1 3\n"
+                        "", true));
+  EQ(M.Files[0].Features, {4, 5, 6});
+  EQ(M.Files[1].Features, {1, 3, 6});
+  EXPECT_EQ(3U, M.Merge({1, 2, 3}, &NewFiles));
+  EQ(NewFiles, {"B"});
 }
 
 TEST(Merge, Merge) {
@@ -736,3 +751,25 @@ TEST(Merge, Merge) {
         "STARTED 3 1000\nDONE 3 1  \n",
         {"B", "D"}, 3);
 }
+
+TEST(Fuzzer, ForEachNonZeroByte) {
+  const size_t N = 64;
+  alignas(64) uint8_t Ar[N + 8] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    1, 2, 0, 0, 0, 0, 0, 0,
+    0, 0, 3, 0, 4, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 5, 0, 6, 0, 0,
+    0, 0, 0, 0, 0, 0, 7, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 8,
+    9, 9, 9, 9, 9, 9, 9, 9,
+  };
+  typedef std::vector<std::pair<size_t, uint8_t> > Vec;
+  Vec Res, Expected;
+  auto CB = [&](size_t Idx, uint8_t V) { Res.push_back({Idx, V}); };
+  ForEachNonZeroByte(Ar, Ar + N, 100, CB);
+  Expected = {{108, 1}, {109, 2}, {118, 3}, {120, 4},
+              {135, 5}, {137, 6}, {146, 7}, {163, 8}};
+  EXPECT_EQ(Res, Expected);
+}
diff --git a/lib/Fuzzer/test/LargeTest.cpp b/lib/Fuzzer/test/LargeTest.cpp
new file mode 100644
index 000000000000..83ed61971801
--- /dev/null
+++ b/lib/Fuzzer/test/LargeTest.cpp
@@ -0,0 +1,37 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// A fuzz target with lots of edges.
+#include <cstdint>
+#include <cstdlib>
+
+static inline void break_optimization(const void *arg) {
+    __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
+#define A                                         \
+  do {                                            \
+    i++;                                          \
+    c++;                                          \
+    if (Data[(i + __LINE__) % Size] == (c % 256)) \
+      break_optimization(Data);                   \
+    else                                          \
+      break_optimization(0);                      \
+  } while (0)
+
+// for (int i = 0, n = Data[(__LINE__ - 1) % Size] % 16; i < n; i++)
+
+#define B do{A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; }while(0)
+#define C do{B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; }while(0)
+#define D do{C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; }while(0)
+#define E do{D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; }while(0)
+
+volatile int sink;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (!Size) return 0;
+  int c = 0;
+  int i = 0;
+  D;
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/LoadTest.cpp b/lib/Fuzzer/test/LoadTest.cpp
index c1780d5c7bd9..eef16c7be51e 100644
--- a/lib/Fuzzer/test/LoadTest.cpp
+++ b/lib/Fuzzer/test/LoadTest.cpp
@@ -14,7 +14,7 @@ int array[kArraySize];
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   if (Size < 8) return 0;
-  size_t a = 0;
+  uint64_t a = 0;
   memcpy(&a, Data, 8);
   Sink = array[a % (kArraySize + 1)];
   return 0;
diff --git a/lib/Fuzzer/test/Memcmp64BytesTest.cpp b/lib/Fuzzer/test/Memcmp64BytesTest.cpp
new file mode 100644
index 000000000000..e81526b578a3
--- /dev/null
+++ b/lib/Fuzzer/test/Memcmp64BytesTest.cpp
@@ -0,0 +1,20 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. The fuzzer must find a particular string.
+#include <cassert>
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  const char kString64Bytes[] =
+      "123456789 123456789 123456789 123456789 123456789 123456789 1234";
+  assert(sizeof(kString64Bytes) == 65);
+  if (Size >= 64 && memcmp(Data, kString64Bytes, 64) == 0) {
+    fprintf(stderr, "BINGO\n");
+    exit(1);
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/UninstrumentedTest.cpp b/lib/Fuzzer/test/NotinstrumentedTest.cpp
index ffe952c749d2..ffe952c749d2 100644
--- a/lib/Fuzzer/test/UninstrumentedTest.cpp
+++ b/lib/Fuzzer/test/NotinstrumentedTest.cpp
diff --git a/lib/Fuzzer/test/OutOfMemorySingleLargeMallocTest.cpp b/lib/Fuzzer/test/OutOfMemorySingleLargeMallocTest.cpp
index ea23a601aa23..316b7682b8e6 100644
--- a/lib/Fuzzer/test/OutOfMemorySingleLargeMallocTest.cpp
+++ b/lib/Fuzzer/test/OutOfMemorySingleLargeMallocTest.cpp
@@ -15,7 +15,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   if (Size > 0 && Data[0] == 'H') {
     if (Size > 1 && Data[1] == 'i') {
       if (Size > 2 && Data[2] == '!') {
-          size_t kSize = 0xff000000U;
+          size_t kSize = 0x20000000U;
           char *p = new char[kSize];
           SinkPtr = p;
           delete [] p;
diff --git a/lib/Fuzzer/test/RepeatedMemcmp.cpp b/lib/Fuzzer/test/RepeatedMemcmp.cpp
index a327bbee7815..7377f65ed76d 100644
--- a/lib/Fuzzer/test/RepeatedMemcmp.cpp
+++ b/lib/Fuzzer/test/RepeatedMemcmp.cpp
@@ -8,13 +8,16 @@
 #include <cstdlib>
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
-  int Matches = 0;
-  for (size_t i = 0; i + 2 < Size; i += 3) {
-    const char *Pat = i % 2 ? "foo" : "bar";
-    if (!memcmp(Data + i, Pat, 3))
-      Matches++;
-  }
-  if (Matches > 20) {
+  int Matches1 = 0;
+  for (size_t i = 0; i + 2 < Size; i += 3)
+    if (!memcmp(Data + i, "foo", 3))
+      Matches1++;
+  int Matches2 = 0;
+  for (size_t i = 0; i + 2 < Size; i += 3)
+    if (!memcmp(Data + i, "bar", 3))
+      Matches2++;
+
+  if (Matches1 > 10 && Matches2 > 10) {
     fprintf(stderr, "BINGO!\n");
     exit(1);
   }
diff --git a/lib/Fuzzer/test/SimpleCmpTest.cpp b/lib/Fuzzer/test/SimpleCmpTest.cpp
index 0220c30f9a6b..12b5cdda0660 100644
--- a/lib/Fuzzer/test/SimpleCmpTest.cpp
+++ b/lib/Fuzzer/test/SimpleCmpTest.cpp
@@ -26,12 +26,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   memcpy(&y, Data + 8, 8);  // 16
   memcpy(&z, Data + 16, sizeof(z));  // 20
   memcpy(&a, Data + 20, sizeof(a));  // 22
+  const bool k32bit = sizeof(void*) == 4;
 
-  if (x > 1234567890 && PrintOnce(__LINE__) &&
-      x < 1234567895 && PrintOnce(__LINE__) &&
+  if ((k32bit || x > 1234567890) && PrintOnce(__LINE__) &&
+      (k32bit || x < 1234567895) && PrintOnce(__LINE__) &&
       a == 0x4242 && PrintOnce(__LINE__) &&
-      y >= 987654321 && PrintOnce(__LINE__) &&
-      y <= 987654325 && PrintOnce(__LINE__) &&
+      (k32bit || y >= 987654321) && PrintOnce(__LINE__) &&
+      (k32bit || y <= 987654325) && PrintOnce(__LINE__) &&
       z < -10000 && PrintOnce(__LINE__) &&
       z >= -10005 && PrintOnce(__LINE__) &&
       z != -10003 && PrintOnce(__LINE__) &&
diff --git a/lib/Fuzzer/test/SingleByteInputTest.cpp b/lib/Fuzzer/test/SingleByteInputTest.cpp
new file mode 100644
index 000000000000..4ce819d230ce
--- /dev/null
+++ b/lib/Fuzzer/test/SingleByteInputTest.cpp
@@ -0,0 +1,17 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer, need just one byte to crash.
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <cstdio>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[Size/2] == 42) {
+    fprintf(stderr, "BINGO\n");
+    abort();
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/SingleStrcmpTest.cpp b/lib/Fuzzer/test/SingleStrcmpTest.cpp
index 73470b527eeb..48f481dfc51a 100644
--- a/lib/Fuzzer/test/SingleStrcmpTest.cpp
+++ b/lib/Fuzzer/test/SingleStrcmpTest.cpp
@@ -8,10 +8,14 @@
 #include <cstdlib>
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
-  char *S = (char*)Data;
-  if (Size >= 7 && !strcmp(S, "qwerty")) {
-    fprintf(stderr, "BINGO\n");
-    exit(1);
+  if (Size >= 7) {
+    char Copy[7];
+    memcpy(Copy, Data, 6);
+    Copy[6] = 0;
+    if (!strcmp(Copy, "qwerty")) {
+      fprintf(stderr, "BINGO\n");
+      exit(1);
+    }
   }
   return 0;
 }
diff --git a/lib/Fuzzer/test/SingleStrncmpTest.cpp b/lib/Fuzzer/test/SingleStrncmpTest.cpp
index dbcc464b0a78..e5601da86329 100644
--- a/lib/Fuzzer/test/SingleStrncmpTest.cpp
+++ b/lib/Fuzzer/test/SingleStrncmpTest.cpp
@@ -9,7 +9,8 @@
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   char *S = (char*)Data;
-  if (Size >= 6 && !strncmp(S, "qwerty", 6)) {
+  volatile auto Strncmp = &(strncmp);   // Make sure strncmp is not inlined.
+  if (Size >= 6 && !Strncmp(S, "qwerty", 6)) {
     fprintf(stderr, "BINGO\n");
     exit(1);
   }
diff --git a/lib/Fuzzer/test/SwapCmpTest.cpp b/lib/Fuzzer/test/SwapCmpTest.cpp
index f79db4ccf714..b90ac72c22c4 100644
--- a/lib/Fuzzer/test/SwapCmpTest.cpp
+++ b/lib/Fuzzer/test/SwapCmpTest.cpp
@@ -19,8 +19,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   x = __builtin_bswap64(x);
   y = __builtin_bswap32(y);
   z = __builtin_bswap16(z);
+  const bool k32bit = sizeof(void*) == 4;
 
-  if (x == 0x46555A5A5A5A5546ULL &&
+  if ((k32bit || x == 0x46555A5A5A5A5546ULL) &&
       z == 0x4F4B &&
       y == 0x66757A7A &&
       true
diff --git a/lib/Fuzzer/test/TableLookupTest.cpp b/lib/Fuzzer/test/TableLookupTest.cpp
new file mode 100644
index 000000000000..f9d5610820ff
--- /dev/null
+++ b/lib/Fuzzer/test/TableLookupTest.cpp
@@ -0,0 +1,45 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Make sure the fuzzer eventually finds all possible values of a variable
+// within a range.
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <set>
+
+const size_t N = 1 << 12;
+
+// Define an array of counters that will be understood by libFuzzer
+// as extra coverage signal. The array must be:
+//  * uint8_t
+//  * aligned by 64
+//  * in the section named __libfuzzer_extra_counters.
+// The target code may declare more than one such array.
+//
+// Use either `Counters[Idx] = 1` or `Counters[Idx]++;`
+// depending on whether multiple occurrences of the event 'Idx'
+// is important to distinguish from one occurrence.
+#ifdef __linux__
+alignas(64) __attribute__((section("__libfuzzer_extra_counters")))
+#endif
+static uint8_t Counters[N];
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  static std::set<uint16_t> SeenIdx;
+  if (Size != 4) return 0;
+  uint32_t Idx;
+  memcpy(&Idx, Data, 4);
+  Idx %= N;
+  assert(Counters[Idx] == 0);  // libFuzzer should reset these between the runs.
+  // Or Counters[Idx]=1 if we don't care how many times this happened.
+  Counters[Idx]++;
+  SeenIdx.insert(Idx);
+  if (SeenIdx.size() == N) {
+    fprintf(stderr, "BINGO: found all values\n");
+    abort();
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/TwoDifferentBugsTest.cpp b/lib/Fuzzer/test/TwoDifferentBugsTest.cpp
new file mode 100644
index 000000000000..42c0d192ba86
--- /dev/null
+++ b/lib/Fuzzer/test/TwoDifferentBugsTest.cpp
@@ -0,0 +1,22 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. This test may trigger two different bugs.
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int *Null = 0;
+
+void Foo() { Null[1] = 0; }
+void Bar() { Null[2] = 0; }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size < 10 && Data[0] == 'H')
+    Foo();
+  if (Size >= 10 && Data[0] == 'H')
+    Bar();
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/afl-driver-extra-stats.test b/lib/Fuzzer/test/afl-driver-extra-stats.test
index 81e384e7dad2..1b0818e55ea5 100644
--- a/lib/Fuzzer/test/afl-driver-extra-stats.test
+++ b/lib/Fuzzer/test/afl-driver-extra-stats.test
@@ -1,3 +1,5 @@
+REQUIRES: posix
+
 ; Test that not specifying an extra stats file isn't broken.
 RUN: unset AFL_DRIVER_EXTRA_STATS_FILENAME
 RUN: AFLDriverTest
diff --git a/lib/Fuzzer/test/afl-driver-stderr.test b/lib/Fuzzer/test/afl-driver-stderr.test
index c0f9c8398c2a..e835acd4275b 100644
--- a/lib/Fuzzer/test/afl-driver-stderr.test
+++ b/lib/Fuzzer/test/afl-driver-stderr.test
@@ -1,3 +1,5 @@
+REQUIRES: posix
+
 ; Test that not specifying a stderr file isn't broken.
 RUN: unset AFL_DRIVER_STDERR_DUPLICATE_FILENAME
 RUN: AFLDriverTest
diff --git a/lib/Fuzzer/test/bad-strcmp.test b/lib/Fuzzer/test/bad-strcmp.test
new file mode 100644
index 000000000000..9a2f3742a5f4
--- /dev/null
+++ b/lib/Fuzzer/test/bad-strcmp.test
@@ -0,0 +1 @@
+RUN: LLVMFuzzer-BadStrcmpTest -runs=100000
diff --git a/lib/Fuzzer/test/coverage.test b/lib/Fuzzer/test/coverage.test
index fa11be502ef9..ff3fdff57a3d 100644
--- a/lib/Fuzzer/test/coverage.test
+++ b/lib/Fuzzer/test/coverage.test
@@ -1,9 +1,11 @@
+XFAIL: darwin
+
 CHECK: COVERAGE:
 CHECK-DAG: COVERED: {{.*}}in LLVMFuzzerTestOneInput {{.*}}NullDerefTest.cpp:13
 CHECK-DAG: COVERED: {{.*}}in LLVMFuzzerTestOneInput {{.*}}NullDerefTest.cpp:14
 CHECK-DAG: COVERED: {{.*}}in LLVMFuzzerTestOneInput {{.*}}NullDerefTest.cpp:16
 CHECK-DAG: COVERED: {{.*}}in LLVMFuzzerTestOneInput {{.*}}NullDerefTest.cpp:19
-CHECK: COVERED_DIRS: {{.*}}lib/Fuzzer/test
+CHECK: COVERED_DIRS: {{.*}}lib{{[/\\]}}Fuzzer{{[/\\]}}test
 RUN: not LLVMFuzzer-NullDerefTest -print_coverage=1 2>&1 | FileCheck %s
 
 RUN: LLVMFuzzer-DSOTest -print_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO
diff --git a/lib/Fuzzer/test/cxxstring.test b/lib/Fuzzer/test/cxxstring.test
new file mode 100644
index 000000000000..c60d7aee9686
--- /dev/null
+++ b/lib/Fuzzer/test/cxxstring.test
@@ -0,0 +1,2 @@
+RUN: not LLVMFuzzer-CxxStringEqTest -seed=1 -runs=1000000 2>&1 | FileCheck %s
+CHECK: BINGO
diff --git a/lib/Fuzzer/test/disable-leaks.test b/lib/Fuzzer/test/disable-leaks.test
new file mode 100644
index 000000000000..467b64ccc6f4
--- /dev/null
+++ b/lib/Fuzzer/test/disable-leaks.test
@@ -0,0 +1,4 @@
+REQUIRES: lsan
+RUN: LLVMFuzzer-AccumulateAllocationsTest -detect_leaks=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=ACCUMULATE_ALLOCS
+ACCUMULATE_ALLOCS: INFO: libFuzzer disabled leak detection after every mutation
+
diff --git a/lib/Fuzzer/test/dump_coverage.test b/lib/Fuzzer/test/dump_coverage.test
index 9bd98daa3619..8acc8304fc60 100644
--- a/lib/Fuzzer/test/dump_coverage.test
+++ b/lib/Fuzzer/test/dump_coverage.test
@@ -1,16 +1,14 @@
-RUN: DIR=%t_workdir
-RUN: BUILD_DIR=$(pwd)
-RUN: rm -rf $DIR && mkdir -p $DIR && cd $DIR
-RUN: not $BUILD_DIR/LLVMFuzzer-NullDerefTest -dump_coverage=1 2>&1 | FileCheck %s
-RUN: $BUILD_DIR/LLVMFuzzer-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO
-RUN: not $BUILD_DIR/LLVMFuzzer-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV
-RUN: rm -rf $DIR
-
-
-CHECK: SanitizerCoverage: ./LLVMFuzzer-NullDerefTest.{{.*}}.sancov {{.*}} PCs written
-
-DSO: SanitizerCoverage: ./LLVMFuzzer-DSOTest.{{.*}}.sancov {{.*}} PCs written
-DSO-DAG: SanitizerCoverage: ./libLLVMFuzzer-DSO1.{{.*}}.sancov {{.*}} PCs written
-DSO-DAG: SanitizerCoverage: ./libLLVMFuzzer-DSO2.{{.*}}.sancov {{.*}} PCs written
+RUN: rm -rf %t_workdir && mkdir -p %t_workdir
+RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not LLVMFuzzer-NullDerefTest -dump_coverage=1 2>&1 | FileCheck %s
+RUN: sancov -covered-functions LLVMFuzzer-NullDerefTest* %t_workdir/*.sancov | FileCheck %s --check-prefix=SANCOV
+RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' LLVMFuzzer-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO
+RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not LLVMFuzzer-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV
+
+CHECK: SanitizerCoverage: {{.*}}LLVMFuzzer-NullDerefTest.{{.*}}.sancov {{.*}} PCs written
+SANCOV: LLVMFuzzerTestOneInput
+
+DSO: SanitizerCoverage: {{.*}}LLVMFuzzer-DSOTest.{{.*}}.sancov {{.*}} PCs written
+DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO1.{{.*}}.sancov {{.*}} PCs written
+DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO2.{{.*}}.sancov {{.*}} PCs written
 
 NOCOV-NOT: SanitizerCoverage: {{.*}} PCs written
diff --git a/lib/Fuzzer/test/equivalence-signals.test b/lib/Fuzzer/test/equivalence-signals.test
new file mode 100644
index 000000000000..81a7f37602cc
--- /dev/null
+++ b/lib/Fuzzer/test/equivalence-signals.test
@@ -0,0 +1,9 @@
+REQUIRES: posix
+# Run EquivalenceATest against itself with a small timeout
+# to stress the signal handling and ensure that shmem doesn't mind
+# the signals.
+
+RUN: LLVMFuzzer-EquivalenceATest -timeout=1 -run_equivalence_server=EQUIV_SIG_TEST & export APID=$!
+RUN: sleep 3
+RUN: LLVMFuzzer-EquivalenceATest -timeout=1 -use_equivalence_server=EQUIV_SIG_TEST -runs=500000 2>&1
+RUN: kill -9 $APID
diff --git a/lib/Fuzzer/test/equivalence.test b/lib/Fuzzer/test/equivalence.test
new file mode 100644
index 000000000000..015ba855c600
--- /dev/null
+++ b/lib/Fuzzer/test/equivalence.test
@@ -0,0 +1,8 @@
+REQUIRES: posix
+
+RUN: LLVMFuzzer-EquivalenceATest -run_equivalence_server=EQUIV_TEST & export APID=$!
+RUN: sleep 3
+RUN: not LLVMFuzzer-EquivalenceBTest -use_equivalence_server=EQUIV_TEST -max_len=4096 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: equivalence-mismatch. Sizes: {{.*}}; offset 2
+CHECK: SUMMARY: libFuzzer: equivalence-mismatch
+RUN: kill -9 $APID
diff --git a/lib/Fuzzer/test/extra-counters.test b/lib/Fuzzer/test/extra-counters.test
new file mode 100644
index 000000000000..61fce44784b7
--- /dev/null
+++ b/lib/Fuzzer/test/extra-counters.test
@@ -0,0 +1,6 @@
+REQUIRES: linux
+
+RUN: not LLVMFuzzer-TableLookupTest -print_final_stats=1 2>&1 | FileCheck %s
+CHECK: BINGO
+// Expecting >= 4096 new_units_added
+CHECK: stat::new_units_added:{{.*[4][0-9][0-9][0-9]}}
diff --git a/lib/Fuzzer/test/fuzzer-customcrossover.test b/lib/Fuzzer/test/fuzzer-customcrossover.test
index 28d39ce31dec..ccf8261af8ad 100644
--- a/lib/Fuzzer/test/fuzzer-customcrossover.test
+++ b/lib/Fuzzer/test/fuzzer-customcrossover.test
@@ -2,7 +2,7 @@ RUN: rm -rf %t/CustomCrossover
 RUN: mkdir -p %t/CustomCrossover
 RUN: echo "0123456789" > %t/CustomCrossover/digits
 RUN: echo "abcdefghij" > %t/CustomCrossover/chars
-RUN: not LLVMFuzzer-CustomCrossOverTest -seed=1 -use_memcmp=0 -runs=100000 %t/CustomCrossover 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomCrossover
+RUN: not LLVMFuzzer-CustomCrossOverTest -seed=1 -runs=100000 %t/CustomCrossover 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomCrossover
 RUN: rm -rf %t/CustomCrossover
 
 LLVMFuzzerCustomCrossover: In LLVMFuzzerCustomCrossover
diff --git a/lib/Fuzzer/test/fuzzer-customcrossoverandmutate.test b/lib/Fuzzer/test/fuzzer-customcrossoverandmutate.test
new file mode 100644
index 000000000000..1e322ec0da63
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-customcrossoverandmutate.test
@@ -0,0 +1 @@
+RUN: LLVMFuzzer-CustomCrossOverAndMutateTest -seed=1 -runs=100000
diff --git a/lib/Fuzzer/test/fuzzer-dirs.test b/lib/Fuzzer/test/fuzzer-dirs.test
index 63afe8dfcf9c..3de64f278f5d 100644
--- a/lib/Fuzzer/test/fuzzer-dirs.test
+++ b/lib/Fuzzer/test/fuzzer-dirs.test
@@ -5,9 +5,9 @@ RUN: echo b > %t/SUB1/SUB2/b
 RUN: echo c > %t/SUB1/SUB2/SUB3/c
 RUN: LLVMFuzzer-SimpleTest %t/SUB1 -runs=0 2>&1 | FileCheck %s --check-prefix=SUBDIRS
 SUBDIRS: READ   units: 3
-RUN: echo zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz > %t/SUB1/long
+RUN: echo -n zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz > %t/SUB1/long
 RUN: LLVMFuzzer-SimpleTest %t/SUB1 -runs=0 2>&1 | FileCheck %s --check-prefix=LONG
-LONG: INFO: -max_len is not provided, using 94
+LONG: INFO: -max_len is not provided, using 93
 RUN: rm -rf %t/SUB1
 
 RUN: not LLVMFuzzer-SimpleTest NONEXISTENT_DIR 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR
diff --git a/lib/Fuzzer/test/fuzzer-jobs.test b/lib/Fuzzer/test/fuzzer-jobs.test
deleted file mode 100644
index 5bf8cfadfb75..000000000000
--- a/lib/Fuzzer/test/fuzzer-jobs.test
+++ /dev/null
@@ -1,29 +0,0 @@
-RUN: rm -rf %tmp
-RUN: mkdir %tmp && cd %tmp
-# Create a shared corpus directory
-RUN: rm -rf FuzzerJobsTestCORPUS
-RUN: mkdir FuzzerJobsTestCORPUS
-RUN: rm -f fuzz-{0,1}.log
-# Start fuzzer and in parallel check that the output files
-# that should be created exist.
-RUN: LLVMFuzzer-EmptyTest -max_total_time=4 -jobs=2 -workers=2 FuzzerJobsTestCORPUS > %t-fuzzer-jobs-test.log 2>&1 & export FUZZER_PID=$!
-# Wait a short while to give time for the child processes
-# to start fuzzing
-RUN: sleep 2
-# If the instances are running in parallel they should have created their log
-# files by now.
-RUN: ls fuzz-0.log
-RUN: ls fuzz-1.log
-# Wait for libfuzzer to finish.
-# This probably isn't portable but we need a way to block until
-# the fuzzer is done otherwise we might remove the files while
-# they are being used.
-RUN: while kill -0 ${FUZZER_PID}; do : ; done
-RUN: rm -f fuzz-{0,1}.log
-RUN: rm -rf FuzzerJobsTestCORPUS
-RUN: FileCheck -input-file=%t-fuzzer-jobs-test.log %s
-RUN: rm %t-fuzzer-jobs-test.log
-RUN: cd ../
-
-CHECK-DAG: Job 0 exited with exit code 0
-CHECK-DAG: Job 1 exited with exit code 0
diff --git a/lib/Fuzzer/test/fuzzer-leak.test b/lib/Fuzzer/test/fuzzer-leak.test
index 9cf5c743fff5..13e3ad740e6d 100644
--- a/lib/Fuzzer/test/fuzzer-leak.test
+++ b/lib/Fuzzer/test/fuzzer-leak.test
@@ -29,7 +29,5 @@ RUN: not LLVMFuzzer-LeakTimeoutTest -timeout=1 2>&1 | FileCheck %s --check-prefi
 LEAK_TIMEOUT: ERROR: libFuzzer: timeout after
 LEAK_TIMEOUT-NOT: LeakSanitizer
 
-RUN: LLVMFuzzer-AccumulateAllocationsTest -detect_leaks=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=ACCUMULATE_ALLOCS
-ACCUMULATE_ALLOCS: INFO: libFuzzer disabled leak detection after every mutation
 
 RUN: LLVMFuzzer-LeakTest -error_exitcode=0
diff --git a/lib/Fuzzer/test/fuzzer-oom.test b/lib/Fuzzer/test/fuzzer-oom.test
index 8caf649e9f04..e9d33552723e 100644
--- a/lib/Fuzzer/test/fuzzer-oom.test
+++ b/lib/Fuzzer/test/fuzzer-oom.test
@@ -1,10 +1,12 @@
+XFAIL: darwin
 RUN: not LLVMFuzzer-OutOfMemoryTest -rss_limit_mb=300 2>&1 | FileCheck %s
+
 CHECK: ERROR: libFuzzer: out-of-memory (used: {{.*}}; limit: 300Mb)
 CHECK: Test unit written to ./oom-
 SUMMARY: libFuzzer: out-of-memory
 
-RUN: not LLVMFuzzer-OutOfMemorySingleLargeMallocTest 2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
-SINGLE_LARGE_MALLOC: libFuzzer: out-of-memory (malloc(42{{.*}}))
+RUN: not LLVMFuzzer-OutOfMemorySingleLargeMallocTest -rss_limit_mb=300 2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
+SINGLE_LARGE_MALLOC: libFuzzer: out-of-memory (malloc(53{{.*}}))
 SINGLE_LARGE_MALLOC: in LLVMFuzzerTestOneInput
 
 # Check that -rss_limit_mb=0 means no limit.
diff --git a/lib/Fuzzer/test/fuzzer-segv.test b/lib/Fuzzer/test/fuzzer-segv.test
index 330f03bcc494..b9a6a5ce44ca 100644
--- a/lib/Fuzzer/test/fuzzer-segv.test
+++ b/lib/Fuzzer/test/fuzzer-segv.test
@@ -1,4 +1,4 @@
-RUN: ASAN_OPTIONS=handle_segv=0 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_OWN_SEGV_HANDLER
+RUN: env ASAN_OPTIONS=handle_segv=0 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_OWN_SEGV_HANDLER
 LIBFUZZER_OWN_SEGV_HANDLER: == ERROR: libFuzzer: deadly signal
 LIBFUZZER_OWN_SEGV_HANDLER: SUMMARY: libFuzzer: deadly signal
 LIBFUZZER_OWN_SEGV_HANDLER: Test unit written to ./crash-
diff --git a/lib/Fuzzer/test/fuzzer-singleinputs.test b/lib/Fuzzer/test/fuzzer-singleinputs.test
index ca8403bff81f..500e5da8faa9 100644
--- a/lib/Fuzzer/test/fuzzer-singleinputs.test
+++ b/lib/Fuzzer/test/fuzzer-singleinputs.test
@@ -8,7 +8,7 @@ RUN: echo bbb > %tmp/SINGLE_INPUTS/bbb
 RUN: LLVMFuzzer-SimpleTest            %tmp/SINGLE_INPUTS/aaa %tmp/SINGLE_INPUTS/bbb 2>&1 | FileCheck %s --check-prefix=SINGLE_INPUTS
 RUN: LLVMFuzzer-SimpleTest -max_len=2 %tmp/SINGLE_INPUTS/aaa %tmp/SINGLE_INPUTS/bbb 2>&1 | FileCheck %s --check-prefix=SINGLE_INPUTS
 RUN: rm -rf  %tmp/SINGLE_INPUTS
-SINGLE_INPUTS: LLVMFuzzer-SimpleTest: Running 2 inputs 1 time(s) each.
+SINGLE_INPUTS: LLVMFuzzer-SimpleTest{{.*}}: Running 2 inputs 1 time(s) each.
 SINGLE_INPUTS: aaa in
 SINGLE_INPUTS: bbb in
 SINGLE_INPUTS: NOTE: fuzzing was not performed, you have only
diff --git a/lib/Fuzzer/test/fuzzer-traces-hooks.test b/lib/Fuzzer/test/fuzzer-traces-hooks.test
index 71fe6f2daf11..f93a8b7199e2 100644
--- a/lib/Fuzzer/test/fuzzer-traces-hooks.test
+++ b/lib/Fuzzer/test/fuzzer-traces-hooks.test
@@ -1,25 +1,17 @@
-// FIXME: Support sanitizer hooks for memcmp and strcmp need
-// to be implemented in the sanitizer runtime for platforms other
-// than linux
-REQUIRES: linux
+// FIXME: Support for sanitizer hooks for memcmp and strcmp needs to
+// be implemented in the sanitizer runtime for this test
+UNSUPPORTED: windows
 CHECK: BINGO
-Done1000000: Done 1000000 runs in
 
-RUN: not LLVMFuzzer-MemcmpTest               -seed=4294967295 -runs=100000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-MemcmpTest -use_memcmp=0 -seed=4294967295 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+RUN: not LLVMFuzzer-MemcmpTest               -seed=1 -runs=2000000   2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-StrncmpTest              -seed=1 -runs=2000000   2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-StrcmpTest               -seed=1 -runs=2000000   2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-StrstrTest               -seed=1 -runs=2000000   2>&1 | FileCheck %s
 
-RUN: not LLVMFuzzer-StrncmpTest               -seed=2 -runs=100000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-StrncmpTest -use_memcmp=0 -seed=3 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+RUN: not LLVMFuzzer-Memcmp64BytesTest        -seed=1 -runs=1000000   2>&1 | FileCheck %s
 
-RUN: not LLVMFuzzer-StrcmpTest               -seed=4 -runs=200000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-StrcmpTest -use_memcmp=0 -seed=5 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
-
-RUN: not LLVMFuzzer-StrstrTest               -seed=6 -runs=200000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-StrstrTest -use_memmem=0 -seed=7 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
-
-RUN: LLVMFuzzer-RepeatedMemcmp -seed=10 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
+RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
 RECOMMENDED_DICT:###### Recommended dictionary. ######
 RECOMMENDED_DICT-DAG: "foo"
 RECOMMENDED_DICT-DAG: "bar"
 RECOMMENDED_DICT:###### End of recommended dictionary. ######
-
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
index 2f91c2195ca9..ff46d32b387d 100644
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -11,7 +11,7 @@ MaxTotalTime: Done {{.*}} runs in {{.}} second(s)
 
 RUN: not LLVMFuzzer-NullDerefTest                  2>&1 | FileCheck %s --check-prefix=NullDerefTest
 RUN: not LLVMFuzzer-NullDerefTest -close_fd_mask=3 2>&1 | FileCheck %s --check-prefix=NullDerefTest
-NullDerefTest: ERROR: AddressSanitizer: SEGV on unknown address
+NullDerefTest: ERROR: AddressSanitizer: {{SEGV|access-violation}} on unknown address
 NullDerefTest: Test unit written to ./crash-
 RUN: not LLVMFuzzer-NullDerefTest  -artifact_prefix=ZZZ 2>&1 | FileCheck %s --check-prefix=NullDerefTestPrefix
 NullDerefTestPrefix: Test unit written to ZZZcrash-
@@ -34,7 +34,7 @@ COUNTERS: BINGO
 DISABLED: not LLVMFuzzer-UninstrumentedTest-Uninstrumented 2>&1 | FileCheck %s --check-prefix=UNINSTRUMENTED
 UNINSTRUMENTED: ERROR: __sanitizer_set_death_callback is not defined. Exiting.
 
-RUN: not LLVMFuzzer-UninstrumentedTest-NoCoverage 2>&1 | FileCheck %s --check-prefix=NO_COVERAGE
+RUN: not LLVMFuzzer-NotinstrumentedTest-NoCoverage 2>&1 | FileCheck %s --check-prefix=NO_COVERAGE
 NO_COVERAGE: ERROR: no interesting inputs were found. Is the code instrumented for coverage? Exiting
 
 RUN: not LLVMFuzzer-BufferOverflowOnInput 2>&1 | FileCheck %s --check-prefix=OOB
@@ -51,7 +51,10 @@ RUN: LLVMFuzzer-SimpleTest  -exit_on_src_pos=SimpleTest.cpp:17                 2
 RUN: LLVMFuzzer-ShrinkControlFlowTest  -exit_on_src_pos=ShrinkControlFlowTest.cpp:23 2>&1 | FileCheck %s --check-prefix=EXIT_ON_SRC_POS
 EXIT_ON_SRC_POS: INFO: found line matching '{{.*}}', exiting.
 
-RUN: ASAN_OPTIONS=strict_string_checks=1 not LLVMFuzzer-StrncmpOOBTest -seed=1 -runs=1000000 2>&1 | FileCheck %s --check-prefix=STRNCMP
+RUN: env ASAN_OPTIONS=strict_string_checks=1 not LLVMFuzzer-StrncmpOOBTest -seed=1 -runs=1000000 2>&1 | FileCheck %s --check-prefix=STRNCMP
 STRNCMP: AddressSanitizer: heap-buffer-overflow
 STRNCMP-NOT: __sanitizer_weak_hook_strncmp
 STRNCMP: in LLVMFuzzerTestOneInput
+
+RUN: not LLVMFuzzer-BogusInitializeTest 2>&1 | FileCheck %s --check-prefix=BOGUS_INITIALIZE
+BOGUS_INITIALIZE: argv[0] has been modified in LLVMFuzzerInitialize
diff --git a/lib/Fuzzer/test/lit.cfg b/lib/Fuzzer/test/lit.cfg
index 745af0c38245..85c95b42d1ea 100644
--- a/lib/Fuzzer/test/lit.cfg
+++ b/lib/Fuzzer/test/lit.cfg
@@ -6,6 +6,23 @@ config.test_format = lit.formats.ShTest(True)
 config.suffixes = ['.test']
 config.test_source_root = os.path.dirname(__file__)
 
+# Choose between lit's internal shell pipeline runner and a real shell.  If
+# LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override.
+use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
+if use_lit_shell:
+    # 0 is external, "" is default, and everything else is internal.
+    execute_external = (use_lit_shell == "0")
+else:
+    # Otherwise we default to internal on Windows and external elsewhere, as
+    # bash on Windows is usually very slow.
+    execute_external = (not sys.platform in ['win32'])
+
+# testFormat: The test format to use to interpret tests.
+#
+# For now we require '&&' between commands, until they get globally killed and
+# the test runner updated.
+config.test_format = lit.formats.ShTest(execute_external)
+
 # Tweak PATH to include llvm tools dir and current exec dir.
 llvm_tools_dir = getattr(config, 'llvm_tools_dir', None)
 if (not llvm_tools_dir) or (not os.path.exists(llvm_tools_dir)):
@@ -20,6 +37,15 @@ if config.has_lsan:
 else:
   lit_config.note('lsan feature unavailable')
 
+if sys.platform.startswith('win') or sys.platform.startswith('cygwin'):
+  config.available_features.add('windows')
+
+if sys.platform.startswith('darwin'):
+  config.available_features.add('darwin')
+
+if config.is_posix:
+  config.available_features.add('posix')
+
 if sys.platform.startswith('linux'):
   # Note the value of ``sys.platform`` is not consistent
   # between python 2 and 3, hence the use of ``.startswith()``.
diff --git a/lib/Fuzzer/test/lit.site.cfg.in b/lib/Fuzzer/test/lit.site.cfg.in
index 03e86c487ca9..069f2b72c0d9 100644
--- a/lib/Fuzzer/test/lit.site.cfg.in
+++ b/lib/Fuzzer/test/lit.site.cfg.in
@@ -1,4 +1,5 @@
 config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 config.has_lsan = True if @HAS_LSAN@ == 1 else False
+config.is_posix = @LIBFUZZER_POSIX@
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/lib/Fuzzer/test/merge-posix.test b/lib/Fuzzer/test/merge-posix.test
new file mode 100644
index 000000000000..47b90b986791
--- /dev/null
+++ b/lib/Fuzzer/test/merge-posix.test
@@ -0,0 +1,23 @@
+REQUIRES: posix
+
+RUN: rm -rf  %tmp/T1 %tmp/T2
+RUN: mkdir -p %tmp/T1 %tmp/T2
+
+RUN: echo F..... > %tmp/T1/1
+RUN: echo .U.... > %tmp/T1/2
+RUN: echo ..Z... > %tmp/T1/3
+
+RUN: echo .....F > %tmp/T2/1
+RUN: echo ....U. > %tmp/T2/2
+RUN: echo ...Z.. > %tmp/T2/3
+RUN: echo ...Z.. > %tmp/T2/4
+RUN: echo ....E. > %tmp/T2/5
+RUN: echo .....R > %tmp/T2/6
+
+# Check that we can report an error if file size exceeded
+RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
+SIGXFSZ: ERROR: libFuzzer: file size exceeded
+
+# Check that we honor TMPDIR
+RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR
+TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp
diff --git a/lib/Fuzzer/test/merge-summary.test b/lib/Fuzzer/test/merge-summary.test
new file mode 100644
index 000000000000..df9d62dec636
--- /dev/null
+++ b/lib/Fuzzer/test/merge-summary.test
@@ -0,0 +1,15 @@
+RUN: rm -rf %t/T1 %t/T2
+RUN: mkdir -p %t/T0 %t/T1 %t/T2
+RUN: echo ...Z.. > %t/T2/1
+RUN: echo ....E. > %t/T2/2
+RUN: echo .....R > %t/T2/3
+RUN: echo F..... > %t/T2/a
+RUN: echo .U.... > %t/T2/b
+RUN: echo ..Z... > %t/T2/c
+
+RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %t/T1 %t/T2 -save_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=SAVE_SUMMARY
+SAVE_SUMMARY: MERGE-OUTER: writing coverage summary for 6 files to {{.*}}SUMMARY
+RUN: rm %t/T1/*
+RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %t/T1 %t/T2 -load_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=LOAD_SUMMARY
+LOAD_SUMMARY: MERGE-OUTER: coverage summary loaded from {{.*}}SUMMAR
+LOAD_SUMMARY: MERGE-OUTER: 0 new files with 0 new features added
diff --git a/lib/Fuzzer/test/merge.test b/lib/Fuzzer/test/merge.test
index 5c7d30e41caa..e59da8c3e091 100644
--- a/lib/Fuzzer/test/merge.test
+++ b/lib/Fuzzer/test/merge.test
@@ -1,12 +1,13 @@
 CHECK: BINGO
 
-RUN: rm -rf  %tmp/T1 %tmp/T2
-RUN: mkdir -p %tmp/T1 %tmp/T2
-RUN: echo F..... > %tmp/T1/1
-RUN: echo .U.... > %tmp/T1/2
-RUN: echo ..Z... > %tmp/T1/3
+RUN: rm -rf %tmp/T0 %tmp/T1 %tmp/T2
+RUN: mkdir -p %tmp/T0 %tmp/T1 %tmp/T2
+RUN: echo F..... > %tmp/T0/1
+RUN: echo .U.... > %tmp/T0/2
+RUN: echo ..Z... > %tmp/T0/3
 
 # T1 has 3 elements, T2 is empty.
+RUN: cp %tmp/T0/* %tmp/T1/
 RUN: LLVMFuzzer-FullCoverageSetTest         -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=CHECK1
 CHECK1: MERGE-OUTER: 3 files, 3 in the initial corpus
 CHECK1: MERGE-OUTER: 0 new files with 0 new features added
@@ -29,13 +30,15 @@ CHECK3: MERGE-OUTER: 12 files, 6 in the initial corpus
 CHECK3: MERGE-OUTER: 0 new files with 0 new features added
 
 # Check that we respect max_len during the merge and don't crash.
-RUN: rm %tmp/T1/??*
+RUN: rm %tmp/T1/*
+RUN: cp %tmp/T0/* %tmp/T1/
 RUN: echo looooooooong > %tmp/T2/looooooooong
 RUN: LLVMFuzzer-FullCoverageSetTest         -merge=1 %tmp/T1 %tmp/T2 -max_len=6 2>&1 | FileCheck %s --check-prefix=MAX_LEN
 MAX_LEN: MERGE-OUTER: 3 new files
 
 # Check that merge tolerates failures.
-RUN: rm %tmp/T1/??*
+RUN: rm %tmp/T1/*
+RUN: cp %tmp/T0/* %tmp/T1/
 RUN: echo 'FUZZER' > %tmp/T2/FUZZER
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=MERGE_WITH_CRASH
 MERGE_WITH_CRASH: MERGE-OUTER: succesfull in 2 attempt(s)
@@ -45,10 +48,6 @@ MERGE_WITH_CRASH: MERGE-OUTER: 3 new files
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2  -max_len=5 2>&1 | FileCheck %s --check-prefix=MERGE_LEN5
 MERGE_LEN5: MERGE-OUTER: succesfull in 1 attempt(s)
 
-# Check that we honor TMPDIR
-RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR
-TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp
-
-# Check that we can report an error if file size exceeded
-RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
-SIGXFSZ: ERROR: libFuzzer: file size exceeded
+RUN: rm -rf  %tmp/T1/* %tmp/T2/*
+RUN: not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=EMPTY
+EMPTY: MERGE-OUTER: zero succesfull attempts, exiting
diff --git a/lib/Fuzzer/test/minimize_crash.test b/lib/Fuzzer/test/minimize_crash.test
index 7e5406598e4a..5643c6bacb09 100644
--- a/lib/Fuzzer/test/minimize_crash.test
+++ b/lib/Fuzzer/test/minimize_crash.test
@@ -1,6 +1,13 @@
 RUN: echo 'Hi!rv349f34t3gg' > not_minimal_crash
 RUN: LLVMFuzzer-NullDerefTest -minimize_crash=1 not_minimal_crash -max_total_time=2 2>&1 | FileCheck %s
-CHECK: CRASH_MIN: failed to minimize beyond minimized-from-{{.*}} (3 bytes), exiting
+CHECK: CRASH_MIN: failed to minimize beyond ./minimized-from-{{.*}} (3 bytes), exiting
 RUN: LLVMFuzzer-NullDerefTest -minimize_crash=1 not_minimal_crash -max_total_time=2 -exact_artifact_path=exact_minimized_path 2>&1 | FileCheck %s --check-prefix=CHECK_EXACT
 CHECK_EXACT: CRASH_MIN: failed to minimize beyond exact_minimized_path (3 bytes), exiting
 RUN: rm not_minimal_crash minimized-from-* exact_minimized_path
+
+RUN: echo -n 'abcd*xyz' > not_minimal_crash
+RUN: LLVMFuzzer-SingleByteInputTest -minimize_crash=1 not_minimal_crash -exact_artifact_path=exact_minimized_path 2>&1 | FileCheck %s --check-prefix=MIN1
+MIN1: Test unit written to exact_minimized_path
+MIN1: Test unit written to exact_minimized_path
+MIN1: INFO: The input is small enough, exiting
+MIN1: CRASH_MIN: failed to minimize beyond exact_minimized_path (1 bytes), exiting
diff --git a/lib/Fuzzer/test/minimize_two_crashes.test b/lib/Fuzzer/test/minimize_two_crashes.test
new file mode 100644
index 000000000000..2358d8c2a92e
--- /dev/null
+++ b/lib/Fuzzer/test/minimize_two_crashes.test
@@ -0,0 +1,16 @@
+# Test that the minimizer stops when it sees a differe bug.
+
+RUN: rm -rf %t && mkdir %t
+RUN: echo H12345678901234667888090 > %t/long_crash
+RUN: env ASAN_OPTIONS=dedup_token_length=3 LLVMFuzzer-TwoDifferentBugsTest -seed=1 -minimize_crash=1 %t/long_crash -exact_artifact_path=%t/result 2>&1 | FileCheck %s
+
+CHECK: DedupToken1: DEDUP_TOKEN: Bar
+CHECK: DedupToken2: DEDUP_TOKEN: Bar
+CHECK: DedupToken1: DEDUP_TOKEN: Bar
+CHECK: DedupToken2: DEDUP_TOKEN: Foo
+CHECK: CRASH_MIN: mismatch in dedup tokens
+
+RUN: not  LLVMFuzzer-TwoDifferentBugsTest %t/result 2>&1 | FileCheck %s --check-prefix=VERIFY
+
+VERIFY: ERROR: AddressSanitizer:
+VERIFY: in Bar
diff --git a/lib/Fuzzer/test/no-coverage/CMakeLists.txt b/lib/Fuzzer/test/no-coverage/CMakeLists.txt
index d2f6f438ad79..52e7240333ee 100644
--- a/lib/Fuzzer/test/no-coverage/CMakeLists.txt
+++ b/lib/Fuzzer/test/no-coverage/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CMAKE_CXX_FLAGS
   "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters,trace-pc-guard")
 
 set(NoCoverageTests
-  UninstrumentedTest
+  NotinstrumentedTest
   )
 
 foreach(Test ${NoCoverageTests})
@@ -16,14 +16,14 @@ endforeach()
 ###############################################################################
 # AFL Driver test
 ###############################################################################
+if(NOT MSVC)
+  add_executable(AFLDriverTest
+    ../AFLDriverTest.cpp ../../afl/afl_driver.cpp)
 
-add_executable(AFLDriverTest
-  ../AFLDriverTest.cpp ../../afl/afl_driver.cpp)
+  set_target_properties(AFLDriverTest
+      PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+      "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
+      )
 
-set_target_properties(AFLDriverTest
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY
-    "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
-    )
-
-# Propagate value into parent directory
-set(TestBinaries ${TestBinaries} AFLDriverTest PARENT_SCOPE)
+  add_dependencies(TestBinaries AFLDriverTest)
+endif()
diff --git a/lib/Fuzzer/test/trace-malloc-2.test b/lib/Fuzzer/test/trace-malloc-2.test
new file mode 100644
index 000000000000..7719b650c791
--- /dev/null
+++ b/lib/Fuzzer/test/trace-malloc-2.test
@@ -0,0 +1,8 @@
+// FIXME: This test infinite loops on darwin because it crashes
+// printing a stack trace repeatedly
+UNSUPPORTED: darwin
+
+RUN: LLVMFuzzer-TraceMallocTest -seed=1 -trace_malloc=2 -runs=1000 2>&1 | FileCheck %s --check-prefix=TRACE2
+TRACE2-DAG: FREE[0]
+TRACE2-DAG: MALLOC[0]
+TRACE2-DAG: in LLVMFuzzerTestOneInput
diff --git a/lib/Fuzzer/test/trace-malloc.test b/lib/Fuzzer/test/trace-malloc.test
index c95147904d42..25694cc2de5c 100644
--- a/lib/Fuzzer/test/trace-malloc.test
+++ b/lib/Fuzzer/test/trace-malloc.test
@@ -3,8 +3,3 @@ CHECK-DAG: MallocFreeTracer: STOP 0 0 (same)
 CHECK-DAG: MallocFreeTracer: STOP 0 1 (DIFFERENT)
 CHECK-DAG: MallocFreeTracer: STOP 1 0 (DIFFERENT)
 CHECK-DAG: MallocFreeTracer: STOP 1 1 (same)
-
-RUN: LLVMFuzzer-TraceMallocTest -seed=1 -trace_malloc=2 -runs=1000 2>&1 | FileCheck %s --check-prefix=TRACE2
-TRACE2-DAG: FREE[0]
-TRACE2-DAG: MALLOC[0]
-TRACE2-DAG: in LLVMFuzzerTestOneInput
diff --git a/lib/Fuzzer/test/trace-pc.test b/lib/Fuzzer/test/trace-pc.test
new file mode 100644
index 000000000000..3709677b71b6
--- /dev/null
+++ b/lib/Fuzzer/test/trace-pc.test
@@ -0,0 +1,2 @@
+CHECK: BINGO
+RUN: LLVMFuzzer-SimpleTest-TracePC -runs=100000 -seed=1 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/trace-pc/CMakeLists.txt b/lib/Fuzzer/test/trace-pc/CMakeLists.txt
new file mode 100644
index 000000000000..e800f82cc5dc
--- /dev/null
+++ b/lib/Fuzzer/test/trace-pc/CMakeLists.txt
@@ -0,0 +1,13 @@
+# These tests are not instrumented with coverage and don't
+# have coverage rt in the binary.
+
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters,trace-pc-guard -fsanitize-coverage=trace-pc")
+
+set(TracePCTests
+  SimpleTest
+  )
+
+foreach(Test ${TracePCTests})
+  add_libfuzzer_test(${Test}-TracePC SOURCES ../${Test}.cpp)
+endforeach()
diff --git a/lib/Fuzzer/test/ubsan/CMakeLists.txt b/lib/Fuzzer/test/ubsan/CMakeLists.txt
index 7a9eacdbe7df..55e0a118186b 100644
--- a/lib/Fuzzer/test/ubsan/CMakeLists.txt
+++ b/lib/Fuzzer/test/ubsan/CMakeLists.txt
@@ -10,6 +10,3 @@ set(UbsanTests
 foreach(Test ${UbsanTests})
   add_libfuzzer_test(${Test}-Ubsan SOURCES ../${Test}.cpp)
 endforeach()
-
-# Propagate value into parent directory
-set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/ulimit.test b/lib/Fuzzer/test/ulimit.test
index a60636c351bd..c2faca13f728 100644
--- a/lib/Fuzzer/test/ulimit.test
+++ b/lib/Fuzzer/test/ulimit.test
@@ -1,2 +1,4 @@
+REQUIRES: posix
+
 RUN: ulimit -s 1000
 RUN: LLVMFuzzer-SimpleTest
diff --git a/lib/Fuzzer/test/uninstrumented/CMakeLists.txt b/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
index 29b66e6e586a..f4ab59e5b18d 100644
--- a/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
+++ b/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
@@ -11,6 +11,3 @@ set(UninstrumentedTests
 foreach(Test ${UninstrumentedTests})
   add_libfuzzer_test(${Test}-Uninstrumented SOURCES ../${Test}.cpp)
 endforeach()
-
-# Propagate value into parent directory
-set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/value-profile-div.test b/lib/Fuzzer/test/value-profile-div.test
index ba45e4129d30..b966a8916512 100644
--- a/lib/Fuzzer/test/value-profile-div.test
+++ b/lib/Fuzzer/test/value-profile-div.test
@@ -1,3 +1,3 @@
-CHECK: AddressSanitizer: FPE
+CHECK: AddressSanitizer: {{FPE|int-divide-by-zero}}
 RUN: not LLVMFuzzer-DivTest -seed=1 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
 
diff --git a/lib/Fuzzer/test/value-profile-mem.test b/lib/Fuzzer/test/value-profile-mem.test
index 09d737dbe736..880b2692910a 100644
--- a/lib/Fuzzer/test/value-profile-mem.test
+++ b/lib/Fuzzer/test/value-profile-mem.test
@@ -1,2 +1,2 @@
 CHECK: BINGO
-RUN: not LLVMFuzzer-SingleMemcmpTest -seed=1  -use_cmp=0 -use_memcmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-SingleMemcmpTest -seed=1  -use_cmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/value-profile-strcmp.test b/lib/Fuzzer/test/value-profile-strcmp.test
index 1e7ef9b45e96..7f1047594548 100644
--- a/lib/Fuzzer/test/value-profile-strcmp.test
+++ b/lib/Fuzzer/test/value-profile-strcmp.test
@@ -1,2 +1,2 @@
 CHECK: BINGO
-RUN: not LLVMFuzzer-SingleStrcmpTest -seed=1  -use_cmp=0 -use_memcmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-SingleStrcmpTest -seed=1  -use_cmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/value-profile-strncmp.test b/lib/Fuzzer/test/value-profile-strncmp.test
index 650973180c06..84a74c4f0ad2 100644
--- a/lib/Fuzzer/test/value-profile-strncmp.test
+++ b/lib/Fuzzer/test/value-profile-strncmp.test
@@ -1,2 +1,2 @@
 CHECK: BINGO
-RUN: not LLVMFuzzer-SingleStrncmpTest -seed=1 -use_cmp=0 -use_memcmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-SingleStrncmpTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index eecef9423f2e..d0b77e7218b9 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -832,7 +833,7 @@ void SlotTracker::processModule() {
     // Add all the function attributes to the table.
     // FIXME: Add attributes of other objects?
     AttributeSet FnAttrs = F.getAttributes().getFnAttributes();
-    if (FnAttrs.hasAttributes(AttributeSet::FunctionIndex))
+    if (FnAttrs.hasAttributes())
       CreateAttributeSetSlot(FnAttrs);
   }
 
@@ -867,15 +868,10 @@ void SlotTracker::processFunction() {
 
       // We allow direct calls to any llvm.foo function here, because the
       // target may not be linked into the optimizer.
-      if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (auto CS = ImmutableCallSite(&I)) {
         // Add all the call attributes to the table.
-        AttributeSet Attrs = CI->getAttributes().getFnAttributes();
-        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-          CreateAttributeSetSlot(Attrs);
-      } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
-        // Add all the call attributes to the table.
-        AttributeSet Attrs = II->getAttributes().getFnAttributes();
-        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        AttributeSet Attrs = CS.getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes())
           CreateAttributeSetSlot(Attrs);
       }
     }
@@ -1016,8 +1012,7 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 }
 
 void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
-  assert(AS.hasAttributes(AttributeSet::FunctionIndex) &&
-         "Doesn't need a slot!");
+  assert(AS.hasAttributes() && "Doesn't need a slot!");
 
   as_iterator I = asMap.find(AS);
   if (I != asMap.end())
@@ -1073,6 +1068,8 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
         Out << " nsz";
       if (FPO->hasAllowReciprocal())
         Out << " arcp";
+      if (FPO->hasAllowContract())
+        Out << " contract";
     }
   }
 
@@ -1614,6 +1611,9 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
   Printer.printInt("offset", N->getOffsetInBits());
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printMetadata("extraData", N->getRawExtraData());
+  if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
+    Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
+                     /* ShouldSkipZero */ false);
   Out << ")";
 }
 
@@ -1688,6 +1688,8 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printMetadata("macros", N->getRawMacros());
   Printer.printInt("dwoId", N->getDWOId());
   Printer.printBool("splitDebugInlining", N->getSplitDebugInlining(), true);
+  Printer.printBool("debugInfoForProfiling", N->getDebugInfoForProfiling(),
+                    false);
   Out << ")";
 }
 
@@ -2083,7 +2085,8 @@ public:
   void printModule(const Module *M);
 
   void writeOperand(const Value *Op, bool PrintType);
-  void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
+  void writeParamOperand(const Value *Operand, AttributeList Attrs,
+                         unsigned Idx);
   void writeOperandBundles(ImmutableCallSite CS);
   void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
   void writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
@@ -2099,7 +2102,7 @@ public:
   void printIndirectSymbol(const GlobalIndirectSymbol *GIS);
   void printComdat(const Comdat *C);
   void printFunction(const Function *F);
-  void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
+  void printArgument(const Argument *FA, AttributeList Attrs, unsigned Idx);
   void printBasicBlock(const BasicBlock *BB);
   void printInstructionLine(const Instruction &I);
   void printInstruction(const Instruction &I);
@@ -2178,7 +2181,7 @@ void AssemblyWriter::writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
 }
 
 void AssemblyWriter::writeParamOperand(const Value *Operand,
-                                       AttributeSet Attrs, unsigned Idx) {
+                                       AttributeList Attrs, unsigned Idx) {
   if (!Operand) {
     Out << "<null operand!>";
     return;
@@ -2596,19 +2599,12 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->isMaterializable())
     Out << "; Materializable\n";
 
-  const AttributeSet &Attrs = F->getAttributes();
-  if (Attrs.hasAttributes(AttributeSet::FunctionIndex)) {
+  const AttributeList &Attrs = F->getAttributes();
+  if (Attrs.hasAttributes(AttributeList::FunctionIndex)) {
     AttributeSet AS = Attrs.getFnAttributes();
     std::string AttrStr;
 
-    unsigned Idx = 0;
-    for (unsigned E = AS.getNumSlots(); Idx != E; ++Idx)
-      if (AS.getSlotIndex(Idx) == AttributeSet::FunctionIndex)
-        break;
-
-    for (AttributeSet::iterator I = AS.begin(Idx), E = AS.end(Idx);
-         I != E; ++I) {
-      Attribute Attr = *I;
+    for (const Attribute &Attr : AS) {
       if (!Attr.isStringAttribute()) {
         if (!AttrStr.empty()) AttrStr += ' ';
         AttrStr += Attr.getAsString();
@@ -2641,8 +2637,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   }
 
   FunctionType *FT = F->getFunctionType();
-  if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
-    Out <<  Attrs.getAsString(AttributeSet::ReturnIndex) << ' ';
+  if (Attrs.hasAttributes(AttributeList::ReturnIndex))
+    Out << Attrs.getAsString(AttributeList::ReturnIndex) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
@@ -2681,7 +2677,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   StringRef UA = getUnnamedAddrEncoding(F->getUnnamedAddr());
   if (!UA.empty())
     Out << ' ' << UA;
-  if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+  if (Attrs.hasAttributes(AttributeList::FunctionIndex))
     Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
   if (F->hasSection()) {
     Out << " section \"";
@@ -2730,8 +2726,8 @@ void AssemblyWriter::printFunction(const Function *F) {
 /// printArgument - This member is called for every argument that is passed into
 /// the function.  Simply print it out
 ///
-void AssemblyWriter::printArgument(const Argument *Arg,
-                                   AttributeSet Attrs, unsigned Idx) {
+void AssemblyWriter::printArgument(const Argument *Arg, AttributeList Attrs,
+                                   unsigned Idx) {
   // Output type...
   TypePrinter.print(Arg->getType(), Out);
 
@@ -2901,12 +2897,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ", ";
     writeOperand(SI.getDefaultDest(), true);
     Out << " [";
-    for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-         i != e; ++i) {
+    for (auto Case : SI.cases()) {
       Out << "\n    ";
-      writeOperand(i.getCaseValue(), true);
+      writeOperand(Case.getCaseValue(), true);
       Out << ", ";
-      writeOperand(i.getCaseSuccessor(), true);
+      writeOperand(Case.getCaseSuccessor(), true);
     }
     Out << "\n  ]";
   } else if (isa<IndirectBrInst>(I)) {
@@ -3015,10 +3010,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Operand = CI->getCalledValue();
     FunctionType *FTy = CI->getFunctionType();
     Type *RetTy = FTy->getReturnType();
-    const AttributeSet &PAL = CI->getAttributes();
+    const AttributeList &PAL = CI->getAttributes();
 
-    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the call instruction.  We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -3043,7 +3038,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << ", ...";
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    if (PAL.hasAttributes(AttributeList::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(CI);
@@ -3052,7 +3047,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Operand = II->getCalledValue();
     FunctionType *FTy = II->getFunctionType();
     Type *RetTy = FTy->getReturnType();
-    const AttributeSet &PAL = II->getAttributes();
+    const AttributeList &PAL = II->getAttributes();
 
     // Print the calling convention being used.
     if (II->getCallingConv() != CallingConv::C) {
@@ -3060,8 +3055,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the invoke instruction. We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -3079,7 +3074,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    if (PAL.hasAttributes(AttributeList::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(II);
@@ -3109,6 +3104,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     if (AI->getAlignment()) {
       Out << ", align " << AI->getAlignment();
     }
+
+    unsigned AddrSpace = AI->getType()->getAddressSpace();
+    if (AddrSpace != 0) {
+      Out << ", addrspace(" << AddrSpace << ')';
+    }
+
   } else if (isa<CastInst>(I)) {
     if (Operand) {
       Out << ' ';
@@ -3242,7 +3243,7 @@ void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
 }
 
 void AssemblyWriter::writeAllAttributeGroups() {
-  std::vector<std::pair<AttributeSet, unsigned> > asVec;
+  std::vector<std::pair<AttributeSet, unsigned>> asVec;
   asVec.resize(Machine.as_size());
 
   for (SlotTracker::as_iterator I = Machine.as_begin(), E = Machine.as_end();
@@ -3251,7 +3252,7 @@ void AssemblyWriter::writeAllAttributeGroups() {
 
   for (const auto &I : asVec)
     Out << "attributes #" << I.second << " = { "
-        << I.first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
+        << I.first.getAsString(true) << " }\n";
 }
 
 void AssemblyWriter::printUseListOrder(const UseListOrder &Order) {
@@ -3535,6 +3536,7 @@ void Metadata::print(raw_ostream &OS, ModuleSlotTracker &MST,
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // Value::dump - allow easy printing of Values from the debugger.
 LLVM_DUMP_METHOD
 void Value::dump() const { print(dbgs(), /*IsForDebug=*/true); dbgs() << '\n'; }
@@ -3566,3 +3568,4 @@ void Metadata::dump(const Module *M) const {
   print(dbgs(), M, /*IsForDebug=*/true);
   dbgs() << '\n';
 }
+#endif
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index d0d27101aa86..09f037365793 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -16,7 +16,6 @@
 #ifndef LLVM_LIB_IR_ATTRIBUTEIMPL_H
 #define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
-#include "AttributeSetNode.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
@@ -144,16 +143,74 @@ public:
   StringRef getStringValue() const { return Val; }
 };
 
-typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
+  unsigned NumAttrs; ///< Number of attributes in this node.
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs);
+
+public:
+  // AttributesSetNode is uniqued, these should not be available.
+  AttributeSetNode(const AttributeSetNode &) = delete;
+  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
+
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static AttributeSetNode *get(LLVMContext &C, const AttrBuilder &B);
+
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  /// \brief Return the number of attributes this AttributeList contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return NumAttrs != 0; }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef const Attribute *iterator;
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
+  iterator end() const { return begin() + NumAttrs; }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, makeArrayRef(begin(), end()));
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (const auto &Attr : AttrList)
+      Attr.Profile(ID);
+  }
+};
+
+typedef std::pair<unsigned, AttributeSet> IndexAttrPair;
 
 //===----------------------------------------------------------------------===//
 /// \class
 /// \brief This class represents a set of attributes that apply to the function,
 /// return type, and parameters.
-class AttributeSetImpl final
+class AttributeListImpl final
     : public FoldingSetNode,
-      private TrailingObjects<AttributeSetImpl, IndexAttrPair> {
-  friend class AttributeSet;
+      private TrailingObjects<AttributeListImpl, IndexAttrPair> {
+  friend class AttributeList;
   friend TrailingObjects;
 
 private:
@@ -166,52 +223,21 @@ private:
   size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumSlots; }
 
   /// \brief Return a pointer to the IndexAttrPair for the specified slot.
-  const IndexAttrPair *getNode(unsigned Slot) const {
+  const IndexAttrPair *getSlotPair(unsigned Slot) const {
     return getTrailingObjects<IndexAttrPair>() + Slot;
   }
 
 public:
-  AttributeSetImpl(LLVMContext &C,
-                   ArrayRef<std::pair<unsigned, AttributeSetNode *>> Slots)
-      : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <=
-                      sizeof(AvailableFunctionAttrs) * CHAR_BIT,
-                  "Too many attributes");
-
-#ifndef NDEBUG
-    if (Slots.size() >= 2) {
-      for (const std::pair<unsigned, AttributeSetNode *> *i = Slots.begin() + 1,
-                                                         *e = Slots.end();
-           i != e; ++i) {
-        assert((i-1)->first <= i->first && "Attribute set not ordered!");
-      }
-    }
-#endif
-    // There's memory after the node where we can store the entries in.
-    std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
-
-    // Initialize AvailableFunctionAttrs summary bitset.
-    if (NumSlots > 0) {
-      static_assert(AttributeSet::FunctionIndex == ~0u,
-                    "FunctionIndex should be biggest possible index");
-      const std::pair<unsigned, AttributeSetNode *> &Last = Slots.back();
-      if (Last.first == AttributeSet::FunctionIndex) {
-        const AttributeSetNode *Node = Last.second;
-        for (Attribute I : *Node) {
-          if (!I.isStringAttribute())
-            AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-        }
-      }
-    }
-  }
+  AttributeListImpl(LLVMContext &C,
+                    ArrayRef<std::pair<unsigned, AttributeSet>> Slots);
 
   // AttributesSetImpt is uniqued, these should not be available.
-  AttributeSetImpl(const AttributeSetImpl &) = delete;
-  AttributeSetImpl &operator=(const AttributeSetImpl &) = delete;
+  AttributeListImpl(const AttributeListImpl &) = delete;
+  AttributeListImpl &operator=(const AttributeListImpl &) = delete;
 
   void operator delete(void *p) { ::operator delete(p); }
 
-  /// \brief Get the context that created this AttributeSetImpl.
+  /// \brief Get the context that created this AttributeListImpl.
   LLVMContext &getContext() { return Context; }
 
   /// \brief Return the number of slots used in this attribute list. This is
@@ -224,42 +250,35 @@ public:
   /// attributes are applied to, not the index into the AttrNodes list where the
   /// attributes reside.
   unsigned getSlotIndex(unsigned Slot) const {
-    return getNode(Slot)->first;
+    return getSlotPair(Slot)->first;
+  }
+
+  /// \brief Retrieve the attribute set node for the given "slot" in the
+  /// AttrNode list.
+  AttributeSet getSlotNode(unsigned Slot) const {
+    return getSlotPair(Slot)->second;
   }
 
   /// \brief Retrieve the attributes for the given "slot" in the AttrNode list.
   /// \p Slot is an index into the AttrNodes list, not the index of the return /
   /// parameter/ function which the attributes apply to.
-  AttributeSet getSlotAttributes(unsigned Slot) const {
-    return AttributeSet::get(Context, *getNode(Slot));
+  AttributeList getSlotAttributes(unsigned Slot) const {
+    return AttributeList::get(Context, *getSlotPair(Slot));
   }
 
-  /// \brief Retrieve the attribute set node for the given "slot" in the
-  /// AttrNode list.
-  AttributeSetNode *getSlotNode(unsigned Slot) const {
-    return getNode(Slot)->second;
-  }
-
-  /// \brief Return true if the AttributeSetNode for the FunctionIndex has an
+  /// \brief Return true if the AttributeSet or the FunctionIndex has an
   /// enum attribute of the given kind.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  typedef AttributeSetNode::iterator iterator;
-  iterator begin(unsigned Slot) const { return getSlotNode(Slot)->begin(); }
-  iterator end(unsigned Slot) const { return getSlotNode(Slot)->end(); }
+  typedef AttributeSet::iterator iterator;
+  iterator begin(unsigned Slot) const { return getSlotNode(Slot).begin(); }
+  iterator end(unsigned Slot) const { return getSlotNode(Slot).end(); }
 
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(getNode(0), getNumSlots()));
-  }
+  void Profile(FoldingSetNodeID &ID) const;
   static void Profile(FoldingSetNodeID &ID,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*>> Nodes) {
-    for (const auto &Node : Nodes) {
-      ID.AddInteger(Node.first);
-      ID.AddPointer(Node.second);
-    }
-  }
+                      ArrayRef<std::pair<unsigned, AttributeSet>> Nodes);
 
   void dump() const;
 };
diff --git a/lib/IR/AttributeSetNode.h b/lib/IR/AttributeSetNode.h
deleted file mode 100644
index 23ce3713c20b..000000000000
--- a/lib/IR/AttributeSetNode.h
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file defines the node class used internally by AttributeSet.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_ATTRIBUTESETNODE_H
-#define LLVM_IR_ATTRIBUTESETNODE_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
-#include <climits>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a group of attributes that apply to one
-/// element: function, return type, or parameter.
-class AttributeSetNode final
-    : public FoldingSetNode,
-      private TrailingObjects<AttributeSetNode, Attribute> {
-  friend TrailingObjects;
-
-  unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint64_t AvailableAttrs;
-
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
-                  "Too many attributes for AvailableAttrs");
-    // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
-
-    for (Attribute I : *this) {
-      if (!I.isStringAttribute()) {
-        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-      }
-    }
-  }
-
-public:
-  // AttributesSetNode is uniqued, these should not be available.
-  AttributeSetNode(const AttributeSetNode &) = delete;
-  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
-
-  void operator delete(void *p) { ::operator delete(p); }
-
-  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
-
-  static AttributeSetNode *get(AttributeSet AS, unsigned Index) {
-    return AS.getAttributes(Index);
-  }
-
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
-
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs & ((uint64_t)1) << Kind;
-  }
-  bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return NumAttrs != 0; }
-
-  Attribute getAttribute(Attribute::AttrKind Kind) const;
-  Attribute getAttribute(StringRef Kind) const;
-
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
-  uint64_t getDereferenceableBytes() const;
-  uint64_t getDereferenceableOrNullBytes() const;
-  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::string getAsString(bool InAttrGrp) const;
-
-  typedef const Attribute *iterator;
-  iterator begin() const { return getTrailingObjects<Attribute>(); }
-  iterator end() const { return begin() + NumAttrs; }
-
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(begin(), end()));
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (const auto &Attr : AttrList)
-      Attr.Profile(ID);
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_IR_ATTRIBUTESETNODE_H
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 1ec53cf1e1d6..2b7359dab807 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -1,4 +1,4 @@
-//===-- Attributes.cpp - Implement AttributesList -------------------------===//
+//===- Attributes.cpp - Implement AttributesList --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,23 +9,38 @@
 //
 // \file
 // \brief This file implements the Attribute, AttributeImpl, AttrBuilder,
-// AttributeSetImpl, and AttributeSet classes.
+// AttributeListImpl, and AttributeList classes.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
 #include "AttributeImpl.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/Atomic.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -411,9 +426,12 @@ bool Attribute::operator<(Attribute A) const {
 //===----------------------------------------------------------------------===//
 
 // Pin the vtables to this file.
-AttributeImpl::~AttributeImpl() {}
+AttributeImpl::~AttributeImpl() = default;
+
 void EnumAttributeImpl::anchor() {}
+
 void IntAttributeImpl::anchor() {}
+
 void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
@@ -473,9 +491,86 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const {
 }
 
 //===----------------------------------------------------------------------===//
+// AttributeSet Definition
+//===----------------------------------------------------------------------===//
+
+AttributeSet AttributeSet::get(LLVMContext &C, const AttrBuilder &B) {
+  return AttributeSet(AttributeSetNode::get(C, B));
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
+  return AttributeSet(AttributeSetNode::get(C, Attrs));
+}
+
+unsigned AttributeSet::getNumAttributes() const {
+  return SetNode ? SetNode->getNumAttributes() : 0;
+}
+
+bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
+  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+}
+
+bool AttributeSet::hasAttribute(StringRef Kind) const {
+  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+}
+
+Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
+  return SetNode ? SetNode->getAttribute(Kind) : Attribute();
+}
+
+Attribute AttributeSet::getAttribute(StringRef Kind) const {
+  return SetNode ? SetNode->getAttribute(Kind) : Attribute();
+}
+
+unsigned AttributeSet::getAlignment() const {
+  return SetNode ? SetNode->getAlignment() : 0;
+}
+
+unsigned AttributeSet::getStackAlignment() const {
+  return SetNode ? SetNode->getStackAlignment() : 0;
+}
+
+uint64_t AttributeSet::getDereferenceableBytes() const {
+  return SetNode ? SetNode->getDereferenceableBytes() : 0;
+}
+
+uint64_t AttributeSet::getDereferenceableOrNullBytes() const {
+  return SetNode ? SetNode->getDereferenceableOrNullBytes() : 0;
+}
+
+std::pair<unsigned, Optional<unsigned>> AttributeSet::getAllocSizeArgs() const {
+  return SetNode ? SetNode->getAllocSizeArgs()
+                 : std::pair<unsigned, Optional<unsigned>>(0, 0);
+}
+
+std::string AttributeSet::getAsString(bool InAttrGrp) const {
+  return SetNode ? SetNode->getAsString(InAttrGrp) : "";
+}
+
+AttributeSet::iterator AttributeSet::begin() const {
+  return SetNode ? SetNode->begin() : nullptr;
+}
+
+AttributeSet::iterator AttributeSet::end() const {
+  return SetNode ? SetNode->end() : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
 // AttributeSetNode Definition
 //===----------------------------------------------------------------------===//
 
+AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : AvailableAttrs(0), NumAttrs(Attrs.size()) {
+  // There's memory after the node where we can store the entries in.
+  std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+  for (Attribute I : *this) {
+    if (!I.isStringAttribute()) {
+      AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+    }
+  }
+}
+
 AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
                                         ArrayRef<Attribute> Attrs) {
   if (Attrs.empty())
@@ -504,10 +599,52 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
     pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
   }
 
-  // Return the AttributesListNode that we found or created.
+  // Return the AttributeSetNode that we found or created.
   return PA;
 }
 
+AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
+  // Add target-independent attributes.
+  SmallVector<Attribute, 8> Attrs;
+  for (Attribute::AttrKind Kind = Attribute::None;
+       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
+    if (!B.contains(Kind))
+      continue;
+
+    Attribute Attr;
+    switch (Kind) {
+    case Attribute::Alignment:
+      Attr = Attribute::getWithAlignment(C, B.getAlignment());
+      break;
+    case Attribute::StackAlignment:
+      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
+      break;
+    case Attribute::Dereferenceable:
+      Attr = Attribute::getWithDereferenceableBytes(
+          C, B.getDereferenceableBytes());
+      break;
+    case Attribute::DereferenceableOrNull:
+      Attr = Attribute::getWithDereferenceableOrNullBytes(
+          C, B.getDereferenceableOrNullBytes());
+      break;
+    case Attribute::AllocSize: {
+      auto A = B.getAllocSizeArgs();
+      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
+      break;
+    }
+    default:
+      Attr = Attribute::get(C, Kind);
+    }
+    Attrs.push_back(Attr);
+  }
+
+  // Add target-dependent (string) attributes.
+  for (const auto &TDA : B.td_attrs())
+    Attrs.emplace_back(Attribute::get(C, TDA.first, TDA.second));
+
+  return get(C, Attrs);
+}
+
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
   for (Attribute I : *this)
     if (I.hasAttribute(Kind))
@@ -578,46 +715,106 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSetImpl Definition
+// AttributeListImpl Definition
 //===----------------------------------------------------------------------===//
 
-LLVM_DUMP_METHOD void AttributeSetImpl::dump() const {
-  AttributeSet(const_cast<AttributeSetImpl *>(this)).dump();
+AttributeListImpl::AttributeListImpl(
+    LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Slots)
+    : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
+#ifndef NDEBUG
+  assert(!Slots.empty() && "pointless AttributeListImpl");
+  if (Slots.size() >= 2) {
+    auto &PrevPair = Slots.front();
+    for (auto &CurPair : Slots.drop_front()) {
+      assert(PrevPair.first <= CurPair.first && "Attribute set not ordered!");
+    }
+  }
+#endif
+
+  // There's memory after the node where we can store the entries in.
+  std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
+
+  // Initialize AvailableFunctionAttrs summary bitset.
+  static_assert(Attribute::EndAttrKinds <=
+                    sizeof(AvailableFunctionAttrs) * CHAR_BIT,
+                "Too many attributes");
+  static_assert(AttributeList::FunctionIndex == ~0u,
+                "FunctionIndex should be biggest possible index");
+  const auto &Last = Slots.back();
+  if (Last.first == AttributeList::FunctionIndex) {
+    AttributeSet Node = Last.second;
+    for (Attribute I : Node) {
+      if (!I.isStringAttribute())
+        AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+    }
+  }
+}
+
+void AttributeListImpl::Profile(FoldingSetNodeID &ID) const {
+  Profile(ID, makeArrayRef(getSlotPair(0), getNumSlots()));
+}
+
+void AttributeListImpl::Profile(
+    FoldingSetNodeID &ID, ArrayRef<std::pair<unsigned, AttributeSet>> Nodes) {
+  for (const auto &Node : Nodes) {
+    ID.AddInteger(Node.first);
+    ID.AddPointer(Node.second.SetNode);
+  }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeListImpl::dump() const {
+  AttributeList(const_cast<AttributeListImpl *>(this)).dump();
+}
+#endif
+
 //===----------------------------------------------------------------------===//
-// AttributeSet Construction and Mutation Methods
+// AttributeList Construction and Mutation Methods
 //===----------------------------------------------------------------------===//
 
-AttributeSet
-AttributeSet::getImpl(LLVMContext &C,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Attrs) {
+AttributeList AttributeList::getImpl(
+    LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
+  assert(!Attrs.empty() && "creating pointless AttributeList");
+#ifndef NDEBUG
+  unsigned LastIndex = 0;
+  bool IsFirst = true;
+  for (auto &&AttrPair : Attrs) {
+    assert((IsFirst || LastIndex < AttrPair.first) &&
+           "unsorted or duplicate AttributeList indices");
+    assert(AttrPair.second.hasAttributes() && "pointless AttributeList slot");
+    LastIndex = AttrPair.first;
+    IsFirst = false;
+  }
+#endif
+
   LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
-  AttributeSetImpl::Profile(ID, Attrs);
+  AttributeListImpl::Profile(ID, Attrs);
 
   void *InsertPoint;
-  AttributeSetImpl *PA = pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
+  AttributeListImpl *PA =
+      pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
 
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
   if (!PA) {
-    // Coallocate entries after the AttributeSetImpl itself.
+    // Coallocate entries after the AttributeListImpl itself.
     void *Mem = ::operator new(
-        AttributeSetImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
-    PA = new (Mem) AttributeSetImpl(C, Attrs);
+        AttributeListImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
+    PA = new (Mem) AttributeListImpl(C, Attrs);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
 
   // Return the AttributesList that we found or created.
-  return AttributeSet(PA);
+  return AttributeList(PA);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<std::pair<unsigned, Attribute> > Attrs){
+AttributeList
+AttributeList::get(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, Attribute>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeSet();
+    return AttributeList();
 
   assert(std::is_sorted(Attrs.begin(), Attrs.end(),
                         [](const std::pair<unsigned, Attribute> &LHS,
@@ -632,8 +829,8 @@ AttributeSet AttributeSet::get(LLVMContext &C,
 
   // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
   // list.
-  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrPairVec;
-  for (ArrayRef<std::pair<unsigned, Attribute> >::iterator I = Attrs.begin(),
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairVec;
+  for (ArrayRef<std::pair<unsigned, Attribute>>::iterator I = Attrs.begin(),
          E = Attrs.end(); I != E; ) {
     unsigned Index = I->first;
     SmallVector<Attribute, 4> AttrVec;
@@ -642,103 +839,87 @@ AttributeSet AttributeSet::get(LLVMContext &C,
       ++I;
     }
 
-    AttrPairVec.emplace_back(Index, AttributeSetNode::get(C, AttrVec));
+    AttrPairVec.emplace_back(Index, AttributeSet::get(C, AttrVec));
   }
 
   return getImpl(C, AttrPairVec);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<std::pair<unsigned,
-                                                  AttributeSetNode*> > Attrs) {
+AttributeList
+AttributeList::get(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeSet();
+    return AttributeList();
 
   return getImpl(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               const AttrBuilder &B) {
-  if (!B.hasAttributes())
-    return AttributeSet();
-
-  // Add target-independent attributes.
-  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (Attribute::AttrKind Kind = Attribute::None;
-       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
-    if (!B.contains(Kind))
-      continue;
-
-    Attribute Attr;
-    switch (Kind) {
-    case Attribute::Alignment:
-      Attr = Attribute::getWithAlignment(C, B.getAlignment());
-      break;
-    case Attribute::StackAlignment:
-      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
-      break;
-    case Attribute::Dereferenceable:
-      Attr = Attribute::getWithDereferenceableBytes(
-          C, B.getDereferenceableBytes());
-      break;
-    case Attribute::DereferenceableOrNull:
-      Attr = Attribute::getWithDereferenceableOrNullBytes(
-          C, B.getDereferenceableOrNullBytes());
-      break;
-    case Attribute::AllocSize: {
-      auto A = B.getAllocSizeArgs();
-      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
-      break;
-    }
-    default:
-      Attr = Attribute::get(C, Kind);
-    }
-    Attrs.emplace_back(Index, Attr);
+AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
+                                 AttributeSet RetAttrs,
+                                 ArrayRef<AttributeSet> ArgAttrs) {
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairs;
+  if (RetAttrs.hasAttributes())
+    AttrPairs.emplace_back(ReturnIndex, RetAttrs);
+  size_t Index = 1;
+  for (AttributeSet AS : ArgAttrs) {
+    if (AS.hasAttributes())
+      AttrPairs.emplace_back(Index, AS);
+    ++Index;
   }
+  if (FnAttrs.hasAttributes())
+    AttrPairs.emplace_back(FunctionIndex, FnAttrs);
+  if (AttrPairs.empty())
+    return AttributeList();
+  return getImpl(C, AttrPairs);
+}
 
-  // Add target-dependent (string) attributes.
-  for (const auto &TDA : B.td_attrs())
-    Attrs.emplace_back(Index, Attribute::get(C, TDA.first, TDA.second));
-
-  return get(C, Attrs);
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 const AttrBuilder &B) {
+  if (!B.hasAttributes())
+    return AttributeList();
+  AttributeSet AS = AttributeSet::get(C, B);
+  std::pair<unsigned, AttributeSet> Arr[1] = {{Index, AS}};
+  return getImpl(C, Arr);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<Attribute::AttrKind> Kinds) {
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<Attribute::AttrKind> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (Attribute::AttrKind K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<StringRef> Kinds) {
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<StringRef> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (StringRef K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
-  if (Attrs.empty()) return AttributeSet();
+AttributeList AttributeList::get(LLVMContext &C,
+                                 ArrayRef<AttributeList> Attrs) {
+  if (Attrs.empty())
+    return AttributeList();
   if (Attrs.size() == 1) return Attrs[0];
 
-  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
-  AttributeSetImpl *A0 = Attrs[0].pImpl;
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrNodeVec;
+  AttributeListImpl *A0 = Attrs[0].pImpl;
   if (A0)
-    AttrNodeVec.append(A0->getNode(0), A0->getNode(A0->getNumSlots()));
+    AttrNodeVec.append(A0->getSlotPair(0), A0->getSlotPair(A0->getNumSlots()));
   // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec
   // ordered by index.  Because we know that each list in Attrs is ordered by
   // index we only need to merge each successive list in rather than doing a
   // full sort.
   for (unsigned I = 1, E = Attrs.size(); I != E; ++I) {
-    AttributeSetImpl *AS = Attrs[I].pImpl;
-    if (!AS) continue;
-    SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator
+    AttributeListImpl *ALI = Attrs[I].pImpl;
+    if (!ALI) continue;
+    SmallVector<std::pair<unsigned, AttributeSet>, 8>::iterator
       ANVI = AttrNodeVec.begin(), ANVE;
-    for (const IndexAttrPair *AI = AS->getNode(0),
-                             *AE = AS->getNode(AS->getNumSlots());
+    for (const IndexAttrPair *AI = ALI->getSlotPair(0),
+                             *AE = ALI->getSlotPair(ALI->getNumSlots());
          AI != AE; ++AI) {
       ANVE = AttrNodeVec.end();
       while (ANVI != ANVE && ANVI->first <= AI->first)
@@ -750,113 +931,123 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
   return getImpl(C, AttrNodeVec);
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        Attribute::AttrKind Kind) const {
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          Attribute::AttrKind Kind) const {
   if (hasAttribute(Index, Kind)) return *this;
-  return addAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  return addAttributes(C, Index, AttributeList::get(C, Index, Kind));
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        StringRef Kind, StringRef Value) const {
-  llvm::AttrBuilder B;
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          StringRef Kind,
+                                          StringRef Value) const {
+  AttrBuilder B;
   B.addAttribute(Kind, Value);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C,
-                                        ArrayRef<unsigned> Indices,
-                                        Attribute A) const {
+AttributeList AttributeList::addAttribute(LLVMContext &C,
+                                          ArrayRef<unsigned> Indices,
+                                          Attribute A) const {
+  assert(std::is_sorted(Indices.begin(), Indices.end()));
+
   unsigned I = 0, E = pImpl ? pImpl->getNumSlots() : 0;
-  auto IdxI = Indices.begin(), IdxE = Indices.end();
-  SmallVector<AttributeSet, 4> AttrSet;
-
-  while (I != E && IdxI != IdxE) {
-    if (getSlotIndex(I) < *IdxI)
-      AttrSet.emplace_back(getSlotAttributes(I++));
-    else if (getSlotIndex(I) > *IdxI)
-      AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
-    else {
-      AttrBuilder B(getSlotAttributes(I), *IdxI);
-      B.addAttribute(A);
-      AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B));
+  SmallVector<IndexAttrPair, 4> AttrVec;
+  for (unsigned Index : Indices) {
+    // Add all attribute slots before the current index.
+    for (; I < E && getSlotIndex(I) < Index; ++I)
+      AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
+
+    // Add the attribute at this index. If we already have attributes at this
+    // index, merge them into a new set.
+    AttrBuilder B;
+    if (I < E && getSlotIndex(I) == Index) {
+      B.merge(AttrBuilder(pImpl->getSlotNode(I)));
       ++I;
-      ++IdxI;
     }
+    B.addAttribute(A);
+    AttrVec.emplace_back(Index, AttributeSet::get(C, B));
   }
 
-  while (I != E)
-    AttrSet.emplace_back(getSlotAttributes(I++));
-
-  while (IdxI != IdxE)
-    AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+  // Add remaining attributes.
+  for (; I < E; ++I)
+    AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
 
-  return get(C, AttrSet);
+  return get(C, AttrVec);
 }
 
-AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
-                                         AttributeSet Attrs) const {
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           AttributeList Attrs) const {
   if (!pImpl) return Attrs;
   if (!Attrs.pImpl) return *this;
 
+  return addAttributes(C, Index, Attrs.getAttributes(Index));
+}
+
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           AttributeSet AS) const {
+  if (!AS.hasAttributes())
+    return *this;
+
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment. For now, say
   // we can't change a known alignment.
   unsigned OldAlign = getParamAlignment(Index);
-  unsigned NewAlign = Attrs.getParamAlignment(Index);
+  unsigned NewAlign = AS.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
 #endif
 
-  // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
+  SmallVector<std::pair<unsigned, AttributeSet>, 4> AttrSet;
   uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
-  uint64_t LastIndex = 0;
-  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
-    if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
-      break;
-    }
-    LastIndex = I + 1;
-    AttrSet.push_back(getSlotAttributes(I));
-  }
-
-  // Now add the attribute into the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
+  unsigned I;
 
-  for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
-    if (Attrs.getSlotIndex(I) == Index) {
-      for (AttributeSetImpl::iterator II = Attrs.pImpl->begin(I),
-             IE = Attrs.pImpl->end(I); II != IE; ++II)
-        B.addAttribute(*II);
+  // Add all the attribute slots before the one we need to merge.
+  for (I = 0; I < NumAttrs; ++I) {
+    if (getSlotIndex(I) >= Index)
       break;
-    }
+    AttrSet.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
+  }
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  if (I < NumAttrs && getSlotIndex(I) == Index) {
+    // We need to merge two AttributeSets.
+    AttributeSet Merged = AttributeSet::get(
+        C, AttrBuilder(pImpl->getSlotNode(I)).merge(AttrBuilder(AS)));
+    AttrSet.emplace_back(Index, Merged);
+    ++I;
+  } else {
+    // Otherwise, there were no attributes at this position in the original
+    // list. Add the set as is.
+    AttrSet.emplace_back(Index, AS);
+  }
 
-  // Add the remaining attribute slots.
-  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
-    AttrSet.push_back(getSlotAttributes(I));
+  // Add the remaining entries.
+  for (; I < NumAttrs; ++I)
+    AttrSet.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
 
   return get(C, AttrSet);
 }
 
-AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           Attribute::AttrKind Kind) const {
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           const AttrBuilder &B) const {
+  return get(C, Index, AttributeSet::get(C, B));
+}
+
+AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
+                                             Attribute::AttrKind Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  return removeAttributes(C, Index, AttributeList::get(C, Index, Kind));
 }
 
-AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           StringRef Kind) const {
+AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
+                                             StringRef Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  return removeAttributes(C, Index, AttributeList::get(C, Index, Kind));
 }
 
-AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
-                                            AttributeSet Attrs) const {
-  if (!pImpl) return AttributeSet();
+AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
+                                              AttributeList Attrs) const {
+  if (!pImpl)
+    return AttributeList();
   if (!Attrs.pImpl) return *this;
 
   // FIXME it is not obvious how this should work for alignment.
@@ -865,13 +1056,13 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
          "Attempt to change alignment!");
 
   // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
+  SmallVector<AttributeList, 4> AttrSet;
   uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
+  AttributeList AL;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
     if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
+      if (getSlotIndex(I) == Index) AL = getSlotAttributes(LastIndex++);
       break;
     }
     LastIndex = I + 1;
@@ -879,8 +1070,8 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   }
 
   // Now remove the attribute from the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
+  // AttributeList there.
+  AttrBuilder B(AL, Index);
 
   for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
     if (Attrs.getSlotIndex(I) == Index) {
@@ -888,7 +1079,7 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
       break;
     }
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  AttrSet.push_back(AttributeList::get(C, Index, B));
 
   // Add the remaining attribute slots.
   for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
@@ -897,22 +1088,23 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   return get(C, AttrSet);
 }
 
-AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
-                                            const AttrBuilder &Attrs) const {
-  if (!pImpl) return AttributeSet();
+AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
+                                              const AttrBuilder &Attrs) const {
+  if (!pImpl)
+    return AttributeList();
 
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't pass in alignment, which no current use does.
   assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
 
   // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
+  SmallVector<AttributeList, 4> AttrSet;
   uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
+  AttributeList AL;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
     if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
+      if (getSlotIndex(I) == Index) AL = getSlotAttributes(LastIndex++);
       break;
     }
     LastIndex = I + 1;
@@ -920,11 +1112,11 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   }
 
   // Now remove the attribute from the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
+  // AttributeList there.
+  AttrBuilder B(AL, Index);
   B.remove(Attrs);
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  AttrSet.push_back(AttributeList::get(C, Index, B));
 
   // Add the remaining attribute slots.
   for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
@@ -933,94 +1125,96 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   return get(C, AttrSet);
 }
 
-AttributeSet AttributeSet::addDereferenceableAttr(LLVMContext &C, unsigned Index,
-                                                  uint64_t Bytes) const {
-  llvm::AttrBuilder B;
+AttributeList AttributeList::removeAttributes(LLVMContext &C,
+                                              unsigned WithoutIndex) const {
+  if (!pImpl)
+    return AttributeList();
+
+  SmallVector<std::pair<unsigned, AttributeSet>, 4> AttrSet;
+  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
+    unsigned Index = getSlotIndex(I);
+    if (Index != WithoutIndex)
+      AttrSet.push_back({Index, pImpl->getSlotNode(I)});
+  }
+  return get(C, AttrSet);
+}
+
+AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C,
+                                                    unsigned Index,
+                                                    uint64_t Bytes) const {
+  AttrBuilder B;
   B.addDereferenceableAttr(Bytes);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
-AttributeSet AttributeSet::addDereferenceableOrNullAttr(LLVMContext &C,
-                                                        unsigned Index,
-                                                        uint64_t Bytes) const {
-  llvm::AttrBuilder B;
+AttributeList
+AttributeList::addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
+                                            uint64_t Bytes) const {
+  AttrBuilder B;
   B.addDereferenceableOrNullAttr(Bytes);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
-AttributeSet
-AttributeSet::addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                               unsigned ElemSizeArg,
-                               const Optional<unsigned> &NumElemsArg) {
-  llvm::AttrBuilder B;
+AttributeList
+AttributeList::addAllocSizeAttr(LLVMContext &C, unsigned Index,
+                                unsigned ElemSizeArg,
+                                const Optional<unsigned> &NumElemsArg) {
+  AttrBuilder B;
   B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Accessor Methods
+// AttributeList Accessor Methods
 //===----------------------------------------------------------------------===//
 
-LLVMContext &AttributeSet::getContext() const {
-  return pImpl->getContext();
-}
+LLVMContext &AttributeList::getContext() const { return pImpl->getContext(); }
 
-AttributeSet AttributeSet::getParamAttributes(unsigned Index) const {
-  return pImpl && hasAttributes(Index) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(Index, getAttributes(Index)))) :
-    AttributeSet();
+AttributeSet AttributeList::getParamAttributes(unsigned ArgNo) const {
+  return getAttributes(ArgNo + 1);
 }
 
-AttributeSet AttributeSet::getRetAttributes() const {
-  return pImpl && hasAttributes(ReturnIndex) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(ReturnIndex,
-                                       getAttributes(ReturnIndex)))) :
-    AttributeSet();
+AttributeSet AttributeList::getRetAttributes() const {
+  return getAttributes(ReturnIndex);
 }
 
-AttributeSet AttributeSet::getFnAttributes() const {
-  return pImpl && hasAttributes(FunctionIndex) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(FunctionIndex,
-                                       getAttributes(FunctionIndex)))) :
-    AttributeSet();
+AttributeSet AttributeList::getFnAttributes() const {
+  return getAttributes(FunctionIndex);
 }
 
-bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttribute(Kind);
+bool AttributeList::hasAttribute(unsigned Index,
+                                 Attribute::AttrKind Kind) const {
+  return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttribute(Kind);
+bool AttributeList::hasAttribute(unsigned Index, StringRef Kind) const {
+  return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeSet::hasAttributes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttributes();
+bool AttributeList::hasAttributes(unsigned Index) const {
+  return getAttributes(Index).hasAttributes();
 }
 
-bool AttributeSet::hasFnAttribute(Attribute::AttrKind Kind) const {
+bool AttributeList::hasFnAttribute(Attribute::AttrKind Kind) const {
   return pImpl && pImpl->hasFnAttribute(Kind);
 }
 
-bool AttributeSet::hasFnAttribute(StringRef Kind) const {
-  return hasAttribute(AttributeSet::FunctionIndex, Kind);
+bool AttributeList::hasFnAttribute(StringRef Kind) const {
+  return hasAttribute(AttributeList::FunctionIndex, Kind);
+}
+
+bool AttributeList::hasParamAttribute(unsigned ArgNo,
+                                      Attribute::AttrKind Kind) const {
+  return hasAttribute(ArgNo + 1, Kind);
 }
 
-bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr,
-                                    unsigned *Index) const {
+bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
+                                     unsigned *Index) const {
   if (!pImpl) return false;
 
   for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
-    for (AttributeSetImpl::iterator II = pImpl->begin(I),
-           IE = pImpl->end(I); II != IE; ++II)
+    for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I);
+         II != IE; ++II)
       if (II->hasAttribute(Attr)) {
         if (Index) *Index = pImpl->getSlotIndex(I);
         return true;
@@ -1029,93 +1223,85 @@ bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr,
   return false;
 }
 
-Attribute AttributeSet::getAttribute(unsigned Index,
-                                     Attribute::AttrKind Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAttribute(Kind) : Attribute();
+Attribute AttributeList::getAttribute(unsigned Index,
+                                      Attribute::AttrKind Kind) const {
+  return getAttributes(Index).getAttribute(Kind);
 }
 
-Attribute AttributeSet::getAttribute(unsigned Index,
-                                     StringRef Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAttribute(Kind) : Attribute();
+Attribute AttributeList::getAttribute(unsigned Index, StringRef Kind) const {
+  return getAttributes(Index).getAttribute(Kind);
 }
 
-unsigned AttributeSet::getParamAlignment(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAlignment() : 0;
+unsigned AttributeList::getParamAlignment(unsigned Index) const {
+  return getAttributes(Index).getAlignment();
 }
 
-unsigned AttributeSet::getStackAlignment(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getStackAlignment() : 0;
+unsigned AttributeList::getStackAlignment(unsigned Index) const {
+  return getAttributes(Index).getStackAlignment();
 }
 
-uint64_t AttributeSet::getDereferenceableBytes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getDereferenceableBytes() : 0;
+uint64_t AttributeList::getDereferenceableBytes(unsigned Index) const {
+  return getAttributes(Index).getDereferenceableBytes();
 }
 
-uint64_t AttributeSet::getDereferenceableOrNullBytes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getDereferenceableOrNullBytes() : 0;
+uint64_t AttributeList::getDereferenceableOrNullBytes(unsigned Index) const {
+  return getAttributes(Index).getDereferenceableOrNullBytes();
 }
 
 std::pair<unsigned, Optional<unsigned>>
-AttributeSet::getAllocSizeArgs(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAllocSizeArgs() : std::make_pair(0u, Optional<unsigned>(0u));
+AttributeList::getAllocSizeArgs(unsigned Index) const {
+  return getAttributes(Index).getAllocSizeArgs();
 }
 
-std::string AttributeSet::getAsString(unsigned Index, bool InAttrGrp) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAsString(InAttrGrp) : std::string("");
+std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
+  return getAttributes(Index).getAsString(InAttrGrp);
 }
 
-AttributeSetNode *AttributeSet::getAttributes(unsigned Index) const {
-  if (!pImpl) return nullptr;
+AttributeSet AttributeList::getAttributes(unsigned Index) const {
+  if (!pImpl) return AttributeSet();
 
   // Loop through to find the attribute node we want.
   for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
     if (pImpl->getSlotIndex(I) == Index)
       return pImpl->getSlotNode(I);
 
-  return nullptr;
+  return AttributeSet();
 }
 
-AttributeSet::iterator AttributeSet::begin(unsigned Slot) const {
+AttributeList::iterator AttributeList::begin(unsigned Slot) const {
   if (!pImpl)
     return ArrayRef<Attribute>().begin();
   return pImpl->begin(Slot);
 }
 
-AttributeSet::iterator AttributeSet::end(unsigned Slot) const {
+AttributeList::iterator AttributeList::end(unsigned Slot) const {
   if (!pImpl)
     return ArrayRef<Attribute>().end();
   return pImpl->end(Slot);
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Introspection Methods
+// AttributeList Introspection Methods
 //===----------------------------------------------------------------------===//
 
-unsigned AttributeSet::getNumSlots() const {
+unsigned AttributeList::getNumSlots() const {
   return pImpl ? pImpl->getNumSlots() : 0;
 }
 
-unsigned AttributeSet::getSlotIndex(unsigned Slot) const {
+unsigned AttributeList::getSlotIndex(unsigned Slot) const {
   assert(pImpl && Slot < pImpl->getNumSlots() &&
          "Slot # out of range!");
   return pImpl->getSlotIndex(Slot);
 }
 
-AttributeSet AttributeSet::getSlotAttributes(unsigned Slot) const {
+AttributeList AttributeList::getSlotAttributes(unsigned Slot) const {
   assert(pImpl && Slot < pImpl->getNumSlots() &&
          "Slot # out of range!");
   return pImpl->getSlotAttributes(Slot);
 }
 
-LLVM_DUMP_METHOD void AttributeSet::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeList::dump() const {
   dbgs() << "PAL[\n";
 
   for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
@@ -1130,28 +1316,34 @@ LLVM_DUMP_METHOD void AttributeSet::dump() const {
 
   dbgs() << "]\n";
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // AttrBuilder Method Implementations
 //===----------------------------------------------------------------------===//
 
-AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
-    : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0),
-      DerefOrNullBytes(0), AllocSizeArgs(0) {
-  AttributeSetImpl *pImpl = AS.pImpl;
+AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) {
+  AttributeListImpl *pImpl = AL.pImpl;
   if (!pImpl) return;
 
   for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
     if (pImpl->getSlotIndex(I) != Index) continue;
 
-    for (AttributeSetImpl::iterator II = pImpl->begin(I),
-           IE = pImpl->end(I); II != IE; ++II)
+    for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I);
+         II != IE; ++II)
       addAttribute(*II);
 
     break;
   }
 }
 
+AttrBuilder::AttrBuilder(AttributeSet AS) {
+  if (AS.hasAttributes()) {
+    for (const Attribute &A : AS)
+      addAttribute(A);
+  }
+}
+
 void AttrBuilder::clear() {
   Attrs.reset();
   TargetDepAttrs.clear();
@@ -1213,7 +1405,7 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
   return *this;
 }
 
-AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
+AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
     if (A.getSlotIndex(I) == Index) {
@@ -1221,9 +1413,10 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
       break;
     }
 
-  assert(Slot != ~0U && "Couldn't find index in AttributeSet!");
+  assert(Slot != ~0U && "Couldn't find index in AttributeList!");
 
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
+  for (AttributeList::iterator I = A.begin(Slot), E = A.end(Slot); I != E;
+       ++I) {
     Attribute Attr = *I;
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       removeAttribute(Attr.getKindAsEnum());
@@ -1359,7 +1552,7 @@ bool AttrBuilder::overlaps(const AttrBuilder &B) const {
     return true;
 
   // Then check if any target dependent ones do.
-  for (auto I : td_attrs())
+  for (const auto &I : td_attrs())
     if (B.contains(I.first))
       return true;
 
@@ -1374,7 +1567,7 @@ bool AttrBuilder::hasAttributes() const {
   return !Attrs.none() || !TargetDepAttrs.empty();
 }
 
-bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
+bool AttrBuilder::hasAttributes(AttributeList A, uint64_t Index) const {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
     if (A.getSlotIndex(I) == Index) {
@@ -1384,7 +1577,8 @@ bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
 
   assert(Slot != ~0U && "Couldn't find the index!");
 
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
+  for (AttributeList::iterator I = A.begin(Slot), E = A.end(Slot); I != E;
+       ++I) {
     Attribute Attr = *I;
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       if (Attrs[I->getKindAsEnum()])
@@ -1485,16 +1679,15 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
   B.addAttribute(Attribute::StackProtect)
     .addAttribute(Attribute::StackProtectStrong)
     .addAttribute(Attribute::StackProtectReq);
-  AttributeSet OldSSPAttr = AttributeSet::get(Caller.getContext(),
-                                              AttributeSet::FunctionIndex,
-                                              B);
+  AttributeList OldSSPAttr =
+      AttributeList::get(Caller.getContext(), AttributeList::FunctionIndex, B);
 
   if (Callee.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectReq);
   } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectStrong);
   } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
@@ -1510,7 +1703,6 @@ bool AttributeFuncs::areInlineCompatible(const Function &Caller,
   return hasCompatibleFnAttrs(Caller, Callee);
 }
 
-
 void AttributeFuncs::mergeAttributesForInlining(Function &Caller,
                                                 const Function &Callee) {
   mergeFnAttrs(Caller, Callee);
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index e3a7bae02e0a..2897434a2b8d 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -33,10 +34,10 @@ using namespace llvm;
 
 static void rename(GlobalValue *GV) { GV->setName(GV->getName() + ".old"); }
 
-// Upgrade the declarations of the SSE4.1 functions whose arguments have
+// Upgrade the declarations of the SSE4.1 ptest intrinsics whose arguments have
 // changed their type from v4f32 to v2i64.
-static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID,
-                                 Function *&NewFn) {
+static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
+                                  Function *&NewFn) {
   // Check whether this is an old version of the function, which received
   // v4f32 arguments.
   Type *Arg0Type = F->getFunctionType()->getParamType(0);
@@ -65,6 +66,265 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
+  // All of the intrinsics matches below should be marked with which llvm
+  // version started autoupgrading them. At some point in the future we would
+  // like to use this information to remove upgrade code for some older
+  // intrinsics. It is currently undecided how we will determine that future
+  // point.
+  if (Name.startswith("sse2.pcmpeq.") || // Added in 3.1
+      Name.startswith("sse2.pcmpgt.") || // Added in 3.1
+      Name.startswith("avx2.pcmpeq.") || // Added in 3.1
+      Name.startswith("avx2.pcmpgt.") || // Added in 3.1
+      Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
+      Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
+      Name == "sse.add.ss" || // Added in 4.0
+      Name == "sse2.add.sd" || // Added in 4.0
+      Name == "sse.sub.ss" || // Added in 4.0
+      Name == "sse2.sub.sd" || // Added in 4.0
+      Name == "sse.mul.ss" || // Added in 4.0
+      Name == "sse2.mul.sd" || // Added in 4.0
+      Name == "sse.div.ss" || // Added in 4.0
+      Name == "sse2.div.sd" || // Added in 4.0
+      Name == "sse41.pmaxsb" || // Added in 3.9
+      Name == "sse2.pmaxs.w" || // Added in 3.9
+      Name == "sse41.pmaxsd" || // Added in 3.9
+      Name == "sse2.pmaxu.b" || // Added in 3.9
+      Name == "sse41.pmaxuw" || // Added in 3.9
+      Name == "sse41.pmaxud" || // Added in 3.9
+      Name == "sse41.pminsb" || // Added in 3.9
+      Name == "sse2.pmins.w" || // Added in 3.9
+      Name == "sse41.pminsd" || // Added in 3.9
+      Name == "sse2.pminu.b" || // Added in 3.9
+      Name == "sse41.pminuw" || // Added in 3.9
+      Name == "sse41.pminud" || // Added in 3.9
+      Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
+      Name.startswith("avx2.pmax") || // Added in 3.9
+      Name.startswith("avx2.pmin") || // Added in 3.9
+      Name.startswith("avx512.mask.pmax") || // Added in 4.0
+      Name.startswith("avx512.mask.pmin") || // Added in 4.0
+      Name.startswith("avx2.vbroadcast") || // Added in 3.8
+      Name.startswith("avx2.pbroadcast") || // Added in 3.8
+      Name.startswith("avx.vpermil.") || // Added in 3.1
+      Name.startswith("sse2.pshuf") || // Added in 3.9
+      Name.startswith("avx512.pbroadcast") || // Added in 3.9
+      Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
+      Name.startswith("avx512.mask.movddup") || // Added in 3.9
+      Name.startswith("avx512.mask.movshdup") || // Added in 3.9
+      Name.startswith("avx512.mask.movsldup") || // Added in 3.9
+      Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
+      Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
+      Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
+      Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
+      Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
+      Name.startswith("avx512.mask.punpckl") || // Added in 3.9
+      Name.startswith("avx512.mask.punpckh") || // Added in 3.9
+      Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
+      Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
+      Name.startswith("avx512.mask.pand.") || // Added in 3.9
+      Name.startswith("avx512.mask.pandn.") || // Added in 3.9
+      Name.startswith("avx512.mask.por.") || // Added in 3.9
+      Name.startswith("avx512.mask.pxor.") || // Added in 3.9
+      Name.startswith("avx512.mask.and.") || // Added in 3.9
+      Name.startswith("avx512.mask.andn.") || // Added in 3.9
+      Name.startswith("avx512.mask.or.") || // Added in 3.9
+      Name.startswith("avx512.mask.xor.") || // Added in 3.9
+      Name.startswith("avx512.mask.padd.") || // Added in 4.0
+      Name.startswith("avx512.mask.psub.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmull.") || // Added in 4.0
+      Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
+      Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.packsswb.") || // Added in 5.0
+      Name.startswith("avx512.mask.packssdw.") || // Added in 5.0
+      Name.startswith("avx512.mask.packuswb.") || // Added in 5.0
+      Name.startswith("avx512.mask.packusdw.") || // Added in 5.0
+      Name == "avx512.mask.add.pd.128" || // Added in 4.0
+      Name == "avx512.mask.add.pd.256" || // Added in 4.0
+      Name == "avx512.mask.add.ps.128" || // Added in 4.0
+      Name == "avx512.mask.add.ps.256" || // Added in 4.0
+      Name == "avx512.mask.div.pd.128" || // Added in 4.0
+      Name == "avx512.mask.div.pd.256" || // Added in 4.0
+      Name == "avx512.mask.div.ps.128" || // Added in 4.0
+      Name == "avx512.mask.div.ps.256" || // Added in 4.0
+      Name == "avx512.mask.mul.pd.128" || // Added in 4.0
+      Name == "avx512.mask.mul.pd.256" || // Added in 4.0
+      Name == "avx512.mask.mul.ps.128" || // Added in 4.0
+      Name == "avx512.mask.mul.ps.256" || // Added in 4.0
+      Name == "avx512.mask.sub.pd.128" || // Added in 4.0
+      Name == "avx512.mask.sub.pd.256" || // Added in 4.0
+      Name == "avx512.mask.sub.ps.128" || // Added in 4.0
+      Name == "avx512.mask.sub.ps.256" || // Added in 4.0
+      Name == "avx512.mask.max.pd.128" || // Added in 5.0
+      Name == "avx512.mask.max.pd.256" || // Added in 5.0
+      Name == "avx512.mask.max.ps.128" || // Added in 5.0
+      Name == "avx512.mask.max.ps.256" || // Added in 5.0
+      Name == "avx512.mask.min.pd.128" || // Added in 5.0
+      Name == "avx512.mask.min.pd.256" || // Added in 5.0
+      Name == "avx512.mask.min.ps.128" || // Added in 5.0
+      Name == "avx512.mask.min.ps.256" || // Added in 5.0
+      Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.w") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.w") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
+      Name.startswith("avx512.mask.pslli") || // Added in 4.0
+      Name.startswith("avx512.mask.psrai") || // Added in 4.0
+      Name.startswith("avx512.mask.psrli") || // Added in 4.0
+      Name.startswith("avx512.mask.psllv") || // Added in 4.0
+      Name.startswith("avx512.mask.psrav") || // Added in 4.0
+      Name.startswith("avx512.mask.psrlv") || // Added in 4.0
+      Name.startswith("sse41.pmovsx") || // Added in 3.8
+      Name.startswith("sse41.pmovzx") || // Added in 3.9
+      Name.startswith("avx2.pmovsx") || // Added in 3.9
+      Name.startswith("avx2.pmovzx") || // Added in 3.9
+      Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
+      Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
+      Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
+      Name == "sse2.cvtdq2pd" || // Added in 3.9
+      Name == "sse2.cvtps2pd" || // Added in 3.9
+      Name == "avx.cvtdq2.pd.256" || // Added in 3.9
+      Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
+      Name.startswith("avx.vinsertf128.") || // Added in 3.7
+      Name == "avx2.vinserti128" || // Added in 3.7
+      Name.startswith("avx512.mask.insert") || // Added in 4.0
+      Name.startswith("avx.vextractf128.") || // Added in 3.7
+      Name == "avx2.vextracti128" || // Added in 3.7
+      Name.startswith("avx512.mask.vextract") || // Added in 4.0
+      Name.startswith("sse4a.movnt.") || // Added in 3.9
+      Name.startswith("avx.movnt.") || // Added in 3.2
+      Name.startswith("avx512.storent.") || // Added in 3.9
+      Name == "sse41.movntdqa" || // Added in 5.0
+      Name == "avx2.movntdqa" || // Added in 5.0
+      Name == "avx512.movntdqa" || // Added in 5.0
+      Name == "sse2.storel.dq" || // Added in 3.9
+      Name.startswith("sse.storeu.") || // Added in 3.9
+      Name.startswith("sse2.storeu.") || // Added in 3.9
+      Name.startswith("avx.storeu.") || // Added in 3.9
+      Name.startswith("avx512.mask.storeu.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.p") || // Added in 3.9
+      Name.startswith("avx512.mask.store.b.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.d.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.q.") || // Added in 3.9
+      Name.startswith("avx512.mask.loadu.") || // Added in 3.9
+      Name.startswith("avx512.mask.load.") || // Added in 3.9
+      Name == "sse42.crc32.64.8" || // Added in 3.4
+      Name.startswith("avx.vbroadcast.s") || // Added in 3.5
+      Name.startswith("avx512.mask.palignr.") || // Added in 3.9
+      Name.startswith("avx512.mask.valign.") || // Added in 4.0
+      Name.startswith("sse2.psll.dq") || // Added in 3.7
+      Name.startswith("sse2.psrl.dq") || // Added in 3.7
+      Name.startswith("avx2.psll.dq") || // Added in 3.7
+      Name.startswith("avx2.psrl.dq") || // Added in 3.7
+      Name.startswith("avx512.psll.dq") || // Added in 3.9
+      Name.startswith("avx512.psrl.dq") || // Added in 3.9
+      Name == "sse41.pblendw" || // Added in 3.7
+      Name.startswith("sse41.blendp") || // Added in 3.7
+      Name.startswith("avx.blend.p") || // Added in 3.7
+      Name == "avx2.pblendw" || // Added in 3.7
+      Name.startswith("avx2.pblendd.") || // Added in 3.7
+      Name.startswith("avx.vbroadcastf128") || // Added in 4.0
+      Name == "avx2.vbroadcasti128" || // Added in 3.7
+      Name == "xop.vpcmov" || // Added in 3.8
+      Name == "xop.vpcmov.256" || // Added in 5.0
+      Name.startswith("avx512.mask.move.s") || // Added in 4.0
+      Name.startswith("avx512.cvtmask2") || // Added in 5.0
+      (Name.startswith("xop.vpcom") && // Added in 3.2
+       F->arg_size() == 2))
+    return true;
+
+  return false;
+}
+
+static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
+                                        Function *&NewFn) {
+  // Only handle intrinsics that start with "x86.".
+  if (!Name.startswith("x86."))
+    return false;
+  // Remove "x86." prefix.
+  Name = Name.substr(4);
+
+  if (ShouldUpgradeX86Intrinsic(F, Name)) {
+    NewFn = nullptr;
+    return true;
+  }
+
+  // SSE4.1 ptest functions may have an old signature.
+  if (Name.startswith("sse41.ptest")) { // Added in 3.2
+    if (Name.substr(11) == "c")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestc, NewFn);
+    if (Name.substr(11) == "z")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestz, NewFn);
+    if (Name.substr(11) == "nzc")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
+  }
+  // Several blend and other instructions with masks used the wrong number of
+  // bits.
+  if (Name == "sse41.insertps") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
+                                            NewFn);
+  if (Name == "sse41.dppd") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
+                                            NewFn);
+  if (Name == "sse41.dpps") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
+                                            NewFn);
+  if (Name == "sse41.mpsadbw") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
+                                            NewFn);
+  if (Name == "avx.dp.ps.256") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
+                                            NewFn);
+  if (Name == "avx2.mpsadbw") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
+                                            NewFn);
+
+  // frcz.ss/sd may need to have an argument dropped. Added in 3.2
+  if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
+    rename(F);
+    NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::x86_xop_vfrcz_ss);
+    return true;
+  }
+  if (Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
+    rename(F);
+    NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::x86_xop_vfrcz_sd);
+    return true;
+  }
+  // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
+  if (Name.startswith("xop.vpermil2")) { // Added in 3.9
+    auto Idx = F->getFunctionType()->getParamType(2);
+    if (Idx->isFPOrFPVectorTy()) {
+      rename(F);
+      unsigned IdxSize = Idx->getPrimitiveSizeInBits();
+      unsigned EltSize = Idx->getScalarSizeInBits();
+      Intrinsic::ID Permil2ID;
+      if (EltSize == 64 && IdxSize == 128)
+        Permil2ID = Intrinsic::x86_xop_vpermil2pd;
+      else if (EltSize == 32 && IdxSize == 128)
+        Permil2ID = Intrinsic::x86_xop_vpermil2ps;
+      else if (EltSize == 64 && IdxSize == 256)
+        Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
+      else
+        Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
@@ -155,26 +415,31 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
-  case 'i': {
-    if (Name.startswith("invariant.start")) {
+  case 'i':
+  case 'l': {
+    bool IsLifetimeStart = Name.startswith("lifetime.start");
+    if (IsLifetimeStart || Name.startswith("invariant.start")) {
+      Intrinsic::ID ID = IsLifetimeStart ?
+        Intrinsic::lifetime_start : Intrinsic::invariant_start;
       auto Args = F->getFunctionType()->params();
       Type* ObjectPtr[1] = {Args[1]};
-      if (F->getName() !=
-          Intrinsic::getName(Intrinsic::invariant_start, ObjectPtr)) {
+      if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(
-            F->getParent(), Intrinsic::invariant_start, ObjectPtr);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
         return true;
       }
     }
-    if (Name.startswith("invariant.end")) {
+
+    bool IsLifetimeEnd = Name.startswith("lifetime.end");
+    if (IsLifetimeEnd || Name.startswith("invariant.end")) {
+      Intrinsic::ID ID = IsLifetimeEnd ?
+        Intrinsic::lifetime_end : Intrinsic::invariant_end;
+
       auto Args = F->getFunctionType()->params();
-      Type* ObjectPtr[1] = {Args[2]};
-      if (F->getName() !=
-          Intrinsic::getName(Intrinsic::invariant_end, ObjectPtr)) {
+      Type* ObjectPtr[1] = {Args[IsLifetimeEnd ? 1 : 2]};
+      if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::invariant_end, ObjectPtr);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
         return true;
       }
     }
@@ -204,16 +469,48 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
+  case 'n': {
+    if (Name.startswith("nvvm.")) {
+      Name = Name.substr(5);
+
+      // The following nvvm intrinsics correspond exactly to an LLVM intrinsic.
+      Intrinsic::ID IID = StringSwitch<Intrinsic::ID>(Name)
+                              .Cases("brev32", "brev64", Intrinsic::bitreverse)
+                              .Case("clz.i", Intrinsic::ctlz)
+                              .Case("popc.i", Intrinsic::ctpop)
+                              .Default(Intrinsic::not_intrinsic);
+      if (IID != Intrinsic::not_intrinsic && F->arg_size() == 1) {
+        NewFn = Intrinsic::getDeclaration(F->getParent(), IID,
+                                          {F->getReturnType()});
+        return true;
+      }
 
+      // The following nvvm intrinsics correspond exactly to an LLVM idiom, but
+      // not to an intrinsic alone.  We expand them in UpgradeIntrinsicCall.
+      //
+      // TODO: We could add lohi.i2d.
+      bool Expand = StringSwitch<bool>(Name)
+                        .Cases("abs.i", "abs.ll", true)
+                        .Cases("clz.ll", "popc.ll", "h2f", true)
+                        .Cases("max.i", "max.ll", "max.ui", "max.ull", true)
+                        .Cases("min.i", "min.ll", "min.ui", "min.ull", true)
+                        .Default(false);
+      if (Expand) {
+        NewFn = nullptr;
+        return true;
+      }
+    }
+  }
   case 'o':
     // We only need to change the name to match the mangling including the
     // address space.
-    if (F->arg_size() == 2 && Name.startswith("objectsize.")) {
+    if (Name.startswith("objectsize.")) {
       Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
-      if (F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
+      if (F->arg_size() == 2 ||
+          F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::objectsize, Tys);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize,
+                                          Tys);
         return true;
       }
     }
@@ -226,236 +523,15 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
-  case 'x': {
-    bool IsX86 = Name.startswith("x86.");
-    if (IsX86)
-      Name = Name.substr(4);
-
-    // All of the intrinsics matches below should be marked with which llvm
-    // version started autoupgrading them. At some point in the future we would
-    // like to use this information to remove upgrade code for some older
-    // intrinsics. It is currently undecided how we will determine that future
-    // point.
-    if (IsX86 &&
-        (Name.startswith("sse2.pcmpeq.") || // Added in 3.1
-         Name.startswith("sse2.pcmpgt.") || // Added in 3.1
-         Name.startswith("avx2.pcmpeq.") || // Added in 3.1
-         Name.startswith("avx2.pcmpgt.") || // Added in 3.1
-         Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
-         Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
-         Name == "sse.add.ss" || // Added in 4.0
-         Name == "sse2.add.sd" || // Added in 4.0
-         Name == "sse.sub.ss" || // Added in 4.0
-         Name == "sse2.sub.sd" || // Added in 4.0
-         Name == "sse.mul.ss" || // Added in 4.0
-         Name == "sse2.mul.sd" || // Added in 4.0
-         Name == "sse.div.ss" || // Added in 4.0
-         Name == "sse2.div.sd" || // Added in 4.0
-         Name == "sse41.pmaxsb" || // Added in 3.9
-         Name == "sse2.pmaxs.w" || // Added in 3.9
-         Name == "sse41.pmaxsd" || // Added in 3.9
-         Name == "sse2.pmaxu.b" || // Added in 3.9
-         Name == "sse41.pmaxuw" || // Added in 3.9
-         Name == "sse41.pmaxud" || // Added in 3.9
-         Name == "sse41.pminsb" || // Added in 3.9
-         Name == "sse2.pmins.w" || // Added in 3.9
-         Name == "sse41.pminsd" || // Added in 3.9
-         Name == "sse2.pminu.b" || // Added in 3.9
-         Name == "sse41.pminuw" || // Added in 3.9
-         Name == "sse41.pminud" || // Added in 3.9
-         Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
-         Name.startswith("avx2.pmax") || // Added in 3.9
-         Name.startswith("avx2.pmin") || // Added in 3.9
-         Name.startswith("avx512.mask.pmax") || // Added in 4.0
-         Name.startswith("avx512.mask.pmin") || // Added in 4.0
-         Name.startswith("avx2.vbroadcast") || // Added in 3.8
-         Name.startswith("avx2.pbroadcast") || // Added in 3.8
-         Name.startswith("avx.vpermil.") || // Added in 3.1
-         Name.startswith("sse2.pshuf") || // Added in 3.9
-         Name.startswith("avx512.pbroadcast") || // Added in 3.9
-         Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
-         Name.startswith("avx512.mask.movddup") || // Added in 3.9
-         Name.startswith("avx512.mask.movshdup") || // Added in 3.9
-         Name.startswith("avx512.mask.movsldup") || // Added in 3.9
-         Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
-         Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
-         Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
-         Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
-         Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
-         Name.startswith("avx512.mask.punpckl") || // Added in 3.9
-         Name.startswith("avx512.mask.punpckh") || // Added in 3.9
-         Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
-         Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
-         Name.startswith("avx512.mask.pand.") || // Added in 3.9
-         Name.startswith("avx512.mask.pandn.") || // Added in 3.9
-         Name.startswith("avx512.mask.por.") || // Added in 3.9
-         Name.startswith("avx512.mask.pxor.") || // Added in 3.9
-         Name.startswith("avx512.mask.and.") || // Added in 3.9
-         Name.startswith("avx512.mask.andn.") || // Added in 3.9
-         Name.startswith("avx512.mask.or.") || // Added in 3.9
-         Name.startswith("avx512.mask.xor.") || // Added in 3.9
-         Name.startswith("avx512.mask.padd.") || // Added in 4.0
-         Name.startswith("avx512.mask.psub.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmull.") || // Added in 4.0
-         Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
-         Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
-         Name == "avx512.mask.add.pd.128" || // Added in 4.0
-         Name == "avx512.mask.add.pd.256" || // Added in 4.0
-         Name == "avx512.mask.add.ps.128" || // Added in 4.0
-         Name == "avx512.mask.add.ps.256" || // Added in 4.0
-         Name == "avx512.mask.div.pd.128" || // Added in 4.0
-         Name == "avx512.mask.div.pd.256" || // Added in 4.0
-         Name == "avx512.mask.div.ps.128" || // Added in 4.0
-         Name == "avx512.mask.div.ps.256" || // Added in 4.0
-         Name == "avx512.mask.mul.pd.128" || // Added in 4.0
-         Name == "avx512.mask.mul.pd.256" || // Added in 4.0
-         Name == "avx512.mask.mul.ps.128" || // Added in 4.0
-         Name == "avx512.mask.mul.ps.256" || // Added in 4.0
-         Name == "avx512.mask.sub.pd.128" || // Added in 4.0
-         Name == "avx512.mask.sub.pd.256" || // Added in 4.0
-         Name == "avx512.mask.sub.ps.128" || // Added in 4.0
-         Name == "avx512.mask.sub.ps.256" || // Added in 4.0
-         Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.w") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.w") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
-         Name.startswith("avx512.mask.pslli") || // Added in 4.0
-         Name.startswith("avx512.mask.psrai") || // Added in 4.0
-         Name.startswith("avx512.mask.psrli") || // Added in 4.0
-         Name.startswith("avx512.mask.psllv") || // Added in 4.0
-         Name.startswith("avx512.mask.psrav") || // Added in 4.0
-         Name.startswith("avx512.mask.psrlv") || // Added in 4.0
-         Name.startswith("sse41.pmovsx") || // Added in 3.8
-         Name.startswith("sse41.pmovzx") || // Added in 3.9
-         Name.startswith("avx2.pmovsx") || // Added in 3.9
-         Name.startswith("avx2.pmovzx") || // Added in 3.9
-         Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
-         Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
-         Name == "sse2.cvtdq2pd" || // Added in 3.9
-         Name == "sse2.cvtps2pd" || // Added in 3.9
-         Name == "avx.cvtdq2.pd.256" || // Added in 3.9
-         Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
-         Name.startswith("avx.vinsertf128.") || // Added in 3.7
-         Name == "avx2.vinserti128" || // Added in 3.7
-         Name.startswith("avx512.mask.insert") || // Added in 4.0
-         Name.startswith("avx.vextractf128.") || // Added in 3.7
-         Name == "avx2.vextracti128" || // Added in 3.7
-         Name.startswith("avx512.mask.vextract") || // Added in 4.0
-         Name.startswith("sse4a.movnt.") || // Added in 3.9
-         Name.startswith("avx.movnt.") || // Added in 3.2
-         Name.startswith("avx512.storent.") || // Added in 3.9
-         Name == "sse2.storel.dq" || // Added in 3.9
-         Name.startswith("sse.storeu.") || // Added in 3.9
-         Name.startswith("sse2.storeu.") || // Added in 3.9
-         Name.startswith("avx.storeu.") || // Added in 3.9
-         Name.startswith("avx512.mask.storeu.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.p") || // Added in 3.9
-         Name.startswith("avx512.mask.store.b.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.d.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.q.") || // Added in 3.9
-         Name.startswith("avx512.mask.loadu.") || // Added in 3.9
-         Name.startswith("avx512.mask.load.") || // Added in 3.9
-         Name == "sse42.crc32.64.8" || // Added in 3.4
-         Name.startswith("avx.vbroadcast.s") || // Added in 3.5
-         Name.startswith("avx512.mask.palignr.") || // Added in 3.9
-         Name.startswith("avx512.mask.valign.") || // Added in 4.0
-         Name.startswith("sse2.psll.dq") || // Added in 3.7
-         Name.startswith("sse2.psrl.dq") || // Added in 3.7
-         Name.startswith("avx2.psll.dq") || // Added in 3.7
-         Name.startswith("avx2.psrl.dq") || // Added in 3.7
-         Name.startswith("avx512.psll.dq") || // Added in 3.9
-         Name.startswith("avx512.psrl.dq") || // Added in 3.9
-         Name == "sse41.pblendw" || // Added in 3.7
-         Name.startswith("sse41.blendp") || // Added in 3.7
-         Name.startswith("avx.blend.p") || // Added in 3.7
-         Name == "avx2.pblendw" || // Added in 3.7
-         Name.startswith("avx2.pblendd.") || // Added in 3.7
-         Name.startswith("avx.vbroadcastf128") || // Added in 4.0
-         Name == "avx2.vbroadcasti128" || // Added in 3.7
-         Name == "xop.vpcmov" || // Added in 3.8
-         Name.startswith("avx512.mask.move.s") || // Added in 4.0
-         (Name.startswith("xop.vpcom") && // Added in 3.2
-          F->arg_size() == 2))) {
-      NewFn = nullptr;
-      return true;
-    }
-    // SSE4.1 ptest functions may have an old signature.
-    if (IsX86 && Name.startswith("sse41.ptest")) { // Added in 3.2
-      if (Name.substr(11) == "c")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
-      if (Name.substr(11) == "z")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
-      if (Name.substr(11) == "nzc")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
-    }
-    // Several blend and other instructions with masks used the wrong number of
-    // bits.
-    if (IsX86 && Name == "sse41.insertps") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.dppd") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.dpps") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.mpsadbw") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
-                                              NewFn);
-    if (IsX86 && Name == "avx.dp.ps.256") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
-                                              NewFn);
-    if (IsX86 && Name == "avx2.mpsadbw") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
-                                              NewFn);
-
-    // frcz.ss/sd may need to have an argument dropped. Added in 3.2
-    if (IsX86 && Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
-      rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                        Intrinsic::x86_xop_vfrcz_ss);
-      return true;
-    }
-    if (IsX86 && Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
-      rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                        Intrinsic::x86_xop_vfrcz_sd);
+  case 'x':
+    if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
-    }
-    // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
-    if (IsX86 && Name.startswith("xop.vpermil2")) { // Added in 3.9
-      auto Params = F->getFunctionType()->params();
-      auto Idx = Params[2];
-      if (Idx->getScalarType()->isFloatingPointTy()) {
-        rename(F);
-        unsigned IdxSize = Idx->getPrimitiveSizeInBits();
-        unsigned EltSize = Idx->getScalarSizeInBits();
-        Intrinsic::ID Permil2ID;
-        if (EltSize == 64 && IdxSize == 128)
-          Permil2ID = Intrinsic::x86_xop_vpermil2pd;
-        else if (EltSize == 32 && IdxSize == 128)
-          Permil2ID = Intrinsic::x86_xop_vpermil2ps;
-        else if (EltSize == 64 && IdxSize == 256)
-          Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
-        else
-          Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
-        NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
-        return true;
-      }
-    }
-    break;
   }
+  // Remangle our intrinsic since we upgrade the mangling
+  auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F);
+  if (Result != None) {
+    NewFn = Result.getValue();
+    return true;
   }
 
   //  This may not belong here. This function is effectively being overloaded
@@ -733,6 +809,15 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
   return Builder.CreateInsertElement(A, Select, (uint64_t)0);
 }
 
+
+static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
+  Value* Op = CI.getArgOperand(0);
+  Type* ReturnOp = CI.getType();
+  unsigned NumElts = CI.getType()->getVectorNumElements();
+  Value *Mask = getX86MaskVec(Builder, Op, NumElts);
+  return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -753,6 +838,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     bool IsX86 = Name.startswith("x86.");
     if (IsX86)
       Name = Name.substr(4);
+    bool IsNVVM = Name.startswith("nvvm.");
+    if (IsNVVM)
+      Name = Name.substr(5);
 
     if (IsX86 && Name.startswith("sse4a.movnt.")) {
       Module *M = F->getParent();
@@ -838,18 +926,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.startswith("avx512.mask.storeu."))) {
-      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), /*Aligned*/false);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && (Name.startswith("avx512.mask.store."))) {
+    if (IsX86 && (Name.startswith("avx512.mask.store"))) {
+      // "avx512.mask.storeu." or "avx512.mask.store."
+      bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
       UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), /*Aligned*/true);
+                         CI->getArgOperand(2), Aligned);
 
       // Remove intrinsic.
       CI->eraseFromParent();
@@ -858,15 +939,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
     Value *Rep;
     // Upgrade packed integer vector compare intrinsics to compare instructions.
-    if (IsX86 && (Name.startswith("sse2.pcmpeq.") ||
-                  Name.startswith("avx2.pcmpeq."))) {
-      Rep = Builder.CreateICmpEQ(CI->getArgOperand(0), CI->getArgOperand(1),
-                                 "pcmpeq");
-      Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.startswith("sse2.pcmpgt.") ||
-                         Name.startswith("avx2.pcmpgt."))) {
-      Rep = Builder.CreateICmpSGT(CI->getArgOperand(0), CI->getArgOperand(1),
-                                  "pcmpgt");
+    if (IsX86 && (Name.startswith("sse2.pcmp") ||
+                  Name.startswith("avx2.pcmp"))) {
+      // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
+      bool CmpEq = Name[9] == 'e';
+      Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
+                               CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
     } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
       Type *I32Ty = Type::getInt32Ty(C);
@@ -904,10 +982,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
                                         Builder.CreateFDiv(Elt0, Elt1),
                                         ConstantInt::get(I32Ty, 0));
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmpeq.")) {
-      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_EQ);
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmpgt.")) {
-      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_SGT);
+    } else if (IsX86 && Name.startswith("avx512.mask.pcmp")) {
+      // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
+      bool CmpEq = Name[16] == 'e';
+      Rep = upgradeMaskedCompare(Builder, *CI,
+                                 CmpEq ? ICmpInst::ICMP_EQ
+                                       : ICmpInst::ICMP_SGT);
     } else if (IsX86 && (Name == "sse41.pmaxsb" ||
                          Name == "sse2.pmaxs.w" ||
                          Name == "sse41.pmaxsd" ||
@@ -1019,15 +1099,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
-    } else if (IsX86 && Name == "xop.vpcmov") {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
+    } else if (IsX86 && Name.startswith("xop.vpcmov")) {
       Value *Sel = CI->getArgOperand(2);
-      unsigned NumElts = CI->getType()->getVectorNumElements();
-      Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1));
-      Value *NotSel = Builder.CreateXor(Sel, MinusOne);
-      Value *Sel0 = Builder.CreateAnd(Arg0, Sel);
-      Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
+      Value *NotSel = Builder.CreateNot(Sel);
+      Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
+      Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
       Rep = Builder.CreateOr(Sel0, Sel1);
     } else if (IsX86 && Name == "sse42.crc32.64.8") {
       Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
@@ -1461,6 +1537,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::ctlz,
+                                                         CI->getType()),
+                               { CI->getArgOperand(0), Builder.getInt1(false) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.mask.max.p") ||
+                         Name.startswith("avx512.mask.min.p"))) {
+      bool IsMin = Name[13] == 'i';
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned VecWidth = VecTy->getPrimitiveSizeInBits();
+      unsigned EltWidth = VecTy->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (!IsMin && VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_sse_max_ps;
+      else if (!IsMin && VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_sse2_max_pd;
+      else if (!IsMin && VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx_max_ps_256;
+      else if (!IsMin && VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx_max_pd_256;
+      else if (IsMin && VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_sse_min_ps;
+      else if (IsMin && VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_sse2_min_pd;
+      else if (IsMin && VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx_min_ps_256;
+      else if (IsMin && VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx_min_pd_256;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.pshuf.b.")) {
       VectorType *VecTy = cast<VectorType>(CI->getType());
       Intrinsic::ID IID;
@@ -1501,6 +1614,42 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                { CI->getArgOperand(0), CI->getArgOperand(1) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.pack")) {
+      bool IsUnsigned = Name[16] == 'u';
+      bool IsDW = Name[18] == 'd';
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      Intrinsic::ID IID;
+      if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packsswb_128;
+      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packsswb;
+      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packsswb_512;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packssdw_128;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packssdw;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packssdw_512;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packuswb_128;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packuswb;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packuswb_512;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse41_packusdw;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packusdw;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packusdw_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.psll")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
@@ -1705,6 +1854,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
     } else if (IsX86 && Name.startswith("avx512.mask.move.s")) {
       Rep = upgradeMaskedMove(Builder, *CI);
+    } else if (IsX86 && Name.startswith("avx512.cvtmask2")) {
+      Rep = UpgradeMaskToInt(Builder, *CI);
     } else if (IsX86 && Name.startswith("avx512.mask.vpermilvar.")) {
       Intrinsic::ID IID;
       if (Name.endswith("ps.128"))
@@ -1727,6 +1878,64 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                { CI->getArgOperand(0), CI->getArgOperand(1) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.endswith(".movntdqa")) {
+      Module *M = F->getParent();
+      MDNode *Node = MDNode::get(
+          C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+
+      Value *Ptr = CI->getArgOperand(0);
+      VectorType *VTy = cast<VectorType>(CI->getType());
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC =
+          Builder.CreateBitCast(Ptr, PointerType::getUnqual(VTy), "cast");
+      LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8);
+      LI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      Rep = LI;
+    } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
+      Value *Arg = CI->getArgOperand(0);
+      Value *Neg = Builder.CreateNeg(Arg, "neg");
+      Value *Cmp = Builder.CreateICmpSGE(
+          Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
+      Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
+    } else if (IsNVVM && (Name == "max.i" || Name == "max.ll" ||
+                          Name == "max.ui" || Name == "max.ull")) {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+      Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull")
+                       ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
+                       : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
+      Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
+    } else if (IsNVVM && (Name == "min.i" || Name == "min.ll" ||
+                          Name == "min.ui" || Name == "min.ull")) {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+      Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull")
+                       ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
+                       : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
+      Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
+    } else if (IsNVVM && Name == "clz.ll") {
+      // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 and returns an i64.
+      Value *Arg = CI->getArgOperand(0);
+      Value *Ctlz = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                    {Arg->getType()}),
+          {Arg, Builder.getFalse()}, "ctlz");
+      Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
+    } else if (IsNVVM && Name == "popc.ll") {
+      // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 and returns an
+      // i64.
+      Value *Arg = CI->getArgOperand(0);
+      Value *Popc = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+                                    {Arg->getType()}),
+          Arg, "ctpop");
+      Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
+    } else if (IsNVVM && Name == "h2f") {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(
+                                   F->getParent(), Intrinsic::convert_from_fp16,
+                                   {Builder.getFloatTy()}),
+                               CI->getArgOperand(0), "h2f");
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
@@ -1737,13 +1946,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
-  std::string Name = CI->getName();
-  if (!Name.empty())
-    CI->setName(Name + ".old");
-
+  CallInst *NewCall = nullptr;
   switch (NewFn->getIntrinsicID()) {
-  default:
-    llvm_unreachable("Unknown function for CallInst upgrade.");
+  default: {
+    // Handle generic mangling change, but nothing else
+    assert(
+        (CI->getCalledFunction()->getName() != NewFn->getName()) &&
+        "Unknown function for CallInst upgrade and isn't just a name change");
+    CI->setCalledFunction(NewFn);
+    return;
+  }
 
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
@@ -1761,43 +1973,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::arm_neon_vst4lane: {
     SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                  CI->arg_operands().end());
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::bitreverse:
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
 
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
            "Mismatch between function args and call args");
-    CI->replaceAllUsesWith(Builder.CreateCall(
-        NewFn, {CI->getArgOperand(0), Builder.getFalse()}, Name));
-    CI->eraseFromParent();
-    return;
-
-  case Intrinsic::objectsize:
-    CI->replaceAllUsesWith(Builder.CreateCall(
-        NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)}, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall =
+        Builder.CreateCall(NewFn, {CI->getArgOperand(0), Builder.getFalse()});
+    break;
 
-  case Intrinsic::ctpop: {
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+  case Intrinsic::objectsize: {
+    Value *NullIsUnknownSize = CI->getNumArgOperands() == 2
+                                   ? Builder.getFalse()
+                                   : CI->getArgOperand(2);
+    NewCall = Builder.CreateCall(
+        NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), NullIsUnknownSize});
+    break;
   }
 
+  case Intrinsic::ctpop:
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
+
+  case Intrinsic::convert_from_fp16:
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
+
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd:
-    CI->replaceAllUsesWith(
-        Builder.CreateCall(NewFn, {CI->getArgOperand(1)}, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(1)});
+    break;
 
   case Intrinsic::x86_xop_vpermil2pd:
   case Intrinsic::x86_xop_vpermil2ps:
@@ -1808,9 +2020,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
     VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
     Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::x86_sse41_ptestc:
@@ -1832,10 +2043,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     Value *BC0 = Builder.CreateBitCast(Arg0, NewVecTy, "cast");
     Value *BC1 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
 
-    CallInst *NewCall = Builder.CreateCall(NewFn, {BC0, BC1}, Name);
-    CI->replaceAllUsesWith(NewCall);
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {BC0, BC1});
+    break;
   }
 
   case Intrinsic::x86_sse41_insertps:
@@ -1851,17 +2060,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
     // Replace the last argument with a trunc.
     Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
-
-    CallInst *NewCall = Builder.CreateCall(NewFn, Args);
-    CI->replaceAllUsesWith(NewCall);
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::thread_pointer: {
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {});
+    break;
   }
 
   case Intrinsic::invariant_start:
@@ -1870,11 +2075,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::masked_store: {
     SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                  CI->arg_operands().end());
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
   }
+  assert(NewCall && "Should have either set this variable or returned through "
+                    "the default case");
+  std::string Name = CI->getName();
+  if (!Name.empty()) {
+    CI->setName(Name + ".old");
+    NewCall->setName(Name);
   }
+  CI->replaceAllUsesWith(NewCall);
+  CI->eraseFromParent();
 }
 
 void llvm::UpgradeCallsToIntrinsic(Function *F) {
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 19e784923658..90ca21ab91f8 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -117,28 +117,19 @@ const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
-Module *BasicBlock::getModule() {
-  return getParent()->getParent();
-}
-
-TerminatorInst *BasicBlock::getTerminator() {
-  if (InstList.empty()) return nullptr;
-  return dyn_cast<TerminatorInst>(&InstList.back());
-}
-
 const TerminatorInst *BasicBlock::getTerminator() const {
   if (InstList.empty()) return nullptr;
   return dyn_cast<TerminatorInst>(&InstList.back());
 }
 
-CallInst *BasicBlock::getTerminatingMustTailCall() {
+const CallInst *BasicBlock::getTerminatingMustTailCall() const {
   if (InstList.empty())
     return nullptr;
-  ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
+  const ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
   if (!RI || RI == &InstList.front())
     return nullptr;
 
-  Instruction *Prev = RI->getPrevNode();
+  const Instruction *Prev = RI->getPrevNode();
   if (!Prev)
     return nullptr;
 
@@ -162,7 +153,7 @@ CallInst *BasicBlock::getTerminatingMustTailCall() {
   return nullptr;
 }
 
-CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
+const CallInst *BasicBlock::getTerminatingDeoptimizeCall() const {
   if (InstList.empty())
     return nullptr;
   auto *RI = dyn_cast<ReturnInst>(&InstList.back());
@@ -177,22 +168,22 @@ CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHI() {
-  for (Instruction &I : *this)
+const Instruction* BasicBlock::getFirstNonPHI() const {
+  for (const Instruction &I : *this)
     if (!isa<PHINode>(I))
       return &I;
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHIOrDbg() {
-  for (Instruction &I : *this)
+const Instruction* BasicBlock::getFirstNonPHIOrDbg() const {
+  for (const Instruction &I : *this)
     if (!isa<PHINode>(I) && !isa<DbgInfoIntrinsic>(I))
       return &I;
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
-  for (Instruction &I : *this) {
+const Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() const {
+  for (const Instruction &I : *this) {
     if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
       continue;
 
@@ -206,12 +197,12 @@ Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
   return nullptr;
 }
 
-BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
-  Instruction *FirstNonPHI = getFirstNonPHI();
+BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const {
+  const Instruction *FirstNonPHI = getFirstNonPHI();
   if (!FirstNonPHI)
     return end();
 
-  iterator InsertPt = FirstNonPHI->getIterator();
+  const_iterator InsertPt = FirstNonPHI->getIterator();
   if (InsertPt->isEHPad()) ++InsertPt;
   return InsertPt;
 }
@@ -223,10 +214,10 @@ void BasicBlock::dropAllReferences() {
 
 /// If this basic block has a single predecessor block,
 /// return the block, otherwise return a null pointer.
-BasicBlock *BasicBlock::getSinglePredecessor() {
-  pred_iterator PI = pred_begin(this), E = pred_end(this);
+const BasicBlock *BasicBlock::getSinglePredecessor() const {
+  const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr;         // No preds.
-  BasicBlock *ThePred = *PI;
+  const BasicBlock *ThePred = *PI;
   ++PI;
   return (PI == E) ? ThePred : nullptr /*multiple preds*/;
 }
@@ -236,10 +227,10 @@ BasicBlock *BasicBlock::getSinglePredecessor() {
 /// Note that unique predecessor doesn't mean single edge, there can be
 /// multiple edges from the unique predecessor to this block (for example
 /// a switch statement with multiple cases having the same destination).
-BasicBlock *BasicBlock::getUniquePredecessor() {
-  pred_iterator PI = pred_begin(this), E = pred_end(this);
+const BasicBlock *BasicBlock::getUniquePredecessor() const {
+  const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr; // No preds.
-  BasicBlock *PredBB = *PI;
+  const BasicBlock *PredBB = *PI;
   ++PI;
   for (;PI != E; ++PI) {
     if (*PI != PredBB)
@@ -250,18 +241,18 @@ BasicBlock *BasicBlock::getUniquePredecessor() {
   return PredBB;
 }
 
-BasicBlock *BasicBlock::getSingleSuccessor() {
-  succ_iterator SI = succ_begin(this), E = succ_end(this);
+const BasicBlock *BasicBlock::getSingleSuccessor() const {
+  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // no successors
-  BasicBlock *TheSucc = *SI;
+  const BasicBlock *TheSucc = *SI;
   ++SI;
   return (SI == E) ? TheSucc : nullptr /* multiple successors */;
 }
 
-BasicBlock *BasicBlock::getUniqueSuccessor() {
-  succ_iterator SI = succ_begin(this), E = succ_end(this);
+const BasicBlock *BasicBlock::getUniqueSuccessor() const {
+  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // No successors
-  BasicBlock *SuccBB = *SI;
+  const BasicBlock *SuccBB = *SI;
   ++SI;
   for (;SI != E; ++SI) {
     if (*SI != SuccBB)
@@ -438,9 +429,6 @@ bool BasicBlock::isLandingPad() const {
 }
 
 /// Return the landingpad instruction associated with the landing pad.
-LandingPadInst *BasicBlock::getLandingPadInst() {
-  return dyn_cast<LandingPadInst>(getFirstNonPHI());
-}
 const LandingPadInst *BasicBlock::getLandingPadInst() const {
   return dyn_cast<LandingPadInst>(getFirstNonPHI());
 }
diff --git a/lib/IR/Comdat.cpp b/lib/IR/Comdat.cpp
index fc1b48d1c190..e27ecad0a884 100644
--- a/lib/IR/Comdat.cpp
+++ b/lib/IR/Comdat.cpp
@@ -1,4 +1,4 @@
-//===-- Comdat.cpp - Implement Metadata classes --------------------------===//
+//===- Comdat.cpp - Implement Metadata classes ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Comdat.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Comdat.h"
+
 using namespace llvm;
 
 Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {}
 
-Comdat::Comdat() : Name(nullptr), SK(Comdat::Any) {}
+Comdat::Comdat() = default;
 
 StringRef Comdat::getName() const { return Name->first(); }
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 098ff90a0a95..bba230677ebf 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ConstantFold.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -606,17 +607,15 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
       const APFloat &V = FPC->getValueAPF();
       bool ignored;
-      uint64_t x[2];
       uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      APSInt IntVal(DestBitWidth, opc == Instruction::FPToUI);
       if (APFloat::opInvalidOp ==
-          V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
-                             APFloat::rmTowardZero, &ignored)) {
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored)) {
         // Undefined behavior invoked - the destination type can't represent
         // the input constant.
         return UndefValue::get(DestTy);
       }
-      APInt Val(DestBitWidth, x);
-      return ConstantInt::get(FPC->getContext(), Val);
+      return ConstantInt::get(FPC->getContext(), IntVal);
     }
     return nullptr; // Can't fold.
   case Instruction::IntToPtr:   //always treated as unsigned
@@ -1209,10 +1208,15 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     SmallVector<Constant*, 16> Result;
     Type *Ty = IntegerType::get(VTy->getContext(), 32);
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *LHS =
-        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
-      Constant *RHS =
-        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+      Constant *ExtractIdx = ConstantInt::get(Ty, i);
+      Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
+      Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
+
+      // If any element of a divisor vector is zero, the whole op is undef.
+      if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+           Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+          RHS->isNullValue())
+        return UndefValue::get(VTy);
 
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
     }
@@ -2231,7 +2235,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
     NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
 
-    Constant *PrevIdx = cast<Constant>(Idxs[i - 1]);
+    Constant *PrevIdx = NewIdxs[i-1] ? NewIdxs[i-1] :
+                           cast<Constant>(Idxs[i - 1]);
     Constant *Div = ConstantExpr::getSDiv(CI, Factor);
 
     unsigned CommonExtendedWidth =
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index a85ad465317c..8dfd6c8036c4 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -40,10 +40,10 @@ ConstantRange::ConstantRange(uint32_t BitWidth, bool Full) {
 
 /// Initialize a range to hold the single specified value.
 ///
-ConstantRange::ConstantRange(APIntMoveTy V)
+ConstantRange::ConstantRange(APInt V)
     : Lower(std::move(V)), Upper(Lower + 1) {}
 
-ConstantRange::ConstantRange(APIntMoveTy L, APIntMoveTy U)
+ConstantRange::ConstantRange(APInt L, APInt U)
     : Lower(std::move(L)), Upper(std::move(U)) {
   assert(Lower.getBitWidth() == Upper.getBitWidth() &&
          "ConstantRange with unequal bit widths");
@@ -272,6 +272,22 @@ APInt ConstantRange::getSetSize() const {
   return (Upper - Lower).zext(getBitWidth()+1);
 }
 
+/// isSizeStrictlySmallerThanOf - Compare set size of this range with the range
+/// CR.
+/// This function is faster than comparing results of getSetSize for the two
+/// ranges, because we don't need to extend bitwidth of APInts we're operating
+/// with.
+///
+bool
+ConstantRange::isSizeStrictlySmallerThanOf(const ConstantRange &Other) const {
+  assert(getBitWidth() == Other.getBitWidth());
+  if (isFullSet())
+    return false;
+  if (Other.isFullSet())
+    return true;
+  return (Upper - Lower).ult(Other.Upper - Other.Lower);
+}
+
 /// getUnsignedMax - Return the largest unsigned value contained in the
 /// ConstantRange.
 ///
@@ -414,7 +430,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
       if (CR.Upper.ule(Lower))
         return ConstantRange(CR.Lower, Upper);
 
-      if (getSetSize().ult(CR.getSetSize()))
+      if (isSizeStrictlySmallerThanOf(CR))
         return *this;
       return CR;
     }
@@ -429,7 +445,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
   if (CR.Upper.ult(Upper)) {
     if (CR.Lower.ult(Upper)) {
-      if (getSetSize().ult(CR.getSetSize()))
+      if (isSizeStrictlySmallerThanOf(CR))
         return *this;
       return CR;
     }
@@ -445,7 +461,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
     return ConstantRange(CR.Lower, Upper);
   }
-  if (getSetSize().ult(CR.getSetSize()))
+  if (isSizeStrictlySmallerThanOf(CR))
     return *this;
   return CR;
 }
@@ -739,17 +755,16 @@ ConstantRange::add(const ConstantRange &Other) const {
   if (isFullSet() || Other.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
   APInt NewLower = getLower() + Other.getLower();
   APInt NewUpper = getUpper() + Other.getUpper() - 1;
   if (NewLower == NewUpper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   ConstantRange X = ConstantRange(NewLower, NewUpper);
-  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+  if (X.isSizeStrictlySmallerThanOf(*this) ||
+      X.isSizeStrictlySmallerThanOf(Other))
     // We've wrapped, therefore, full set.
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-
   return X;
 }
 
@@ -773,17 +788,16 @@ ConstantRange::sub(const ConstantRange &Other) const {
   if (isFullSet() || Other.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
   APInt NewLower = getLower() - Other.getUpper() + 1;
   APInt NewUpper = getUpper() - Other.getLower();
   if (NewLower == NewUpper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   ConstantRange X = ConstantRange(NewLower, NewUpper);
-  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+  if (X.isSizeStrictlySmallerThanOf(*this) ||
+      X.isSizeStrictlySmallerThanOf(Other))
     // We've wrapped, therefore, full set.
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-
   return X;
 }
 
@@ -837,7 +851,7 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   ConstantRange Result_sext(std::min(L, Compare), std::max(L, Compare) + 1);
   ConstantRange SR = Result_sext.truncate(getBitWidth());
 
-  return UR.getSetSize().ult(SR.getSetSize()) ? UR : SR;
+  return UR.isSizeStrictlySmallerThanOf(SR) ? UR : SR;
 }
 
 ConstantRange
@@ -996,11 +1010,13 @@ void ConstantRange::print(raw_ostream &OS) const {
     OS << "[" << Lower << "," << Upper << ")";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Allow printing from a debugger easily...
 ///
 LLVM_DUMP_METHOD void ConstantRange::dump() const {
   print(dbgs());
 }
+#endif
 
 ConstantRange llvm::getConstantRangeFromMetadata(const MDNode &Ranges) {
   const unsigned NumRanges = Ranges.getNumOperands() / 2;
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 533b9245277f..c5f93c9f4db0 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -1027,7 +1027,7 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
     return getSequenceIfElementsMatch<ConstantDataVector>(C, V);
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
-  // the operand list constants a ConstantExpr or something else strange.
+  // the operand list contains a ConstantExpr or something else strange.
   return nullptr;
 }
 
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 00bb476c0b3c..b5ed30b85c8a 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Attributes.h"
-#include "AttributeSetNode.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -259,7 +258,8 @@ void LLVMSetTarget(LLVMModuleRef M, const char *Triple) {
 }
 
 void LLVMDumpModule(LLVMModuleRef M) {
-  unwrap(M)->dump();
+  unwrap(M)->print(errs(), nullptr,
+                   /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true);
 }
 
 LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
@@ -358,9 +358,11 @@ LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
 
-void LLVMDumpType(LLVMTypeRef Ty) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LLVMDumpType(LLVMTypeRef Ty) {
   return unwrap(Ty)->dump();
 }
+#endif
 
 char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
   std::string buf;
@@ -640,8 +642,8 @@ void LLVMSetValueName(LLVMValueRef Val, const char *Name) {
   unwrap(Val)->setName(Name);
 }
 
-void LLVMDumpValue(LLVMValueRef Val) {
-  unwrap(Val)->dump();
+LLVM_DUMP_METHOD void LLVMDumpValue(LLVMValueRef Val) {
+  unwrap(Val)->print(errs(), /*IsForDebug=*/true);
 }
 
 char* LLVMPrintValueToString(LLVMValueRef Val) {
@@ -1844,18 +1846,14 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 }
 
 unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
-  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
-  if (!ASN)
-    return 0;
-  return ASN->getNumAttributes();
+  auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
+  return AS.getNumAttributes();
 }
 
 void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                               LLVMAttributeRef *Attrs) {
-  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
-  if (!ASN)
-    return;
-  for (auto A: make_range(ASN->begin(), ASN->end()))
+  auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
+  for (auto A : AS)
     *Attrs++ = wrap(A);
 }
 
@@ -1885,12 +1883,12 @@ void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
                                         const char *V) {
   Function *Func = unwrap<Function>(Fn);
-  AttributeSet::AttrIndex Idx =
-    AttributeSet::AttrIndex(AttributeSet::FunctionIndex);
+  AttributeList::AttrIndex Idx =
+      AttributeList::AttrIndex(AttributeList::FunctionIndex);
   AttrBuilder B;
 
   B.addAttribute(A, V);
-  AttributeSet Set = AttributeSet::get(Func->getContext(), Idx, B);
+  AttributeList Set = AttributeList::get(Func->getContext(), Idx, B);
   Func->addAttributes(Idx, Set);
 }
 
@@ -1910,10 +1908,8 @@ void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
 }
 
 LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
-  Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin();
-  while (index --> 0)
-    AI++;
-  return wrap(&*AI);
+  Function *Fn = unwrap<Function>(FnRef);
+  return wrap(&Fn->arg_begin()[index]);
 }
 
 LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
@@ -1938,25 +1934,24 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
 
 LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I(A);
-  if (++I == A->getParent()->arg_end())
+  Function *Fn = A->getParent();
+  if (A->getArgNo() + 1 >= Fn->arg_size())
     return nullptr;
-  return wrap(&*I);
+  return wrap(&Fn->arg_begin()[A->getArgNo() + 1]);
 }
 
 LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I(A);
-  if (I == A->getParent()->arg_begin())
+  if (A->getArgNo() == 0)
     return nullptr;
-  return wrap(&*--I);
+  return wrap(&A->getParent()->arg_begin()[A->getArgNo() - 1]);
 }
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
   AttrBuilder B;
   B.addAlignmentAttr(align);
-  A->addAttr(AttributeSet::get(A->getContext(),A->getArgNo() + 1, B));
+  A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
 }
 
 /*--.. Operations on basic blocks ..........................................--*/
@@ -2165,10 +2160,9 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B;
   B.addAlignmentAttr(align);
-  Call.setAttributes(Call.getAttributes()
-                       .addAttributes(Call->getContext(), index,
-                                      AttributeSet::get(Call->getContext(),
-                                                        index, B)));
+  Call.setAttributes(Call.getAttributes().addAttributes(
+      Call->getContext(), index,
+      AttributeList::get(Call->getContext(), index, B)));
 }
 
 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
@@ -2179,19 +2173,15 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
 unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
                                        LLVMAttributeIndex Idx) {
   auto CS = CallSite(unwrap<Instruction>(C));
-  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
-  if (!ASN)
-    return 0;
-  return ASN->getNumAttributes();
+  auto AS = CS.getAttributes().getAttributes(Idx);
+  return AS.getNumAttributes();
 }
 
 void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
                                LLVMAttributeRef *Attrs) {
   auto CS = CallSite(unwrap<Instruction>(C));
-  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
-  if (!ASN)
-    return;
-  for (auto A: make_range(ASN->begin(), ASN->end()))
+  auto AS = CS.getAttributes().getAttributes(Idx);
+  for (auto A : AS)
     *Attrs++ = wrap(A);
 }
 
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index d06161067f5f..9407c805b92a 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -126,7 +126,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
     unsigned Lang, DIFile *File, StringRef Producer, bool isOptimized,
     StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
     DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId,
-    bool SplitDebugInlining) {
+    bool SplitDebugInlining, bool DebugInfoForProfiling) {
 
   assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -136,7 +136,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer,
       SplitName, Kind, nullptr, nullptr, nullptr, nullptr, nullptr, DWOId,
-      SplitDebugInlining);
+      SplitDebugInlining, DebugInfoForProfiling);
 
   // Create a named metadata so that it is easier to find cu in a module.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
@@ -241,17 +241,20 @@ DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
-                            0, 0, DINode::FlagZero);
+                            0, 0, None, DINode::FlagZero);
 }
 
-DIDerivedType *DIBuilder::createPointerType(DIType *PointeeTy,
-                                            uint64_t SizeInBits,
-                                            uint32_t AlignInBits,
-                                            StringRef Name) {
+DIDerivedType *DIBuilder::createPointerType(
+    DIType *PointeeTy,
+    uint64_t SizeInBits,
+    uint32_t AlignInBits,
+    Optional<unsigned> DWARFAddressSpace,
+    StringRef Name) {
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, DINode::FlagZero);
+                            AlignInBits, 0, DWARFAddressSpace,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
@@ -261,15 +264,18 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                                                   DINode::DIFlags Flags) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_ptr_to_member_type, "",
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, Flags, Base);
+                            AlignInBits, 0, None, Flags, Base);
 }
 
-DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy,
-                                              uint64_t SizeInBits,
-                                              uint32_t AlignInBits) {
+DIDerivedType *DIBuilder::createReferenceType(
+    unsigned Tag, DIType *RTy,
+    uint64_t SizeInBits,
+    uint32_t AlignInBits,
+    Optional<unsigned> DWARFAddressSpace) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
-                            SizeInBits, AlignInBits, 0, DINode::FlagZero);
+                            SizeInBits, AlignInBits, 0, DWARFAddressSpace,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
@@ -277,14 +283,14 @@ DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DIScope *Context) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
                             LineNo, getNonCompileUnitScope(Context), Ty, 0, 0,
-                            0, DINode::FlagZero);
+                            0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
-                            FriendTy, 0, 0, 0, DINode::FlagZero);
+                            FriendTy, 0, 0, 0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -292,7 +298,7 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
                                             DINode::DIFlags Flags) {
   assert(Ty && "Unable to create inheritance");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
-                            0, Ty, BaseTy, 0, 0, BaseOffset, Flags);
+                            0, Ty, BaseTy, 0, 0, BaseOffset, None, Flags);
 }
 
 DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
@@ -303,7 +309,7 @@ DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
                                            DINode::DIFlags Flags, DIType *Ty) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, Flags);
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -320,7 +326,7 @@ DIDerivedType *DIBuilder::createBitFieldMemberType(
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), Ty, SizeInBits, /* AlignInBits */ 0,
-      OffsetInBits, Flags,
+      OffsetInBits, None, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
                                                StorageOffsetInBits)));
 }
@@ -333,7 +339,8 @@ DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name, DIFile *File,
   Flags |= DINode::FlagStaticMember;
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty, 0,
-                            AlignInBits, 0, Flags, getConstantOrNull(Val));
+                            AlignInBits, 0, None, Flags,
+                            getConstantOrNull(Val));
 }
 
 DIDerivedType *
@@ -343,7 +350,7 @@ DIBuilder::createObjCIVar(StringRef Name, DIFile *File, unsigned LineNumber,
                           DIType *Ty, MDNode *PropertyNode) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(File), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, Flags,
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags,
                             PropertyNode);
 }
 
@@ -442,14 +449,6 @@ DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
   return DISubroutineType::get(VMContext, Flags, CC, ParameterTypes);
 }
 
-DICompositeType *DIBuilder::createExternalTypeRef(unsigned Tag, DIFile *File,
-                                                  StringRef UniqueIdentifier) {
-  assert(!UniqueIdentifier.empty() && "external type ref without uid");
-  return DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr,
-                              0, 0, 0, DINode::FlagExternalTypeRef, nullptr, 0,
-                              nullptr, nullptr, UniqueIdentifier);
-}
-
 DICompositeType *DIBuilder::createEnumerationType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, DINodeArray Elements,
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index d15a34c0b936..6f90ce598568 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -118,9 +118,6 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-const LayoutAlignElem
-DataLayout::InvalidAlignmentElem = { INVALID_ALIGN, 0, 0, 0 };
-
 //===----------------------------------------------------------------------===//
 // PointerAlignElem, PointerAlign support
 //===----------------------------------------------------------------------===//
@@ -145,9 +142,6 @@ PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
           && TypeByteWidth == rhs.TypeByteWidth);
 }
 
-const PointerAlignElem
-DataLayout::InvalidPointerElem = { 0U, 0U, 0U, ~0U };
-
 //===----------------------------------------------------------------------===//
 //                       DataLayout Class Implementation
 //===----------------------------------------------------------------------===//
@@ -180,6 +174,7 @@ void DataLayout::reset(StringRef Desc) {
 
   LayoutMap = nullptr;
   BigEndian = false;
+  AllocaAddrSpace = 0;
   StackNaturalAlign = 0;
   ManglingMode = MM_None;
   NonIntegralAddressSpaces.clear();
@@ -358,6 +353,12 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       StackNaturalAlign = inBytes(getInt(Tok));
       break;
     }
+    case 'A': { // Default stack/alloca address space.
+      AllocaAddrSpace = getInt(Tok);
+      if (!isUInt<24>(AllocaAddrSpace))
+        report_fatal_error("Invalid address space, must be a 24bit integer");
+      break;
+    }
     case 'm':
       if (!Tok.empty())
         report_fatal_error("Unexpected trailing characters after mangling specifier in datalayout string");
@@ -400,6 +401,7 @@ void DataLayout::init(const Module *M) { *this = M->getDataLayout(); }
 
 bool DataLayout::operator==(const DataLayout &Other) const {
   bool Ret = BigEndian == Other.BigEndian &&
+             AllocaAddrSpace == Other.AllocaAddrSpace &&
              StackNaturalAlign == Other.StackNaturalAlign &&
              ManglingMode == Other.ManglingMode &&
              LegalIntWidths == Other.LegalIntWidths &&
@@ -408,6 +410,18 @@ bool DataLayout::operator==(const DataLayout &Other) const {
   return Ret;
 }
 
+DataLayout::AlignmentsTy::iterator
+DataLayout::findAlignmentLowerBound(AlignTypeEnum AlignType,
+                                    uint32_t BitWidth) {
+  auto Pair = std::make_pair((unsigned)AlignType, BitWidth);
+  return std::lower_bound(Alignments.begin(), Alignments.end(), Pair,
+                          [](const LayoutAlignElem &LHS,
+                             const std::pair<unsigned, uint32_t> &RHS) {
+                            return std::tie(LHS.AlignType, LHS.TypeBitWidth) <
+                                   std::tie(RHS.first, RHS.second);
+                          });
+}
+
 void
 DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
                          unsigned pref_align, uint32_t bit_width) {
@@ -426,18 +440,17 @@ DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
     report_fatal_error(
         "Preferred alignment cannot be less than the ABI alignment");
 
-  for (LayoutAlignElem &Elem : Alignments) {
-    if (Elem.AlignType == (unsigned)align_type &&
-        Elem.TypeBitWidth == bit_width) {
-      // Update the abi, preferred alignments.
-      Elem.ABIAlign = abi_align;
-      Elem.PrefAlign = pref_align;
-      return;
-    }
+  AlignmentsTy::iterator I = findAlignmentLowerBound(align_type, bit_width);
+  if (I != Alignments.end() &&
+      I->AlignType == (unsigned)align_type && I->TypeBitWidth == bit_width) {
+    // Update the abi, preferred alignments.
+    I->ABIAlign = abi_align;
+    I->PrefAlign = pref_align;
+  } else {
+    // Insert before I to keep the vector sorted.
+    Alignments.insert(I, LayoutAlignElem::get(align_type, abi_align,
+                                              pref_align, bit_width));
   }
-
-  Alignments.push_back(LayoutAlignElem::get(align_type, abi_align,
-                                            pref_align, bit_width));
 }
 
 DataLayout::PointersTy::iterator
@@ -471,45 +484,29 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
 unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
                                       uint32_t BitWidth, bool ABIInfo,
                                       Type *Ty) const {
-  // Check to see if we have an exact match and remember the best match we see.
-  int BestMatchIdx = -1;
-  int LargestInt = -1;
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == (unsigned)AlignType &&
-        Alignments[i].TypeBitWidth == BitWidth)
-      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
-
-    // The best match so far depends on what we're looking for.
-    if (AlignType == INTEGER_ALIGN &&
-        Alignments[i].AlignType == INTEGER_ALIGN) {
-      // The "best match" for integers is the smallest size that is larger than
-      // the BitWidth requested.
-      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
-          Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
-        BestMatchIdx = i;
-      // However, if there isn't one that's larger, then we must use the
-      // largest one we have (see below)
-      if (LargestInt == -1 ||
-          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
-        LargestInt = i;
+  AlignmentsTy::const_iterator I = findAlignmentLowerBound(AlignType, BitWidth);
+  // See if we found an exact match. Of if we are looking for an integer type,
+  // but don't have an exact match take the next largest integer. This is where
+  // the lower_bound will point to when it fails an exact match.
+  if (I != Alignments.end() && I->AlignType == (unsigned)AlignType &&
+      (I->TypeBitWidth == BitWidth || AlignType == INTEGER_ALIGN))
+    return ABIInfo ? I->ABIAlign : I->PrefAlign;
+
+  if (AlignType == INTEGER_ALIGN) {
+    // If we didn't have a larger value try the largest value we have.
+    if (I != Alignments.begin()) {
+      --I; // Go to the previous entry and see if its an integer.
+      if (I->AlignType == INTEGER_ALIGN)
+        return ABIInfo ? I->ABIAlign : I->PrefAlign;
     }
-  }
-
-  // Okay, we didn't find an exact solution.  Fall back here depending on what
-  // is being looked for.
-  if (BestMatchIdx == -1) {
-    // If we didn't find an integer alignment, fall back on most conservative.
-    if (AlignType == INTEGER_ALIGN) {
-      BestMatchIdx = LargestInt;
-    } else if (AlignType == VECTOR_ALIGN) {
-      // By default, use natural alignment for vector types. This is consistent
-      // with what clang and llvm-gcc do.
-      unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-      Align *= cast<VectorType>(Ty)->getNumElements();
-      Align = PowerOf2Ceil(Align);
-      return Align;
-    }
-  }
+  } else if (AlignType == VECTOR_ALIGN) {
+    // By default, use natural alignment for vector types. This is consistent
+    // with what clang and llvm-gcc do.
+    unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+    Align *= cast<VectorType>(Ty)->getNumElements();
+    Align = PowerOf2Ceil(Align);
+    return Align;
+   }
 
   // If we still couldn't find a reasonable default alignment, fall back
   // to a simple heuristic that the alignment is the first power of two
@@ -517,15 +514,9 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
   // approximation of reality, and if the user wanted something less
   // less conservative, they should have specified it explicitly in the data
   // layout.
-  if (BestMatchIdx == -1) {
-    unsigned Align = getTypeStoreSize(Ty);
-    Align = PowerOf2Ceil(Align);
-    return Align;
-  }
-
-  // Since we got a "best match" index, just return it.
-  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
-                 : Alignments[BestMatchIdx].PrefAlign;
+  unsigned Align = getTypeStoreSize(Ty);
+  Align = PowerOf2Ceil(Align);
+  return Align;
 }
 
 namespace {
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index 6b9bc689a446..c5d39c544304 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -79,9 +79,19 @@ void DebugInfoFinder::processModule(const Module &M) {
         processScope(M->getScope());
     }
   }
-  for (auto &F : M.functions())
+  for (auto &F : M.functions()) {
     if (auto *SP = cast_or_null<DISubprogram>(F.getSubprogram()))
       processSubprogram(SP);
+    // There could be subprograms from inlined functions referenced from
+    // instructions only. Walk the function to find them.
+    for (const BasicBlock &BB : F) {
+      for (const Instruction &I : BB) {
+        if (!I.getDebugLoc())
+          continue;
+        processLocation(M, I.getDebugLoc().get());
+      }
+    }
+  }
 }
 
 void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
@@ -239,6 +249,38 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
   return true;
 }
 
+static llvm::MDNode *stripDebugLocFromLoopID(llvm::MDNode *N) {
+  assert(N->op_begin() != N->op_end() && "Missing self reference?");
+
+  // if there is no debug location, we do not have to rewrite this MDNode.
+  if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
+        return isa<DILocation>(Op.get());
+      }))
+    return N;
+
+  // If there is only the debug location without any actual loop metadata, we
+  // can remove the metadata.
+  if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
+        return !isa<DILocation>(Op.get());
+      }))
+    return nullptr;
+
+  SmallVector<Metadata *, 4> Args;
+  // Reserve operand 0 for loop id self reference.
+  auto TempNode = MDNode::getTemporary(N->getContext(), None);
+  Args.push_back(TempNode.get());
+  // Add all non-debug location operands back.
+  for (auto Op = N->op_begin() + 1; Op != N->op_end(); Op++) {
+    if (!isa<DILocation>(*Op))
+      Args.push_back(*Op);
+  }
+
+  // Set the first operand to itself.
+  MDNode *LoopID = MDNode::get(N->getContext(), Args);
+  LoopID->replaceOperandWith(0, LoopID);
+  return LoopID;
+}
+
 bool llvm::stripDebugInfo(Function &F) {
   bool Changed = false;
   if (F.getSubprogram()) {
@@ -246,6 +288,7 @@ bool llvm::stripDebugInfo(Function &F) {
     F.setSubprogram(nullptr);
   }
 
+  llvm::DenseMap<llvm::MDNode*, llvm::MDNode*> LoopIDsMap;
   for (BasicBlock &BB : F) {
     for (auto II = BB.begin(), End = BB.end(); II != End;) {
       Instruction &I = *II++; // We may delete the instruction, increment now.
@@ -259,6 +302,15 @@ bool llvm::stripDebugInfo(Function &F) {
         I.setDebugLoc(DebugLoc());
       }
     }
+
+    auto *TermInst = BB.getTerminator();
+    if (auto *LoopID = TermInst->getMetadata(LLVMContext::MD_loop)) {
+      auto *NewLoopID = LoopIDsMap.lookup(LoopID);
+      if (!NewLoopID)
+        NewLoopID = LoopIDsMap[LoopID] = stripDebugLocFromLoopID(LoopID);
+      if (NewLoopID != LoopID)
+        TermInst->setMetadata(LLVMContext::MD_loop, NewLoopID);
+    }
   }
   return Changed;
 }
@@ -410,7 +462,8 @@ private:
         CU->isOptimized(), CU->getFlags(), CU->getRuntimeVersion(),
         CU->getSplitDebugFilename(), DICompileUnit::LineTablesOnly, EnumTypes,
         RetainedTypes, GlobalVariables, ImportedEntities, CU->getMacros(),
-        CU->getDWOId(), CU->getSplitDebugInlining());
+        CU->getDWOId(), CU->getSplitDebugInlining(),
+        CU->getDebugInfoForProfiling());
   }
 
   DILocation *getReplacementMDLocation(DILocation *MLD) {
@@ -558,17 +611,26 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
     }
     for (auto &BB : F) {
       for (auto &I : BB) {
-        if (I.getDebugLoc() == DebugLoc())
-          continue;
-
-        // Make a replacement.
-        auto &DL = I.getDebugLoc();
-        auto *Scope = DL.getScope();
-        MDNode *InlinedAt = DL.getInlinedAt();
-        Scope = remap(Scope);
-        InlinedAt = remap(InlinedAt);
-        I.setDebugLoc(
-            DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt));
+        auto remapDebugLoc = [&](DebugLoc DL) -> DebugLoc {
+          auto *Scope = DL.getScope();
+          MDNode *InlinedAt = DL.getInlinedAt();
+          Scope = remap(Scope);
+          InlinedAt = remap(InlinedAt);
+          return DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt);
+        };
+
+        if (I.getDebugLoc() != DebugLoc())
+          I.setDebugLoc(remapDebugLoc(I.getDebugLoc()));
+
+        // Remap DILocations in untyped MDNodes (e.g., llvm.loop).
+        SmallVector<std::pair<unsigned, MDNode *>, 2> MDs;
+        I.getAllMetadata(MDs);
+        for (auto Attachment : MDs)
+          if (auto *T = dyn_cast_or_null<MDTuple>(Attachment.second))
+            for (unsigned N = 0; N < T->getNumOperands(); ++N)
+              if (auto *Loc = dyn_cast_or_null<DILocation>(T->getOperand(N)))
+                if (Loc != DebugLoc())
+                  T->replaceOperandWith(N, remapDebugLoc(Loc));
       }
     }
   }
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 8e21a907e15e..d14c6018d409 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -245,16 +245,18 @@ DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-    uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
-    Metadata *ExtraData, StorageType Storage, bool ShouldCreate) {
+    uint32_t AlignInBits, uint64_t OffsetInBits,
+    Optional<unsigned> DWARFAddressSpace, DIFlags Flags, Metadata *ExtraData,
+    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                         AlignInBits, OffsetInBits, Flags, ExtraData));
+                         AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
+                         ExtraData));
   Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
   DEFINE_GETIMPL_STORE(
-      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits, Flags),
-      Ops);
+      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
+                      DWARFAddressSpace, Flags), Ops);
 }
 
 DICompositeType *DICompositeType::getImpl(
@@ -383,8 +385,8 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
-    uint64_t DWOId, bool SplitDebugInlining, StorageType Storage,
-    bool ShouldCreate) {
+    uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
+    StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
@@ -397,7 +399,8 @@ DICompileUnit *DICompileUnit::getImpl(
   return storeImpl(new (array_lengthof(Ops))
                        DICompileUnit(Context, Storage, SourceLanguage,
                                      IsOptimized, RuntimeVersion, EmissionKind,
-                                     DWOId, SplitDebugInlining, Ops),
+                                     DWOId, SplitDebugInlining,
+                                     DebugInfoForProfiling, Ops),
                    Storage);
 }
 
@@ -611,10 +614,23 @@ bool DIExpression::isValid() const {
         return false;
       break;
     }
+    case dwarf::DW_OP_swap: {
+      // Must be more than one implicit element on the stack.
+
+      // FIXME: A better way to implement this would be to add a local variable
+      // that keeps track of the stack depth and introduce something like a
+      // DW_LLVM_OP_implicit_location as a placeholder for the location this
+      // DIExpression is attached to, or else pass the number of implicit stack
+      // elements into isValid.
+      if (getNumElements() == 1)
+        return false;
+      break;
+    }
     case dwarf::DW_OP_constu:
     case dwarf::DW_OP_plus:
     case dwarf::DW_OP_minus:
     case dwarf::DW_OP_deref:
+    case dwarf::DW_OP_xderef:
       break;
     }
   }
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index ffa7a6b40e2a..f31074a7ad44 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -66,8 +66,8 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
                          const_cast<MDNode *>(InlinedAt));
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const {
-#ifndef NDEBUG
   if (!Loc)
     return;
 
@@ -79,8 +79,8 @@ LLVM_DUMP_METHOD void DebugLoc::dump() const {
     InlinedAtDL.dump();
   } else
     dbgs() << "\n";
-#endif
 }
+#endif
 
 void DebugLoc::print(raw_ostream &OS) const {
   if (!Loc)
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index ea71fde26e0e..395b6158e0c8 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -148,21 +148,31 @@ void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
-bool DiagnosticInfoWithDebugLocBase::isLocationAvailable() const {
-  return getDebugLoc();
+DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) {
+  if (!DL)
+    return;
+  Filename = DL->getFilename();
+  Line = DL->getLine();
+  Column = DL->getColumn();
 }
 
-void DiagnosticInfoWithDebugLocBase::getLocation(StringRef *Filename,
+DiagnosticLocation::DiagnosticLocation(const DISubprogram *SP) {
+  if (!SP)
+    return;
+  Filename = SP->getFilename();
+  Line = SP->getScopeLine();
+  Column = 0;
+}
+
+void DiagnosticInfoWithLocationBase::getLocation(StringRef *Filename,
                                                  unsigned *Line,
                                                  unsigned *Column) const {
-  DILocation *L = getDebugLoc();
-  assert(L != nullptr && "debug location is invalid");
-  *Filename = L->getFilename();
-  *Line = L->getLine();
-  *Column = L->getColumn();
+  *Filename = Loc.getFilename();
+  *Line = Loc.getLine();
+  *Column = Loc.getColumn();
 }
 
-const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
+const std::string DiagnosticInfoWithLocationBase::getLocationStr() const {
   StringRef Filename("<unknown>");
   unsigned Line = 0;
   unsigned Column = 0;
@@ -171,14 +181,14 @@ const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
   return (Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Value *V)
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V)
     : Key(Key) {
   if (auto *F = dyn_cast<Function>(V)) {
     if (DISubprogram *SP = F->getSubprogram())
-      DLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+      Loc = SP;
   }
   else if (auto *I = dyn_cast<Instruction>(V))
-    DLoc = I->getDebugLoc();
+    Loc = I->getDebugLoc();
 
   // Only include names that correspond to user variables.  FIXME: we should use
   // debug info if available to get the name of the user variable.
@@ -191,7 +201,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Value *V)
     Val = I->getOpcodeName();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Type *T)
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Type *T)
     : Key(Key) {
   raw_string_ostream OS(Val);
   OS << *T;
@@ -211,73 +221,83 @@ void DiagnosticInfoOptimizationBase::print(DiagnosticPrinter &DP) const {
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
                                        StringRef RemarkName,
-                                       const DebugLoc &DLoc, Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(
+                                       const DiagnosticLocation &Loc,
+                                       const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
           DK_OptimizationRemark, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
+
+OptimizationRemark::OptimizationRemark(const char *PassName,
+                                       StringRef RemarkName,
+                                       const Instruction *Inst)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
+                                   RemarkName, *Inst->getParent()->getParent(),
+                                   Inst->getDebugLoc(), Inst->getParent()) {}
+
+// Helper to allow for an assert before attempting to return an invalid
+// reference.
+static const BasicBlock &getFirstFunctionBlock(const Function *Func) {
+  assert(!Func->empty() && "Function does not have a body");
+  return Func->front();
+}
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
-                                       StringRef RemarkName, Instruction *Inst)
-    : DiagnosticInfoOptimizationBase(DK_OptimizationRemark, DS_Remark, PassName,
-                                     RemarkName,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc(), Inst->getParent()) {}
+                                       StringRef RemarkName,
+                                       const Function *Func)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
+                                   RemarkName, *Func, Func->getSubprogram(),
+                                   &getFirstFunctionBlock(Func)) {}
 
-bool OptimizationRemark::isEnabled() const {
+bool OptimizationRemark::isEnabled(StringRef PassName) {
   return PassRemarksOptLoc.Pattern &&
-         PassRemarksOptLoc.Pattern->match(getPassName());
+         PassRemarksOptLoc.Pattern->match(PassName);
 }
 
-OptimizationRemarkMissed::OptimizationRemarkMissed(const char *PassName,
-                                                   StringRef RemarkName,
-                                                   const DebugLoc &DLoc,
-                                                   Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(
+OptimizationRemarkMissed::OptimizationRemarkMissed(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
           DK_OptimizationRemarkMissed, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemarkMissed::OptimizationRemarkMissed(const char *PassName,
                                                    StringRef RemarkName,
-                                                   Instruction *Inst)
-    : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkMissed, DS_Remark,
-                                     PassName, RemarkName,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc(), Inst->getParent()) {}
+                                                   const Instruction *Inst)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemarkMissed, DS_Remark,
+                                   PassName, RemarkName,
+                                   *Inst->getParent()->getParent(),
+                                   Inst->getDebugLoc(), Inst->getParent()) {}
 
-bool OptimizationRemarkMissed::isEnabled() const {
+bool OptimizationRemarkMissed::isEnabled(StringRef PassName) {
   return PassRemarksMissedOptLoc.Pattern &&
-         PassRemarksMissedOptLoc.Pattern->match(getPassName());
+         PassRemarksMissedOptLoc.Pattern->match(PassName);
 }
 
-OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(const char *PassName,
-                                                       StringRef RemarkName,
-                                                       const DebugLoc &DLoc,
-                                                       Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(
+OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
           DK_OptimizationRemarkAnalysis, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(const char *PassName,
                                                        StringRef RemarkName,
-                                                       Instruction *Inst)
-    : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkAnalysis, DS_Remark,
-                                     PassName, RemarkName,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc(), Inst->getParent()) {}
-
-OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(enum DiagnosticKind Kind,
-                                                       const char *PassName,
-                                                       StringRef RemarkName,
-                                                       const DebugLoc &DLoc,
-                                                       Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(Kind, DS_Remark, PassName, RemarkName,
-                                     *cast<BasicBlock>(CodeRegion)->getParent(),
-                                     DLoc, CodeRegion) {}
+                                                       const Instruction *Inst)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemarkAnalysis, DS_Remark,
+                                   PassName, RemarkName,
+                                   *Inst->getParent()->getParent(),
+                                   Inst->getDebugLoc(), Inst->getParent()) {}
 
-bool OptimizationRemarkAnalysis::isEnabled() const {
-  return shouldAlwaysPrint() ||
-         (PassRemarksAnalysisOptLoc.Pattern &&
-          PassRemarksAnalysisOptLoc.Pattern->match(getPassName()));
+OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(
+    enum DiagnosticKind Kind, const char *PassName, StringRef RemarkName,
+    const DiagnosticLocation &Loc, const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(Kind, DS_Remark, PassName, RemarkName,
+                                   *cast<BasicBlock>(CodeRegion)->getParent(),
+                                   Loc, CodeRegion) {}
+
+bool OptimizationRemarkAnalysis::isEnabled(StringRef PassName) {
+  return PassRemarksAnalysisOptLoc.Pattern &&
+         PassRemarksAnalysisOptLoc.Pattern->match(PassName);
 }
 
 void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const {
@@ -285,42 +305,48 @@ void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const {
 }
 
 void llvm::emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
-                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Function &Fn,
+                                  const DiagnosticLocation &Loc,
                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemark(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemark(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
                                         const Function &Fn,
-                                        const DebugLoc &DLoc,
+                                        const DiagnosticLocation &Loc,
                                         const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkMissed(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkMissed(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
                                           const char *PassName,
                                           const Function &Fn,
-                                          const DebugLoc &DLoc,
+                                          const DiagnosticLocation &Loc,
                                           const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkAnalysis(PassName, Fn, Loc, Msg));
 }
 
-void llvm::emitOptimizationRemarkAnalysisFPCommute(LLVMContext &Ctx,
-                                                   const char *PassName,
-                                                   const Function &Fn,
-                                                   const DebugLoc &DLoc,
-                                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, Fn, DLoc, Msg));
+void llvm::emitOptimizationRemarkAnalysisFPCommute(
+    LLVMContext &Ctx, const char *PassName, const Function &Fn,
+    const DiagnosticLocation &Loc, const Twine &Msg) {
+  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkAnalysisAliasing(LLVMContext &Ctx,
                                                   const char *PassName,
                                                   const Function &Fn,
-                                                  const DebugLoc &DLoc,
+                                                  const DiagnosticLocation &Loc,
                                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, Fn, Loc, Msg));
 }
 
+DiagnosticInfoOptimizationFailure::DiagnosticInfoOptimizationFailure(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
+          DK_OptimizationFailure, DS_Warning, PassName, RemarkName,
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
+
 bool DiagnosticInfoOptimizationFailure::isEnabled() const {
   // Only print warnings.
   return getSeverity() == DS_Warning;
@@ -336,18 +362,6 @@ void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
   DP << Str;
 }
 
-void llvm::emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
-                                    const DebugLoc &DLoc, const Twine &Msg) {
-  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
-      Fn, DLoc, Twine("loop not vectorized: " + Msg)));
-}
-
-void llvm::emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn,
-                                     const DebugLoc &DLoc, const Twine &Msg) {
-  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
-      Fn, DLoc, Twine("loop not interleaved: " + Msg)));
-}
-
 void DiagnosticInfoISelFallback::print(DiagnosticPrinter &DP) const {
   DP << "Instruction selection used fallback path for " << getFunction();
 }
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index 1880807da7eb..44948cc5831d 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -29,9 +29,9 @@ using namespace llvm;
 
 // Always verify dominfo if expensive checking is enabled.
 #ifdef EXPENSIVE_CHECKS
-static bool VerifyDomInfo = true;
+bool llvm::VerifyDomInfo = true;
 #else
-static bool VerifyDomInfo = false;
+bool llvm::VerifyDomInfo = false;
 #endif
 static cl::opt<bool,true>
 VerifyDomInfoX("verify-dom-info", cl::location(VerifyDomInfo),
@@ -73,6 +73,15 @@ template void llvm::Calculate<Function, Inverse<BasicBlock *>>(
         GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT,
     Function &F);
 
+bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
+                               FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<DominatorTreeAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 // dominates - Return true if Def dominates a use in User. This performs
 // the special checks necessary if Def and User are in the same basic block.
 // Note that Def doesn't dominate a use in Def itself!
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 05419aa3d2bb..c4bb9e83acd7 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -30,7 +30,6 @@ using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
-template class llvm::SymbolTableListTraits<Argument>;
 template class llvm::SymbolTableListTraits<BasicBlock>;
 
 //===----------------------------------------------------------------------===//
@@ -39,12 +38,8 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
 
 void Argument::anchor() { }
 
-Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
-  : Value(Ty, Value::ArgumentVal) {
-  Parent = nullptr;
-
-  if (Par)
-    Par->getArgumentList().push_back(this);
+Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
+    : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
   setName(Name);
 }
 
@@ -52,27 +47,9 @@ void Argument::setParent(Function *parent) {
   Parent = parent;
 }
 
-/// getArgNo - Return the index of this formal argument in its containing
-/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1.
-unsigned Argument::getArgNo() const {
-  const Function *F = getParent();
-  assert(F && "Argument is not in a function");
-
-  Function::const_arg_iterator AI = F->arg_begin();
-  unsigned ArgIdx = 0;
-  for (; &*AI != this; ++AI)
-    ++ArgIdx;
-
-  return ArgIdx;
-}
-
-/// hasNonNullAttr - Return true if this argument has the nonnull attribute on
-/// it in its containing function. Also returns true if at least one byte is
-/// known to be dereferenceable and the pointer is in addrspace(0).
 bool Argument::hasNonNullAttr() const {
   if (!getType()->isPointerTy()) return false;
-  if (getParent()->getAttributes().
-        hasAttribute(getArgNo()+1, Attribute::NonNull))
+  if (getParent()->hasParamAttribute(getArgNo(), Attribute::NonNull))
     return true;
   else if (getDereferenceableBytes() > 0 &&
            getType()->getPointerAddressSpace() == 0)
@@ -80,25 +57,19 @@ bool Argument::hasNonNullAttr() const {
   return false;
 }
 
-/// hasByValAttr - Return true if this argument has the byval attribute on it
-/// in its containing function.
 bool Argument::hasByValAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::ByVal);
 }
 
 bool Argument::hasSwiftSelfAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::SwiftSelf);
+  return getParent()->hasParamAttribute(getArgNo(), Attribute::SwiftSelf);
 }
 
 bool Argument::hasSwiftErrorAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::SwiftError);
+  return getParent()->hasParamAttribute(getArgNo(), Attribute::SwiftError);
 }
 
-/// \brief Return true if this argument has the inalloca attribute on it in
-/// its containing function.
 bool Argument::hasInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::InAlloca);
@@ -106,9 +77,9 @@ bool Argument::hasInAllocaAttr() const {
 
 bool Argument::hasByValOrInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
-  AttributeSet Attrs = getParent()->getAttributes();
-  return Attrs.hasAttribute(getArgNo() + 1, Attribute::ByVal) ||
-         Attrs.hasAttribute(getArgNo() + 1, Attribute::InAlloca);
+  AttributeList Attrs = getParent()->getAttributes();
+  return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca);
 }
 
 unsigned Argument::getParamAlignment() const {
@@ -129,116 +100,74 @@ uint64_t Argument::getDereferenceableOrNullBytes() const {
   return getParent()->getDereferenceableOrNullBytes(getArgNo()+1);
 }
 
-/// hasNestAttr - Return true if this argument has the nest attribute on
-/// it in its containing function.
 bool Argument::hasNestAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::Nest);
 }
 
-/// hasNoAliasAttr - Return true if this argument has the noalias attribute on
-/// it in its containing function.
 bool Argument::hasNoAliasAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::NoAlias);
 }
 
-/// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
-/// on it in its containing function.
 bool Argument::hasNoCaptureAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::NoCapture);
 }
 
-/// hasSRetAttr - Return true if this argument has the sret attribute on
-/// it in its containing function.
 bool Argument::hasStructRetAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::StructRet);
 }
 
-/// hasReturnedAttr - Return true if this argument has the returned attribute on
-/// it in its containing function.
 bool Argument::hasReturnedAttr() const {
   return hasAttribute(Attribute::Returned);
 }
 
-/// hasZExtAttr - Return true if this argument has the zext attribute on it in
-/// its containing function.
 bool Argument::hasZExtAttr() const {
   return hasAttribute(Attribute::ZExt);
 }
 
-/// hasSExtAttr Return true if this argument has the sext attribute on it in its
-/// containing function.
 bool Argument::hasSExtAttr() const {
   return hasAttribute(Attribute::SExt);
 }
 
-/// Return true if this argument has the readonly or readnone attribute on it
-/// in its containing function.
 bool Argument::onlyReadsMemory() const {
-  return getParent()->getAttributes().
-      hasAttribute(getArgNo()+1, Attribute::ReadOnly) ||
-      getParent()->getAttributes().
-      hasAttribute(getArgNo()+1, Attribute::ReadNone);
+  AttributeList Attrs = getParent()->getAttributes();
+  return Attrs.hasParamAttribute(getArgNo(), Attribute::ReadOnly) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::ReadNone);
 }
 
-/// addAttr - Add attributes to an argument.
-void Argument::addAttr(AttributeSet AS) {
+void Argument::addAttr(AttributeList AS) {
   assert(AS.getNumSlots() <= 1 &&
          "Trying to add more than one attribute set to an argument!");
   AttrBuilder B(AS, AS.getSlotIndex(0));
-  getParent()->addAttributes(getArgNo() + 1,
-                             AttributeSet::get(Parent->getContext(),
-                                               getArgNo() + 1, B));
+  getParent()->addAttributes(
+      getArgNo() + 1,
+      AttributeList::get(Parent->getContext(), getArgNo() + 1, B));
 }
 
-/// removeAttr - Remove attributes from an argument.
-void Argument::removeAttr(AttributeSet AS) {
+void Argument::removeAttr(AttributeList AS) {
   assert(AS.getNumSlots() <= 1 &&
          "Trying to remove more than one attribute set from an argument!");
   AttrBuilder B(AS, AS.getSlotIndex(0));
-  getParent()->removeAttributes(getArgNo() + 1,
-                                AttributeSet::get(Parent->getContext(),
-                                                  getArgNo() + 1, B));
+  getParent()->removeAttributes(
+      getArgNo() + 1,
+      AttributeList::get(Parent->getContext(), getArgNo() + 1, B));
 }
 
-/// hasAttribute - Checks if an argument has a given attribute.
 bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
-  return getParent()->hasAttribute(getArgNo() + 1, Kind);
+  return getParent()->hasParamAttribute(getArgNo(), Kind);
 }
 
 //===----------------------------------------------------------------------===//
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
 
-bool Function::isMaterializable() const {
-  return getGlobalObjectSubClassData() & (1 << IsMaterializableBit);
-}
-
-void Function::setIsMaterializable(bool V) {
-  unsigned Mask = 1 << IsMaterializableBit;
-  setGlobalObjectSubClassData((~Mask & getGlobalObjectSubClassData()) |
-                              (V ? Mask : 0u));
-}
-
 LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
-FunctionType *Function::getFunctionType() const {
-  return cast<FunctionType>(getValueType());
-}
-
-bool Function::isVarArg() const {
-  return getFunctionType()->isVarArg();
-}
-
-Type *Function::getReturnType() const {
-  return getFunctionType()->getReturnType();
-}
-
 void Function::removeFromParent() {
   getParent()->getFunctionList().remove(getIterator());
 }
@@ -254,7 +183,8 @@ void Function::eraseFromParent() {
 Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal,
-                   OperandTraits<Function>::op_begin(this), 0, Linkage, name) {
+                   OperandTraits<Function>::op_begin(this), 0, Linkage, name),
+      Arguments(nullptr), NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -282,7 +212,8 @@ Function::~Function() {
   dropAllReferences();    // After this it is safe to delete instructions.
 
   // Delete all of the method arguments and unlink from symbol table...
-  ArgumentList.clear();
+  if (Arguments)
+    clearArguments();
 
   // Remove the function from the on-the-side GC table.
   clearGC();
@@ -290,16 +221,33 @@ Function::~Function() {
 
 void Function::BuildLazyArguments() const {
   // Create the arguments vector, all arguments start out unnamed.
-  FunctionType *FT = getFunctionType();
-  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
-    assert(!FT->getParamType(i)->isVoidTy() &&
-           "Cannot have void typed arguments!");
-    ArgumentList.push_back(new Argument(FT->getParamType(i)));
+  auto *FT = getFunctionType();
+  if (NumArgs > 0) {
+    Arguments = std::allocator<Argument>().allocate(NumArgs);
+    for (unsigned i = 0, e = NumArgs; i != e; ++i) {
+      Type *ArgTy = FT->getParamType(i);
+      assert(!ArgTy->isVoidTy() && "Cannot have void typed arguments!");
+      new (Arguments + i) Argument(ArgTy, "", const_cast<Function *>(this), i);
+    }
   }
 
   // Clear the lazy arguments bit.
   unsigned SDC = getSubclassDataFromValue();
   const_cast<Function*>(this)->setValueSubclassData(SDC &= ~(1<<0));
+  assert(!hasLazyArguments());
+}
+
+static MutableArrayRef<Argument> makeArgArray(Argument *Args, size_t Count) {
+  return MutableArrayRef<Argument>(Args, Count);
+}
+
+void Function::clearArguments() {
+  for (Argument &A : makeArgArray(Arguments, NumArgs)) {
+    A.setName("");
+    A.~Argument();
+  }
+  std::allocator<Argument>().deallocate(Arguments, NumArgs);
+  Arguments = nullptr;
 }
 
 void Function::stealArgumentListFrom(Function &Src) {
@@ -307,10 +255,10 @@ void Function::stealArgumentListFrom(Function &Src) {
 
   // Drop the current arguments, if any, and set the lazy argument bit.
   if (!hasLazyArguments()) {
-    assert(llvm::all_of(ArgumentList,
+    assert(llvm::all_of(makeArgArray(Arguments, NumArgs),
                         [](const Argument &A) { return A.use_empty(); }) &&
            "Expected arguments to be unused in declaration");
-    ArgumentList.clear();
+    clearArguments();
     setValueSubclassData(getSubclassDataFromValue() | (1 << 0));
   }
 
@@ -319,18 +267,26 @@ void Function::stealArgumentListFrom(Function &Src) {
     return;
 
   // Steal arguments from Src, and fix the lazy argument bits.
-  ArgumentList.splice(ArgumentList.end(), Src.ArgumentList);
+  assert(arg_size() == Src.arg_size());
+  Arguments = Src.Arguments;
+  Src.Arguments = nullptr;
+  for (Argument &A : makeArgArray(Arguments, NumArgs)) {
+    // FIXME: This does the work of transferNodesFromList inefficiently.
+    SmallString<128> Name;
+    if (A.hasName())
+      Name = A.getName();
+    if (!Name.empty())
+      A.setName("");
+    A.setParent(this);
+    if (!Name.empty())
+      A.setName(Name);
+  }
+
   setValueSubclassData(getSubclassDataFromValue() & ~(1 << 0));
+  assert(!hasLazyArguments());
   Src.setValueSubclassData(Src.getSubclassDataFromValue() | (1 << 0));
 }
 
-size_t Function::arg_size() const {
-  return getFunctionType()->getNumParams();
-}
-bool Function::arg_empty() const {
-  return getFunctionType()->getNumParams() == 0;
-}
-
 // dropAllReferences() - This function causes all the subinstructions to "let
 // go" of all references that they are maintaining.  This allows one to
 // 'delete' a whole class at a time, even though there may be circular
@@ -362,49 +318,49 @@ void Function::dropAllReferences() {
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void Function::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
-void Function::addAttributes(unsigned i, AttributeSet Attrs) {
-  AttributeSet PAL = getAttributes();
+void Function::addAttributes(unsigned i, AttributeList Attrs) {
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
 void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void Function::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
-void Function::removeAttributes(unsigned i, AttributeSet Attrs) {
-  AttributeSet PAL = getAttributes();
+void Function::removeAttributes(unsigned i, AttributeList Attrs) {
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
 void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
@@ -533,10 +489,18 @@ static std::string getMangledTypeStr(Type* Ty) {
   } else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
     Result += "a" + llvm::utostr(ATyp->getNumElements()) +
       getMangledTypeStr(ATyp->getElementType());
-  } else if (StructType* STyp = dyn_cast<StructType>(Ty)) {
-    assert(!STyp->isLiteral() && "TODO: implement literal types");
-    Result += STyp->getName();
-  } else if (FunctionType* FT = dyn_cast<FunctionType>(Ty)) {
+  } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
+    if (!STyp->isLiteral()) {
+      Result += "s_";
+      Result += STyp->getName();
+    } else {
+      Result += "sl_";
+      for (auto Elem : STyp->elements())
+        Result += getMangledTypeStr(Elem);
+    }
+    // Ensure nested structs are distinguishable.
+    Result += "s";
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(Ty)) {
     Result += "f_" + getMangledTypeStr(FT->getReturnType());
     for (size_t i = 0; i < FT->getNumParams(); i++)
       Result += getMangledTypeStr(FT->getParamType(i));
@@ -1279,9 +1243,10 @@ void Function::setValueSubclassDataBit(unsigned Bit, bool On) {
     setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit));
 }
 
-void Function::setEntryCount(uint64_t Count) {
+void Function::setEntryCount(uint64_t Count,
+                             const DenseSet<GlobalValue::GUID> *S) {
   MDBuilder MDB(getContext());
-  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count));
+  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count, S));
 }
 
 Optional<uint64_t> Function::getEntryCount() const {
@@ -1298,6 +1263,18 @@ Optional<uint64_t> Function::getEntryCount() const {
   return None;
 }
 
+DenseSet<GlobalValue::GUID> Function::getImportGUIDs() const {
+  DenseSet<GlobalValue::GUID> R;
+  if (MDNode *MD = getMetadata(LLVMContext::MD_prof))
+    if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0)))
+      if (MDS->getString().equals("function_entry_count"))
+        for (unsigned i = 2; i < MD->getNumOperands(); i++)
+          R.insert(mdconst::extract<ConstantInt>(MD->getOperand(i))
+                       ->getValue()
+                       .getZExtValue());
+  return R;
+}
+
 void Function::setSectionPrefix(StringRef Prefix) {
   MDBuilder MDB(getContext());
   setMetadata(LLVMContext::MD_section_prefix,
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index 3bbcf781e5dd..ba92a91cc917 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -103,11 +103,17 @@ bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
   return true;
 }
 
+void GCOVFile::print(raw_ostream &OS) const {
+  for (const auto &FPtr : Functions)
+    FPtr->print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump GCOVFile content to dbgs() for debugging purposes.
 LLVM_DUMP_METHOD void GCOVFile::dump() const {
-  for (const auto &FPtr : Functions)
-    FPtr->dump();
+  print(dbgs());
 }
+#endif
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
@@ -343,13 +349,19 @@ uint64_t GCOVFunction::getExitCount() const {
   return Blocks.back()->getCount();
 }
 
+void GCOVFunction::print(raw_ostream &OS) const {
+  OS << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
+     << LineNumber << "\n";
+  for (const auto &Block : Blocks)
+    Block->print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 LLVM_DUMP_METHOD void GCOVFunction::dump() const {
-  dbgs() << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
-         << LineNumber << "\n";
-  for (const auto &Block : Blocks)
-    Block->dump();
+  print(dbgs());
 }
+#endif
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
@@ -400,29 +412,35 @@ void GCOVBlock::collectLineCounts(FileInfo &FI) {
     FI.addBlockLine(Parent.getFilename(), N, this);
 }
 
-/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
-LLVM_DUMP_METHOD void GCOVBlock::dump() const {
-  dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
+void GCOVBlock::print(raw_ostream &OS) const {
+  OS << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!SrcEdges.empty()) {
-    dbgs() << "\tSource Edges : ";
+    OS << "\tSource Edges : ";
     for (const GCOVEdge *Edge : SrcEdges)
-      dbgs() << Edge->Src.Number << " (" << Edge->Count << "), ";
-    dbgs() << "\n";
+      OS << Edge->Src.Number << " (" << Edge->Count << "), ";
+    OS << "\n";
   }
   if (!DstEdges.empty()) {
-    dbgs() << "\tDestination Edges : ";
+    OS << "\tDestination Edges : ";
     for (const GCOVEdge *Edge : DstEdges)
-      dbgs() << Edge->Dst.Number << " (" << Edge->Count << "), ";
-    dbgs() << "\n";
+      OS << Edge->Dst.Number << " (" << Edge->Count << "), ";
+    OS << "\n";
   }
   if (!Lines.empty()) {
-    dbgs() << "\tLines : ";
+    OS << "\tLines : ";
     for (uint32_t N : Lines)
-      dbgs() << (N) << ",";
-    dbgs() << "\n";
+      OS << (N) << ",";
+    OS << "\n";
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
+LLVM_DUMP_METHOD void GCOVBlock::dump() const {
+  print(dbgs());
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // FileInfo implementation.
 
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6f7356524d38..5f338f58d940 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -93,18 +93,6 @@ void GlobalObject::setAlignment(unsigned Align) {
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
-unsigned GlobalObject::getGlobalObjectSubClassData() const {
-  unsigned ValueData = getGlobalValueSubClassData();
-  return ValueData >> GlobalObjectBits;
-}
-
-void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
-  unsigned OldData = getGlobalValueSubClassData();
-  setGlobalValueSubClassData((OldData & GlobalObjectMask) |
-                             (Val << GlobalObjectBits));
-  assert(getGlobalObjectSubClassData() == Val && "representation error");
-}
-
 void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
   GlobalValue::copyAttributesFrom(Src);
   if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
@@ -152,7 +140,7 @@ StringRef GlobalValue::getSection() const {
   return cast<GlobalObject>(this)->getSection();
 }
 
-Comdat *GlobalValue::getComdat() {
+const Comdat *GlobalValue::getComdat() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
     if (const GlobalObject *GO = GA->getBaseObject())
@@ -177,7 +165,9 @@ void GlobalObject::setSection(StringRef S) {
 
   // Get or create a stable section name string and put it in the table in the
   // context.
-  S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  if (!S.empty()) {
+    S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  }
   getContext().pImpl->GlobalObjectSections[this] = S;
 
   // Update the HasSectionHashEntryBit. Setting the section to the empty string
@@ -240,7 +230,7 @@ bool GlobalValue::canIncreaseAlignment() const {
   return true;
 }
 
-GlobalObject *GlobalValue::getBaseObject() {
+const GlobalObject *GlobalValue::getBaseObject() const {
   if (auto *GO = dyn_cast<GlobalObject>(this))
     return GO;
   if (auto *GA = dyn_cast<GlobalAlias>(this))
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index d3e410d6d033..fd5ae71a2f3c 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -172,7 +172,8 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
            "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start,
+                                           { Ptr->getType() });
   return createCallHelper(TheFn, Ops, this);
 }
 
@@ -187,7 +188,8 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
            "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end,
+                                           { Ptr->getType() });
   return createCallHelper(TheFn, Ops, this);
 }
 
@@ -482,3 +484,11 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
                   getInt32(DerivedOffset)};
  return createCallHelper(FnGCRelocate, Args, this, Name);
 }
+
+CallInst *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID,
+                                               Value *LHS, Value *RHS,
+                                               const Twine &Name) {
+  Module *M = BB->getParent()->getParent();
+  Function *Fn =  Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  return createCallHelper(Fn, { LHS, RHS }, this, Name);
+}
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index 05e206cfd6cb..955fdc749b2b 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -70,6 +70,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print Module IR"; }
 };
 
 class PrintFunctionPassWrapper : public FunctionPass {
@@ -91,6 +93,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print Function IR"; }
 };
 
 class PrintBasicBlockPass : public BasicBlockPass {
@@ -111,6 +115,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print BasicBlock IR"; }
 };
 
 }
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 5a9118571040..8feeeb65d445 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -1,4 +1,4 @@
-//===-- InlineAsm.cpp - Implement the InlineAsm class ---------------------===//
+//===- InlineAsm.cpp - Implement the InlineAsm class ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,27 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/InlineAsm.h"
 #include "ConstantsContext.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
-using namespace llvm;
-
-// Implement the first virtual method in this class in this file so the
-// InlineAsm vtable is emitted here.
-InlineAsm::~InlineAsm() {
-}
+#include <cstddef>
+#include <cstdlib>
 
-InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
-                          StringRef Constraints, bool hasSideEffects,
-                          bool isAlignStack, AsmDialect asmDialect) {
-  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
-                       isAlignStack, asmDialect);
-  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
-  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
-}
+using namespace llvm;
 
 InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
@@ -40,12 +35,24 @@ InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
       AsmString(asmString), Constraints(constraints), FTy(FTy),
       HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
       Dialect(asmDialect) {
-
   // Do various checks on the constraint string and type.
   assert(Verify(getFunctionType(), constraints) &&
          "Function type not legal for constraints!");
 }
 
+// Implement the first virtual method in this class in this file so the
+// InlineAsm vtable is emitted here.
+InlineAsm::~InlineAsm() = default;
+
+InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
+                          StringRef Constraints, bool hasSideEffects,
+                          bool isAlignStack, AsmDialect asmDialect) {
+  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
+                       isAlignStack, asmDialect);
+  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
+  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
+}
+
 void InlineAsm::destroyConstant() {
   getType()->getContext().pImpl->InlineAsms.remove(this);
   delete this;
@@ -55,14 +62,6 @@ FunctionType *InlineAsm::getFunctionType() const {
   return FTy;
 }
     
-///Default constructor.
-InlineAsm::ConstraintInfo::ConstraintInfo() :
-  Type(isInput), isEarlyClobber(false),
-  MatchingInput(-1), isCommutative(false),
-  isIndirect(false), isMultipleAlternative(false),
-  currentAlternativeIndex(0) {
-}
-
 /// Parse - Analyze the specified string (e.g. "==&{eax}") and fill in the
 /// fields in this structure.  If the constraint string is not understood,
 /// return true, otherwise return false.
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 2fa03489081d..c26699eab4e2 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 using namespace llvm;
@@ -59,12 +60,6 @@ const Module *Instruction::getModule() const {
   return getParent()->getModule();
 }
 
-Module *Instruction::getModule() {
-  return getParent()->getModule();
-}
-
-Function *Instruction::getFunction() { return getParent()->getParent(); }
-
 const Function *Instruction::getFunction() const {
   return getParent()->getParent();
 }
@@ -122,6 +117,29 @@ bool Instruction::hasNoSignedWrap() const {
   return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
 }
 
+void Instruction::dropPoisonGeneratingFlags() {
+  switch (getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+    cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(false);
+    cast<OverflowingBinaryOperator>(this)->setHasNoSignedWrap(false);
+    break;
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::AShr:
+  case Instruction::LShr:
+    cast<PossiblyExactOperator>(this)->setIsExact(false);
+    break;
+
+  case Instruction::GetElementPtr:
+    cast<GetElementPtrInst>(this)->setIsInBounds(false);
+    break;
+  }
+}
+
 bool Instruction::isExact() const {
   return cast<PossiblyExactOperator>(this)->isExact();
 }
@@ -186,6 +204,11 @@ bool Instruction::hasAllowReciprocal() const {
   return cast<FPMathOperator>(this)->hasAllowReciprocal();
 }
 
+bool Instruction::hasAllowContract() const {
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasAllowContract();
+}
+
 FastMathFlags Instruction::getFastMathFlags() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
@@ -521,17 +544,6 @@ bool Instruction::mayThrow() const {
   return isa<ResumeInst>(this);
 }
 
-/// Return true if the instruction is associative:
-///
-///   Associative operators satisfy:  x op (y op z) === (x op y) op z
-///
-/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative.
-///
-bool Instruction::isAssociative(unsigned Opcode) {
-  return Opcode == And || Opcode == Or || Opcode == Xor ||
-         Opcode == Add || Opcode == Mul;
-}
-
 bool Instruction::isAssociative() const {
   unsigned Opcode = getOpcode();
   if (isAssociative(Opcode))
@@ -546,51 +558,6 @@ bool Instruction::isAssociative() const {
   }
 }
 
-/// Return true if the instruction is commutative:
-///
-///   Commutative operators satisfy: (x op y) === (y op x)
-///
-/// In LLVM, these are the associative operators, plus SetEQ and SetNE, when
-/// applied to any type.
-///
-bool Instruction::isCommutative(unsigned op) {
-  switch (op) {
-  case Add:
-  case FAdd:
-  case Mul:
-  case FMul:
-  case And:
-  case Or:
-  case Xor:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// Return true if the instruction is idempotent:
-///
-///   Idempotent operators satisfy:  x op x === x
-///
-/// In LLVM, the And and Or operators are idempotent.
-///
-bool Instruction::isIdempotent(unsigned Opcode) {
-  return Opcode == And || Opcode == Or;
-}
-
-/// Return true if the instruction is nilpotent:
-///
-///   Nilpotent operators satisfy:  x op x === Id,
-///
-///   where Id is the identity for the operator, i.e. a constant such that
-///     x op Id === x and Id op x === x for all x.
-///
-/// In LLVM, the Xor operator is nilpotent.
-///
-bool Instruction::isNilpotent(unsigned Opcode) {
-  return Opcode == Xor;
-}
-
 Instruction *Instruction::cloneImpl() const {
   llvm_unreachable("Subclass of Instruction failed to implement cloneImpl");
 }
@@ -651,3 +618,34 @@ Instruction *Instruction::clone() const {
   New->copyMetadata(*this);
   return New;
 }
+
+void Instruction::updateProfWeight(uint64_t S, uint64_t T) {
+  auto *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (ProfileData == nullptr)
+    return;
+
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
+  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+    return;
+
+  SmallVector<uint32_t, 4> Weights;
+  for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+    // Using APInt::div may be expensive, but most cases should fit in 64 bits.
+    APInt Val(128, mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i))
+                       ->getValue()
+                       .getZExtValue());
+    Val *= APInt(128, S);
+    Weights.push_back(Val.udiv(APInt(128, T)).getLimitedValue());
+  }
+  MDBuilder MDB(getContext());
+  setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
+
+void Instruction::setProfWeight(uint64_t W) {
+  assert((isa<CallInst>(this) || isa<InvokeInst>(this)) &&
+         "Can only set weights for call and invoke instrucitons");
+  SmallVector<uint32_t, 1> Weights;
+  Weights.push_back(W);
+  MDBuilder MDB(getContext());
+  setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index b67926943429..c10c144122e2 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -307,7 +307,7 @@ CallInst::CallInst(const CallInst &CI)
     : Instruction(CI.getType(), Instruction::Call,
                   OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
                   CI.getNumOperands()),
-      AttributeList(CI.AttributeList), FTy(CI.FTy) {
+      Attrs(CI.Attrs), FTy(CI.FTy) {
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
 
@@ -334,7 +334,7 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
 Value *CallInst::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
     return getArgOperand(Index-1);
   if (const Function *F = getCalledFunction())
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
@@ -345,48 +345,58 @@ Value *CallInst::getReturnedArgOperand() const {
 }
 
 void CallInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
+bool CallInst::hasRetAttr(Attribute::AttrKind Kind) const {
+  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    return true;
+
+  // Look at the callee, if available.
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+  return false;
+}
+
 bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
-  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
+  assert(i < getNumArgOperands() && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, Kind))
+  if (Attrs.hasParamAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(i, Kind);
+    return F->getAttributes().hasParamAttribute(i, Kind);
   return false;
 }
 
@@ -400,8 +410,10 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i,
   // question is a call argument; or be indirectly implied by the kind of its
   // containing operand bundle, if the operand is a bundle operand.
 
+  // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
+  // indices.
   if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i, Kind);
+    return paramHasAttr(i - 1, Kind);
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either a call argument or an operand bundle!");
@@ -466,7 +478,7 @@ static Instruction *createMalloc(Instruction *InsertBefore,
   Value *MallocFunc = MallocF;
   if (!MallocFunc)
     // prototype malloc as "void *malloc(size_t)"
-    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, nullptr);
+    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy);
   PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
   CallInst *MCall = nullptr;
   Instruction *Result = nullptr;
@@ -560,7 +572,7 @@ static Instruction *createFree(Value *Source,
   Type *VoidTy = Type::getVoidTy(M->getContext());
   Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
-  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, nullptr);
+  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy);
   CallInst *Result = nullptr;
   Value *PtrCast = Source;
   if (InsertBefore) {
@@ -646,7 +658,7 @@ InvokeInst::InvokeInst(const InvokeInst &II)
                      OperandTraits<InvokeInst>::op_end(this) -
                          II.getNumOperands(),
                      II.getNumOperands()),
-      AttributeList(II.AttributeList), FTy(II.FTy) {
+      Attrs(II.Attrs), FTy(II.FTy) {
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
   std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -681,7 +693,7 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
 Value *InvokeInst::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
     return getArgOperand(Index-1);
   if (const Function *F = getCalledFunction())
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
@@ -691,13 +703,23 @@ Value *InvokeInst::getReturnedArgOperand() const {
   return nullptr;
 }
 
+bool InvokeInst::hasRetAttr(Attribute::AttrKind Kind) const {
+  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    return true;
+
+  // Look at the callee, if available.
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+  return false;
+}
+
 bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
-  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
+  assert(i < getNumArgOperands() && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, Kind))
+  if (Attrs.hasParamAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(i, Kind);
+    return F->getAttributes().hasParamAttribute(i, Kind);
   return false;
 }
 
@@ -711,8 +733,10 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
   // question is an invoke argument; or be indirectly implied by the kind of its
   // containing operand bundle, if the operand is a bundle operand.
 
+  // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
+  // indices.
   if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i, Kind);
+    return paramHasAttr(i - 1, Kind);
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either an invoke argument or an operand bundle!");
@@ -720,37 +744,37 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void InvokeInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
@@ -1199,34 +1223,38 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   return Amt;
 }
 
-AllocaInst::AllocaInst(Type *Ty, const Twine &Name, Instruction *InsertBefore)
-    : AllocaInst(Ty, /*ArraySize=*/nullptr, Name, InsertBefore) {}
-
-AllocaInst::AllocaInst(Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
-
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        Instruction *InsertBefore)
-    : AllocaInst(Ty, ArraySize, /*Align=*/0, Name, InsertBefore) {}
+  : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, ArraySize, /*Align=*/0, Name, InsertAtEnd) {}
+  : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
-    : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                       getAISize(Ty->getContext(), ArraySize), InsertBefore),
-      AllocatedType(Ty) {
+  : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/0, Name, InsertBefore) {}
+
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       const Twine &Name, BasicBlock *InsertAtEnd)
+  : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/0, Name, InsertAtEnd) {}
+
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       unsigned Align, const Twine &Name,
+                       Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertBefore),
+    AllocatedType(Ty) {
   setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
-                       const Twine &Name, BasicBlock *InsertAtEnd)
-    : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                       getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       unsigned Align, const Twine &Name,
+                       BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
       AllocatedType(Ty) {
   setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
@@ -3655,16 +3683,16 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   setNumHungOffUseOperands(OpNo+2);
-  CaseIt Case(this, NewCaseIdx);
+  CaseHandle Case(this, NewCaseIdx);
   Case.setValue(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt i) {
-  unsigned idx = i.getCaseIndex();
-  
+SwitchInst::CaseIt SwitchInst::removeCase(CaseIt I) {
+  unsigned idx = I->getCaseIndex();
+
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
 
   unsigned NumOps = getNumOperands();
@@ -3680,6 +3708,8 @@ void SwitchInst::removeCase(CaseIt i) {
   OL[NumOps-2].set(nullptr);
   OL[NumOps-2+1].set(nullptr);
   setNumHungOffUseOperands(NumOps-2);
+
+  return CaseIt(this, idx);
 }
 
 /// growOperands - grow operands - This grows the operand list in response
@@ -3826,6 +3856,7 @@ InsertValueInst *InsertValueInst::cloneImpl() const {
 
 AllocaInst *AllocaInst::cloneImpl() const {
   AllocaInst *Result = new AllocaInst(getAllocatedType(),
+                                      getType()->getAddressSpace(),
                                       (Value *)getOperand(0), getAlignment());
   Result->setUsedWithInAlloca(isUsedWithInAlloca());
   Result->setSwiftError(isSwiftError());
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index 240250662aec..c9814a96bea6 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -21,6 +21,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -93,3 +94,34 @@ Value *InstrProfIncrementInst::getStep() const {
   LLVMContext &Context = M->getContext();
   return ConstantInt::get(Type::getInt64Ty(Context), 1);
 }
+
+ConstrainedFPIntrinsic::RoundingMode
+ConstrainedFPIntrinsic::getRoundingMode() const {
+  Metadata *MD = dyn_cast<MetadataAsValue>(getOperand(2))->getMetadata();
+  if (!MD || !isa<MDString>(MD))
+    return rmInvalid;
+  StringRef RoundingArg = cast<MDString>(MD)->getString();
+
+  // For dynamic rounding mode, we use round to nearest but we will set the
+  // 'exact' SDNodeFlag so that the value will not be rounded.
+  return StringSwitch<RoundingMode>(RoundingArg)
+    .Case("round.dynamic",    rmDynamic)
+    .Case("round.tonearest",  rmToNearest)
+    .Case("round.downward",   rmDownward)
+    .Case("round.upward",     rmUpward)
+    .Case("round.towardzero", rmTowardZero)
+    .Default(rmInvalid);
+}
+
+ConstrainedFPIntrinsic::ExceptionBehavior
+ConstrainedFPIntrinsic::getExceptionBehavior() const {
+  Metadata *MD = dyn_cast<MetadataAsValue>(getOperand(3))->getMetadata();
+  if (!MD || !isa<MDString>(MD))
+    return ebInvalid;
+  StringRef ExceptionArg = cast<MDString>(MD)->getString();
+  return StringSwitch<ExceptionBehavior>(ExceptionArg)
+    .Case("fpexcept.ignore",  ebIgnore)
+    .Case("fpexcept.maytrap", ebMayTrap)
+    .Case("fpexcept.strict",  ebStrict)
+    .Default(ebInvalid);
+}
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index dd66f144f04f..6c6383c22255 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -58,6 +58,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
     {MD_type, "type"},
     {MD_section_prefix, "section_prefix"},
     {MD_absolute_symbol, "absolute_symbol"},
+    {MD_associated, "associated"},
   };
 
   for (auto &MDKind : MDKinds) {
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index c43356c53826..343722463e5f 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -114,9 +114,10 @@ LLVMContextImpl::~LLVMContextImpl() {
   }
 
   // Destroy attribute lists.
-  for (FoldingSetIterator<AttributeSetImpl> I = AttrsLists.begin(),
-         E = AttrsLists.end(); I != E; ) {
-    FoldingSetIterator<AttributeSetImpl> Elem = I++;
+  for (FoldingSetIterator<AttributeListImpl> I = AttrsLists.begin(),
+                                             E = AttrsLists.end();
+       I != E;) {
+    FoldingSetIterator<AttributeListImpl> Elem = I++;
     delete &*Elem;
   }
 
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 850c81cfabb2..0ee0b9c0da25 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -352,22 +352,26 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   uint64_t SizeInBits;
   uint64_t OffsetInBits;
   uint32_t AlignInBits;
+  Optional<unsigned> DWARFAddressSpace;
   unsigned Flags;
   Metadata *ExtraData;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-                uint32_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                uint32_t AlignInBits, uint64_t OffsetInBits,
+                Optional<unsigned> DWARFAddressSpace, unsigned Flags,
                 Metadata *ExtraData)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
-        AlignInBits(AlignInBits), Flags(Flags), ExtraData(ExtraData) {}
+        AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
+        Flags(Flags), ExtraData(ExtraData) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
-        Flags(N->getFlags()), ExtraData(N->getRawExtraData()) {}
+        DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
+        ExtraData(N->getRawExtraData()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -375,7 +379,9 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
            SizeInBits == RHS->getSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
-           OffsetInBits == RHS->getOffsetInBits() && Flags == RHS->getFlags() &&
+           OffsetInBits == RHS->getOffsetInBits() &&
+           DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
+           Flags == RHS->getFlags() &&
            ExtraData == RHS->getRawExtraData();
   }
   unsigned getHashValue() const {
@@ -612,17 +618,19 @@ template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
   typedef MDNodeKeyImpl<DISubprogram> KeyTy;
   static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope,
-                                    LHS.LinkageName, RHS);
+                                    LHS.LinkageName, LHS.TemplateParams, RHS);
   }
   static bool isSubsetEqual(const DISubprogram *LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS->isDefinition(), LHS->getRawScope(),
-                                    LHS->getRawLinkageName(), RHS);
+                                    LHS->getRawLinkageName(),
+                                    LHS->getRawTemplateParams(), RHS);
   }
 
   /// Subprograms compare equal if they declare the same function in an ODR
   /// type.
   static bool isDeclarationOfODRMember(bool IsDefinition, const Metadata *Scope,
                                        const MDString *LinkageName,
+                                       const Metadata *TemplateParams,
                                        const DISubprogram *RHS) {
     // Check whether the LHS is eligible.
     if (IsDefinition || !Scope || !LinkageName)
@@ -633,8 +641,14 @@ template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
       return false;
 
     // Compare to the RHS.
+    // FIXME: We need to compare template parameters here to avoid incorrect
+    // collisions in mapMetadata when RF_MoveDistinctMDs and a ODR-DISubprogram
+    // has a non-ODR template parameter (i.e., a DICompositeType that does not
+    // have an identifier). Eventually we should decouple ODR logic from
+    // uniquing logic.
     return IsDefinition == RHS->isDefinition() && Scope == RHS->getRawScope() &&
-           LinkageName == RHS->getRawLinkageName();
+           LinkageName == RHS->getRawLinkageName() &&
+           TemplateParams == RHS->getRawTemplateParams();
   }
 };
 
@@ -1105,7 +1119,7 @@ public:
   FPMapTy FPConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
-  FoldingSet<AttributeSetImpl> AttrsLists;
+  FoldingSet<AttributeListImpl> AttrsLists;
   FoldingSet<AttributeSetNode> AttrsSetNodes;
 
   StringMap<MDString, BumpPtrAllocator> MDStringCache;
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index f4bfd5992151..b9c4f482adf5 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -56,11 +56,16 @@ MDNode *MDBuilder::createUnpredictable() {
   return MDNode::get(Context, None);
 }
 
-MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) {
+MDNode *MDBuilder::createFunctionEntryCount(
+    uint64_t Count, const DenseSet<GlobalValue::GUID> *Imports) {
   Type *Int64Ty = Type::getInt64Ty(Context);
-  return MDNode::get(Context,
-                     {createString("function_entry_count"),
-                      createConstant(ConstantInt::get(Int64Ty, Count))});
+  SmallVector<Metadata *, 8> Ops;
+  Ops.push_back(createString("function_entry_count"));
+  Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count)));
+  if (Imports)
+    for (auto ID : *Imports)
+      Ops.push_back(createConstant(ConstantInt::get(Int64Ty, ID)));
+  return MDNode::get(Context, Ops);
 }
 
 MDNode *MDBuilder::createFunctionSectionPrefix(StringRef Prefix) {
diff --git a/lib/IR/Mangler.cpp b/lib/IR/Mangler.cpp
index 41e11b3945e4..03723bfd2ddb 100644
--- a/lib/IR/Mangler.cpp
+++ b/lib/IR/Mangler.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/IR/Mangler.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -172,3 +173,34 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   raw_svector_ostream OS(OutName);
   getNameWithPrefix(OS, GV, CannotUsePrivateLabel);
 }
+
+void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
+                                        const Triple &TT, Mangler &Mangler) {
+  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
+    return;
+
+  if (TT.isKnownWindowsMSVCEnvironment())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
+    std::string Flag;
+    raw_string_ostream FlagOS(Flag);
+    Mangler.getNameWithPrefix(FlagOS, GV, false);
+    FlagOS.flush();
+    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
+      OS << Flag.substr(1);
+    else
+      OS << Flag;
+  } else {
+    Mangler.getNameWithPrefix(OS, GV, false);
+  }
+
+  if (!GV->getValueType()->isFunctionTy()) {
+    if (TT.isKnownWindowsMSVCEnvironment())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 1d1930459239..7228de3d2370 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -11,20 +11,50 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Metadata.h"
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/TrackingMDRef.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -1027,8 +1057,7 @@ static SmallVector<TrackingMDRef, 4> &getNMDOps(void *Operands) {
 }
 
 NamedMDNode::NamedMDNode(const Twine &N)
-    : Name(N.str()), Parent(nullptr),
-      Operands(new SmallVector<TrackingMDRef, 4>()) {}
+    : Name(N.str()), Operands(new SmallVector<TrackingMDRef, 4>()) {}
 
 NamedMDNode::~NamedMDNode() {
   dropAllReferences();
@@ -1308,17 +1337,26 @@ bool Instruction::extractProfTotalWeight(uint64_t &TotalVal) const {
     return false;
 
   auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
-  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+  if (!ProfDataName)
     return false;
 
-  TotalVal = 0;
-  for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
-    auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
-    if (!V)
-      return false;
-    TotalVal += V->getValue().getZExtValue();
+  if (ProfDataName->getString().equals("branch_weights")) {
+    TotalVal = 0;
+    for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+      auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
+      if (!V)
+        return false;
+      TotalVal += V->getValue().getZExtValue();
+    }
+    return true;
+  } else if (ProfDataName->getString().equals("VP") &&
+             ProfileData->getNumOperands() > 3) {
+    TotalVal = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2))
+                   ->getValue()
+                   .getZExtValue();
+    return true;
   }
-  return true;
+  return false;
 }
 
 void Instruction::clearMetadataHashEntries() {
@@ -1446,7 +1484,7 @@ void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
   addMetadata(
       LLVMContext::MD_type,
       *MDTuple::get(getContext(),
-                    {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                    {ConstantAsMetadata::get(llvm::ConstantInt::get(
                          Type::getInt64Ty(getContext()), Offset)),
                      TypeID}));
 }
@@ -1459,6 +1497,15 @@ DISubprogram *Function::getSubprogram() const {
   return cast_or_null<DISubprogram>(getMetadata(LLVMContext::MD_dbg));
 }
 
+bool Function::isDebugInfoForProfiling() const {
+  if (DISubprogram *SP = getSubprogram()) {
+    if (DICompileUnit *CU = SP->getUnit()) {
+      return CU->getDebugInfoForProfiling();
+    }
+  }
+  return false;
+}
+
 void GlobalVariable::addDebugInfo(DIGlobalVariableExpression *GV) {
   addMetadata(LLVMContext::MD_dbg, *GV);
 }
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 1911f84340c6..fec9df193685 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -120,9 +120,8 @@ void Module::getOperandBundleTags(SmallVectorImpl<StringRef> &Result) const {
 // it.  This is nice because it allows most passes to get away with not handling
 // the symbol table directly for this common task.
 //
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      FunctionType *Ty,
-                                      AttributeSet AttributeList) {
+Constant *Module::getOrInsertFunction(StringRef Name, FunctionType *Ty,
+                                      AttributeList AttributeList) {
   // See if we have a definition for the specified function already.
   GlobalValue *F = getNamedValue(Name);
   if (!F) {
@@ -145,49 +144,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
 
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       FunctionType *Ty) {
-  return getOrInsertFunction(Name, Ty, AttributeSet());
-}
-
-// getOrInsertFunction - Look up the specified function in the module symbol
-// table.  If it does not exist, add a prototype for the function and return it.
-// This version of the method takes a null terminated list of function
-// arguments, which makes it easier for clients to use.
-//
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      AttributeSet AttributeList,
-                                      Type *RetTy, ...) {
-  va_list Args;
-  va_start(Args, RetTy);
-
-  // Build the list of argument types...
-  std::vector<Type*> ArgTys;
-  while (Type *ArgTy = va_arg(Args, Type*))
-    ArgTys.push_back(ArgTy);
-
-  va_end(Args);
-
-  // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name,
-                             FunctionType::get(RetTy, ArgTys, false),
-                             AttributeList);
-}
-
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      Type *RetTy, ...) {
-  va_list Args;
-  va_start(Args, RetTy);
-
-  // Build the list of argument types...
-  std::vector<Type*> ArgTys;
-  while (Type *ArgTy = va_arg(Args, Type*))
-    ArgTys.push_back(ArgTy);
-
-  va_end(Args);
-
-  // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name,
-                             FunctionType::get(RetTy, ArgTys, false),
-                             AttributeSet());
+  return getOrInsertFunction(Name, Ty, AttributeList());
 }
 
 // getFunction - Look up the specified function in the module symbol table.
@@ -208,7 +165,8 @@ Function *Module::getFunction(StringRef Name) const {
 /// If AllowLocal is set to true, this function will return types that
 /// have an local. By default, these types are not returned.
 ///
-GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
+GlobalVariable *Module::getGlobalVariable(StringRef Name,
+                                          bool AllowLocal) const {
   if (GlobalVariable *Result =
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
@@ -465,6 +423,14 @@ void Module::dropAllReferences() {
     GIF.dropAllReferences();
 }
 
+unsigned Module::getNumberRegisterParameters() const {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("NumRegisterParameters"));
+  if (!Val)
+    return 0;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
+
 unsigned Module::getDwarfVersion() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("Dwarf Version"));
   if (!Val)
diff --git a/lib/IR/Operator.cpp b/lib/IR/Operator.cpp
index 2fba24d99b30..7d819f3aae8d 100644
--- a/lib/IR/Operator.cpp
+++ b/lib/IR/Operator.cpp
@@ -1,4 +1,18 @@
+//===-- Operator.cpp - Implement the LLVM operators -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the non-inline methods for the LLVM Operator classes.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
diff --git a/lib/IR/OptBisect.cpp b/lib/IR/OptBisect.cpp
index e9574ca81261..b670c817569a 100644
--- a/lib/IR/OptBisect.cpp
+++ b/lib/IR/OptBisect.cpp
@@ -39,14 +39,6 @@ static void printPassMessage(const StringRef &Name, int PassNum,
          << "(" << PassNum << ") " << Name << " on " << TargetDesc << "\n";
 }
 
-static void printCaseMessage(int CaseNum, StringRef Msg, bool Running) {
-  if (Running)
-    errs() << "BISECT: running case (";
-  else
-    errs() << "BISECT: NOT running case (";
-  errs() << CaseNum << "): " << Msg << "\n";
-}
-
 static std::string getDescription(const Module &M) {
   return "module (" + M.getName().str() + ")";
 }
@@ -108,13 +100,3 @@ bool OptBisect::checkPass(const StringRef PassName,
   printPassMessage(PassName, CurBisectNum, TargetDesc, ShouldRun);
   return ShouldRun;
 }
-
-bool OptBisect::shouldRunCase(const Twine &Msg) {
-  if (!BisectEnabled)
-    return true;
-  int CurFuelNum = ++LastBisectNum;
-  bool ShouldRun = (OptBisectLimit == -1 || CurFuelNum <= OptBisectLimit);
-  printCaseMessage(CurFuelNum, Msg.str(), ShouldRun);
-  return ShouldRun;
-}
-
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index a42945ef3fff..f1b5f2f108dc 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -118,10 +118,12 @@ void Pass::print(raw_ostream &O,const Module*) const {
   O << "Pass::print not implemented for pass: '" << getPassName() << "'!\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // dump - call print(cerr);
 LLVM_DUMP_METHOD void Pass::dump() const {
   print(dbgs(), nullptr);
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // ImmutablePass Implementation
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index 8f68bb1daecf..47fdfedfdde8 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -91,4 +91,6 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
 }
 }
 
+AnalysisSetKey CFGAnalyses::SetKey;
+
 AnalysisSetKey PreservedAnalyses::AllAnalysesKey;
diff --git a/lib/IR/Statepoint.cpp b/lib/IR/Statepoint.cpp
index 63be1e780d81..8c3f0f208cc6 100644
--- a/lib/IR/Statepoint.cpp
+++ b/lib/IR/Statepoint.cpp
@@ -53,18 +53,19 @@ bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
          Attr.hasAttribute("statepoint-num-patch-bytes");
 }
 
-StatepointDirectives llvm::parseStatepointDirectivesFromAttrs(AttributeSet AS) {
+StatepointDirectives
+llvm::parseStatepointDirectivesFromAttrs(AttributeList AS) {
   StatepointDirectives Result;
 
   Attribute AttrID =
-      AS.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
+      AS.getAttribute(AttributeList::FunctionIndex, "statepoint-id");
   uint64_t StatepointID;
   if (AttrID.isStringAttribute())
     if (!AttrID.getValueAsString().getAsInteger(10, StatepointID))
       Result.StatepointID = StatepointID;
 
   uint32_t NumPatchBytes;
-  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeSet::FunctionIndex,
+  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeList::FunctionIndex,
                                                 "statepoint-num-patch-bytes");
   if (AttrNumPatchBytes.isStringAttribute())
     if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes))
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index ca866738f882..b67b0a307861 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -41,12 +41,6 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   }
 }
 
-Type *Type::getScalarType() const {
-  if (auto *VTy = dyn_cast<VectorType>(this))
-    return VTy->getElementType();
-  return const_cast<Type*>(this);
-}
-
 bool Type::isIntegerTy(unsigned Bitwidth) const {
   return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
 }
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index dc4c1cffb20c..a178b9ec0f09 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -1,4 +1,4 @@
-//===-- TypeFinder.cpp - Implement the TypeFinder class -------------------===//
+//===- TypeFinder.cpp - Implement the TypeFinder class --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/TypeFinder.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <utility>
+
 using namespace llvm;
 
 void TypeFinder::run(const Module &M, bool onlyNamed) {
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 91a999b58004..b07c57685a26 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -320,7 +320,7 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
-void Value::assertModuleIsMaterialized() const {
+void Value::assertModuleIsMaterializedImpl() const {
 #ifndef NDEBUG
   const GlobalValue *GV = dyn_cast<GlobalValue>(this);
   if (!GV)
@@ -437,17 +437,17 @@ enum PointerStripKind {
 };
 
 template <PointerStripKind StripKind>
-static Value *stripPointerCastsAndOffsets(Value *V) {
+static const Value *stripPointerCastsAndOffsets(const Value *V) {
   if (!V->getType()->isPointerTy())
     return V;
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
-  SmallPtrSet<Value *, 4> Visited;
+  SmallPtrSet<const Value *, 4> Visited;
 
   Visited.insert(V);
   do {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       switch (StripKind) {
       case PSK_ZeroIndicesAndAliases:
       case PSK_ZeroIndices:
@@ -467,13 +467,13 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       if (StripKind == PSK_ZeroIndices || GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
+      if (auto CS = ImmutableCallSite(V))
+        if (const Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
@@ -487,20 +487,21 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
 }
 } // end anonymous namespace
 
-Value *Value::stripPointerCasts() {
+const Value *Value::stripPointerCasts() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliases>(this);
 }
 
-Value *Value::stripPointerCastsNoFollowAliases() {
+const Value *Value::stripPointerCastsNoFollowAliases() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
 }
 
-Value *Value::stripInBoundsConstantOffsets() {
+const Value *Value::stripInBoundsConstantOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
-Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
-                                                        APInt &Offset) {
+const Value *
+Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                 APInt &Offset) const {
   if (!getType()->isPointerTy())
     return this;
 
@@ -510,11 +511,11 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
-  SmallPtrSet<Value *, 4> Visited;
+  SmallPtrSet<const Value *, 4> Visited;
   Visited.insert(this);
-  Value *V = this;
+  const Value *V = this;
   do {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       if (!GEP->isInBounds())
         return V;
       APInt GEPOffset(Offset);
@@ -524,11 +525,11 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       V = GA->getAliasee();
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
+      if (auto CS = ImmutableCallSite(V))
+        if (const Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
@@ -541,7 +542,7 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
   return V;
 }
 
-Value *Value::stripInBoundsOffsets() {
+const Value *Value::stripInBoundsOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
 
@@ -633,7 +634,7 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
         Align = DL.getPrefTypeAlignment(AllocatedType);
     }
   } else if (auto CS = ImmutableCallSite(this))
-    Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex);
+    Align = CS.getAttributes().getParamAlignment(AttributeList::ReturnIndex);
   else if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
@@ -643,9 +644,9 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
   return Align;
 }
 
-Value *Value::DoPHITranslation(const BasicBlock *CurBB,
-                               const BasicBlock *PredBB) {
-  PHINode *PN = dyn_cast<PHINode>(this);
+const Value *Value::DoPHITranslation(const BasicBlock *CurBB,
+                                     const BasicBlock *PredBB) const {
+  auto *PN = dyn_cast<PHINode>(this);
   if (PN && PN->getParent() == CurBB)
     return PN->getIncomingValueForBlock(PredBB);
   return this;
diff --git a/lib/IR/ValueSymbolTable.cpp b/lib/IR/ValueSymbolTable.cpp
index 8a6a320fc2d1..0c3946c8661e 100644
--- a/lib/IR/ValueSymbolTable.cpp
+++ b/lib/IR/ValueSymbolTable.cpp
@@ -1,4 +1,4 @@
-//===-- ValueSymbolTable.cpp - Implement the ValueSymbolTable class -------===//
+//===- ValueSymbolTable.cpp - Implement the ValueSymbolTable class --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -99,13 +100,15 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   return makeUniqueName(V, UniqueName);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // dump - print out the symbol table
 //
 LLVM_DUMP_METHOD void ValueSymbolTable::dump() const {
-  //DEBUG(dbgs() << "ValueSymbolTable:\n");
+  //dbgs() << "ValueSymbolTable:\n";
   for (const auto &I : *this) {
-    //DEBUG(dbgs() << "  '" << I->getKeyData() << "' = ");
+    //dbgs() << "  '" << I->getKeyData() << "' = ";
     I.getValue()->dump();
-    //DEBUG(dbgs() << "\n");
+    //dbgs() << "\n";
   }
 }
+#endif
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 5855059a189c..4e04020f206e 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -277,6 +277,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// already.
   bool SawFrameEscape;
 
+  /// Whether the current function has a DISubprogram attached to it.
+  bool HasDebugInfo = false;
+
   /// Stores the count of how many objects were passed to llvm.localescape for a
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
@@ -297,6 +300,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   // constant expressions, we can arrive at a particular user many times.
   SmallPtrSet<const Value *, 32> GlobalValueVisited;
 
+  // Keeps track of duplicate function argument debug info.
+  SmallVector<const DILocalVariable *, 16> DebugFnArgs;
+
   TBAAVerifier TBAAVerifyHelper;
 
   void checkAtomicMemAccessSize(Type *Ty, const Instruction *I);
@@ -342,6 +348,7 @@ public:
     visit(const_cast<Function &>(F));
     verifySiblingFuncletUnwinds();
     InstsInThisBlock.clear();
+    DebugFnArgs.clear();
     LandingPadResultTy = nullptr;
     SawFrameEscape = false;
     SiblingFuncletInfo.clear();
@@ -457,6 +464,7 @@ private:
   void visitUserOp1(Instruction &I);
   void visitUserOp2(Instruction &I) { visitUserOp1(I); }
   void visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS);
+  void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
   template <class DbgIntrinsicTy>
   void visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII);
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
@@ -481,12 +489,11 @@ private:
   void verifyMustTailCall(CallInst &CI);
   bool performTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
                         unsigned ArgNo, std::string &Suffix);
-  bool verifyAttributeCount(AttributeSet Attrs, unsigned Params);
-  void verifyAttributeTypes(AttributeSet Attrs, unsigned Idx, bool isFunction,
+  bool verifyAttributeCount(AttributeList Attrs, unsigned Params);
+  void verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
                             const Value *V);
-  void verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
-                            bool isReturnValue, const Value *V);
-  void verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+  void verifyParameterAttrs(AttributeSet Attrs, Type *Ty, const Value *V);
+  void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
 
@@ -497,6 +504,7 @@ private:
   void verifySiblingFuncletUnwinds();
 
   void verifyFragmentExpression(const DbgInfoIntrinsic &I);
+  void verifyFnArgs(const DbgInfoIntrinsic &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -652,7 +660,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     if (auto *GVE = dyn_cast<DIGlobalVariableExpression>(MD))
       visitDIGlobalVariableExpression(*GVE);
     else
-      AssertDI(false, "!dbg attachment of global variable must be a DIGlobalVariableExpression");
+      AssertDI(false, "!dbg attachment of global variable must be a "
+                      "DIGlobalVariableExpression");
   }
 
   if (!GV.hasInitializer()) {
@@ -822,28 +831,6 @@ static bool isType(const Metadata *MD) { return !MD || isa<DIType>(MD); }
 static bool isScope(const Metadata *MD) { return !MD || isa<DIScope>(MD); }
 static bool isDINode(const Metadata *MD) { return !MD || isa<DINode>(MD); }
 
-template <class Ty>
-static bool isValidMetadataArrayImpl(const MDTuple &N, bool AllowNull) {
-  for (Metadata *MD : N.operands()) {
-    if (MD) {
-      if (!isa<Ty>(MD))
-        return false;
-    } else {
-      if (!AllowNull)
-        return false;
-    }
-  }
-  return true;
-}
-
-template <class Ty> static bool isValidMetadataArray(const MDTuple &N) {
-  return isValidMetadataArrayImpl<Ty>(N, /* AllowNull */ false);
-}
-
-template <class Ty> static bool isValidMetadataNullArray(const MDTuple &N) {
-  return isValidMetadataArrayImpl<Ty>(N, /* AllowNull */ true);
-}
-
 void Verifier::visitDILocation(const DILocation &N) {
   AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
            "location requires a valid scope", &N, N.getRawScope());
@@ -900,6 +887,13 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
            N.getRawBaseType());
+
+  if (N.getDWARFAddressSpace()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_pointer_type ||
+                 N.getTag() == dwarf::DW_TAG_reference_type,
+             "DWARF address space only applies to pointer or reference types",
+             &N);
+  }
 }
 
 static bool hasConflictingReferenceFlags(unsigned Flags) {
@@ -1024,6 +1018,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   if (auto *F = N.getRawFile())
     AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+  else
+    AssertDI(N.getLine() == 0, "line specified with no file", &N, N.getLine());
   if (auto *T = N.getRawType())
     AssertDI(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
   AssertDI(isType(N.getRawContainingType()), "invalid containing type", &N,
@@ -1312,71 +1308,73 @@ Verifier::visitModuleFlag(const MDNode *Op,
   }
 }
 
-void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
-                                    bool isFunction, const Value *V) {
-  unsigned Slot = ~0U;
-  for (unsigned I = 0, E = Attrs.getNumSlots(); I != E; ++I)
-    if (Attrs.getSlotIndex(I) == Idx) {
-      Slot = I;
-      break;
-    }
+/// Return true if this attribute kind only applies to functions.
+static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
+  switch (Kind) {
+  case Attribute::NoReturn:
+  case Attribute::NoUnwind:
+  case Attribute::NoInline:
+  case Attribute::AlwaysInline:
+  case Attribute::OptimizeForSize:
+  case Attribute::StackProtect:
+  case Attribute::StackProtectReq:
+  case Attribute::StackProtectStrong:
+  case Attribute::SafeStack:
+  case Attribute::NoRedZone:
+  case Attribute::NoImplicitFloat:
+  case Attribute::Naked:
+  case Attribute::InlineHint:
+  case Attribute::StackAlignment:
+  case Attribute::UWTable:
+  case Attribute::NonLazyBind:
+  case Attribute::ReturnsTwice:
+  case Attribute::SanitizeAddress:
+  case Attribute::SanitizeThread:
+  case Attribute::SanitizeMemory:
+  case Attribute::MinSize:
+  case Attribute::NoDuplicate:
+  case Attribute::Builtin:
+  case Attribute::NoBuiltin:
+  case Attribute::Cold:
+  case Attribute::OptimizeNone:
+  case Attribute::JumpTable:
+  case Attribute::Convergent:
+  case Attribute::ArgMemOnly:
+  case Attribute::NoRecurse:
+  case Attribute::InaccessibleMemOnly:
+  case Attribute::InaccessibleMemOrArgMemOnly:
+  case Attribute::AllocSize:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
 
-  assert(Slot != ~0U && "Attribute set inconsistency!");
+/// Return true if this is a function attribute that can also appear on
+/// arguments.
+static bool isFuncOrArgAttr(Attribute::AttrKind Kind) {
+  return Kind == Attribute::ReadOnly || Kind == Attribute::WriteOnly ||
+         Kind == Attribute::ReadNone;
+}
 
-  for (AttributeSet::iterator I = Attrs.begin(Slot), E = Attrs.end(Slot);
-         I != E; ++I) {
-    if (I->isStringAttribute())
+void Verifier::verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
+                                    const Value *V) {
+  for (Attribute A : Attrs) {
+    if (A.isStringAttribute())
       continue;
 
-    if (I->getKindAsEnum() == Attribute::NoReturn ||
-        I->getKindAsEnum() == Attribute::NoUnwind ||
-        I->getKindAsEnum() == Attribute::NoInline ||
-        I->getKindAsEnum() == Attribute::AlwaysInline ||
-        I->getKindAsEnum() == Attribute::OptimizeForSize ||
-        I->getKindAsEnum() == Attribute::StackProtect ||
-        I->getKindAsEnum() == Attribute::StackProtectReq ||
-        I->getKindAsEnum() == Attribute::StackProtectStrong ||
-        I->getKindAsEnum() == Attribute::SafeStack ||
-        I->getKindAsEnum() == Attribute::NoRedZone ||
-        I->getKindAsEnum() == Attribute::NoImplicitFloat ||
-        I->getKindAsEnum() == Attribute::Naked ||
-        I->getKindAsEnum() == Attribute::InlineHint ||
-        I->getKindAsEnum() == Attribute::StackAlignment ||
-        I->getKindAsEnum() == Attribute::UWTable ||
-        I->getKindAsEnum() == Attribute::NonLazyBind ||
-        I->getKindAsEnum() == Attribute::ReturnsTwice ||
-        I->getKindAsEnum() == Attribute::SanitizeAddress ||
-        I->getKindAsEnum() == Attribute::SanitizeThread ||
-        I->getKindAsEnum() == Attribute::SanitizeMemory ||
-        I->getKindAsEnum() == Attribute::MinSize ||
-        I->getKindAsEnum() == Attribute::NoDuplicate ||
-        I->getKindAsEnum() == Attribute::Builtin ||
-        I->getKindAsEnum() == Attribute::NoBuiltin ||
-        I->getKindAsEnum() == Attribute::Cold ||
-        I->getKindAsEnum() == Attribute::OptimizeNone ||
-        I->getKindAsEnum() == Attribute::JumpTable ||
-        I->getKindAsEnum() == Attribute::Convergent ||
-        I->getKindAsEnum() == Attribute::ArgMemOnly ||
-        I->getKindAsEnum() == Attribute::NoRecurse ||
-        I->getKindAsEnum() == Attribute::InaccessibleMemOnly ||
-        I->getKindAsEnum() == Attribute::InaccessibleMemOrArgMemOnly ||
-        I->getKindAsEnum() == Attribute::AllocSize) {
-      if (!isFunction) {
-        CheckFailed("Attribute '" + I->getAsString() +
-                    "' only applies to functions!", V);
-        return;
-      }
-    } else if (I->getKindAsEnum() == Attribute::ReadOnly ||
-               I->getKindAsEnum() == Attribute::WriteOnly ||
-               I->getKindAsEnum() == Attribute::ReadNone) {
-      if (Idx == 0) {
-        CheckFailed("Attribute '" + I->getAsString() +
-                    "' does not apply to function returns");
+    if (isFuncOnlyAttr(A.getKindAsEnum())) {
+      if (!IsFunction) {
+        CheckFailed("Attribute '" + A.getAsString() +
+                        "' only applies to functions!",
+                    V);
         return;
       }
-    } else if (isFunction) {
-      CheckFailed("Attribute '" + I->getAsString() +
-                  "' does not apply to functions!", V);
+    } else if (IsFunction && !isFuncOrArgAttr(A.getKindAsEnum())) {
+      CheckFailed("Attribute '" + A.getAsString() +
+                      "' does not apply to functions!",
+                  V);
       return;
     }
   }
@@ -1384,106 +1382,91 @@ void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
 
 // VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
-                                    bool isReturnValue, const Value *V) {
-  if (!Attrs.hasAttributes(Idx))
+void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
+                                    const Value *V) {
+  if (!Attrs.hasAttributes())
     return;
 
-  verifyAttributeTypes(Attrs, Idx, false, V);
-
-  if (isReturnValue)
-    Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
-               !Attrs.hasAttribute(Idx, Attribute::Nest) &&
-               !Attrs.hasAttribute(Idx, Attribute::StructRet) &&
-               !Attrs.hasAttribute(Idx, Attribute::NoCapture) &&
-               !Attrs.hasAttribute(Idx, Attribute::Returned) &&
-               !Attrs.hasAttribute(Idx, Attribute::InAlloca) &&
-               !Attrs.hasAttribute(Idx, Attribute::SwiftSelf) &&
-               !Attrs.hasAttribute(Idx, Attribute::SwiftError),
-           "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', "
-           "'returned', 'swiftself', and 'swifterror' do not apply to return "
-           "values!",
-           V);
+  verifyAttributeTypes(Attrs, /*IsFunction=*/false, V);
 
   // Check for mutually incompatible attributes.  Only inreg is compatible with
   // sret.
   unsigned AttrCount = 0;
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::ByVal);
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::InAlloca);
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::StructRet) ||
-               Attrs.hasAttribute(Idx, Attribute::InReg);
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::Nest);
+  AttrCount += Attrs.hasAttribute(Attribute::ByVal);
+  AttrCount += Attrs.hasAttribute(Attribute::InAlloca);
+  AttrCount += Attrs.hasAttribute(Attribute::StructRet) ||
+               Attrs.hasAttribute(Attribute::InReg);
+  AttrCount += Attrs.hasAttribute(Attribute::Nest);
   Assert(AttrCount <= 1, "Attributes 'byval', 'inalloca', 'inreg', 'nest', "
                          "and 'sret' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::InAlloca) &&
-           Attrs.hasAttribute(Idx, Attribute::ReadOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::InAlloca) &&
+           Attrs.hasAttribute(Attribute::ReadOnly)),
          "Attributes "
          "'inalloca and readonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::StructRet) &&
-           Attrs.hasAttribute(Idx, Attribute::Returned)),
+  Assert(!(Attrs.hasAttribute(Attribute::StructRet) &&
+           Attrs.hasAttribute(Attribute::Returned)),
          "Attributes "
          "'sret and returned' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ZExt) &&
-           Attrs.hasAttribute(Idx, Attribute::SExt)),
+  Assert(!(Attrs.hasAttribute(Attribute::ZExt) &&
+           Attrs.hasAttribute(Attribute::SExt)),
          "Attributes "
          "'zeroext and signext' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
-           Attrs.hasAttribute(Idx, Attribute::ReadOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::ReadNone) &&
+           Attrs.hasAttribute(Attribute::ReadOnly)),
          "Attributes "
          "'readnone and readonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
-           Attrs.hasAttribute(Idx, Attribute::WriteOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::ReadNone) &&
+           Attrs.hasAttribute(Attribute::WriteOnly)),
          "Attributes "
          "'readnone and writeonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadOnly) &&
-           Attrs.hasAttribute(Idx, Attribute::WriteOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::ReadOnly) &&
+           Attrs.hasAttribute(Attribute::WriteOnly)),
          "Attributes "
          "'readonly and writeonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::NoInline) &&
-           Attrs.hasAttribute(Idx, Attribute::AlwaysInline)),
+  Assert(!(Attrs.hasAttribute(Attribute::NoInline) &&
+           Attrs.hasAttribute(Attribute::AlwaysInline)),
          "Attributes "
          "'noinline and alwaysinline' are incompatible!",
          V);
 
-  Assert(
-      !AttrBuilder(Attrs, Idx).overlaps(AttributeFuncs::typeIncompatible(Ty)),
-      "Wrong types for attribute: " +
-          AttributeSet::get(Context, Idx, AttributeFuncs::typeIncompatible(Ty))
-              .getAsString(Idx),
-      V);
+  AttrBuilder IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
+  Assert(!AttrBuilder(Attrs).overlaps(IncompatibleAttrs),
+         "Wrong types for attribute: " +
+             AttributeSet::get(Context, IncompatibleAttrs).getAsString(),
+         V);
 
   if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
     SmallPtrSet<Type*, 4> Visited;
     if (!PTy->getElementType()->isSized(&Visited)) {
-      Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
-                 !Attrs.hasAttribute(Idx, Attribute::InAlloca),
+      Assert(!Attrs.hasAttribute(Attribute::ByVal) &&
+                 !Attrs.hasAttribute(Attribute::InAlloca),
              "Attributes 'byval' and 'inalloca' do not support unsized types!",
              V);
     }
     if (!isa<PointerType>(PTy->getElementType()))
-      Assert(!Attrs.hasAttribute(Idx, Attribute::SwiftError),
+      Assert(!Attrs.hasAttribute(Attribute::SwiftError),
              "Attribute 'swifterror' only applies to parameters "
              "with pointer to pointer type!",
              V);
   } else {
-    Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal),
+    Assert(!Attrs.hasAttribute(Attribute::ByVal),
            "Attribute 'byval' only applies to parameters with pointer type!",
            V);
-    Assert(!Attrs.hasAttribute(Idx, Attribute::SwiftError),
+    Assert(!Attrs.hasAttribute(Attribute::SwiftError),
            "Attribute 'swifterror' only applies to parameters "
            "with pointer type!",
            V);
@@ -1492,7 +1475,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
 
 // Check parameter attributes against a function type.
 // The value V is printed in error messages.
-void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                                    const Value *V) {
   if (Attrs.isEmpty())
     return;
@@ -1503,122 +1486,124 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
   bool SawSwiftSelf = false;
   bool SawSwiftError = false;
 
-  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    unsigned Idx = Attrs.getSlotIndex(i);
-
-    Type *Ty;
-    if (Idx == 0)
-      Ty = FT->getReturnType();
-    else if (Idx-1 < FT->getNumParams())
-      Ty = FT->getParamType(Idx-1);
-    else
-      break;  // VarArgs attributes, verified elsewhere.
+  // Verify return value attributes.
+  AttributeSet RetAttrs = Attrs.getRetAttributes();
+  Assert((!RetAttrs.hasAttribute(Attribute::ByVal) &&
+          !RetAttrs.hasAttribute(Attribute::Nest) &&
+          !RetAttrs.hasAttribute(Attribute::StructRet) &&
+          !RetAttrs.hasAttribute(Attribute::NoCapture) &&
+          !RetAttrs.hasAttribute(Attribute::Returned) &&
+          !RetAttrs.hasAttribute(Attribute::InAlloca) &&
+          !RetAttrs.hasAttribute(Attribute::SwiftSelf) &&
+          !RetAttrs.hasAttribute(Attribute::SwiftError)),
+         "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', "
+         "'returned', 'swiftself', and 'swifterror' do not apply to return "
+         "values!",
+         V);
+  Assert((!RetAttrs.hasAttribute(Attribute::ReadOnly) &&
+          !RetAttrs.hasAttribute(Attribute::WriteOnly) &&
+          !RetAttrs.hasAttribute(Attribute::ReadNone)),
+         "Attribute '" + RetAttrs.getAsString() +
+             "' does not apply to function returns",
+         V);
+  verifyParameterAttrs(RetAttrs, FT->getReturnType(), V);
 
-    verifyParameterAttrs(Attrs, Idx, Ty, Idx == 0, V);
+  // Verify parameter attributes.
+  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+    Type *Ty = FT->getParamType(i);
+    AttributeSet ArgAttrs = Attrs.getParamAttributes(i);
 
-    if (Idx == 0)
-      continue;
+    verifyParameterAttrs(ArgAttrs, Ty, V);
 
-    if (Attrs.hasAttribute(Idx, Attribute::Nest)) {
+    if (ArgAttrs.hasAttribute(Attribute::Nest)) {
       Assert(!SawNest, "More than one parameter has attribute nest!", V);
       SawNest = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::Returned)) {
+    if (ArgAttrs.hasAttribute(Attribute::Returned)) {
       Assert(!SawReturned, "More than one parameter has attribute returned!",
              V);
       Assert(Ty->canLosslesslyBitCastTo(FT->getReturnType()),
-             "Incompatible "
-             "argument and return types for 'returned' attribute",
+             "Incompatible argument and return types for 'returned' attribute",
              V);
       SawReturned = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::StructRet)) {
+    if (ArgAttrs.hasAttribute(Attribute::StructRet)) {
       Assert(!SawSRet, "Cannot have multiple 'sret' parameters!", V);
-      Assert(Idx == 1 || Idx == 2,
+      Assert(i == 0 || i == 1,
              "Attribute 'sret' is not on first or second parameter!", V);
       SawSRet = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::SwiftSelf)) {
+    if (ArgAttrs.hasAttribute(Attribute::SwiftSelf)) {
       Assert(!SawSwiftSelf, "Cannot have multiple 'swiftself' parameters!", V);
       SawSwiftSelf = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::SwiftError)) {
+    if (ArgAttrs.hasAttribute(Attribute::SwiftError)) {
       Assert(!SawSwiftError, "Cannot have multiple 'swifterror' parameters!",
              V);
       SawSwiftError = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::InAlloca)) {
-      Assert(Idx == FT->getNumParams(), "inalloca isn't on the last parameter!",
-             V);
+    if (ArgAttrs.hasAttribute(Attribute::InAlloca)) {
+      Assert(i == FT->getNumParams() - 1,
+             "inalloca isn't on the last parameter!", V);
     }
   }
 
-  if (!Attrs.hasAttributes(AttributeSet::FunctionIndex))
+  if (!Attrs.hasAttributes(AttributeList::FunctionIndex))
     return;
 
-  verifyAttributeTypes(Attrs, AttributeSet::FunctionIndex, true, V);
+  verifyAttributeTypes(Attrs.getFnAttributes(), /*IsFunction=*/true, V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly)),
-      "Attributes 'readnone and readonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::ReadOnly)),
+         "Attributes 'readnone and readonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
-      "Attributes 'readnone and writeonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+         "Attributes 'readnone and writeonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
-      "Attributes 'readonly and writeonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadOnly) &&
+           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+         "Attributes 'readonly and writeonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
-                           Attribute::InaccessibleMemOrArgMemOnly)),
-      "Attributes 'readnone and inaccessiblemem_or_argmemonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::InaccessibleMemOrArgMemOnly)),
+         "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
+         "incompatible!",
+         V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
-                           Attribute::InaccessibleMemOnly)),
-      "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::InaccessibleMemOnly)),
+         "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::AlwaysInline)),
-      "Attributes 'noinline and alwaysinline' are incompatible!", V);
-
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, 
-                         Attribute::OptimizeNone)) {
-    Assert(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline),
+  Assert(!(Attrs.hasFnAttribute(Attribute::NoInline) &&
+           Attrs.hasFnAttribute(Attribute::AlwaysInline)),
+         "Attributes 'noinline and alwaysinline' are incompatible!", V);
+
+  if (Attrs.hasFnAttribute(Attribute::OptimizeNone)) {
+    Assert(Attrs.hasFnAttribute(Attribute::NoInline),
            "Attribute 'optnone' requires 'noinline'!", V);
 
-    Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                               Attribute::OptimizeForSize),
+    Assert(!Attrs.hasFnAttribute(Attribute::OptimizeForSize),
            "Attributes 'optsize and optnone' are incompatible!", V);
 
-    Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize),
+    Assert(!Attrs.hasFnAttribute(Attribute::MinSize),
            "Attributes 'minsize and optnone' are incompatible!", V);
   }
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                         Attribute::JumpTable)) {
+  if (Attrs.hasFnAttribute(Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
     Assert(GV->hasGlobalUnnamedAddr(),
            "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::AllocSize)) {
+  if (Attrs.hasFnAttribute(Attribute::AllocSize)) {
     std::pair<unsigned, Optional<unsigned>> Args =
-        Attrs.getAllocSizeArgs(AttributeSet::FunctionIndex);
+        Attrs.getAllocSizeArgs(AttributeList::FunctionIndex);
 
     auto CheckParam = [&](StringRef Name, unsigned ParamNo) {
       if (ParamNo >= FT->getNumParams()) {
@@ -1649,8 +1634,8 @@ void Verifier::verifyFunctionMetadata(
   for (const auto &Pair : MDs) {
     if (Pair.first == LLVMContext::MD_prof) {
       MDNode *MD = Pair.second;
-      Assert(MD->getNumOperands() == 2,
-             "!prof annotations should have exactly 2 operands", MD);
+      Assert(MD->getNumOperands() >= 2,
+             "!prof annotations should have no less than 2 operands", MD);
 
       // Check first operand.
       Assert(MD->getOperand(0) != nullptr, "first operand should not be null",
@@ -1725,15 +1710,15 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   }
 }
 
-bool Verifier::verifyAttributeCount(AttributeSet Attrs, unsigned Params) {
+bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
   if (Attrs.getNumSlots() == 0)
     return true;
 
   unsigned LastSlot = Attrs.getNumSlots() - 1;
   unsigned LastIndex = Attrs.getSlotIndex(LastSlot);
-  if (LastIndex <= Params
-      || (LastIndex == AttributeSet::FunctionIndex
-          && (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
+  if (LastIndex <= Params ||
+      (LastIndex == AttributeList::FunctionIndex &&
+       (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
     return true;
 
   return false;
@@ -1963,7 +1948,7 @@ void Verifier::visitFunction(const Function &F) {
   Assert(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
          "Invalid struct return type!", &F);
 
-  AttributeSet Attrs = F.getAttributes();
+  AttributeList Attrs = F.getAttributes();
 
   Assert(verifyAttributeCount(Attrs, FT->getNumParams()),
          "Attribute after last parameter!", &F);
@@ -1974,7 +1959,7 @@ void Verifier::visitFunction(const Function &F) {
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
   // checking for Attributes that can/can not ever be on functions.
-  Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::Builtin),
+  Assert(!Attrs.hasFnAttribute(Attribute::Builtin),
          "Attribute 'builtin' can only be applied to a callsite.", &F);
 
   // Check that this function meets the restrictions on this calling convention.
@@ -1984,6 +1969,18 @@ void Verifier::visitFunction(const Function &F) {
   default:
   case CallingConv::C:
     break;
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    Assert(F.getReturnType()->isVoidTy(),
+           "Calling convention requires void return type", &F);
+    LLVM_FALLTHROUGH;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    Assert(!F.hasStructRetAttr(),
+           "Calling convention does not allow sret", &F);
+    LLVM_FALLTHROUGH;
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::Intel_OCL_BI:
@@ -2014,7 +2011,7 @@ void Verifier::visitFunction(const Function &F) {
     }
 
     // Check that swifterror argument is only used by loads and stores.
-    if (Attrs.hasAttribute(i+1, Attribute::SwiftError)) {
+    if (Attrs.hasParamAttribute(i, Attribute::SwiftError)) {
       verifySwiftErrorValue(&Arg);
     }
     ++i;
@@ -2113,11 +2110,10 @@ void Verifier::visitFunction(const Function &F) {
          "Function is marked as dllimport, but not external.", &F);
 
   auto *N = F.getSubprogram();
-  if (!N)
+  HasDebugInfo = (N != nullptr);
+  if (!HasDebugInfo)
     return;
 
-  visitDISubprogram(*N);
-
   // Check that all !dbg attachments lead to back to N (or, at least, another
   // subprogram that describes the same function).
   //
@@ -2601,7 +2597,7 @@ void Verifier::verifyCallSite(CallSite CS) {
            "Call parameter type does not match function signature!",
            CS.getArgument(i), FTy->getParamType(i), I);
 
-  AttributeSet Attrs = CS.getAttributes();
+  AttributeList Attrs = CS.getAttributes();
 
   Assert(verifyAttributeCount(Attrs, CS.arg_size()),
          "Attribute after last parameter!", I);
@@ -2623,7 +2619,7 @@ void Verifier::verifyCallSite(CallSite CS) {
   // make sure the underlying alloca/parameter it comes from has a swifterror as
   // well.
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    if (CS.paramHasAttr(i+1, Attribute::SwiftError)) {
+    if (CS.paramHasAttr(i, Attribute::SwiftError)) {
       Value *SwiftErrorArg = CS.getArgument(i);
       if (auto AI = dyn_cast<AllocaInst>(SwiftErrorArg->stripInBoundsOffsets())) {
         Assert(AI->isSwiftError(),
@@ -2641,24 +2637,25 @@ void Verifier::verifyCallSite(CallSite CS) {
     bool SawNest = false;
     bool SawReturned = false;
 
-    for (unsigned Idx = 1; Idx < 1 + FTy->getNumParams(); ++Idx) {
-      if (Attrs.hasAttribute(Idx, Attribute::Nest))
+    for (unsigned Idx = 0; Idx < FTy->getNumParams(); ++Idx) {
+      if (Attrs.hasParamAttribute(Idx, Attribute::Nest))
         SawNest = true;
-      if (Attrs.hasAttribute(Idx, Attribute::Returned))
+      if (Attrs.hasParamAttribute(Idx, Attribute::Returned))
         SawReturned = true;
     }
 
     // Check attributes on the varargs part.
-    for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
-      Type *Ty = CS.getArgument(Idx-1)->getType();
-      verifyParameterAttrs(Attrs, Idx, Ty, false, I);
+    for (unsigned Idx = FTy->getNumParams(); Idx < CS.arg_size(); ++Idx) {
+      Type *Ty = CS.getArgument(Idx)->getType();
+      AttributeSet ArgAttrs = Attrs.getParamAttributes(Idx);
+      verifyParameterAttrs(ArgAttrs, Ty, I);
 
-      if (Attrs.hasAttribute(Idx, Attribute::Nest)) {
+      if (ArgAttrs.hasAttribute(Attribute::Nest)) {
         Assert(!SawNest, "More than one parameter has attribute nest!", I);
         SawNest = true;
       }
 
-      if (Attrs.hasAttribute(Idx, Attribute::Returned)) {
+      if (ArgAttrs.hasAttribute(Attribute::Returned)) {
         Assert(!SawReturned, "More than one parameter has attribute returned!",
                I);
         Assert(Ty->canLosslesslyBitCastTo(FTy->getReturnType()),
@@ -2668,11 +2665,12 @@ void Verifier::verifyCallSite(CallSite CS) {
         SawReturned = true;
       }
 
-      Assert(!Attrs.hasAttribute(Idx, Attribute::StructRet),
+      Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
              "Attribute 'sret' cannot be used for vararg call arguments!", I);
 
-      if (Attrs.hasAttribute(Idx, Attribute::InAlloca))
-        Assert(Idx == CS.arg_size(), "inalloca isn't on the last argument!", I);
+      if (ArgAttrs.hasAttribute(Attribute::InAlloca))
+        Assert(Idx == CS.arg_size() - 1, "inalloca isn't on the last argument!",
+               I);
     }
   }
 
@@ -2726,9 +2724,9 @@ void Verifier::verifyCallSite(CallSite CS) {
   // do so causes assertion failures when the inliner sets up inline scope info.
   if (I->getFunction()->getSubprogram() && CS.getCalledFunction() &&
       CS.getCalledFunction()->getSubprogram())
-    Assert(I->getDebugLoc(), "inlinable function call in a function with debug "
-                             "info must have a !dbg location",
-           I);
+    AssertDI(I->getDebugLoc(), "inlinable function call in a function with "
+                               "debug info must have a !dbg location",
+             I);
 
   visitInstruction(*I);
 }
@@ -2745,17 +2743,17 @@ static bool isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-static AttrBuilder getParameterABIAttributes(int I, AttributeSet Attrs) {
+static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
       Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca,
       Attribute::InReg, Attribute::Returned, Attribute::SwiftSelf,
       Attribute::SwiftError};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
-    if (Attrs.hasAttribute(I + 1, AK))
+    if (Attrs.hasParamAttribute(I, AK))
       Copy.addAttribute(AK);
   }
-  if (Attrs.hasAttribute(I + 1, Attribute::Alignment))
+  if (Attrs.hasParamAttribute(I, Attribute::Alignment))
     Copy.addAlignmentAttr(Attrs.getParamAlignment(I + 1));
   return Copy;
 }
@@ -2787,8 +2785,8 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
   // - All ABI-impacting function attributes, such as sret, byval, inreg,
   //   returned, and inalloca, must match.
-  AttributeSet CallerAttrs = F->getAttributes();
-  AttributeSet CalleeAttrs = CI.getAttributes();
+  AttributeList CallerAttrs = F->getAttributes();
+  AttributeList CalleeAttrs = CI.getAttributes();
   for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
     AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs);
     AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
@@ -3116,7 +3114,7 @@ void Verifier::verifySwiftErrorCallSite(CallSite CS,
   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
        I != E; ++I, ++Idx) {
     if (*I == SwiftErrorVal) {
-      Assert(CS.paramHasAttr(Idx+1, Attribute::SwiftError),
+      Assert(CS.paramHasAttr(Idx, Attribute::SwiftError),
              "swifterror value when used in a callsite should be marked "
              "with swifterror attribute",
               SwiftErrorVal, CS);
@@ -3148,8 +3146,9 @@ void Verifier::verifySwiftErrorValue(const Value *SwiftErrorVal) {
 void Verifier::visitAllocaInst(AllocaInst &AI) {
   SmallPtrSet<Type*, 4> Visited;
   PointerType *PTy = AI.getType();
-  Assert(PTy->getAddressSpace() == 0,
-         "Allocation instruction pointer not in the generic address space!",
+  // TODO: Relax this restriction?
+  Assert(PTy->getAddressSpace() == DL.getAllocaAddrSpace(),
+         "Allocation instruction pointer not in the stack address space!",
          &AI);
   Assert(AI.getAllocatedType()->isSized(&Visited),
          "Cannot allocate unsized type", &AI);
@@ -3929,6 +3928,14 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "constant int",
            CS);
     break;
+  case Intrinsic::experimental_constrained_fadd:
+  case Intrinsic::experimental_constrained_fsub:
+  case Intrinsic::experimental_constrained_fmul:
+  case Intrinsic::experimental_constrained_fdiv:
+  case Intrinsic::experimental_constrained_frem:
+    visitConstrainedFPIntrinsic(
+        cast<ConstrainedFPIntrinsic>(*CS.getInstruction()));
+    break;
   case Intrinsic::dbg_declare: // llvm.dbg.declare
     Assert(isa<MetadataAsValue>(CS.getArgOperand(0)),
            "invalid llvm.dbg.declare intrinsic call 1", CS);
@@ -4294,6 +4301,15 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
   return nullptr;
 }
 
+void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
+  Assert(isa<MetadataAsValue>(FPI.getOperand(2)),
+         "invalid rounding mode argument", &FPI);
+  Assert(FPI.getRoundingMode() != ConstrainedFPIntrinsic::rmInvalid,
+         "invalid rounding mode argument", &FPI);
+  Assert(FPI.getExceptionBehavior() != ConstrainedFPIntrinsic::ebInvalid,
+         "invalid exception behavior argument", &FPI);
+}
+
 template <class DbgIntrinsicTy>
 void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
   auto *MD = cast<MetadataAsValue>(DII.getArgOperand(0))->getMetadata();
@@ -4330,6 +4346,8 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
                                " variable and !dbg attachment",
            &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
            Loc->getScope()->getSubprogram());
+
+  verifyFnArgs(DII);
 }
 
 static uint64_t getVariableSize(const DILocalVariable &V) {
@@ -4398,15 +4416,49 @@ void Verifier::verifyFragmentExpression(const DbgInfoIntrinsic &I) {
   AssertDI(FragSize != VarSize, "fragment covers entire variable", &I, V, E);
 }
 
+void Verifier::verifyFnArgs(const DbgInfoIntrinsic &I) {
+  // This function does not take the scope of noninlined function arguments into
+  // account. Don't run it if current function is nodebug, because it may
+  // contain inlined debug intrinsics.
+  if (!HasDebugInfo)
+    return;
+
+  DILocalVariable *Var;
+  if (auto *DV = dyn_cast<DbgValueInst>(&I)) {
+    // For performance reasons only check non-inlined ones.
+    if (DV->getDebugLoc()->getInlinedAt())
+      return;
+    Var = DV->getVariable();
+  } else {
+    auto *DD = cast<DbgDeclareInst>(&I);
+    if (DD->getDebugLoc()->getInlinedAt())
+      return;
+    Var = DD->getVariable();
+  }
+  AssertDI(Var, "dbg intrinsic without variable");
+
+  unsigned ArgNo = Var->getArg();
+  if (!ArgNo)
+    return;
+
+  // Verify there are no duplicate function argument debug info entries.
+  // These will cause hard-to-debug assertions in the DWARF backend.
+  if (DebugFnArgs.size() < ArgNo)
+    DebugFnArgs.resize(ArgNo, nullptr);
+
+  auto *Prev = DebugFnArgs[ArgNo - 1];
+  DebugFnArgs[ArgNo - 1] = Var;
+  AssertDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
+           Prev, Var);
+}
+
 void Verifier::verifyCompileUnits() {
   auto *CUs = M.getNamedMetadata("llvm.dbg.cu");
   SmallPtrSet<const Metadata *, 2> Listed;
   if (CUs)
     Listed.insert(CUs->op_begin(), CUs->op_end());
-  AssertDI(
-      all_of(CUVisited,
-             [&Listed](const Metadata *CU) { return Listed.count(CU); }),
-      "All DICompileUnits must be listed in llvm.dbg.cu");
+  for (auto *CU : CUVisited)
+    AssertDI(Listed.count(CU), "DICompileUnit not listed in llvm.dbg.cu", CU);
   CUVisited.clear();
 }
 
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
index c73143eb330b..73b5662d4bc8 100644
--- a/lib/LTO/CMakeLists.txt
+++ b/lib/LTO/CMakeLists.txt
@@ -1,52 +1,3 @@
-# Figure out if we can track VC revisions.
-function(find_first_existing_file out_var)
-  foreach(file ${ARGN})
-    if(EXISTS "${file}")
-      set(${out_var} "${file}" PARENT_SCOPE)
-      return()
-    endif()
-  endforeach()
-endfunction()
-
-macro(find_first_existing_vc_file out_var path)
-  find_first_existing_file(${out_var}
-    "${path}/.git/logs/HEAD" # Git
-    "${path}/.svn/wc.db"     # SVN 1.7
-    "${path}/.svn/entries"   # SVN 1.6
-    )
-endmacro()
-
-find_first_existing_vc_file(llvm_vc "${LLVM_MAIN_SRC_DIR}")
-
-# The VC revision include that we want to generate.
-set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/LLVMLTORevision.h")
-
-set(get_svn_script "${LLVM_CMAKE_PATH}/GenerateVersionFromCVS.cmake")
-
-if(DEFINED llvm_vc)
-  # Create custom target to generate the VC revision include.
-  add_custom_command(OUTPUT "${version_inc}"
-    DEPENDS "${llvm_vc}" "${get_svn_script}"
-    COMMAND
-    ${CMAKE_COMMAND} "-DSOURCE_DIR=${LLVM_MAIN_SRC_DIR}"
-                     "-DNAME=LLVM_REVISION"
-                     "-DHEADER_FILE=${version_inc}"
-                     -P "${get_svn_script}")
-
-  # Mark the generated header as being generated.
-  set_source_files_properties("${version_inc}"
-    PROPERTIES GENERATED TRUE
-               HEADER_FILE_ONLY TRUE)
-
-  # Tell Version.cpp that it needs to build with -DHAVE_SVN_VERSION_INC.
-  set_source_files_properties(Version.cpp
-    PROPERTIES COMPILE_DEFINITIONS "HAVE_SVN_VERSION_INC")
-else()
-  # Not producing a VC revision include.
-  set(version_inc)
-endif()
-
-
 add_llvm_library(LLVMLTO
   Caching.cpp
   LTO.cpp
@@ -55,11 +6,11 @@ add_llvm_library(LLVMLTO
   LTOCodeGenerator.cpp
   UpdateCompilerUsed.cpp
   ThinLTOCodeGenerator.cpp
-  ${version_inc}
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/LTO
 
   DEPENDS
   intrinsics_gen
+  llvm_vcsrevision_h
 )
diff --git a/lib/LTO/Caching.cpp b/lib/LTO/Caching.cpp
index fd5bdb0bc01a..e32e46c4c3c8 100644
--- a/lib/LTO/Caching.cpp
+++ b/lib/LTO/Caching.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/LTO/Caching.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -21,70 +22,71 @@
 using namespace llvm;
 using namespace llvm::lto;
 
-static void commitEntry(StringRef TempFilename, StringRef EntryPath) {
-  // Rename to final destination (hopefully race condition won't matter here)
-  auto EC = sys::fs::rename(TempFilename, EntryPath);
-  if (EC) {
-    // Renaming failed, probably not the same filesystem, copy and delete.
-    // FIXME: Avoid needing to do this by creating the temporary file in the
-    // cache directory.
-    {
-      auto ReloadedBufferOrErr = MemoryBuffer::getFile(TempFilename);
-      if (auto EC = ReloadedBufferOrErr.getError())
-        report_fatal_error(Twine("Failed to open temp file '") + TempFilename +
-                           "': " + EC.message() + "\n");
+Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
+                                            AddBufferFn AddBuffer) {
+  if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPath))
+    return errorCodeToError(EC);
 
-      raw_fd_ostream OS(EntryPath, EC, sys::fs::F_None);
-      if (EC)
-        report_fatal_error(Twine("Failed to open ") + EntryPath +
-                           " to save cached entry\n");
-      // I'm not sure what are the guarantee if two processes are doing this
-      // at the same time.
-      OS << (*ReloadedBufferOrErr)->getBuffer();
-    }
-    sys::fs::remove(TempFilename);
-  }
-}
-
-NativeObjectCache lto::localCache(std::string CacheDirectoryPath,
-                                  AddFileFn AddFile) {
   return [=](unsigned Task, StringRef Key) -> AddStreamFn {
-    // First, see if we have a cache hit.
+    // This choice of file name allows the cache to be pruned (see pruneCache()
+    // in include/llvm/Support/CachePruning.h).
     SmallString<64> EntryPath;
-    sys::path::append(EntryPath, CacheDirectoryPath, Key);
-    if (sys::fs::exists(EntryPath)) {
-      AddFile(Task, EntryPath);
+    sys::path::append(EntryPath, CacheDirectoryPath, "llvmcache-" + Key);
+    // First, see if we have a cache hit.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getFile(EntryPath);
+    if (MBOrErr) {
+      AddBuffer(Task, std::move(*MBOrErr));
       return AddStreamFn();
     }
 
+    if (MBOrErr.getError() != errc::no_such_file_or_directory)
+      report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
+                         ": " + MBOrErr.getError().message() + "\n");
+
     // This native object stream is responsible for commiting the resulting
-    // file to the cache and calling AddFile to add it to the link.
+    // file to the cache and calling AddBuffer to add it to the link.
     struct CacheStream : NativeObjectStream {
-      AddFileFn AddFile;
+      AddBufferFn AddBuffer;
       std::string TempFilename;
       std::string EntryPath;
       unsigned Task;
 
-      CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddFileFn AddFile,
+      CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddBufferFn AddBuffer,
                   std::string TempFilename, std::string EntryPath,
                   unsigned Task)
-          : NativeObjectStream(std::move(OS)), AddFile(AddFile),
-            TempFilename(TempFilename), EntryPath(EntryPath), Task(Task) {}
+          : NativeObjectStream(std::move(OS)), AddBuffer(std::move(AddBuffer)),
+            TempFilename(std::move(TempFilename)),
+            EntryPath(std::move(EntryPath)), Task(Task) {}
 
       ~CacheStream() {
+        // FIXME: This code could race with the cache pruner, but it is unlikely
+        // that the cache pruner will choose to remove a newly created file.
+
         // Make sure the file is closed before committing it.
         OS.reset();
-        commitEntry(TempFilename, EntryPath);
-        AddFile(Task, EntryPath);
+        // This is atomic on POSIX systems.
+        if (auto EC = sys::fs::rename(TempFilename, EntryPath))
+          report_fatal_error(Twine("Failed to rename temporary file ") +
+                             TempFilename + ": " + EC.message() + "\n");
+
+        ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+            MemoryBuffer::getFile(EntryPath);
+        if (!MBOrErr)
+          report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
+                             ": " + MBOrErr.getError().message() + "\n");
+        AddBuffer(Task, std::move(*MBOrErr));
       }
     };
 
     return [=](size_t Task) -> std::unique_ptr<NativeObjectStream> {
       // Write to a temporary to avoid race condition
       int TempFD;
-      SmallString<64> TempFilename;
+      SmallString<64> TempFilenameModel, TempFilename;
+      sys::path::append(TempFilenameModel, CacheDirectoryPath, "Thin-%%%%%%.tmp.o");
       std::error_code EC =
-          sys::fs::createTemporaryFile("Thin", "tmp.o", TempFD, TempFilename);
+          sys::fs::createUniqueFile(TempFilenameModel, TempFD, TempFilename,
+                                    sys::fs::owner_read | sys::fs::owner_write);
       if (EC) {
         errs() << "Error: " << EC.message() << "\n";
         report_fatal_error("ThinLTO: Can't get a temporary file");
@@ -93,7 +95,7 @@ NativeObjectCache lto::localCache(std::string CacheDirectoryPath,
       // This CacheStream will move the temporary file into the cache when done.
       return llvm::make_unique<CacheStream>(
           llvm::make_unique<raw_fd_ostream>(TempFD, /* ShouldClose */ true),
-          AddFile, TempFilename.str(), EntryPath.str(), Task);
+          AddBuffer, TempFilename.str(), EntryPath.str(), Task);
     };
   };
 }
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index e3e2f9f806c8..9782c898bf50 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -20,9 +20,13 @@
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/Linker/IRMover.h"
+#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ModuleSummaryIndexObjectFile.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -31,6 +35,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -46,6 +51,12 @@ using namespace object;
 
 #define DEBUG_TYPE "lto"
 
+// The values are (type identifier, summary) pairs.
+typedef DenseMap<
+    GlobalValue::GUID,
+    TinyPtrVector<const std::pair<const std::string, TypeIdSummary> *>>
+    TypeIdSummariesByGuidTy;
+
 // Returns a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -54,7 +65,8 @@ static void computeCacheKey(
     StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
     const FunctionImporter::ExportSetTy &ExportList,
     const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
-    const GVSummaryMapTy &DefinedGlobals) {
+    const GVSummaryMapTy &DefinedGlobals,
+    const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
   // Compute the unique hash for this entry.
   // This is based on the current compiler version, the module itself, the
   // export list, the hash for every single module in the import list, the
@@ -63,7 +75,7 @@ static void computeCacheKey(
 
   // Start with the compiler revision
   Hasher.update(LLVM_VERSION_STRING);
-#ifdef HAVE_LLVM_REVISION
+#ifdef LLVM_REVISION
   Hasher.update(LLVM_REVISION);
 #endif
 
@@ -80,6 +92,18 @@ static void computeCacheKey(
     Data[3] = I >> 24;
     Hasher.update(ArrayRef<uint8_t>{Data, 4});
   };
+  auto AddUint64 = [&](uint64_t I) {
+    uint8_t Data[8];
+    Data[0] = I;
+    Data[1] = I >> 8;
+    Data[2] = I >> 16;
+    Data[3] = I >> 24;
+    Data[4] = I >> 32;
+    Data[5] = I >> 40;
+    Data[6] = I >> 48;
+    Data[7] = I >> 56;
+    Hasher.update(ArrayRef<uint8_t>{Data, 8});
+  };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
   // command-line flags (which is unsupported in production), but may set
@@ -94,6 +118,7 @@ static void computeCacheKey(
   AddUnsigned(Conf.RelocModel);
   AddUnsigned(Conf.CodeModel);
   AddUnsigned(Conf.CGOptLevel);
+  AddUnsigned(Conf.CGFileType);
   AddUnsigned(Conf.OptLevel);
   AddString(Conf.OptPipeline);
   AddString(Conf.AAPipeline);
@@ -107,10 +132,16 @@ static void computeCacheKey(
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&F, sizeof(F)));
 
-  // Include the hash for every module we import functions from
+  // Include the hash for every module we import functions from. The set of
+  // imported symbols for each module may affect code generation and is
+  // sensitive to link order, so include that as well.
   for (auto &Entry : ImportList) {
     auto ModHash = Index.getModuleHash(Entry.first());
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+
+    AddUint64(Entry.second.size());
+    for (auto &Fn : Entry.second)
+      AddUint64(Fn.first);
   }
 
   // Include the hash for the resolved ODR.
@@ -121,12 +152,68 @@ static void computeCacheKey(
                                     sizeof(GlobalValue::LinkageTypes)));
   }
 
+  std::set<GlobalValue::GUID> UsedTypeIds;
+
+  auto AddUsedTypeIds = [&](GlobalValueSummary *GS) {
+    auto *FS = dyn_cast_or_null<FunctionSummary>(GS);
+    if (!FS)
+      return;
+    for (auto &TT : FS->type_tests())
+      UsedTypeIds.insert(TT);
+    for (auto &TT : FS->type_test_assume_vcalls())
+      UsedTypeIds.insert(TT.GUID);
+    for (auto &TT : FS->type_checked_load_vcalls())
+      UsedTypeIds.insert(TT.GUID);
+    for (auto &TT : FS->type_test_assume_const_vcalls())
+      UsedTypeIds.insert(TT.VFunc.GUID);
+    for (auto &TT : FS->type_checked_load_const_vcalls())
+      UsedTypeIds.insert(TT.VFunc.GUID);
+  };
+
   // Include the hash for the linkage type to reflect internalization and weak
-  // resolution.
+  // resolution, and collect any used type identifier resolutions.
   for (auto &GS : DefinedGlobals) {
     GlobalValue::LinkageTypes Linkage = GS.second->linkage();
     Hasher.update(
         ArrayRef<uint8_t>((const uint8_t *)&Linkage, sizeof(Linkage)));
+    AddUsedTypeIds(GS.second);
+  }
+
+  // Imported functions may introduce new uses of type identifier resolutions,
+  // so we need to collect their used resolutions as well.
+  for (auto &ImpM : ImportList)
+    for (auto &ImpF : ImpM.second)
+      AddUsedTypeIds(Index.findSummaryInModule(ImpF.first, ImpM.first()));
+
+  auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
+    AddString(TId);
+
+    AddUnsigned(S.TTRes.TheKind);
+    AddUnsigned(S.TTRes.SizeM1BitWidth);
+
+    AddUint64(S.WPDRes.size());
+    for (auto &WPD : S.WPDRes) {
+      AddUnsigned(WPD.first);
+      AddUnsigned(WPD.second.TheKind);
+      AddString(WPD.second.SingleImplName);
+
+      AddUint64(WPD.second.ResByArg.size());
+      for (auto &ByArg : WPD.second.ResByArg) {
+        AddUint64(ByArg.first.size());
+        for (uint64_t Arg : ByArg.first)
+          AddUint64(Arg);
+        AddUnsigned(ByArg.second.TheKind);
+        AddUint64(ByArg.second.Info);
+      }
+    }
+  };
+
+  // Include the hash for all type identifiers used by this module.
+  for (GlobalValue::GUID TId : UsedTypeIds) {
+    auto SummariesI = TypeIdSummariesByGuid.find(TId);
+    if (SummariesI != TypeIdSummariesByGuid.end())
+      for (auto *Summary : SummariesI->second)
+        AddTypeIdSummary(Summary->first, Summary->second);
   }
 
   if (!Conf.SampleProfile.empty()) {
@@ -164,9 +251,7 @@ static void thinLTOResolveWeakForLinkerGUID(
     }
     // Alias and aliasee can't be turned into available_externally.
     else if (!isa<AliasSummary>(S.get()) &&
-             !GlobalInvolvedWithAlias.count(S.get()) &&
-             (GlobalValue::isLinkOnceODRLinkage(OriginalLinkage) ||
-              GlobalValue::isWeakODRLinkage(OriginalLinkage)))
+             !GlobalInvolvedWithAlias.count(S.get()))
       S->setLinkage(GlobalValue::AvailableExternallyLinkage);
     if (S->linkage() != OriginalLinkage)
       recordNewLinkage(S->modulePath(), GUID, S->linkage());
@@ -220,14 +305,6 @@ void llvm::thinLTOInternalizeAndPromoteInIndex(
     thinLTOInternalizeAndPromoteGUID(I.second, I.first, isExported);
 }
 
-struct InputFile::InputModule {
-  BitcodeModule BM;
-  std::unique_ptr<Module> Mod;
-
-  // The range of ModuleSymbolTable entries for this input module.
-  size_t SymBegin, SymEnd;
-};
-
 // Requires a destructor for std::vector<InputModule>.
 InputFile::~InputFile() = default;
 
@@ -248,61 +325,52 @@ Expected<std::unique_ptr<InputFile>> InputFile::create(MemoryBufferRef Object) {
     return make_error<StringError>("Bitcode file does not contain any modules",
                                    inconvertibleErrorCode());
 
-  // Create an InputModule for each module in the InputFile, and add it to the
-  // ModuleSymbolTable.
+  File->Mods = *BMsOrErr;
+
+  LLVMContext Ctx;
+  std::vector<Module *> Mods;
+  std::vector<std::unique_ptr<Module>> OwnedMods;
   for (auto BM : *BMsOrErr) {
     Expected<std::unique_ptr<Module>> MOrErr =
-        BM.getLazyModule(File->Ctx, /*ShouldLazyLoadMetadata*/ true,
+        BM.getLazyModule(Ctx, /*ShouldLazyLoadMetadata*/ true,
                          /*IsImporting*/ false);
     if (!MOrErr)
       return MOrErr.takeError();
 
-    size_t SymBegin = File->SymTab.symbols().size();
-    File->SymTab.addModule(MOrErr->get());
-    size_t SymEnd = File->SymTab.symbols().size();
+    if ((*MOrErr)->getDataLayoutStr().empty())
+      return make_error<StringError>("input module has no datalayout",
+                                     inconvertibleErrorCode());
 
-    for (const auto &C : (*MOrErr)->getComdatSymbolTable()) {
-      auto P = File->ComdatMap.insert(
-          std::make_pair(&C.second, File->Comdats.size()));
-      assert(P.second);
-      (void)P;
-      File->Comdats.push_back(C.first());
-    }
+    Mods.push_back(MOrErr->get());
+    OwnedMods.push_back(std::move(*MOrErr));
+  }
 
-    File->Mods.push_back({BM, std::move(*MOrErr), SymBegin, SymEnd});
+  SmallVector<char, 0> Symtab;
+  if (Error E = irsymtab::build(Mods, Symtab, File->Strtab))
+    return std::move(E);
+
+  irsymtab::Reader R({Symtab.data(), Symtab.size()},
+                     {File->Strtab.data(), File->Strtab.size()});
+  File->TargetTriple = R.getTargetTriple();
+  File->SourceFileName = R.getSourceFileName();
+  File->COFFLinkerOpts = R.getCOFFLinkerOpts();
+  File->ComdatTable = R.getComdatTable();
+
+  for (unsigned I = 0; I != Mods.size(); ++I) {
+    size_t Begin = File->Symbols.size();
+    for (const irsymtab::Reader::SymbolRef &Sym : R.module_symbols(I))
+      // Skip symbols that are irrelevant to LTO. Note that this condition needs
+      // to match the one in Skip() in LTO::addRegularLTO().
+      if (Sym.isGlobal() && !Sym.isFormatSpecific())
+        File->Symbols.push_back(Sym);
+    File->ModuleSymIndices.push_back({Begin, File->Symbols.size()});
   }
 
   return std::move(File);
 }
 
-Expected<int> InputFile::Symbol::getComdatIndex() const {
-  if (!isGV())
-    return -1;
-  const GlobalObject *GO = getGV()->getBaseObject();
-  if (!GO)
-    return make_error<StringError>("Unable to determine comdat of alias!",
-                                   inconvertibleErrorCode());
-  if (const Comdat *C = GO->getComdat()) {
-    auto I = File->ComdatMap.find(C);
-    assert(I != File->ComdatMap.end());
-    return I->second;
-  }
-  return -1;
-}
-
 StringRef InputFile::getName() const {
-  return Mods[0].BM.getModuleIdentifier();
-}
-
-StringRef InputFile::getSourceFileName() const {
-  return Mods[0].Mod->getSourceFileName();
-}
-
-iterator_range<InputFile::symbol_iterator>
-InputFile::module_symbols(InputModule &IM) {
-  return llvm::make_range(
-      symbol_iterator(SymTab.symbols().data() + IM.SymBegin, SymTab, this),
-      symbol_iterator(SymTab.symbols().data() + IM.SymEnd, SymTab, this));
+  return Mods[0].getModuleIdentifier();
 }
 
 LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
@@ -326,21 +394,17 @@ LTO::LTO(Config Conf, ThinBackend Backend,
 LTO::~LTO() = default;
 
 // Add the given symbol to the GlobalResolutions map, and resolve its partition.
-void LTO::addSymbolToGlobalRes(SmallPtrSet<GlobalValue *, 8> &Used,
-                               const InputFile::Symbol &Sym,
+void LTO::addSymbolToGlobalRes(const InputFile::Symbol &Sym,
                                SymbolResolution Res, unsigned Partition) {
-  GlobalValue *GV = Sym.isGV() ? Sym.getGV() : nullptr;
-
   auto &GlobalRes = GlobalResolutions[Sym.getName()];
-  if (GV) {
-    GlobalRes.UnnamedAddr &= GV->hasGlobalUnnamedAddr();
-    if (Res.Prevailing)
-      GlobalRes.IRName = GV->getName();
-  }
+  GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
+  if (Res.Prevailing)
+    GlobalRes.IRName = Sym.getIRName();
+
   // Set the partition to external if we know it is used elsewhere, e.g.
   // it is visible to a regular object, is referenced from llvm.compiler_used,
   // or was already recorded as being referenced from a different partition.
-  if (Res.VisibleToRegularObj || (GV && Used.count(GV)) ||
+  if (Res.VisibleToRegularObj || Sym.isUsed() ||
       (GlobalRes.Partition != GlobalResolution::Unknown &&
        GlobalRes.Partition != Partition)) {
     GlobalRes.Partition = GlobalResolution::External;
@@ -372,6 +436,7 @@ static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
       OS << 'x';
     OS << '\n';
   }
+  OS.flush();
   assert(ResI == Res.end());
 }
 
@@ -383,41 +448,32 @@ Error LTO::add(std::unique_ptr<InputFile> Input,
     writeToResolutionFile(*Conf.ResolutionFile, Input.get(), Res);
 
   const SymbolResolution *ResI = Res.begin();
-  for (InputFile::InputModule &IM : Input->Mods)
-    if (Error Err = addModule(*Input, IM, ResI, Res.end()))
+  for (unsigned I = 0; I != Input->Mods.size(); ++I)
+    if (Error Err = addModule(*Input, I, ResI, Res.end()))
       return Err;
 
   assert(ResI == Res.end());
   return Error::success();
 }
 
-Error LTO::addModule(InputFile &Input, InputFile::InputModule &IM,
+Error LTO::addModule(InputFile &Input, unsigned ModI,
                      const SymbolResolution *&ResI,
                      const SymbolResolution *ResE) {
-  // FIXME: move to backend
-  Module &M = *IM.Mod;
-
-  if (M.getDataLayoutStr().empty())
-    return make_error<StringError>("input module has no datalayout",
-                                    inconvertibleErrorCode());
-
-  if (!Conf.OverrideTriple.empty())
-    M.setTargetTriple(Conf.OverrideTriple);
-  else if (M.getTargetTriple().empty())
-    M.setTargetTriple(Conf.DefaultTriple);
-
-  Expected<bool> HasThinLTOSummary = IM.BM.hasSummary();
+  Expected<bool> HasThinLTOSummary = Input.Mods[ModI].hasSummary();
   if (!HasThinLTOSummary)
     return HasThinLTOSummary.takeError();
 
+  auto ModSyms = Input.module_symbols(ModI);
   if (*HasThinLTOSummary)
-    return addThinLTO(IM.BM, M, Input.module_symbols(IM), ResI, ResE);
+    return addThinLTO(Input.Mods[ModI], ModSyms, ResI, ResE);
   else
-    return addRegularLTO(IM.BM, ResI, ResE);
+    return addRegularLTO(Input.Mods[ModI], ModSyms, ResI, ResE);
 }
 
 // Add a regular LTO object to the link.
-Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
+Error LTO::addRegularLTO(BitcodeModule BM,
+                         ArrayRef<InputFile::Symbol> Syms,
+                         const SymbolResolution *&ResI,
                          const SymbolResolution *ResE) {
   if (!RegularLTO.CombinedModule) {
     RegularLTO.CombinedModule =
@@ -438,47 +494,84 @@ Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
   ModuleSymbolTable SymTab;
   SymTab.addModule(&M);
 
-  SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
-
   std::vector<GlobalValue *> Keep;
 
   for (GlobalVariable &GV : M.globals())
     if (GV.hasAppendingLinkage())
       Keep.push_back(&GV);
 
-  for (const InputFile::Symbol &Sym :
-       make_range(InputFile::symbol_iterator(SymTab.symbols().begin(), SymTab,
-                                             nullptr),
-                  InputFile::symbol_iterator(SymTab.symbols().end(), SymTab,
-                                             nullptr))) {
+  DenseSet<GlobalObject *> AliasedGlobals;
+  for (auto &GA : M.aliases())
+    if (GlobalObject *GO = GA.getBaseObject())
+      AliasedGlobals.insert(GO);
+
+  // In this function we need IR GlobalValues matching the symbols in Syms
+  // (which is not backed by a module), so we need to enumerate them in the same
+  // order. The symbol enumeration order of a ModuleSymbolTable intentionally
+  // matches the order of an irsymtab, but when we read the irsymtab in
+  // InputFile::create we omit some symbols that are irrelevant to LTO. The
+  // Skip() function skips the same symbols from the module as InputFile does
+  // from the symbol table.
+  auto MsymI = SymTab.symbols().begin(), MsymE = SymTab.symbols().end();
+  auto Skip = [&]() {
+    while (MsymI != MsymE) {
+      auto Flags = SymTab.getSymbolFlags(*MsymI);
+      if ((Flags & object::BasicSymbolRef::SF_Global) &&
+          !(Flags & object::BasicSymbolRef::SF_FormatSpecific))
+        return;
+      ++MsymI;
+    }
+  };
+  Skip();
+
+  for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
-    addSymbolToGlobalRes(Used, Sym, Res, 0);
-
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Undefined)
-      continue;
-    if (Res.Prevailing && Sym.isGV()) {
-      GlobalValue *GV = Sym.getGV();
-      Keep.push_back(GV);
-      switch (GV->getLinkage()) {
-      default:
-        break;
-      case GlobalValue::LinkOnceAnyLinkage:
-        GV->setLinkage(GlobalValue::WeakAnyLinkage);
-        break;
-      case GlobalValue::LinkOnceODRLinkage:
-        GV->setLinkage(GlobalValue::WeakODRLinkage);
-        break;
+    addSymbolToGlobalRes(Sym, Res, 0);
+
+    assert(MsymI != MsymE);
+    ModuleSymbolTable::Symbol Msym = *MsymI++;
+    Skip();
+
+    if (GlobalValue *GV = Msym.dyn_cast<GlobalValue *>()) {
+      if (Res.Prevailing) {
+        if (Sym.isUndefined())
+          continue;
+        Keep.push_back(GV);
+        switch (GV->getLinkage()) {
+        default:
+          break;
+        case GlobalValue::LinkOnceAnyLinkage:
+          GV->setLinkage(GlobalValue::WeakAnyLinkage);
+          break;
+        case GlobalValue::LinkOnceODRLinkage:
+          GV->setLinkage(GlobalValue::WeakODRLinkage);
+          break;
+        }
+      } else if (isa<GlobalObject>(GV) &&
+                 (GV->hasLinkOnceODRLinkage() || GV->hasWeakODRLinkage() ||
+                  GV->hasAvailableExternallyLinkage()) &&
+                 !AliasedGlobals.count(cast<GlobalObject>(GV))) {
+        // Either of the above three types of linkage indicates that the
+        // chosen prevailing symbol will have the same semantics as this copy of
+        // the symbol, so we can link it with available_externally linkage. We
+        // only need to do this if the symbol is undefined.
+        GlobalValue *CombinedGV =
+            RegularLTO.CombinedModule->getNamedValue(GV->getName());
+        if (!CombinedGV || CombinedGV->isDeclaration()) {
+          Keep.push_back(GV);
+          GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
+          cast<GlobalObject>(GV)->setComdat(nullptr);
+        }
       }
     }
     // Common resolution: collect the maximum size/alignment over all commons.
     // We also record if we see an instance of a common as prevailing, so that
     // if none is prevailing we can ignore it later.
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Common) {
+    if (Sym.isCommon()) {
       // FIXME: We should figure out what to do about commons defined by asm.
       // For now they aren't reported correctly by ModuleSymbolTable.
-      auto &CommonRes = RegularLTO.Commons[Sym.getGV()->getName()];
+      auto &CommonRes = RegularLTO.Commons[Sym.getIRName()];
       CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
       CommonRes.Align = std::max(CommonRes.Align, Sym.getCommonAlignment());
       CommonRes.Prevailing |= Res.Prevailing;
@@ -486,23 +579,18 @@ Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
 
     // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit.
   }
+  assert(MsymI == MsymE);
 
   return RegularLTO.Mover->move(std::move(*MOrErr), Keep,
                                 [](GlobalValue &, IRMover::ValueAdder) {},
-                                /* LinkModuleInlineAsm */ true,
                                 /* IsPerformingImport */ false);
 }
 
 // Add a ThinLTO object to the link.
-// FIXME: This function should not need to take as many parameters once we have
-// a bitcode symbol table.
-Error LTO::addThinLTO(BitcodeModule BM, Module &M,
-                      iterator_range<InputFile::symbol_iterator> Syms,
+Error LTO::addThinLTO(BitcodeModule BM,
+                      ArrayRef<InputFile::Symbol> Syms,
                       const SymbolResolution *&ResI,
                       const SymbolResolution *ResE) {
-  SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
-
   Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr = BM.getSummary();
   if (!SummaryOrErr)
     return SummaryOrErr.takeError();
@@ -512,11 +600,15 @@ Error LTO::addThinLTO(BitcodeModule BM, Module &M,
   for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
-    addSymbolToGlobalRes(Used, Sym, Res, ThinLTO.ModuleMap.size() + 1);
+    addSymbolToGlobalRes(Sym, Res, ThinLTO.ModuleMap.size() + 1);
 
-    if (Res.Prevailing && Sym.isGV())
-      ThinLTO.PrevailingModuleForGUID[Sym.getGV()->getGUID()] =
-          BM.getModuleIdentifier();
+    if (Res.Prevailing) {
+      if (!Sym.getIRName().empty()) {
+        auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+            Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+      }
+    }
   }
 
   if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
@@ -602,7 +694,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
       return Error::success();
   }
   return backend(Conf, AddStream, RegularLTO.ParallelCodeGenParallelismLevel,
-                 std::move(RegularLTO.CombinedModule));
+                 std::move(RegularLTO.CombinedModule), ThinLTO.CombinedIndex);
 }
 
 /// This class defines the interface to the ThinLTO backend.
@@ -633,6 +725,7 @@ class InProcessThinBackend : public ThinBackendProc {
   ThreadPool BackendThreadPool;
   AddStreamFn AddStream;
   NativeObjectCache Cache;
+  TypeIdSummariesByGuidTy TypeIdSummariesByGuid;
 
   Optional<Error> Err;
   std::mutex ErrMu;
@@ -645,7 +738,14 @@ public:
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
         BackendThreadPool(ThinLTOParallelismLevel),
-        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {}
+        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+    // Create a mapping from type identifier GUIDs to type identifier summaries.
+    // This allows backends to use the type identifier GUIDs stored in the
+    // function summaries to determine which type identifier summaries affect
+    // each function without needing to compute GUIDs in each backend.
+    for (auto &TId : CombinedIndex.typeIds())
+      TypeIdSummariesByGuid[GlobalValue::getGUID(TId.first)].push_back(&TId);
+  }
 
   Error runThinLTOBackendThread(
       AddStreamFn AddStream, NativeObjectCache Cache, unsigned Task,
@@ -654,7 +754,8 @@ public:
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
-      MapVector<StringRef, BitcodeModule> &ModuleMap) {
+      MapVector<StringRef, BitcodeModule> &ModuleMap,
+      const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
     auto RunThinBackend = [&](AddStreamFn AddStream) {
       LTOLLVMContext BackendContext(Conf);
       Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
@@ -677,7 +778,7 @@ public:
     SmallString<40> Key;
     // The module may be cached, this helps handling it.
     computeCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, ExportList,
-                    ResolvedODR, DefinedGlobals);
+                    ResolvedODR, DefinedGlobals, TypeIdSummariesByGuid);
     if (AddStreamFn CacheAddStream = Cache(Task, Key))
       return RunThinBackend(CacheAddStream);
 
@@ -701,10 +802,11 @@ public:
             const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
                 &ResolvedODR,
             const GVSummaryMapTy &DefinedGlobals,
-            MapVector<StringRef, BitcodeModule> &ModuleMap) {
+            MapVector<StringRef, BitcodeModule> &ModuleMap,
+            const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
           Error E = runThinLTOBackendThread(
-              AddStream, Cache, Task, BM, CombinedIndex, ImportList,
-              ExportList, ResolvedODR, DefinedGlobals, ModuleMap);
+              AddStream, Cache, Task, BM, CombinedIndex, ImportList, ExportList,
+              ResolvedODR, DefinedGlobals, ModuleMap, TypeIdSummariesByGuid);
           if (E) {
             std::unique_lock<std::mutex> L(ErrMu);
             if (Err)
@@ -713,9 +815,9 @@ public:
               Err = std::move(E);
           }
         },
-        BM, std::ref(CombinedIndex), std::ref(ImportList),
-        std::ref(ExportList), std::ref(ResolvedODR), std::ref(DefinedGlobals),
-        std::ref(ModuleMap));
+        BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
+        std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap),
+        std::ref(TypeIdSummariesByGuid));
     return Error::success();
   }
 
@@ -857,19 +959,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
     if (!ModuleToDefinedGVSummaries.count(Mod.first))
       ModuleToDefinedGVSummaries.try_emplace(Mod.first);
 
-  // Compute "dead" symbols, we don't want to import/export these!
-  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
-  for (auto &Res : GlobalResolutions) {
-    if (Res.second.VisibleOutsideThinLTO &&
-        // IRName will be defined if we have seen the prevailing copy of
-        // this value. If not, no need to preserve any ThinLTO copies.
-        !Res.second.IRName.empty())
-      GUIDPreservedSymbols.insert(GlobalValue::getGUID(Res.second.IRName));
-  }
-
-  auto DeadSymbols =
-      computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
-
   StringMap<FunctionImporter::ImportMapTy> ImportLists(
       ThinLTO.ModuleMap.size());
   StringMap<FunctionImporter::ExportSetTy> ExportLists(
@@ -877,6 +966,20 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
 
   if (Conf.OptLevel > 0) {
+    // Compute "dead" symbols, we don't want to import/export these!
+    DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+    for (auto &Res : GlobalResolutions) {
+      if (Res.second.VisibleOutsideThinLTO &&
+          // IRName will be defined if we have seen the prevailing copy of
+          // this value. If not, no need to preserve any ThinLTO copies.
+          !Res.second.IRName.empty())
+        GUIDPreservedSymbols.insert(GlobalValue::getGUID(
+            GlobalValue::getRealLinkageName(Res.second.IRName)));
+    }
+
+    auto DeadSymbols =
+        computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
+
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                              ImportLists, ExportLists, &DeadSymbols);
 
@@ -890,10 +993,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
       // partition (and we can't get the GUID).
       if (Res.second.IRName.empty())
         continue;
-      auto GUID = GlobalValue::getGUID(Res.second.IRName);
+      auto GUID = GlobalValue::getGUID(
+          GlobalValue::getRealLinkageName(Res.second.IRName));
       // Mark exported unless index-based analysis determined it to be dead.
       if (!DeadSymbols.count(GUID))
-        ExportedGUIDs.insert(GlobalValue::getGUID(Res.second.IRName));
+        ExportedGUIDs.insert(GUID);
     }
 
     auto isPrevailing = [&](GlobalValue::GUID GUID,
@@ -937,3 +1041,27 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   return BackendProc->wait();
 }
+
+Expected<std::unique_ptr<tool_output_file>>
+lto::setupOptimizationRemarks(LLVMContext &Context,
+                              StringRef LTORemarksFilename,
+                              bool LTOPassRemarksWithHotness, int Count) {
+  if (LTORemarksFilename.empty())
+    return nullptr;
+
+  std::string Filename = LTORemarksFilename;
+  if (Count != -1)
+    Filename += ".thin." + llvm::utostr(Count) + ".yaml";
+
+  std::error_code EC;
+  auto DiagnosticFile =
+      llvm::make_unique<tool_output_file>(Filename, EC, sys::fs::F_None);
+  if (EC)
+    return errorCodeToError(EC);
+  Context.setDiagnosticsOutputFile(
+      llvm::make_unique<yaml::Output>(DiagnosticFile->os()));
+  if (LTOPassRemarksWithHotness)
+    Context.setDiagnosticHotnessRequested(true);
+  DiagnosticFile->keep();
+  return std::move(DiagnosticFile);
+}
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 809db80bc916..4bd251f727a4 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -27,6 +27,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -42,6 +43,11 @@
 using namespace llvm;
 using namespace lto;
 
+static cl::opt<bool>
+    LTOUseNewPM("lto-use-new-pm",
+                cl::desc("Run LTO passes using the new pass manager"),
+                cl::init(false), cl::Hidden);
+
 LLVM_ATTRIBUTE_NORETURN static void reportOpenError(StringRef Path, Twine Msg) {
   errs() << "failed to open " << Path << ": " << Msg << '\n';
   errs().flush();
@@ -124,6 +130,56 @@ createTargetMachine(Config &Conf, StringRef TheTriple,
       Conf.CodeModel, Conf.CGOptLevel));
 }
 
+static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel) {
+  PassBuilder PB(TM);
+  AAManager AA;
+
+  // Parse a custom AA pipeline if asked to.
+  assert(PB.parseAAPipeline(AA, "default"));
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+
+  // Register the AA manager first so that our version is the one used.
+  FAM.registerPass([&] { return std::move(AA); });
+
+  // Register all the basic analyses with the managers.
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  ModulePassManager MPM;
+  // FIXME (davide): verify the input.
+
+  PassBuilder::OptimizationLevel OL;
+
+  switch (OptLevel) {
+  default:
+    llvm_unreachable("Invalid optimization level");
+  case 0:
+    OL = PassBuilder::O0;
+    break;
+  case 1:
+    OL = PassBuilder::O1;
+    break;
+  case 2:
+    OL = PassBuilder::O2;
+    break;
+  case 3:
+    OL = PassBuilder::O3;
+    break;
+  }
+
+  MPM = PB.buildLTODefaultPipeline(OL, false /* DebugLogging */);
+  MPM.run(Mod, MAM);
+
+  // FIXME (davide): verify the output.
+}
+
 static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
                                  std::string PipelineDesc,
                                  std::string AAPipelineDesc,
@@ -168,13 +224,16 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 }
 
 static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
-                           bool IsThinLTO) {
+                           bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+                           const ModuleSummaryIndex *ImportSummary) {
   legacy::PassManager passes;
   passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(Triple(TM->getTargetTriple()));
   PMB.Inliner = createFunctionInliningPass();
+  PMB.ExportSummary = ExportSummary;
+  PMB.ImportSummary = ImportSummary;
   // Unconditionally verify input since it is not verified before this
   // point and has unknown origin.
   PMB.VerifyInput = true;
@@ -191,12 +250,21 @@ static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
 }
 
 bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
-         bool IsThinLTO) {
-  if (Conf.OptPipeline.empty())
-    runOldPMPasses(Conf, Mod, TM, IsThinLTO);
-  else
+         bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+         const ModuleSummaryIndex *ImportSummary) {
+  // There's still no ThinLTO pipeline hooked up in the new pass manager,
+  // once there is one, we can just remove this.
+  if (LTOUseNewPM && IsThinLTO)
+    report_fatal_error("ThinLTO not supported with the new PM yet!");
+
+  // FIXME: Plumb the combined index into the new pass manager.
+  if (!Conf.OptPipeline.empty())
     runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
                          Conf.DisableVerify);
+  else if (LTOUseNewPM)
+    runNewPMPasses(Mod, TM, Conf.OptLevel);
+  else
+    runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary);
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
 }
 
@@ -207,8 +275,7 @@ void codegen(Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
   auto Stream = AddStream(Task);
   legacy::PassManager CodeGenPasses;
-  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
-                              TargetMachine::CGFT_ObjectFile))
+  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS, Conf.CGFileType))
     report_fatal_error("Failed to setup codegen");
   CodeGenPasses.run(Mod);
 }
@@ -276,12 +343,22 @@ Expected<const Target *> initAndLookupTarget(Config &C, Module &Mod) {
 
 }
 
+static void
+finalizeOptimizationRemarks(std::unique_ptr<tool_output_file> DiagOutputFile) {
+  // Make sure we flush the diagnostic remarks file in case the linker doesn't
+  // call the global destructors before exiting.
+  if (!DiagOutputFile)
+    return;
+  DiagOutputFile->keep();
+  DiagOutputFile->os().flush();
+}
+
 static void handleAsmUndefinedRefs(Module &Mod, TargetMachine &TM) {
   // Collect the list of undefined symbols used in asm and update
   // llvm.compiler.used to prevent optimization to drop these from the output.
   StringSet<> AsmUndefinedRefs;
   ModuleSymbolTable::CollectAsmSymbols(
-      Triple(Mod.getTargetTriple()), Mod.getModuleInlineAsm(),
+      Mod,
       [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
         if (Flags & object::BasicSymbolRef::SF_Undefined)
           AsmUndefinedRefs.insert(Name);
@@ -291,7 +368,8 @@ static void handleAsmUndefinedRefs(Module &Mod, TargetMachine &TM) {
 
 Error lto::backend(Config &C, AddStreamFn AddStream,
                    unsigned ParallelCodeGenParallelismLevel,
-                   std::unique_ptr<Module> Mod) {
+                   std::unique_ptr<Module> Mod,
+                   ModuleSummaryIndex &CombinedIndex) {
   Expected<const Target *> TOrErr = initAndLookupTarget(C, *Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -301,9 +379,20 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
 
   handleAsmUndefinedRefs(*Mod, *TM);
 
-  if (!C.CodeGenOnly)
-    if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false))
+  // Setup optimization remarks.
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Mod->getContext(), C.RemarksFilename, C.RemarksWithHotness);
+  if (!DiagFileOrErr)
+    return DiagFileOrErr.takeError();
+  auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
+
+  if (!C.CodeGenOnly) {
+    if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false,
+             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr)) {
+      finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       return Error::success();
+    }
+  }
 
   if (ParallelCodeGenParallelismLevel == 1) {
     codegen(C, TM.get(), AddStream, 0, *Mod);
@@ -311,11 +400,12 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
     splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel,
                  std::move(Mod));
   }
+  finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   return Error::success();
 }
 
 Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
-                       Module &Mod, ModuleSummaryIndex &CombinedIndex,
+                       Module &Mod, const ModuleSummaryIndex &CombinedIndex,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> &ModuleMap) {
@@ -367,7 +457,8 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
     return Error::success();
 
-  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true))
+  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true,
+           /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex))
     return Error::success();
 
   codegen(Conf, TM.get(), AddStream, Task, Mod);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 6af31e61f946..86fba843e980 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/LTO/LTO.h"
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/Linker/Linker.h"
@@ -140,6 +141,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeMemCpyOptLegacyPassPass(R);
   initializeDCELegacyPassPass(R);
   initializeCFGSimplifyPassPass(R);
+  initializeLateCFGSimplifyPassPass(R);
 }
 
 void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) {
@@ -506,25 +508,6 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
     report_fatal_error("Broken module found, compilation aborted!");
 }
 
-bool LTOCodeGenerator::setupOptimizationRemarks() {
-  if (LTORemarksFilename != "") {
-    std::error_code EC;
-    DiagnosticOutputFile = llvm::make_unique<tool_output_file>(
-        LTORemarksFilename, EC, sys::fs::F_None);
-    if (EC) {
-      emitError(EC.message());
-      return false;
-    }
-    Context.setDiagnosticsOutputFile(
-        llvm::make_unique<yaml::Output>(DiagnosticOutputFile->os()));
-  }
-
-  if (LTOPassRemarksWithHotness)
-    Context.setDiagnosticHotnessRequested(true);
-
-  return true;
-}
-
 void LTOCodeGenerator::finishOptimizationRemarks() {
   if (DiagnosticOutputFile) {
     DiagnosticOutputFile->keep();
@@ -540,8 +523,13 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!this->determineTarget())
     return false;
 
-  if (!setupOptimizationRemarks())
-    return false;
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Context, LTORemarksFilename, LTOPassRemarksWithHotness);
+  if (!DiagFileOrErr) {
+    errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
+    report_fatal_error("Can't get an output file for the remarks");
+  }
+  DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
@@ -567,6 +555,8 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!DisableInline)
     PMB.Inliner = createFunctionInliningPass();
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple);
+  if (Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.OptLevel = OptLevel;
   PMB.VerifyInput = !DisableVerify;
   PMB.VerifyOutput = !DisableVerify;
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 89aeb8000038..11f0982c6a60 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -14,11 +14,12 @@
 
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
@@ -647,11 +648,15 @@ void LTOModule::parseMetadata() {
     }
   }
 
-  // Globals
+  // Globals - we only need to do this for COFF.
+  const Triple TT(_target->getTargetTriple());
+  if (!TT.isOSBinFormatCOFF())
+    return;
+  Mangler M;
   for (const NameAndAttributes &Sym : _symbols) {
     if (!Sym.symbol)
       continue;
-    _target->getObjFileLowering()->emitLinkerFlagsForGlobal(OS, Sym.symbol);
+    emitLinkerFlagsForGlobalCOFF(OS, Sym.symbol, TT, M);
   }
 
   // Add other interesting metadata here.
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 40537e4fa784..0d845a26d0c2 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -14,10 +14,6 @@
 
 #include "llvm/LTO/legacy/ThinLTOCodeGenerator.h"
 
-#ifdef HAVE_LLVM_REVISION
-#include "LLVMLTORevision.h"
-#endif
-
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
@@ -47,6 +43,7 @@
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
@@ -73,27 +70,6 @@ namespace {
 static cl::opt<int>
     ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
 
-Expected<std::unique_ptr<tool_output_file>>
-setupOptimizationRemarks(LLVMContext &Ctx, int Count) {
-  if (LTOPassRemarksWithHotness)
-    Ctx.setDiagnosticHotnessRequested(true);
-
-  if (LTORemarksFilename.empty())
-    return nullptr;
-
-  std::string FileName =
-      LTORemarksFilename + ".thin." + llvm::utostr(Count) + ".yaml";
-  std::error_code EC;
-  auto DiagnosticOutputFile =
-      llvm::make_unique<tool_output_file>(FileName, EC, sys::fs::F_None);
-  if (EC)
-    return errorCodeToError(EC);
-  Ctx.setDiagnosticsOutputFile(
-      llvm::make_unique<yaml::Output>(DiagnosticOutputFile->os()));
-  DiagnosticOutputFile->keep();
-  return std::move(DiagnosticOutputFile);
-}
-
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
                             unsigned count, StringRef Suffix) {
@@ -208,10 +184,12 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
 }
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
-                           unsigned OptLevel) {
+                           unsigned OptLevel, bool Freestanding) {
   // Populate the PassManager
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TM.getTargetTriple());
+  if (Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.Inliner = createFunctionInliningPass();
   // FIXME: should get it from the bitcode?
   PMB.OptLevel = OptLevel;
@@ -285,7 +263,7 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedFunctions,
       const DenseSet<GlobalValue::GUID> &PreservedSymbols, unsigned OptLevel,
-      const TargetMachineBuilder &TMBuilder) {
+      bool Freestanding, const TargetMachineBuilder &TMBuilder) {
     if (CachePath.empty())
       return;
 
@@ -323,7 +301,7 @@ public:
 
     // Start with the compiler revision
     Hasher.update(LLVM_VERSION_STRING);
-#ifdef HAVE_LLVM_REVISION
+#ifdef LLVM_REVISION
     Hasher.update(LLVM_REVISION);
 #endif
 
@@ -342,6 +320,7 @@ public:
       AddUnsigned(*TMBuilder.RelocModel);
     AddUnsigned(TMBuilder.CGOptLevel);
     AddUnsigned(OptLevel);
+    AddUnsigned(Freestanding);
 
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
     for (auto F : ExportList)
@@ -369,7 +348,10 @@ public:
             ArrayRef<uint8_t>((const uint8_t *)&Entry, sizeof(GlobalValue::GUID)));
     }
 
-    sys::path::append(EntryPath, CachePath, toHex(Hasher.result()));
+    // This choice of file name allows the cache to be pruned (see pruneCache()
+    // in include/llvm/Support/CachePruning.h).
+    sys::path::append(EntryPath, CachePath,
+                      "llvmcache-" + toHex(Hasher.result()));
   }
 
   // Access the path to this entry in the cache.
@@ -422,7 +404,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
                      const GVSummaryMapTy &DefinedGlobals,
                      const ThinLTOCodeGenerator::CachingOptions &CacheOptions,
                      bool DisableCodeGen, StringRef SaveTempsDir,
-                     unsigned OptLevel, unsigned count) {
+                     bool Freestanding, unsigned OptLevel, unsigned count) {
 
   // "Benchmark"-like optimization: single-source case
   bool SingleModule = (ModuleMap.size() == 1);
@@ -454,7 +436,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
   }
 
-  optimizeModule(TheModule, TM, OptLevel);
+  optimizeModule(TheModule, TM, OptLevel, Freestanding);
 
   saveTempBitcode(TheModule, SaveTempsDir, count, ".4.opt.bc");
 
@@ -780,7 +762,7 @@ void ThinLTOCodeGenerator::optimize(Module &TheModule) {
   initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
 
   // Optimize now
-  optimizeModule(TheModule, *TMBuilder.create(), OptLevel);
+  optimizeModule(TheModule, *TMBuilder.create(), OptLevel, Freestanding);
 }
 
 /**
@@ -966,7 +948,7 @@ void ThinLTOCodeGenerator::run() {
                                     ImportLists[ModuleIdentifier], ExportList,
                                     ResolvedODR[ModuleIdentifier],
                                     DefinedFunctions, GUIDPreservedSymbols,
-                                    OptLevel, TMBuilder);
+                                    OptLevel, Freestanding, TMBuilder);
         auto CacheEntryPath = CacheEntry.getEntryPath();
 
         {
@@ -990,7 +972,8 @@ void ThinLTOCodeGenerator::run() {
         LLVMContext Context;
         Context.setDiscardValueNames(LTODiscardValueNames);
         Context.enableDebugTypeODRUniquing();
-        auto DiagFileOrErr = setupOptimizationRemarks(Context, count);
+        auto DiagFileOrErr = lto::setupOptimizationRemarks(
+            Context, LTORemarksFilename, LTOPassRemarksWithHotness, count);
         if (!DiagFileOrErr) {
           errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
           report_fatal_error("ThinLTO: Can't get an output file for the "
@@ -1011,7 +994,7 @@ void ThinLTOCodeGenerator::run() {
             *TheModule, *Index, ModuleMap, *TMBuilder.create(), ImportList,
             ExportList, GUIDPreservedSymbols,
             ModuleToDefinedGVSummaries[ModuleIdentifier], CacheOptions,
-            DisableCodeGen, SaveTempsDir, OptLevel, count);
+            DisableCodeGen, SaveTempsDir, Freestanding, OptLevel, count);
 
         // Commit to the cache (if enabled)
         CacheEntry.write(*OutputBuffer);
@@ -1043,11 +1026,7 @@ void ThinLTOCodeGenerator::run() {
     }
   }
 
-  CachePruning(CacheOptions.Path)
-      .setPruningInterval(std::chrono::seconds(CacheOptions.PruningInterval))
-      .setEntryExpiration(std::chrono::seconds(CacheOptions.Expiration))
-      .setMaxSize(CacheOptions.MaxPercentageOfAvailableSpace)
-      .prune();
+  pruneCache(CacheOptions.Path, CacheOptions.Policy);
 
   // If statistics were requested, print them out now.
   if (llvm::AreStatisticsEnabled())
diff --git a/lib/LTO/UpdateCompilerUsed.cpp b/lib/LTO/UpdateCompilerUsed.cpp
index b67d9ea5989d..5165cc965038 100644
--- a/lib/LTO/UpdateCompilerUsed.cpp
+++ b/lib/LTO/UpdateCompilerUsed.cpp
@@ -65,7 +65,7 @@ private:
     // target.
     for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
          I != E; ++I) {
-      LibFunc::Func F = static_cast<LibFunc::Func>(I);
+      LibFunc F = static_cast<LibFunc>(I);
       if (TLI.has(F))
         Libcalls.insert(TLI.getName(F));
     }
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/LibDriver/LibDriver.cpp
index bcdec4f7a933..c50629d71501 100644
--- a/lib/LibDriver/LibDriver.cpp
+++ b/lib/LibDriver/LibDriver.cpp
@@ -121,7 +121,7 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
   for (auto *Arg : Args.filtered(OPT_UNKNOWN))
     llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
 
-  if (Args.filtered_begin(OPT_INPUT) == Args.filtered_end()) {
+  if (!Args.hasArgNoClaim(OPT_INPUT)) {
     // No input files.  To match lib.exe, silently do nothing.
     return 0;
   }
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 9f3cfc0eace4..15a46a2d0420 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -395,11 +395,12 @@ class IRLinker {
       Worklist.push_back(GV);
   }
 
-  /// Flag whether the ModuleInlineAsm string in Src should be linked with
-  /// (concatenated into) the ModuleInlineAsm string for the destination
-  /// module. It should be true for full LTO, but not when importing for
-  /// ThinLTO, otherwise we can have duplicate symbols.
-  bool LinkModuleInlineAsm;
+  /// Whether we are importing globals for ThinLTO, as opposed to linking the
+  /// source module. If this flag is set, it means that we can rely on some
+  /// other object file to define any non-GlobalValue entities defined by the
+  /// source module. This currently causes us to not link retained types in
+  /// debug info metadata and module inline asm.
+  bool IsPerformingImport;
 
   /// Set to true when all global value body linking is complete (including
   /// lazy linking). Used to prevent metadata linking from creating new
@@ -491,10 +492,10 @@ public:
            IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
            ArrayRef<GlobalValue *> ValuesToLink,
            std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
-           bool LinkModuleInlineAsm, bool IsPerformingImport)
+           bool IsPerformingImport)
       : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
         TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this),
-        SharedMDs(SharedMDs), LinkModuleInlineAsm(LinkModuleInlineAsm),
+        SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
         Mapper(ValueMap, RF_MoveDistinctMDs | RF_IgnoreMissingLocals, &TypeMap,
                &GValMaterializer),
         AliasMCID(Mapper.registerAlternateMappingContext(AliasValueMap,
@@ -870,9 +871,6 @@ bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
   if (DGV && !DGV->isDeclarationForLinker())
     return false;
 
-  if (SGV.hasAvailableExternallyLinkage())
-    return true;
-
   if (SGV.isDeclaration() || DoneLinkingBodies)
     return false;
 
@@ -1297,7 +1295,7 @@ Error IRLinker::run() {
   DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
 
   // Append the module inline asm string.
-  if (LinkModuleInlineAsm && !SrcM->getModuleInlineAsm().empty()) {
+  if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
     if (DstM.getModuleInlineAsm().empty())
       DstM.setModuleInlineAsm(SrcM->getModuleInlineAsm());
     else
@@ -1436,10 +1434,10 @@ IRMover::IRMover(Module &M) : Composite(M) {
 Error IRMover::move(
     std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
     std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
-    bool LinkModuleInlineAsm, bool IsPerformingImport) {
+    bool IsPerformingImport) {
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
-                       LinkModuleInlineAsm, IsPerformingImport);
+                       IsPerformingImport);
   Error E = TheIRLinker.run();
   Composite.dropTriviallyDeadConstantArrays();
   return E;
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index cf2c4ccf523e..c0ce4bf76b9f 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -14,12 +14,13 @@
 #include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringSet.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Transforms/Utils/FunctionImportUtils.h"
 using namespace llvm;
 
 namespace {
@@ -31,14 +32,17 @@ class ModuleLinker {
   std::unique_ptr<Module> SrcM;
 
   SetVector<GlobalValue *> ValuesToLink;
-  StringSet<> Internalize;
 
   /// For symbol clashes, prefer those from Src.
   unsigned Flags;
 
-  /// Functions to import from source module, all other functions are
-  /// imported as declarations instead of definitions.
-  DenseSet<const GlobalValue *> *GlobalsToImport;
+  /// List of global value names that should be internalized.
+  StringSet<> Internalize;
+
+  /// Function that will perform the actual internalization. The reason for a
+  /// callback is that the linker cannot call internalizeModule without
+  /// creating a circular dependency between IPO and the linker.
+  std::function<void(Module &, const StringSet<> &)> InternalizeCallback;
 
   /// Used as the callback for lazy linking.
   /// The mover has just hit GV and we have to decide if it, and other members
@@ -46,14 +50,8 @@ class ModuleLinker {
   /// to Add.
   void addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add);
 
-  bool shouldLinkReferencedLinkOnce() {
-    return !(Flags & Linker::DontForceLinkLinkonceODR);
-  }
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
-  bool shouldInternalizeLinkedSymbols() {
-    return Flags & Linker::InternalizeLinkedSymbols;
-  }
 
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
@@ -108,31 +106,17 @@ class ModuleLinker {
 
   bool linkIfNeeded(GlobalValue &GV);
 
-  /// Helper method to check if we are importing from the current source
-  /// module.
-  bool isPerformingImport() const { return GlobalsToImport != nullptr; }
-
-  /// If we are importing from the source module, checks if we should
-  /// import SGV as a definition, otherwise import as a declaration.
-  bool doImportAsDefinition(const GlobalValue *SGV);
-
 public:
   ModuleLinker(IRMover &Mover, std::unique_ptr<Module> SrcM, unsigned Flags,
-               DenseSet<const GlobalValue *> *GlobalsToImport = nullptr)
+               std::function<void(Module &, const StringSet<> &)>
+                   InternalizeCallback = {})
       : Mover(Mover), SrcM(std::move(SrcM)), Flags(Flags),
-        GlobalsToImport(GlobalsToImport) {}
+        InternalizeCallback(std::move(InternalizeCallback)) {}
 
   bool run();
 };
 }
 
-bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
-  if (!isPerformingImport())
-    return false;
-  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
-                                                              GlobalsToImport);
-}
-
 static GlobalValue::VisibilityTypes
 getMinVisibility(GlobalValue::VisibilityTypes A,
                  GlobalValue::VisibilityTypes B) {
@@ -266,18 +250,10 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 
   // We always have to add Src if it has appending linkage.
   if (Src.hasAppendingLinkage()) {
-    // Should have prevented importing for appending linkage in linkIfNeeded.
-    assert(!isPerformingImport());
     LinkFromSrc = true;
     return false;
   }
 
-  if (isPerformingImport()) {
-    // LinkFromSrc iff this is a global requested for importing.
-    LinkFromSrc = GlobalsToImport->count(&Src);
-    return false;
-  }
-
   bool SrcIsDeclaration = Src.isDeclarationForLinker();
   bool DestIsDeclaration = Dest.isDeclarationForLinker();
 
@@ -383,19 +359,9 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
     GV.setUnnamedAddr(UnnamedAddr);
   }
 
-  // Don't want to append to global_ctors list, for example, when we
-  // are importing for ThinLTO, otherwise the global ctors and dtors
-  // get executed multiple times for local variables (the latter causing
-  // double frees).
-  if (GV.hasAppendingLinkage() && isPerformingImport())
-    return false;
-
-  if (isPerformingImport()) {
-    if (!doImportAsDefinition(&GV))
-      return false;
-  } else if (!DGV && !shouldOverrideFromSrc() &&
-             (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-              GV.hasAvailableExternallyLinkage()))
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage()))
     return false;
 
   if (GV.isDeclaration())
@@ -418,17 +384,12 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
 }
 
 void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
-  if (!shouldLinkReferencedLinkOnce())
-    // For ThinLTO we don't import more than what was required.
-    // The client has to guarantee that the linkonce will be availabe at link
-    // time (by promoting it to weak for instance).
-    return;
-
   // Add these to the internalize list
-  if (!GV.hasLinkOnceLinkage() && !shouldLinkOnlyNeeded())
+  if (!GV.hasLinkOnceLinkage() && !GV.hasAvailableExternallyLinkage() &&
+      !shouldLinkOnlyNeeded())
     return;
 
-  if (shouldInternalizeLinkedSymbols())
+  if (InternalizeCallback)
     Internalize.insert(GV.getName());
   Add(GV);
 
@@ -442,7 +403,7 @@ void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
       return;
     if (!LinkFromSrc)
       continue;
-    if (shouldInternalizeLinkedSymbols())
+    if (InternalizeCallback)
       Internalize.insert(GV2->getName());
     Add(*GV2);
   }
@@ -571,7 +532,7 @@ bool ModuleLinker::run() {
     }
   }
 
-  if (shouldInternalizeLinkedSymbols()) {
+  if (InternalizeCallback) {
     for (GlobalValue *GV : ValuesToLink)
       Internalize.insert(GV->getName());
   }
@@ -583,8 +544,7 @@ bool ModuleLinker::run() {
                            [this](GlobalValue &GV, IRMover::ValueAdder Add) {
                              addLazyFor(GV, Add);
                            },
-                           /* LinkModuleInlineAsm */ !isPerformingImport(),
-                           /* IsPerformingImport */ isPerformingImport())) {
+                           /* IsPerformingImport */ false)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       DstM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, EIB.message()));
       HasErrors = true;
@@ -593,19 +553,19 @@ bool ModuleLinker::run() {
   if (HasErrors)
     return true;
 
-  for (auto &P : Internalize) {
-    GlobalValue *GV = DstM.getNamedValue(P.first());
-    GV->setLinkage(GlobalValue::InternalLinkage);
-  }
+  if (InternalizeCallback)
+    InternalizeCallback(DstM, Internalize);
 
   return false;
 }
 
 Linker::Linker(Module &M) : Mover(M) {}
 
-bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
-                          DenseSet<const GlobalValue *> *GlobalsToImport) {
-  ModuleLinker ModLinker(Mover, std::move(Src), Flags, GlobalsToImport);
+bool Linker::linkInModule(
+    std::unique_ptr<Module> Src, unsigned Flags,
+    std::function<void(Module &, const StringSet<> &)> InternalizeCallback) {
+  ModuleLinker ModLinker(Mover, std::move(Src), Flags,
+                         std::move(InternalizeCallback));
   return ModLinker.run();
 }
 
@@ -618,10 +578,11 @@ bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
 /// true is returned and ErrorMsg (if not null) is set to indicate the problem.
 /// Upon failure, the Dest module could be in a modified state, and shouldn't be
 /// relied on to be consistent.
-bool Linker::linkModules(Module &Dest, std::unique_ptr<Module> Src,
-                         unsigned Flags) {
+bool Linker::linkModules(
+    Module &Dest, std::unique_ptr<Module> Src, unsigned Flags,
+    std::function<void(Module &, const StringSet<> &)> InternalizeCallback) {
   Linker L(Dest);
-  return L.linkInModule(std::move(Src), Flags);
+  return L.linkInModule(std::move(Src), Flags, std::move(InternalizeCallback));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 2f1b39e58e33..a86fd383003d 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMMC
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
   MCAsmInfoELF.cpp
+  MCAsmInfoWasm.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
   MCCodeEmitter.cpp
@@ -34,17 +35,21 @@ add_llvm_library(LLVMMC
   MCSectionCOFF.cpp
   MCSectionELF.cpp
   MCSectionMachO.cpp
+  MCSectionWasm.cpp
   MCStreamer.cpp
   MCSubtargetInfo.cpp
   MCSymbol.cpp
   MCSymbolELF.cpp
   MCTargetOptions.cpp
   MCValue.cpp
+  MCWasmObjectTargetWriter.cpp
+  MCWasmStreamer.cpp
   MCWin64EH.cpp
   MCWinEH.cpp
   MachObjectWriter.cpp
   StringTableBuilder.cpp
   SubtargetFeature.cpp
+  WasmObjectWriter.cpp
   WinCOFFObjectWriter.cpp
   WinCOFFStreamer.cpp
 
diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
index 9608c2c656b7..8c94e2780998 100644
--- a/lib/MC/ConstantPools.cpp
+++ b/lib/MC/ConstantPools.cpp
@@ -1,4 +1,4 @@
-//===- ConstantPools.cpp - ConstantPool class --*- C++ -*---------===//
+//===- ConstantPools.cpp - ConstantPool class -----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,13 +10,16 @@
 // This file implements the ConstantPool and  AssemblerConstantPools classes.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/ADT/MapVector.h"
+
 #include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
+
 //
 // ConstantPool implementation
 //
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index a8c88dda6936..ee9c25cda94f 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -11,29 +11,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -42,6 +62,7 @@ using namespace llvm;
 #define DEBUG_TYPE "reloc-info"
 
 namespace {
+
 typedef DenseMap<const MCSectionELF *, uint32_t> SectionIndexMapTy;
 
 class ELFObjectWriter;
@@ -99,8 +120,7 @@ class ELFObjectWriter : public MCObjectWriter {
 
   DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
 
-  llvm::DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>>
-      Relocations;
+  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
 
   /// @}
   /// @name Symbol Table Data
@@ -144,6 +164,8 @@ public:
                   bool IsLittleEndian)
       : MCObjectWriter(OS, IsLittleEndian), TargetObjectWriter(MOTW) {}
 
+  ~ELFObjectWriter() override = default;
+
   void reset() override {
     Renames.clear();
     Relocations.clear();
@@ -152,8 +174,6 @@ public:
     MCObjectWriter::reset();
   }
 
-  ~ELFObjectWriter() override;
-
   void WriteWord(uint64_t W) {
     if (is64Bit())
       write64(W);
@@ -222,18 +242,18 @@ public:
 
   void writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec);
 
+  using MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl;
   bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
                                               const MCSymbol &SymA,
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  bool isWeak(const MCSymbol &Sym) const override;
-
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
   void writeSection(const SectionIndexMapTy &SectionIndexMap,
                     uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
                     const MCSectionELF &Section);
 };
+
 } // end anonymous namespace
 
 void ELFObjectWriter::align(unsigned Alignment) {
@@ -297,9 +317,6 @@ void SymbolTableWriter::writeSymbol(uint32_t name, uint8_t info, uint64_t value,
   ++NumWritten;
 }
 
-ELFObjectWriter::~ELFObjectWriter()
-{}
-
 // Emit the ELF header.
 void ELFObjectWriter::writeHeader(const MCAssembler &Asm) {
   // ELF Header
@@ -370,22 +387,6 @@ uint64_t ELFObjectWriter::SymbolValue(const MCSymbol &Sym,
 
 void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                const MCAsmLayout &Layout) {
-  // Section symbols are used as definitions for undefined symbols with matching
-  // names. If there are multiple sections with the same name, the first one is
-  // used.
-  for (const MCSection &Sec : Asm) {
-    const MCSymbol *Begin = Sec.getBeginSymbol();
-    if (!Begin)
-      continue;
-
-    const MCSymbol *Alias = Asm.getContext().lookupSymbol(Begin->getName());
-    if (!Alias || !Alias->isUndefined())
-      continue;
-
-    Renames.insert(
-        std::make_pair(cast<MCSymbolELF>(Alias), cast<MCSymbolELF>(Begin)));
-  }
-
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
   for (const MCSymbol &A : Asm.symbols()) {
@@ -900,6 +901,8 @@ void ELFObjectWriter::computeSymbolTable(
 
   StrTabBuilder.finalize();
 
+  // File symbols are emitted first and handled separately from normal symbols,
+  // i.e. a non-STT_FILE symbol with the same name may appear.
   for (const std::string &Name : FileNames)
     Writer.writeSymbol(StrTabBuilder.getOffset(Name),
                        ELF::STT_FILE | ELF::STB_LOCAL, 0, 0, ELF::STV_DEFAULT,
@@ -1037,10 +1040,10 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   setStream(OldStream);
 
   SmallVector<char, 128> CompressedContents;
-  zlib::Status Success = zlib::compress(
-      StringRef(UncompressedData.data(), UncompressedData.size()),
-      CompressedContents);
-  if (Success != zlib::StatusOK) {
+  if (Error E = zlib::compress(
+          StringRef(UncompressedData.data(), UncompressedData.size()),
+          CompressedContents)) {
+    consumeError(std::move(E));
     getStream() << UncompressedData;
     return;
   }
@@ -1151,8 +1154,8 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   case ELF::SHT_RELA: {
     sh_link = SymbolTableIndex;
     assert(sh_link && ".symtab not found");
-    const MCSectionELF *InfoSection = Section.getAssociatedSection();
-    sh_info = SectionIndexMap.lookup(InfoSection);
+    const MCSection *InfoSection = Section.getAssociatedSection();
+    sh_info = SectionIndexMap.lookup(cast<MCSectionELF>(InfoSection));
     break;
   }
 
@@ -1172,9 +1175,11 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
     break;
   }
 
-  if (TargetObjectWriter->getEMachine() == ELF::EM_ARM &&
-      Section.getType() == ELF::SHT_ARM_EXIDX)
-    sh_link = SectionIndexMap.lookup(Section.getAssociatedSection());
+  if (Section.getFlags() & ELF::SHF_LINK_ORDER) {
+    const MCSymbol *Sym = Section.getAssociatedSymbol();
+    const MCSectionELF *Sec = cast<MCSectionELF>(&Sym->getSection());
+    sh_link = SectionIndexMap.lookup(Sec);
+  }
 
   WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getSectionName()),
                    Section.getType(), Section.getFlags(), 0, Offset, Size,
@@ -1298,7 +1303,8 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     // Remember the offset into the file for this section.
     uint64_t SecStart = getStream().tell();
 
-    writeRelocations(Asm, *RelSection->getAssociatedSection());
+    writeRelocations(Asm,
+                     cast<MCSectionELF>(*RelSection->getAssociatedSection()));
 
     uint64_t SecEnd = getStream().tell();
     SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
@@ -1351,34 +1357,13 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   const auto &SymA = cast<MCSymbolELF>(SA);
   if (IsPCRel) {
     assert(!InSet);
-    if (::isWeak(SymA))
+    if (isWeak(SymA))
       return false;
   }
   return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
                                                                 InSet, IsPCRel);
 }
 
-bool ELFObjectWriter::isWeak(const MCSymbol &S) const {
-  const auto &Sym = cast<MCSymbolELF>(S);
-  if (::isWeak(Sym))
-    return true;
-
-  // It is invalid to replace a reference to a global in a comdat
-  // with a reference to a local since out of comdat references
-  // to a local are forbidden.
-  // We could try to return false for more cases, like the reference
-  // being in the same comdat or Sym being an alias to another global,
-  // but it is not clear if it is worth the effort.
-  if (Sym.getBinding() != ELF::STB_GLOBAL)
-    return false;
-
-  if (!Sym.isInSection())
-    return false;
-
-  const auto &Sec = cast<MCSectionELF>(Sym.getSection());
-  return Sec.getGroup();
-}
-
 MCObjectWriter *llvm::createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
                                             raw_pwrite_stream &OS,
                                             bool IsLittleEndian) {
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 570f764f6642..fc0aa788f6d3 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------==//
+//===- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 
-MCAsmBackend::MCAsmBackend() {}
+MCAsmBackend::MCAsmBackend() = default;
 
-MCAsmBackend::~MCAsmBackend() {}
+MCAsmBackend::~MCAsmBackend() = default;
 
 Optional<MCFixupKind> MCAsmBackend::getFixupKind(StringRef Name) const {
   return None;
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 3eb8f50de5a8..b9be685cedc4 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfo.cpp - Asm Info -------------------------------------------==//
+//===- MCAsmInfo.cpp - Asm Info -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,29 +16,14 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Dwarf.h"
-#include <cctype>
-#include <cstring>
+
 using namespace llvm;
 
 MCAsmInfo::MCAsmInfo() {
-  PointerSize = 4;
-  CalleeSaveStackSlotSize = 4;
-
-  IsLittleEndian = true;
-  StackGrowsUp = false;
-  HasSubsectionsViaSymbols = false;
-  HasMachoZeroFillDirective = false;
-  HasMachoTBSSDirective = false;
-  MaxInstLength = 4;
-  MinInstAlignment = 1;
-  DollarIsPC = false;
   SeparatorString = ";";
   CommentString = "#";
   LabelSuffix = ":";
-  UseAssignmentForEHBegin = false;
-  NeedsLocalForSize = false;
   PrivateGlobalPrefix = "L";
   PrivateLabelPrefix = PrivateGlobalPrefix;
   LinkerPrivateGlobalPrefix = "";
@@ -47,10 +32,6 @@ MCAsmInfo::MCAsmInfo() {
   Code16Directive = ".code16";
   Code32Directive = ".code32";
   Code64Directive = ".code64";
-  AssemblerDialect = 0;
-  AllowAtInName = false;
-  SupportsQuotedNames = true;
-  UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
   AscizDirective = "\t.asciz\t";
@@ -58,40 +39,8 @@ MCAsmInfo::MCAsmInfo() {
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
   Data64bitsDirective = "\t.quad\t";
-  SunStyleELFSectionSwitchSyntax = false;
-  UsesELFSectionDirectiveForBSS = false;
-  AlignmentIsInBytes = true;
-  TextAlignFillValue = 0;
-  GPRel64Directive = nullptr;
-  GPRel32Directive = nullptr;
   GlobalDirective = "\t.globl\t";
-  SetDirectiveSuppressesReloc = false;
-  HasAggressiveSymbolFolding = true;
-  COMMDirectiveAlignmentIsInBytes = true;
-  LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
-  HasFunctionAlignment = true;
-  HasDotTypeDotSizeDirective = true;
-  HasSingleParameterDotFile = true;
-  HasIdentDirective = false;
-  HasNoDeadStrip = false;
-  HasAltEntry = false;
   WeakDirective = "\t.weak\t";
-  WeakRefDirective = nullptr;
-  HasWeakDefDirective = false;
-  HasWeakDefCanBeHiddenDirective = false;
-  HasLinkOnceDirective = false;
-  HiddenVisibilityAttr = MCSA_Hidden;
-  HiddenDeclarationVisibilityAttr = MCSA_Hidden;
-  ProtectedVisibilityAttr = MCSA_Protected;
-  SupportsDebugInformation = false;
-  ExceptionsType = ExceptionHandling::None;
-  WinEHEncodingType = WinEH::EncodingType::Invalid;
-  DwarfUsesRelocationsAcrossSections = true;
-  DwarfFDESymbolsUseAbsDiff = false;
-  DwarfRegNumForCFI = false;
-  NeedsDwarfSectionOffsetDirective = false;
-  UseParensForSymbolVariant = false;
-  UseLogicalShr = true;
 
   // FIXME: Clang's logic should be synced with the logic used to initialize
   //        this member and the two implementations should be merged.
@@ -107,12 +56,9 @@ MCAsmInfo::MCAsmInfo() {
   //   - The target subclasses for AArch64, ARM, and X86 handle these cases
   UseIntegratedAssembler = false;
   PreserveAsmComments = true;
-
-  CompressDebugSections = DebugCompressionType::DCT_None;
 }
 
-MCAsmInfo::~MCAsmInfo() {
-}
+MCAsmInfo::~MCAsmInfo() = default;
 
 bool MCAsmInfo::isSectionAtomizableBySymbols(const MCSection &Section) const {
   return false;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 5b9dd2009f8b..85104484fd40 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoCOFF.cpp - COFF asm properties -----------------*- C++ -*-===//
+//===- MCAsmInfoCOFF.cpp - COFF asm properties ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCDirectives.h"
+
 using namespace llvm;
 
-void MCAsmInfoCOFF::anchor() { }
+void MCAsmInfoCOFF::anchor() {}
 
 MCAsmInfoCOFF::MCAsmInfoCOFF() {
   // MingW 4.5 and later support .comm with log2 alignment, but .lcomm uses byte
@@ -41,13 +43,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   UseLogicalShr = false;
 }
 
-void MCAsmInfoMicrosoft::anchor() { }
-
-MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
-}
+void MCAsmInfoMicrosoft::anchor() {}
 
-void MCAsmInfoGNUCOFF::anchor() { }
+MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default;
 
-MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
+void MCAsmInfoGNUCOFF::anchor() {}
 
-}
+MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() = default;
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index e95cf488cd30..4b2001764e97 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoDarwin.cpp - Darwin asm properties -------------*- C++ -*-===//
+//===- MCAsmInfoDarwin.cpp - Darwin asm properties ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,9 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/MachO.h"
+
 using namespace llvm;
 
 bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
index 26e5608d8733..e44c08b50d76 100644
--- a/lib/MC/MCAsmInfoELF.cpp
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoELF.cpp - ELF asm properties -------------------*- C++ -*-===//
+//===- MCAsmInfoELF.cpp - ELF asm properties ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,9 +16,10 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
+
 using namespace llvm;
 
-void MCAsmInfoELF::anchor() { }
+void MCAsmInfoELF::anchor() {}
 
 MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
   if (!UsesNonexecutableStackSection)
@@ -31,5 +32,4 @@ MCAsmInfoELF::MCAsmInfoELF() {
   WeakRefDirective = "\t.weak\t";
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
-  UsesNonexecutableStackSection = true;
 }
diff --git a/lib/MC/MCAsmInfoWasm.cpp b/lib/MC/MCAsmInfoWasm.cpp
new file mode 100644
index 000000000000..aa26616dda36
--- /dev/null
+++ b/lib/MC/MCAsmInfoWasm.cpp
@@ -0,0 +1,27 @@
+//===-- MCAsmInfoWasm.cpp - Wasm asm properties -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on Wasm-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoWasm.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionWasm.h"
+using namespace llvm;
+
+void MCAsmInfoWasm::anchor() { }
+
+MCAsmInfoWasm::MCAsmInfoWasm() {
+  HasIdentDirective = true;
+  WeakRefDirective = "\t.weak\t";
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+}
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 817009a65363..9e5553fa8d42 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -103,7 +103,10 @@ public:
   void AddComment(const Twine &T, bool EOL = true) override;
 
   /// AddEncodingComment - Add a comment showing the encoding of an instruction.
-  void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &);
+  /// If PrintSchedInfo - is true then the comment sched:[x:y] should
+  //    be added to output if it's being supported by target
+  void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &,
+                          bool PrintSchedInfo);
 
   /// GetCommentOS - Return a raw_ostream that comments can be written to.
   /// Unlike AddComment, you are required to terminate comments with \n if you
@@ -130,7 +133,7 @@ public:
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override;
 
   void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
@@ -278,7 +281,8 @@ public:
   void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except) override;
   void EmitWinEHHandlerData() override;
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool PrintSchedInfo) override;
 
   void EmitBundleAlignMode(unsigned AlignPow2) override;
   void EmitBundleLock(bool AlignToEnd) override;
@@ -392,12 +396,13 @@ void MCAsmStreamer::emitExplicitComments() {
 void MCAsmStreamer::ChangeSection(MCSection *Section,
                                   const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
-  Section->PrintSwitchToSection(*MAI, OS, Subsection);
+  Section->PrintSwitchToSection(
+      *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS,
+      Subsection);
 }
 
-void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCStreamer::EmitLabel(Symbol);
+void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::EmitLabel(Symbol, Loc);
 
   Symbol->print(OS, MAI);
   OS << MAI->getLabelSuffix();
@@ -1503,7 +1508,8 @@ void MCAsmStreamer::EmitWinCFIEndProlog() {
 }
 
 void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
-                                       const MCSubtargetInfo &STI) {
+                                       const MCSubtargetInfo &STI,
+                                       bool PrintSchedInfo) {
   raw_ostream &OS = GetCommentOS();
   SmallString<256> Code;
   SmallVector<MCFixup, 4> Fixups;
@@ -1576,7 +1582,11 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
       }
     }
   }
-  OS << "]\n";
+  OS << "]";
+  // If we are not going to add fixup or schedul comments after this point then
+  // we have to end the current comment line with "\n".
+  if (Fixups.size() || !PrintSchedInfo)
+    OS << "\n";
 
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
     MCFixup &F = Fixups[i];
@@ -1587,16 +1597,19 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
 }
 
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
-                                    const MCSubtargetInfo &STI) {
+                                    const MCSubtargetInfo &STI,
+                                    bool PrintSchedInfo) {
   assert(getCurrentSectionOnly() &&
          "Cannot emit contents before setting section!");
 
   // Show the encoding in a comment if we have a code emitter.
   if (Emitter)
-    AddEncodingComment(Inst, STI);
+    AddEncodingComment(Inst, STI, PrintSchedInfo);
 
   // Show the MCInst if enabled.
   if (ShowInst) {
+    if (PrintSchedInfo)
+      GetCommentOS() << "\n";
     Inst.dump_pretty(GetCommentOS(), InstPrinter.get(), "\n ");
     GetCommentOS() << "\n";
   }
@@ -1606,6 +1619,16 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
   else
     InstPrinter->printInst(&Inst, OS, "", STI);
 
+  if (PrintSchedInfo) {
+    std::string SI = STI.getSchedInfoStr(Inst);
+    if (!SI.empty())
+      GetCommentOS() << SI;
+  }
+
+  StringRef Comments = CommentToEmit;
+  if (Comments.size() && Comments.back() != '\n')
+    GetCommentOS() << "\n";
+
   EmitEOL();
 }
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 83fcec92e2b5..c2bb7b277181 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -7,36 +7,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstring>
+#include <cassert>
+#include <cstdint>
 #include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "assembler"
 
 namespace {
 namespace stats {
+
 STATISTIC(EmittedFragments, "Number of emitted assembler fragments - total");
 STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
@@ -55,8 +68,9 @@ STATISTIC(FragmentLayouts, "Number of fragment layouts");
 STATISTIC(ObjectBytes, "Number of emitted object file bytes");
 STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
 STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
-}
-}
+
+} // end namespace stats
+} // end anonymous namespace
 
 // FIXME FIXME FIXME: There are number of places in this file where we convert
 // what is a 64-bit assembler value used for computation into a value in the
@@ -73,8 +87,7 @@ MCAssembler::MCAssembler(MCContext &Context, MCAsmBackend &Backend,
   VersionMinInfo.Major = 0; // Major version == 0 for "none specified"
 }
 
-MCAssembler::~MCAssembler() {
-}
+MCAssembler::~MCAssembler() = default;
 
 void MCAssembler::reset() {
   Sections.clear();
@@ -114,10 +127,16 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
   if (!Symbol->isVariable())
     return false;
 
-  // FIXME: It looks like gas supports some cases of the form "foo + 2". It
-  // is not clear if that is a bug or a feature.
   const MCExpr *Expr = Symbol->getVariableValue();
-  const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Expr);
+
+  MCValue V;
+  if (!Expr->evaluateAsRelocatable(V, nullptr, nullptr))
+    return false;
+
+  if (V.getSymB() || V.getRefKind() != MCSymbolRefExpr::VK_None)
+    return false;
+
+  const MCSymbolRefExpr *Ref = V.getSymA();
   if (!Ref)
     return false;
 
@@ -219,7 +238,6 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
       Value -= Layout.getSymbolOffset(Sym);
   }
 
-
   bool ShouldAlignPC = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
                          MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
   assert((ShouldAlignPC ? IsPCRel : true) &&
@@ -641,7 +659,7 @@ std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout,
 
 void MCAssembler::layout(MCAsmLayout &Layout) {
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - pre-layout\n--\n";
+      errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
 
   // Create dummy fragments and assign section ordinals.
@@ -671,14 +689,14 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
       return;
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - post-relaxation\n--\n";
+      errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
   // Finalize the layout, including fragment lowering.
   finishLayout(Layout);
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - final-layout\n--\n";
+      errs() << "assembler backend - final-layout\n--\n";
       dump(); });
 
   // Allow the object writer a chance to perform post-layout binding (for
@@ -714,8 +732,8 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         uint64_t FixedValue;
         bool IsPCRel;
         std::tie(FixedValue, IsPCRel) = handleFixup(Layout, Frag, Fixup);
-        getBackend().applyFixup(Fixup, Contents.data(),
-                                Contents.size(), FixedValue, IsPCRel);
+        getBackend().applyFixup(Fixup, Contents.data(), Contents.size(),
+                                FixedValue, IsPCRel, getContext());
       }
     }
   }
@@ -741,6 +759,10 @@ bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
   MCValue Target;
   uint64_t Value;
   bool Resolved = evaluateFixup(Layout, Fixup, DF, Target, Value);
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_X86_ABS8 &&
+      Fixup.getKind() == FK_Data_1)
+    return false;
   return getBackend().fixupNeedsRelaxationAdvanced(Fixup, Resolved, Value, DF,
                                                    Layout);
 }
diff --git a/lib/MC/MCCodeEmitter.cpp b/lib/MC/MCCodeEmitter.cpp
index c122763b2fe5..ca69478ed10d 100644
--- a/lib/MC/MCCodeEmitter.cpp
+++ b/lib/MC/MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- MCCodeEmitter.cpp - Instruction Encoding --------------------------===//
+//===- MCCodeEmitter.cpp - Instruction Encoding ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,6 @@
 
 using namespace llvm;
 
-MCCodeEmitter::MCCodeEmitter() {
-}
+MCCodeEmitter::MCCodeEmitter() = default;
 
-MCCodeEmitter::~MCCodeEmitter() {
-}
+MCCodeEmitter::~MCCodeEmitter() = default;
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 4798991ceed6..4628d0ab88f3 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -7,30 +7,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCContext.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeView.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCLabel.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolMachO.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -40,19 +53,14 @@ AsSecureLogFileName("as-secure-log-file-name",
                  "AS_SECURE_LOG_FILE env variable)"),
         cl::init(getenv("AS_SECURE_LOG_FILE")), cl::Hidden);
 
-
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
                      bool DoAutoReset)
-    : SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi), Allocator(),
+    : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
       Symbols(Allocator), UsedNames(Allocator),
-      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), DwarfLocSeen(false),
-      GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4),
-      AllowTemporaryLabels(true), DwarfCompileUnitID(0),
-      AutoReset(DoAutoReset), HadError(false) {
+      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
+      AutoReset(DoAutoReset) {
   SecureLogFile = AsSecureLogFileName;
-  SecureLog = nullptr;
-  SecureLogUsed = false;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
     MainFileName =
@@ -80,7 +88,6 @@ void MCContext::reset() {
   MCSubtargetAllocator.DestroyAll();
   UsedNames.clear();
   Symbols.clear();
-  SectionSymbols.clear();
   Allocator.Reset();
   Instances.clear();
   CompilationDir.clear();
@@ -124,18 +131,6 @@ MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
   return Sym;
 }
 
-MCSymbolELF *MCContext::getOrCreateSectionSymbol(const MCSectionELF &Section) {
-  MCSymbol *&Sym = SectionSymbols[&Section];
-  if (Sym)
-    return cast<MCSymbolELF>(Sym);
-
-  StringRef Name = Section.getSectionName();
-  auto NameIter = UsedNames.insert(std::make_pair(Name, false)).first;
-  Sym = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
-
-  return cast<MCSymbolELF>(Sym);
-}
-
 MCSymbol *MCContext::getOrCreateFrameAllocSymbol(StringRef FuncName,
                                                  unsigned Idx) {
   return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
@@ -162,6 +157,8 @@ MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
       return new (Name, *this) MCSymbolELF(Name, IsTemporary);
     case MCObjectFileInfo::IsMachO:
       return new (Name, *this) MCSymbolMachO(Name, IsTemporary);
+    case MCObjectFileInfo::IsWasm:
+      return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
     }
   }
   return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
@@ -182,7 +179,7 @@ MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
   SmallString<128> NewName = Name;
   bool AddSuffix = AlwaysAddSuffix;
   unsigned &NextUniqueID = NextID[Name];
-  for (;;) {
+  while (true) {
     if (AddSuffix) {
       NewName.resize(Name.size());
       raw_svector_ostream(NewName) << NextUniqueID++;
@@ -275,7 +272,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
                                            unsigned TypeAndAttributes,
                                            unsigned Reserved2, SectionKind Kind,
                                            const char *BeginSymName) {
-
   // We unique sections by their segment/section pair.  The returned section
   // may not have the same flags as the requested section, if so this should be
   // diagnosed by the client as an error.
@@ -316,18 +312,53 @@ void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
   const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
 }
 
+MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
+                                              unsigned Flags, SectionKind K,
+                                              unsigned EntrySize,
+                                              const MCSymbolELF *Group,
+                                              unsigned UniqueID,
+                                              const MCSymbolELF *Associated) {
+  MCSymbolELF *R;
+  MCSymbol *&Sym = Symbols[Section];
+  // A section symbol can not redefine regular symbols. There may be multiple
+  // sections with the same name, in which case the first such section wins.
+  if (Sym && Sym->isDefined() &&
+      (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
+    reportError(SMLoc(), "invalid symbol redefinition");
+  if (Sym && Sym->isUndefined()) {
+    R = cast<MCSymbolELF>(Sym);
+  } else {
+    auto NameIter = UsedNames.insert(std::make_pair(Section, false)).first;
+    R = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
+    if (!Sym)
+      Sym = R;
+  }
+  R->setBinding(ELF::STB_LOCAL);
+  R->setType(ELF::STT_SECTION);
+
+  auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
+      Section, Type, Flags, K, EntrySize, Group, UniqueID, R, Associated);
+
+  auto *F = new MCDataFragment();
+  Ret->getFragmentList().insert(Ret->begin(), F);
+  F->setParent(Ret);
+  R->setFragment(F);
+
+  return Ret;
+}
+
 MCSectionELF *MCContext::createELFRelSection(const Twine &Name, unsigned Type,
                                              unsigned Flags, unsigned EntrySize,
                                              const MCSymbolELF *Group,
-                                             const MCSectionELF *Associated) {
+                                             const MCSectionELF *RelInfoSection) {
   StringMap<bool>::iterator I;
   bool Inserted;
   std::tie(I, Inserted) =
-      ELFRelSecNames.insert(std::make_pair(Name.str(), true));
+      RelSecNames.insert(std::make_pair(Name.str(), true));
 
-  return new (ELFAllocator.Allocate())
-      MCSectionELF(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
-                   EntrySize, Group, true, nullptr, Associated);
+  return createELFSectionImpl(
+      I->getKey(), Type, Flags, SectionKind::getReadOnly(), EntrySize, Group,
+      true, cast<MCSymbolELF>(RelInfoSection->getBeginSymbol()));
 }
 
 MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
@@ -340,21 +371,20 @@ MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const Twine &Group, unsigned UniqueID,
-                                       const char *BeginSymName) {
+                                       const MCSymbolELF *Associated) {
   MCSymbolELF *GroupSym = nullptr;
   if (!Group.isTriviallyEmpty() && !Group.str().empty())
     GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));
 
   return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
-                       BeginSymName, nullptr);
+                       Associated);
 }
 
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const MCSymbolELF *GroupSym,
                                        unsigned UniqueID,
-                                       const char *BeginSymName,
-                                       const MCSectionELF *Associated) {
+                                       const MCSymbolELF *Associated) {
   StringRef Group = "";
   if (GroupSym)
     Group = GroupSym->getName();
@@ -375,22 +405,16 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
   else
     Kind = SectionKind::getReadOnly();
 
-  MCSymbol *Begin = nullptr;
-  if (BeginSymName)
-    Begin = createTempSymbol(BeginSymName, false);
-
-  MCSectionELF *Result = new (ELFAllocator.Allocate())
-      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID,
-                   Begin, Associated);
+  MCSectionELF *Result = createELFSectionImpl(
+      CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, Associated);
   Entry.second = Result;
   return Result;
 }
 
 MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) {
-  MCSectionELF *Result = new (ELFAllocator.Allocate())
-      MCSectionELF(".group", ELF::SHT_GROUP, 0, SectionKind::getReadOnly(), 4,
-                   Group, ~0, nullptr, nullptr);
-  return Result;
+  return createELFSectionImpl(".group", ELF::SHT_GROUP, 0,
+                              SectionKind::getReadOnly(), 4, Group, ~0,
+                              nullptr);
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
@@ -462,6 +486,80 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
                         "", 0, UniqueID);
 }
 
+void MCContext::renameWasmSection(MCSectionWasm *Section, StringRef Name) {
+  StringRef GroupName;
+  assert(!Section->getGroup() && "not yet implemented");
+
+  unsigned UniqueID = Section->getUniqueID();
+  WasmUniquingMap.erase(
+      WasmSectionKey{Section->getSectionName(), GroupName, UniqueID});
+  auto I = WasmUniquingMap.insert(std::make_pair(
+                                     WasmSectionKey{Name, GroupName, UniqueID},
+                                     Section))
+               .first;
+  StringRef CachedName = I->first.SectionName;
+  const_cast<MCSectionWasm *>(Section)->setSectionName(CachedName);
+}
+
+MCSectionWasm *MCContext::createWasmRelSection(const Twine &Name, unsigned Type,
+                                               unsigned Flags,
+                                               const MCSymbolWasm *Group) {
+  StringMap<bool>::iterator I;
+  bool Inserted;
+  std::tie(I, Inserted) =
+      RelSecNames.insert(std::make_pair(Name.str(), true));
+
+  return new (WasmAllocator.Allocate())
+      MCSectionWasm(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
+                    Group, ~0, nullptr);
+}
+
+MCSectionWasm *MCContext::getWasmNamedSection(const Twine &Prefix,
+                                              const Twine &Suffix, unsigned Type,
+                                              unsigned Flags) {
+  return getWasmSection(Prefix + "." + Suffix, Type, Flags, Suffix);
+}
+
+MCSectionWasm *MCContext::getWasmSection(const Twine &Section, unsigned Type,
+                                         unsigned Flags,
+                                         const Twine &Group, unsigned UniqueID,
+                                         const char *BeginSymName) {
+  MCSymbolWasm *GroupSym = nullptr;
+  if (!Group.isTriviallyEmpty() && !Group.str().empty())
+    GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
+
+  return getWasmSection(Section, Type, Flags, GroupSym, UniqueID, BeginSymName);
+}
+
+MCSectionWasm *MCContext::getWasmSection(const Twine &Section, unsigned Type,
+                                         unsigned Flags,
+                                         const MCSymbolWasm *GroupSym,
+                                         unsigned UniqueID,
+                                         const char *BeginSymName) {
+  StringRef Group = "";
+  if (GroupSym)
+    Group = GroupSym->getName();
+  // Do the lookup, if we have a hit, return it.
+  auto IterBool = WasmUniquingMap.insert(
+      std::make_pair(WasmSectionKey{Section.str(), Group, UniqueID}, nullptr));
+  auto &Entry = *IterBool.first;
+  if (!IterBool.second)
+    return Entry.second;
+
+  StringRef CachedName = Entry.first.SectionName;
+
+  SectionKind Kind = SectionKind::getText();
+
+  MCSymbol *Begin = nullptr;
+  if (BeginSymName)
+    Begin = createTempSymbol(BeginSymName, false);
+
+  MCSectionWasm *Result = new (WasmAllocator.Allocate())
+      MCSectionWasm(CachedName, Type, Flags, Kind, GroupSym, UniqueID, Begin);
+  Entry.second = Result;
+  return Result;
+}
+
 MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
   return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
 }
@@ -510,13 +608,15 @@ CodeViewContext &MCContext::getCVContext() {
 void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
   HadError = true;
 
-  // If we have a source manager use it. Otherwise just use the generic
-  // report_fatal_error().
-  if (!SrcMgr)
+  // If we have a source manager use it. Otherwise, try using the inline source
+  // manager.
+  // If that fails, use the generic report_fatal_error().
+  if (SrcMgr)
+    SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+  else if (InlineSrcMgr)
+    InlineSrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+  else
     report_fatal_error(Msg, false);
-
-  // Use the source manager to print the message.
-  SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
 }
 
 void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
diff --git a/lib/MC/MCDisassembler/MCDisassembler.cpp b/lib/MC/MCDisassembler/MCDisassembler.cpp
index 3a4f7382bd3c..2f1275d00b86 100644
--- a/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -1,4 +1,4 @@
-//===-- MCDisassembler.cpp - Disassembler interface -----------------------===//
+//===- MCDisassembler.cpp - Disassembler interface ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,13 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace llvm;
 
-MCDisassembler::~MCDisassembler() {
-}
+MCDisassembler::~MCDisassembler() = default;
 
 bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
                                               uint64_t Address, bool IsBranch,
diff --git a/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
index 1612562497d9..5805fd7007d2 100644
--- a/lib/MC/MCDisassembler/MCRelocationInfo.cpp
+++ b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
@@ -1,4 +1,4 @@
-//==-- MCRelocationInfo.cpp ------------------------------------------------==//
+//===-- MCRelocationInfo.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,17 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm-c/Disassembler.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm-c/Disassembler.h"
 
 using namespace llvm;
 
-MCRelocationInfo::MCRelocationInfo(MCContext &Ctx)
-  : Ctx(Ctx) {
-}
+MCRelocationInfo::MCRelocationInfo(MCContext &Ctx) : Ctx(Ctx) {}
 
-MCRelocationInfo::~MCRelocationInfo() {
-}
+MCRelocationInfo::~MCRelocationInfo() = default;
 
 const MCExpr *
 MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
diff --git a/lib/MC/MCDisassembler/MCSymbolizer.cpp b/lib/MC/MCDisassembler/MCSymbolizer.cpp
index c0f707d356c1..78e611e3ddda 100644
--- a/lib/MC/MCDisassembler/MCSymbolizer.cpp
+++ b/lib/MC/MCDisassembler/MCSymbolizer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class -----------*- C++ -*-===//
+//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,5 +11,4 @@
 
 using namespace llvm;
 
-MCSymbolizer::~MCSymbolizer() {
-}
+MCSymbolizer::~MCSymbolizer() = default;
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index a7551a3283a3..cc32e90ad36e 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -7,27 +7,41 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -592,7 +606,6 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   // And the pair of terminating zeros.
   Length += 2 * AddrSize;
 
-
   // Emit the header for this section.
   // The 4 byte length not including the 4 byte value for the length.
   MCOS->EmitIntValue(Length - 4, 4);
@@ -661,7 +674,14 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   // The 2 byte DWARF version.
   MCOS->EmitIntValue(context.getDwarfVersion(), 2);
 
+  // The DWARF v5 header has unit type, address size, abbrev offset.
+  // Earlier versions have abbrev offset, address size.
   const MCAsmInfo &AsmInfo = *context.getAsmInfo();
+  int AddrSize = AsmInfo.getPointerSize();
+  if (context.getDwarfVersion() >= 5) {
+    MCOS->EmitIntValue(dwarf::DW_UT_compile, 1);
+    MCOS->EmitIntValue(AddrSize, 1);
+  }
   // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
   // it is at the start of that section so this is zero.
   if (AbbrevSectionSymbol == nullptr)
@@ -669,11 +689,8 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   else
     MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4,
                           AsmInfo.needsDwarfSectionOffsetDirective());
-
-  const MCAsmInfo *asmInfo = context.getAsmInfo();
-  int AddrSize = asmInfo->getPointerSize();
-  // The 1 byte size of an address.
-  MCOS->EmitIntValue(AddrSize, 1);
+  if (context.getDwarfVersion() <= 4)
+    MCOS->EmitIntValue(AddrSize, 1);
 
   // Second part: the compile_unit DIE.
 
@@ -885,7 +902,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     }
   }
 
-  assert((RangesSectionSymbol != NULL) || !UseRangesSection);
+  assert((RangesSectionSymbol != nullptr) || !UseRangesSection);
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
@@ -1003,6 +1020,7 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 }
 
 namespace {
+
 class FrameEmitterImpl {
   int CFAOffset = 0;
   int InitialCFAOffset = 0;
@@ -1050,10 +1068,10 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitULEB128IntValue(Reg2);
     return;
   }
-  case MCCFIInstruction::OpWindowSave: {
+  case MCCFIInstruction::OpWindowSave:
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
     return;
-  }
+
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
@@ -1087,7 +1105,6 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 
     return;
   }
-
   case MCCFIInstruction::OpDefCfaRegister: {
     unsigned Reg = Instr.getRegister();
     if (!IsEH)
@@ -1097,7 +1114,6 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 
     return;
   }
-
   case MCCFIInstruction::OpOffset:
   case MCCFIInstruction::OpRelOffset: {
     const bool IsRelative =
@@ -1145,11 +1161,11 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
     return;
   }
-  case MCCFIInstruction::OpGnuArgsSize: {
+  case MCCFIInstruction::OpGnuArgsSize:
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
     Streamer.EmitULEB128IntValue(Instr.getOffset());
     return;
-  }
+
   case MCCFIInstruction::OpEscape:
     Streamer.EmitBytes(Instr.getValues());
     return;
@@ -1444,10 +1460,12 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
 }
 
 namespace {
+
 struct CIEKey {
   static const CIEKey getEmptyKey() {
     return CIEKey(nullptr, 0, -1, false, false);
   }
+
   static const CIEKey getTombstoneKey() {
     return CIEKey(nullptr, -1, 0, false, false);
   }
@@ -1457,23 +1475,28 @@ struct CIEKey {
       : Personality(Personality), PersonalityEncoding(PersonalityEncoding),
         LsdaEncoding(LsdaEncoding), IsSignalFrame(IsSignalFrame),
         IsSimple(IsSimple) {}
+
   const MCSymbol *Personality;
   unsigned PersonalityEncoding;
   unsigned LsdaEncoding;
   bool IsSignalFrame;
   bool IsSimple;
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct DenseMapInfo<CIEKey> {
   static CIEKey getEmptyKey() { return CIEKey::getEmptyKey(); }
   static CIEKey getTombstoneKey() { return CIEKey::getTombstoneKey(); }
+
   static unsigned getHashValue(const CIEKey &Key) {
     return static_cast<unsigned>(
         hash_combine(Key.Personality, Key.PersonalityEncoding, Key.LsdaEncoding,
                      Key.IsSignalFrame, Key.IsSimple));
   }
+
   static bool isEqual(const CIEKey &LHS, const CIEKey &RHS) {
     return LHS.Personality == RHS.Personality &&
            LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
@@ -1482,7 +1505,8 @@ template <> struct DenseMapInfo<CIEKey> {
            LHS.IsSimple == RHS.IsSimple;
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                                bool IsEH) {
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index de645cac7370..68fb5e7cbb3d 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -7,10 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 0ef1b2a8bdca..c8e0223c0573 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -11,30 +11,31 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -42,9 +43,6 @@ bool MCELFStreamer::isBundleLocked() const {
   return getCurrentSectionOnly()->isBundleLocked();
 }
 
-MCELFStreamer::~MCELFStreamer() {
-}
-
 void MCELFStreamer::mergeFragment(MCDataFragment *DF,
                                   MCDataFragment *EF) {
   MCAssembler &Assembler = getAssembler();
@@ -95,11 +93,19 @@ void MCELFStreamer::InitSections(bool NoExecStack) {
     SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
 
-void MCELFStreamer::EmitLabel(MCSymbol *S) {
+void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
+
+  const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
+  if (Section.getFlags() & ELF::SHF_TLS)
+    Symbol->setType(ELF::STT_TLS);
+}
 
-  MCObjectStreamer::EmitLabel(Symbol);
+void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc, MCFragment *F) {
+  auto *Symbol = cast<MCSymbolELF>(S);
+  MCObjectStreamer::EmitLabel(Symbol, Loc, F);
 
   const MCSectionELF &Section =
       static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
@@ -147,17 +153,8 @@ void MCELFStreamer::ChangeSection(MCSection *Section,
   if (Grp)
     Asm.registerSymbol(*Grp);
 
-  this->MCObjectStreamer::ChangeSection(Section, Subsection);
-  MCContext &Ctx = getContext();
-  auto *Begin = cast_or_null<MCSymbolELF>(Section->getBeginSymbol());
-  if (!Begin) {
-    Begin = Ctx.getOrCreateSectionSymbol(*SectionELF);
-    Section->setBeginSymbol(Begin);
-  }
-  if (Begin->isUndefined()) {
-    Asm.registerSymbol(*Begin);
-    Begin->setType(ELF::STT_SECTION);
-  }
+  changeSectionImpl(Section, Subsection);
+  Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
 void MCELFStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
@@ -361,13 +358,6 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
-// Add a symbol for the file name of this module. They start after the
-// null symbol and don't count as normal symbol, i.e. a non-STT_FILE symbol
-// with the same name may appear.
-void MCELFStreamer::EmitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
-}
-
 void MCELFStreamer::EmitIdent(StringRef IdentString) {
   MCSection *Comment = getAssembler().getContext().getELFSection(
       ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
@@ -630,15 +620,6 @@ void MCELFStreamer::FinishImpl() {
   this->MCObjectStreamer::FinishImpl();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                    raw_pwrite_stream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll) {
-  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  return S;
-}
-
 void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic ELF doesn't support this directive");
 }
@@ -647,22 +628,6 @@ void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("ELF doesn't support this directive");
 }
 
-void MCELFStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EmitCOFFSymbolType(int Type) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EndCOFFSymbolDef() {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
 void MCELFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
@@ -672,3 +637,12 @@ void MCELFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
 }
+
+MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                    raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                    bool RelaxAll) {
+  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index bcc43a54d620..8149aa27327c 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -7,28 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCExpr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mcexpr"
 
 namespace {
 namespace stats {
+
 STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
-}
-}
+
+} // end namespace stats
+} // end anonymous namespace
 
 void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   switch (getKind()) {
@@ -44,7 +51,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
     bool UseParens =
-        !InParens && Sym.getName().size() && Sym.getName()[0] == '$';
+        !InParens && !Sym.getName().empty() && Sym.getName()[0] == '$';
     if (UseParens) {
       OS << '(';
       Sym.print(OS, MAI);
@@ -129,21 +136,24 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   llvm_unreachable("Invalid expression kind!");
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCExpr::dump() const {
   dbgs() << *this;
   dbgs() << '\n';
 }
+#endif
 
 /* *** */
 
 const MCBinaryExpr *MCBinaryExpr::create(Opcode Opc, const MCExpr *LHS,
-                                         const MCExpr *RHS, MCContext &Ctx) {
-  return new (Ctx) MCBinaryExpr(Opc, LHS, RHS);
+                                         const MCExpr *RHS, MCContext &Ctx,
+                                         SMLoc Loc) {
+  return new (Ctx) MCBinaryExpr(Opc, LHS, RHS, Loc);
 }
 
 const MCUnaryExpr *MCUnaryExpr::create(Opcode Opc, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) MCUnaryExpr(Opc, Expr);
+                                       MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCUnaryExpr(Opc, Expr, Loc);
 }
 
 const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx) {
@@ -153,8 +163,8 @@ const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx) {
 /* *** */
 
 MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
-                                 const MCAsmInfo *MAI)
-    : MCExpr(MCExpr::SymbolRef), Kind(Kind),
+                                 const MCAsmInfo *MAI, SMLoc Loc)
+    : MCExpr(MCExpr::SymbolRef, Loc), Kind(Kind),
       UseParensForSymbolVariant(MAI->useParensForSymbolVariant()),
       HasSubsectionsViaSymbols(MAI->hasSubsectionsViaSymbols()),
       Symbol(Symbol) {
@@ -163,8 +173,8 @@ MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
 
 const MCSymbolRefExpr *MCSymbolRefExpr::create(const MCSymbol *Sym,
                                                VariantKind Kind,
-                                               MCContext &Ctx) {
-  return new (Ctx) MCSymbolRefExpr(Sym, Kind, Ctx.getAsmInfo());
+                                               MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCSymbolRefExpr(Sym, Kind, Ctx.getAsmInfo(), Loc);
 }
 
 const MCSymbolRefExpr *MCSymbolRefExpr::create(StringRef Name, VariantKind Kind,
@@ -205,6 +215,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_SECREL: return "SECREL32";
   case VK_SIZE: return "SIZE";
   case VK_WEAKREF: return "WEAKREF";
+  case VK_X86_ABS8: return "ABS8";
   case VK_ARM_NONE: return "none";
   case VK_ARM_GOT_PREL: return "GOT_PREL";
   case VK_ARM_TARGET1: return "target1";
@@ -275,6 +286,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Hexagon_IE: return "IE";
   case VK_Hexagon_IE_GOT: return "IEGOT";
   case VK_WebAssembly_FUNCTION: return "FUNCTION";
+  case VK_WebAssembly_TYPEINDEX: return "TYPEINDEX";
   case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo";
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
   case VK_AMDGPU_REL32_LO: return "rel32@lo";
@@ -314,6 +326,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("imgrel", VK_COFF_IMGREL32)
     .Case("secrel32", VK_SECREL)
     .Case("size", VK_SIZE)
+    .Case("abs8", VK_X86_ABS8)
     .Case("l", VK_PPC_LO)
     .Case("h", VK_PPC_HI)
     .Case("ha", VK_PPC_HA)
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index 8ff8f8aba1c1..90b44177cf5e 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -7,30 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCFragment.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
-MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
-  : Assembler(Asm), LastValidFragment()
- {
+MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
   // Compute the section layout order. Virtual sections must go last.
   for (MCSection &Sec : Asm)
     if (!Sec.isVirtualSection())
@@ -145,14 +144,14 @@ const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
   MCValue Value;
   if (!Expr->evaluateAsValue(Value, *this)) {
     Assembler.getContext().reportError(
-        SMLoc(), "expression could not be evaluated");
+        Expr->getLoc(), "expression could not be evaluated");
     return nullptr;
   }
 
   const MCSymbolRefExpr *RefB = Value.getSymB();
   if (RefB) {
     Assembler.getContext().reportError(
-        SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
+        Expr->getLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
                      "' could not be evaluated in a subtraction expression");
     return nullptr;
   }
@@ -164,8 +163,7 @@ const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
   const MCSymbol &ASym = A->getSymbol();
   const MCAssembler &Asm = getAssembler();
   if (ASym.isCommon()) {
-    // FIXME: we should probably add a SMLoc to MCExpr.
-    Asm.getContext().reportError(SMLoc(),
+    Asm.getContext().reportError(Expr->getLoc(),
                                  "Common symbol '" + ASym.getName() +
                                      "' cannot be used in assignment expr");
     return nullptr;
@@ -234,7 +232,7 @@ uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
 
 void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
 
-MCFragment::~MCFragment() { }
+MCFragment::~MCFragment() = default;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        uint8_t BundlePadding, MCSection *Parent)
@@ -295,8 +293,6 @@ void MCFragment::destroy() {
   }
 }
 
-/* *** */
-
 // Debugging methods
 
 namespace llvm {
@@ -308,10 +304,11 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
   return OS;
 }
 
-}
+} // end namespace llvm
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCFragment::dump() {
-  raw_ostream &OS = llvm::errs();
+  raw_ostream &OS = errs();
 
   OS << "<";
   switch (getKind()) {
@@ -449,7 +446,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
 }
 
 LLVM_DUMP_METHOD void MCAssembler::dump() {
-  raw_ostream &OS = llvm::errs();
+  raw_ostream &OS = errs();
 
   OS << "<MCAssembler\n";
   OS << "  Sections:[\n    ";
@@ -469,3 +466,4 @@ LLVM_DUMP_METHOD void MCAssembler::dump() {
   }
   OS << "]>\n";
 }
+#endif
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 2da8ecc4ff6a..f6d1d3cffca0 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -34,10 +35,12 @@ void MCOperand::print(raw_ostream &OS) const {
   OS << ">";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCOperand::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
+#endif
 
 void MCInst::print(raw_ostream &OS) const {
   OS << "<MCInst " << getOpcode();
@@ -63,7 +66,9 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer,
   OS << ">";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCInst::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
+#endif
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 23afe8054840..912179095974 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ---===//
+//===- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 void llvm::dumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS) {
@@ -25,8 +29,7 @@ void llvm::dumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS) {
   }
 }
 
-MCInstPrinter::~MCInstPrinter() {
-}
+MCInstPrinter::~MCInstPrinter() = default;
 
 /// getOpcodeName - Return the name of the specified opcode enum (e.g.
 /// "MOV32ri") or empty if we can't resolve it.
@@ -68,7 +71,7 @@ StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
 // For asm-style hex (e.g. 0ffh) the first digit always has to be a number.
 static bool needsLeadingZero(uint64_t Value)
 {
-  while(Value)
+  while (Value)
   {
     uint64_t digit = (Value >> 60) & 0xf;
     if (digit != 0)
diff --git a/lib/MC/MCInstrAnalysis.cpp b/lib/MC/MCInstrAnalysis.cpp
index 2d8336d77ac7..566944c53548 100644
--- a/lib/MC/MCInstrAnalysis.cpp
+++ b/lib/MC/MCInstrAnalysis.cpp
@@ -1,4 +1,4 @@
-//===-- MCInstrAnalysis.cpp - InstrDesc target hooks ------------*- C++ -*-===//
+//===- MCInstrAnalysis.cpp - InstrDesc target hooks -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include <cstdint>
+
 using namespace llvm;
 
 bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
diff --git a/lib/MC/MCLabel.cpp b/lib/MC/MCLabel.cpp
index b443cbbbf43e..db25a46fce18 100644
--- a/lib/MC/MCLabel.cpp
+++ b/lib/MC/MCLabel.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCLabel.cpp - MCLabel implementation ----------------------===//
+//===- lib/MC/MCLabel.cpp - MCLabel implementation ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCLabel.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 void MCLabel::print(raw_ostream &OS) const {
   OS << '"' << getInstance() << '"';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCLabel::dump() const {
   print(dbgs());
 }
+#endif
diff --git a/lib/MC/MCLinkerOptimizationHint.cpp b/lib/MC/MCLinkerOptimizationHint.cpp
index f71fc7830129..97f95418e054 100644
--- a/lib/MC/MCLinkerOptimizationHint.cpp
+++ b/lib/MC/MCLinkerOptimizationHint.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCLinkerOptimizationHint.cpp ----- LOH handling -*- C++ -*-===//
+//===- llvm/MC/MCLinkerOptimizationHint.cpp ----- LOH handling ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,9 +9,11 @@
 
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -41,14 +43,14 @@ void MCLOHDirective::emit(MachObjectWriter &ObjWriter,
 uint64_t MCLOHDirective::getEmitSize(const MachObjectWriter &ObjWriter,
                                      const MCAsmLayout &Layout) const {
   class raw_counting_ostream : public raw_ostream {
-    uint64_t Count;
+    uint64_t Count = 0;
 
     void write_impl(const char *, size_t size) override { Count += size; }
 
     uint64_t current_pos() const override { return Count; }
 
   public:
-    raw_counting_ostream() : Count(0) {}
+    raw_counting_ostream() = default;
     ~raw_counting_ostream() override { flush(); }
   };
 
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index bd425bb73093..1e9ef4163256 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- MCMachOStreamer.cpp - MachO Streamer ------------------------------===//
+//===- MCMachOStreamer.cpp - MachO Streamer -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,27 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <vector>
 
 using namespace llvm;
 
@@ -70,7 +78,7 @@ public:
   /// @{
 
   void ChangeSection(MCSection *Sect, const MCExpr *Subsect) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
@@ -83,18 +91,7 @@ public:
   void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EmitCOFFSymbolType(int Type) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EndCOFFSymbolDef() override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
+
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
@@ -102,13 +99,6 @@ public:
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
-  void EmitFileDirective(StringRef Filename) override {
-    // FIXME: Just ignore the .file; it isn't important enough to fail the
-    // entire assembly.
-
-    // report_fatal_error("unsupported directive: '.file'");
-  }
-
   void EmitIdent(StringRef IdentString) override {
     llvm_unreachable("macho doesn't support this directive");
   }
@@ -152,7 +142,7 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
 void MCMachOStreamer::ChangeSection(MCSection *Section,
                                     const MCExpr *Subsection) {
   // Change the section normally.
-  bool Created = MCObjectStreamer::changeSectionImpl(Section, Subsection);
+  bool Created = changeSectionImpl(Section, Subsection);
   const MCSectionMachO &MSec = *cast<MCSectionMachO>(Section);
   StringRef SegName = MSec.getSegmentName();
   if (SegName == "__DWARF")
@@ -181,15 +171,13 @@ void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
     EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
 }
 
-void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-
+void MCMachOStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
     insert(new MCDataFragment());
 
-  MCObjectStreamer::EmitLabel(Symbol);
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
 
   // This causes the reference type flag to be cleared. Darwin 'as' was "trying"
   // to clear the weak reference and weak definition bits too, but the
diff --git a/lib/MC/MCMachObjectTargetWriter.cpp b/lib/MC/MCMachObjectTargetWriter.cpp
index 4ffd6a78a61f..8809a3c320f8 100644
--- a/lib/MC/MCMachObjectTargetWriter.cpp
+++ b/lib/MC/MCMachObjectTargetWriter.cpp
@@ -1,4 +1,4 @@
-//===-- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass ------===//
+//===- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,4 +16,4 @@ MCMachObjectTargetWriter::MCMachObjectTargetWriter(bool Is64Bit_,
                                                    uint32_t CPUSubtype_)
     : Is64Bit(Is64Bit_), CPUType(CPUType_), CPUSubtype(CPUSubtype_) {}
 
-MCMachObjectTargetWriter::~MCMachObjectTargetWriter() {}
+MCMachObjectTargetWriter::~MCMachObjectTargetWriter() = default;
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index eb2d91254b34..d156f5d05a31 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -34,6 +34,10 @@ namespace {
     void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
                       uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
     void EmitGPRel32Value(const MCExpr *Value) override {}
+    void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+    void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+    void EmitCOFFSymbolType(int Type) override {}
+    void EndCOFFSymbolDef() override {}
   };
 
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 8fd71f62e4e5..9f94264684f9 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -16,7 +16,9 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/ELF.h"
 
 using namespace llvm;
 
@@ -505,68 +507,75 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T) {
   COFFDebugSymbolsSection = nullptr;
   COFFDebugTypesSection = nullptr;
 
+  unsigned DebugSecType = ELF::SHT_PROGBITS;
+
+  // MIPS .debug_* sections should have SHT_MIPS_DWARF section type
+  // to distinguish among sections contain DWARF and ECOFF debug formats.
+  // Sections with ECOFF debug format are obsoleted and marked by SHT_PROGBITS.
+  if (T.getArch() == Triple::mips || T.getArch() == Triple::mipsel ||
+      T.getArch() == Triple::mips64 || T.getArch() == Triple::mips64el)
+    DebugSecType = ELF::SHT_MIPS_DWARF;
+
   // Debug Info Sections.
-  DwarfAbbrevSection = Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
-                                          "section_abbrev");
-  DwarfInfoSection =
-      Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0, "section_info");
-  DwarfLineSection = Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0);
-  DwarfFrameSection = Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0);
+  DwarfAbbrevSection =
+      Ctx->getELFSection(".debug_abbrev", DebugSecType, 0);
+  DwarfInfoSection = Ctx->getELFSection(".debug_info", DebugSecType, 0);
+  DwarfLineSection = Ctx->getELFSection(".debug_line", DebugSecType, 0);
+  DwarfFrameSection = Ctx->getELFSection(".debug_frame", DebugSecType, 0);
   DwarfPubNamesSection =
-      Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_pubnames", DebugSecType, 0);
   DwarfPubTypesSection =
-      Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_pubtypes", DebugSecType, 0);
   DwarfGnuPubNamesSection =
-      Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_gnu_pubnames", DebugSecType, 0);
   DwarfGnuPubTypesSection =
-      Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_gnu_pubtypes", DebugSecType, 0);
   DwarfStrSection =
-      Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".debug_str", DebugSecType,
                          ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
-  DwarfLocSection = Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0);
+  DwarfLocSection = Ctx->getELFSection(".debug_loc", DebugSecType, 0);
   DwarfARangesSection =
-      Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_aranges", DebugSecType, 0);
   DwarfRangesSection =
-      Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0, "debug_range");
-  DwarfMacinfoSection = Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS,
-                                           0, "debug_macinfo");
+      Ctx->getELFSection(".debug_ranges", DebugSecType, 0);
+  DwarfMacinfoSection =
+      Ctx->getELFSection(".debug_macinfo", DebugSecType, 0);
 
   // DWARF5 Experimental Debug Info
 
   // Accelerator Tables
   DwarfAccelNamesSection =
-      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0, "names_begin");
+      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0);
   DwarfAccelObjCSection =
-      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0, "objc_begin");
-  DwarfAccelNamespaceSection = Ctx->getELFSection(
-      ".apple_namespaces", ELF::SHT_PROGBITS, 0, "namespac_begin");
+      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0);
+  DwarfAccelNamespaceSection =
+      Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0);
   DwarfAccelTypesSection =
-      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0, "types_begin");
+      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
-      Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_info.dwo", DebugSecType, 0);
   DwarfTypesDWOSection =
-      Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_types.dwo", DebugSecType, 0);
   DwarfAbbrevDWOSection =
-      Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_abbrev.dwo", DebugSecType, 0);
   DwarfStrDWOSection =
-      Ctx->getELFSection(".debug_str.dwo", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".debug_str.dwo", DebugSecType,
                          ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   DwarfLineDWOSection =
-      Ctx->getELFSection(".debug_line.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_line.dwo", DebugSecType, 0);
   DwarfLocDWOSection =
-      Ctx->getELFSection(".debug_loc.dwo", ELF::SHT_PROGBITS, 0, "skel_loc");
+      Ctx->getELFSection(".debug_loc.dwo", DebugSecType, 0);
   DwarfStrOffDWOSection =
-      Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0);
-  DwarfAddrSection =
-      Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec");
+      Ctx->getELFSection(".debug_str_offsets.dwo", DebugSecType, 0);
+  DwarfAddrSection = Ctx->getELFSection(".debug_addr", DebugSecType, 0);
 
   // DWP Sections
   DwarfCUIndexSection =
-      Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_cu_index", DebugSecType, 0);
   DwarfTUIndexSection =
-      Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_tu_index", DebugSecType, 0);
 
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
@@ -799,6 +808,30 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                         SectionKind::getReadOnly());
 }
 
+void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
+  // TODO: Set the section types and flags.
+  TextSection = Ctx->getWasmSection(".text", 0, 0);
+  DataSection = Ctx->getWasmSection(".data", 0, 0);
+
+  // TODO: Set the section types and flags.
+  DwarfLineSection = Ctx->getWasmSection(".debug_line", 0, 0);
+  DwarfStrSection = Ctx->getWasmSection(".debug_str", 0, 0);
+  DwarfLocSection = Ctx->getWasmSection(".debug_loc", 0, 0);
+  DwarfAbbrevSection = Ctx->getWasmSection(".debug_abbrev", 0, 0, "section_abbrev");
+  DwarfARangesSection = Ctx->getWasmSection(".debug_aranges", 0, 0);
+  DwarfRangesSection = Ctx->getWasmSection(".debug_ranges", 0, 0, "debug_range");
+  DwarfMacinfoSection = Ctx->getWasmSection(".debug_macinfo", 0, 0, "debug_macinfo");
+  DwarfAddrSection = Ctx->getWasmSection(".debug_addr", 0, 0);
+  DwarfCUIndexSection = Ctx->getWasmSection(".debug_cu_index", 0, 0);
+  DwarfTUIndexSection = Ctx->getWasmSection(".debug_tu_index", 0, 0);
+  DwarfInfoSection = Ctx->getWasmSection(".debug_info", 0, 0, "section_info");
+  DwarfFrameSection = Ctx->getWasmSection(".debug_frame", 0, 0);
+  DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", 0, 0);
+  DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", 0, 0);
+
+  // TODO: Define more sections.
+}
+
 void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
                                             CodeModel::Model cm,
                                             MCContext &ctx) {
@@ -843,6 +876,10 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
     Env = IsELF;
     initELFMCObjectFileInfo(TT);
     break;
+  case Triple::Wasm:
+    Env = IsWasm;
+    initWasmMCObjectFileInfo(TT);
+    break;
   case Triple::UnknownObjectFormat:
     report_fatal_error("Cannot initialize MC for unknown object file format.");
     break;
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index cae5c1f8d156..f7f2253256eb 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -153,8 +153,8 @@ void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
   EmitLabel(Frame.End);
 }
 
-void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
-  MCStreamer::EmitLabel(Symbol);
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::EmitLabel(Symbol, Loc);
 
   getAssembler().registerSymbol(*Symbol);
 
@@ -171,6 +171,16 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   }
 }
 
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc, MCFragment *F) {
+  MCStreamer::EmitLabel(Symbol, Loc);
+  getAssembler().registerSymbol(*Symbol);
+  auto *DF = dyn_cast_or_null<MCDataFragment>(F);
+  if (DF)
+    Symbol->setFragment(F);
+  else
+    PendingLabels.push_back(Symbol);
+}
+
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
@@ -203,6 +213,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
                                          const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   flushPendingLabels(nullptr);
+  getContext().clearDwarfLocSeen();
 
   bool Created = getAssembler().registerSection(*Section);
 
@@ -227,7 +238,7 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
 }
 
 void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
-                                       const MCSubtargetInfo &STI) {
+                                       const MCSubtargetInfo &STI, bool) {
   MCStreamer::EmitInstruction(Inst, STI);
 
   MCSection *Sec = getCurrentSectionOnly();
@@ -490,8 +501,8 @@ void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), 
-                                            Value, FK_GPRel_4));
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
@@ -500,8 +511,8 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), 
-                                            Value, FK_GPRel_4));
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
@@ -572,6 +583,10 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
   MCStreamer::emitFill(IntNumValues, Size, Expr);
 }
 
+void MCObjectStreamer::EmitFileDirective(StringRef Filename) {
+  getAssembler().addFileName(Filename);
+}
+
 void MCObjectStreamer::FinishImpl() {
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
diff --git a/lib/MC/MCObjectWriter.cpp b/lib/MC/MCObjectWriter.cpp
index e84f74ae81d6..478b4e84e74a 100644
--- a/lib/MC/MCObjectWriter.cpp
+++ b/lib/MC/MCObjectWriter.cpp
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
 
 using namespace llvm;
 
-MCObjectWriter::~MCObjectWriter() {
-}
+MCObjectWriter::~MCObjectWriter() = default;
 
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
     const MCAssembler &Asm, const MCSymbolRefExpr *A, const MCSymbolRefExpr *B,
@@ -51,5 +51,3 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   // On ELF and COFF  A - B is absolute if A and B are in the same section.
   return &SecA == &SecB;
 }
-
-bool MCObjectWriter::isWeak(const MCSymbol &) const { return false; }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 87ecf9e0227f..38dadfe62135 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -30,15 +30,11 @@
 
 using namespace llvm;
 
-AsmLexer::AsmLexer(const MCAsmInfo &MAI)
-    : MAI(MAI), CurPtr(nullptr), IsAtStartOfLine(true),
-      IsAtStartOfStatement(true), IsParsingMSInlineAsm(false),
-      IsPeeking(false) {
+AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
 }
 
-AsmLexer::~AsmLexer() {
-}
+AsmLexer::~AsmLexer() = default;
 
 void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
   CurBuf = Buf;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index da54155b3b9d..e65ce9f0b936 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -42,6 +43,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -55,6 +57,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cctype>
+#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <deque>
@@ -67,7 +70,7 @@
 
 using namespace llvm;
 
-MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;
 
 static cl::opt<unsigned> AsmMacroMaxNestingDepth(
      "asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
@@ -82,10 +85,10 @@ typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
 struct MCAsmMacroParameter {
   StringRef Name;
   MCAsmMacroArgument Value;
-  bool Required;
-  bool Vararg;
+  bool Required = false;
+  bool Vararg = false;
 
-  MCAsmMacroParameter() : Required(false), Vararg(false) {}
+  MCAsmMacroParameter() = default;
 };
 
 typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
@@ -124,23 +127,20 @@ struct ParseStatementInfo {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
 
   /// \brief The opcode from the last parsed instruction.
-  unsigned Opcode;
+  unsigned Opcode = ~0U;
 
   /// \brief Was there an error parsing the inline assembly?
-  bool ParseError;
+  bool ParseError = false;
 
-  SmallVectorImpl<AsmRewrite> *AsmRewrites;
+  SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
 
-  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(nullptr) {}
+  ParseStatementInfo() = default;
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
-    : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
+    : AsmRewrites(rewrites) {}
 };
 
 /// \brief The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
-  AsmParser(const AsmParser &) = delete;
-  void operator=(const AsmParser &) = delete;
-
 private:
   AsmLexer Lexer;
   MCContext &Ctx;
@@ -199,17 +199,19 @@ private:
   unsigned LastQueryLine;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
-  unsigned AssemblerDialect;
+  unsigned AssemblerDialect = ~0U;
 
   /// \brief is Darwin compatibility enabled?
-  bool IsDarwin;
+  bool IsDarwin = false;
 
   /// \brief Are we parsing ms-style inline assembly?
-  bool ParsingInlineAsm;
+  bool ParsingInlineAsm = false;
 
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
-            const MCAsmInfo &MAI);
+            const MCAsmInfo &MAI, unsigned CB);
+  AsmParser(const AsmParser &) = delete;
+  AsmParser &operator=(const AsmParser &) = delete;
   ~AsmParser() override;
 
   bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;
@@ -223,7 +225,6 @@ public:
     DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
   }
 
-public:
   /// @name MCAsmParser Interface
   /// {
 
@@ -258,7 +259,7 @@ public:
 
   bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
-                        SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
+                        SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
                         SmallVectorImpl<std::string> &Constraints,
                         SmallVectorImpl<std::string> &Clobbers,
                         const MCInstrInfo *MII, const MCInstPrinter *IP,
@@ -572,11 +573,9 @@ extern MCAsmParserExtension *createCOFFAsmParser();
 enum { DEFAULT_ADDRSPACE = 0 };
 
 AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
-                     const MCAsmInfo &MAI)
+                     const MCAsmInfo &MAI, unsigned CB = 0)
     : Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
-      PlatformParser(nullptr), CurBuffer(SM.getMainFileID()),
-      MacrosEnabledFlag(true), CppHashInfo(), AssemblerDialect(~0U),
-      IsDarwin(false), ParsingInlineAsm(false) {
+      CurBuffer(CB ? CB : SM.getMainFileID()), MacrosEnabledFlag(true) {
   HadError = false;
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
@@ -597,6 +596,9 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
   case MCObjectFileInfo::IsELF:
     PlatformParser.reset(createELFAsmParser());
     break;
+  case MCObjectFileInfo::IsWasm:
+    llvm_unreachable("Wasm parsing not supported yet");
+    break;
   }
 
   PlatformParser->Initialize(*this);
@@ -608,6 +610,10 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
 AsmParser::~AsmParser() {
   assert((HadError || ActiveMacros.empty()) &&
          "Unexpected active macro instantiation!");
+
+  // Restore the saved diagnostics handler and context for use during
+  // finalization.
+  SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
 }
 
 void AsmParser::printMacroInstantiations() {
@@ -918,7 +924,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createLNot(Res, getContext());
+    Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Dollar:
   case AsmToken::At:
@@ -979,7 +985,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
 
     // Lookup the symbol variant if used.
-    if (Split.second.size()) {
+    if (!Split.second.empty()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
       if (Variant != MCSymbolRefExpr::VK_Invalid) {
         SymbolName = Split.first;
@@ -1005,7 +1011,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     }
 
     // Otherwise create a symbol ref.
-    Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
+    Res = MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
     return false;
   }
   case AsmToken::BigNum:
@@ -1071,19 +1077,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createMinus(Res, getContext());
+    Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createPlus(Res, getContext());
+    Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createNot(Res, getContext());
+    Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
     return false;
   // MIPS unary expression operators. The lexer won't generate these tokens if
   // MCAsmInfo::HasMipsExpressions is false for the target.
@@ -1436,6 +1442,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
 /// Res contains the LHS of the expression on input.
 bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                               SMLoc &EndLoc) {
+  SMLoc StartLoc = Lexer.getLoc();
   while (true) {
     MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
     unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);
@@ -1460,7 +1467,7 @@ bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
       return true;
 
     // Merge LHS and RHS according to operator.
-    Res = MCBinaryExpr::create(Kind, Res, RHS, getContext());
+    Res = MCBinaryExpr::create(Kind, Res, RHS, getContext(), StartLoc);
   }
 }
 
@@ -1617,7 +1624,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       if (ParsingInlineAsm && SI) {
         StringRef RewrittenLabel =
             SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
-        assert(RewrittenLabel.size() &&
+        assert(!RewrittenLabel.empty() &&
                "We should have an internal name here.");
         Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
                                        RewrittenLabel);
@@ -1626,12 +1633,6 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Sym = getContext().getOrCreateSymbol(IDVal);
     } else
       Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
-
-    Sym->redefineIfPossible();
-
-    if (!Sym->isUndefined() || Sym->isVariable())
-      return Error(IDLoc, "invalid symbol redefinition");
-
     // End of Labels should be treated as end of line for lexing
     // purposes but that information is not available to the Lexer who
     // does not understand Labels. This may cause us to see a Hash
@@ -1650,7 +1651,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
     // Emit the label.
     if (!ParsingInlineAsm)
-      Out.EmitLabel(Sym);
+      Out.EmitLabel(Sym, IDLoc);
 
     // If we are generating dwarf for assembly source files then gather the
     // info to make a dwarf label entry for this label if needed.
@@ -1979,7 +1980,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
     return parseDirectiveMSAlign(IDLoc, Info);
 
-  if (ParsingInlineAsm && (IDVal == "even"))
+  if (ParsingInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
     Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
   if (checkForValidSection())
     return true;
@@ -2025,7 +2026,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    if (CppHashInfo.Filename.size()) {
+    if (!CppHashInfo.Filename.empty()) {
       unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
           0, StringRef(), CppHashInfo.Filename);
       getContext().setGenDwarfFileNumber(FileNumber);
@@ -3873,6 +3874,12 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     if (parseIdentifier(Parameter.Name))
       return TokError("expected identifier in '.macro' directive");
 
+    // Emit an error if two (or more) named parameters share the same name
+    for (const MCAsmMacroParameter& CurrParam : Parameters)
+      if (CurrParam.Name.equals(Parameter.Name))
+        return TokError("macro '" + Name + "' has multiple parameters"
+                        " named '" + Parameter.Name + "'");
+
     if (Lexer.is(AsmToken::Colon)) {
       Lex();  // consume ':'
 
@@ -4191,7 +4198,6 @@ bool AsmParser::parseDirectiveBundleUnlock() {
 /// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
 bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
-
   SMLoc NumBytesLoc = Lexer.getLoc();
   const MCExpr *NumBytes;
   if (checkForValidSection() || parseExpression(NumBytes))
@@ -4287,7 +4293,6 @@ bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Seman
 /// parseDirectiveDS
 /// ::= .ds.{b, d, l, p, s, w, x} expression
 bool AsmParser::parseDirectiveDS(StringRef IDVal, unsigned Size) {
-
   SMLoc NumValuesLoc = Lexer.getLoc();
   int64_t NumValues;
   if (checkForValidSection() || parseAbsoluteExpression(NumValues))
@@ -4416,6 +4421,7 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
     return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
                                    "alignment, can't be less than zero");
 
+  Sym->redefineIfPossible();
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
 
@@ -5208,7 +5214,7 @@ static int rewritesSort(const AsmRewrite *AsmRewriteA,
 
 bool AsmParser::parseMSInlineAsm(
     void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
-    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
     SmallVectorImpl<std::string> &Constraints,
     SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
     const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
@@ -5518,6 +5524,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
 
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
-                                     MCStreamer &Out, const MCAsmInfo &MAI) {
-  return new AsmParser(SM, C, Out, MAI);
+                                     MCStreamer &Out, const MCAsmInfo &MAI,
+                                     unsigned CB) {
+  return new AsmParser(SM, C, Out, MAI, CB);
 }
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index f4114795a92d..bec62ccb2f7f 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -7,19 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -98,12 +106,14 @@ class COFFAsmParser : public MCAsmParserExtension {
                             | COFF::IMAGE_SCN_MEM_READ,
                               SectionKind::getText());
   }
+
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
     return ParseSectionSwitch(".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                            COFF::IMAGE_SCN_MEM_READ |
                                            COFF::IMAGE_SCN_MEM_WRITE,
                               SectionKind::getData());
   }
+
   bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
     return ParseSectionSwitch(".bss",
                               COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA
@@ -141,8 +151,9 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
   bool ParseSEHRegisterNumber(unsigned &RegNo);
   bool ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
+
 public:
-  COFFAsmParser() {}
+  COFFAsmParser() = default;
 };
 
 } // end annonomous namespace.
@@ -277,7 +288,7 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
@@ -466,10 +477,11 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  if (Offset < 0 || Offset > UINT32_MAX)
-    return Error(OffsetLoc,
-                 "invalid '.secrel32' directive offset, can't be less "
-                 "than zero or greater than UINT32_MAX");
+  if (Offset < 0 || Offset > std::numeric_limits<uint32_t>::max())
+    return Error(
+        OffsetLoc,
+        "invalid '.secrel32' directive offset, can't be less "
+        "than zero or greater than std::numeric_limits<uint32_t>::max()");
 
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
@@ -817,4 +829,4 @@ MCAsmParserExtension *createCOFFAsmParser() {
   return new COFFAsmParser;
 }
 
-}
+} // end namespace llvm
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 94aa70ef0326..73a7ad0500c3 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -7,22 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <system_error>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -44,7 +57,7 @@ class DarwinAsmParser : public MCAsmParserExtension {
   SMLoc LastVersionMinDirective;
 
 public:
-  DarwinAsmParser() {}
+  DarwinAsmParser() = default;
 
   void Initialize(MCAsmParser &Parser) override {
     // Call the base implementation.
@@ -209,37 +222,47 @@ public:
   bool parseSectionDirectiveConst(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__const");
   }
+
   bool parseSectionDirectiveStaticConst(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__static_const");
   }
+
   bool parseSectionDirectiveCString(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveLiteral4(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__literal4",
                               MachO::S_4BYTE_LITERALS, 4);
   }
+
   bool parseSectionDirectiveLiteral8(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__literal8",
                               MachO::S_8BYTE_LITERALS, 8);
   }
+
   bool parseSectionDirectiveLiteral16(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__literal16",
                               MachO::S_16BYTE_LITERALS, 16);
   }
+
   bool parseSectionDirectiveConstructor(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__constructor");
   }
+
   bool parseSectionDirectiveDestructor(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__destructor");
   }
+
   bool parseSectionDirectiveFVMLibInit0(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__fvmlib_init0");
   }
+
   bool parseSectionDirectiveFVMLibInit1(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__fvmlib_init1");
   }
+
   bool parseSectionDirectiveSymbolStub(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__symbol_stub",
                               MachO::S_SYMBOL_STUBS |
@@ -247,144 +270,178 @@ public:
                               // FIXME: Different on PPC and ARM.
                               0, 16);
   }
+
   bool parseSectionDirectivePICSymbolStub(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__picsymbol_stub",
                               MachO::S_SYMBOL_STUBS |
                               MachO::S_ATTR_PURE_INSTRUCTIONS, 0, 26);
   }
+
   bool parseSectionDirectiveData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__data");
   }
+
   bool parseSectionDirectiveStaticData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__static_data");
   }
+
   bool parseSectionDirectiveNonLazySymbolPointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__nl_symbol_ptr",
                               MachO::S_NON_LAZY_SYMBOL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveLazySymbolPointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__la_symbol_ptr",
                               MachO::S_LAZY_SYMBOL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveThreadLocalVariablePointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_ptr",
                               MachO::S_THREAD_LOCAL_VARIABLE_POINTERS, 4);
   }
+
   bool parseSectionDirectiveDyld(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__dyld");
   }
+
   bool parseSectionDirectiveModInitFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__mod_init_func",
                               MachO::S_MOD_INIT_FUNC_POINTERS, 4);
   }
+
   bool parseSectionDirectiveModTermFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__mod_term_func",
                               MachO::S_MOD_TERM_FUNC_POINTERS, 4);
   }
+
   bool parseSectionDirectiveConstData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__const");
   }
+
   bool parseSectionDirectiveObjCClass(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__class",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCMetaClass(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__meta_class",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCatClsMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cat_cls_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCatInstMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cat_inst_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCProtocol(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__protocol",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCStringObject(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__string_object",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClsMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cls_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCInstMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__inst_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClsRefs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cls_refs",
                               MachO::S_ATTR_NO_DEAD_STRIP |
                               MachO::S_LITERAL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveObjCMessageRefs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__message_refs",
                               MachO::S_ATTR_NO_DEAD_STRIP |
                               MachO::S_LITERAL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveObjCSymbols(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__symbols",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCategory(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__category",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClassVars(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__class_vars",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCInstanceVars(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__instance_vars",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCModuleInfo(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__module_info",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClassNames(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCMethVarTypes(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCMethVarNames(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCSelectorStrs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__selector_strs",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveTData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_data",
                               MachO::S_THREAD_LOCAL_REGULAR);
   }
+
   bool parseSectionDirectiveText(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__text",
                               MachO::S_ATTR_PURE_INSTRUCTIONS);
   }
+
   bool parseSectionDirectiveTLV(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_vars",
                               MachO::S_THREAD_LOCAL_VARIABLES);
   }
+
   bool parseSectionDirectiveIdent(StringRef, SMLoc) {
     // Darwin silently ignores the .ident directive.
     getParser().eatToEndOfStatement();
     return false;
   }
+
   bool parseSectionDirectiveThreadInitFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_init",
                          MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS);
   }
-  bool parseVersionMin(StringRef, SMLoc);
 
+  bool parseVersionMin(StringRef, SMLoc);
 };
 
 } // end anonymous namespace
@@ -526,7 +583,7 @@ bool DarwinAsmParser::parseDirectiveDumpOrLoad(StringRef Directive,
 ///  ::= .linker_option "string" ( , "string" )*
 bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
   SmallVector<std::string, 4> Args;
-  for (;;) {
+  while (true) {
     if (getLexer().isNot(AsmToken::String))
       return TokError("expected string in '" + Twine(IDVal) + "' directive");
 
@@ -604,7 +661,6 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
     return TokError("unexpected token in '.section' directive");
   Lex();
 
-
   StringRef Segment, Section;
   unsigned StubSize;
   unsigned TAA;
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 8d7ba0d03362..401011a027f4 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -7,17 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -142,9 +154,14 @@ private:
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionArguments(bool IsPush, SMLoc loc);
   unsigned parseSunStyleSectionFlags();
+  bool maybeParseSectionType(StringRef &TypeName);
+  bool parseMergeSize(int64_t &Size);
+  bool parseGroup(StringRef &GroupName);
+  bool parseMetadataSym(MCSymbolELF *&Associated);
+  bool maybeParseUniqueID(int64_t &UniqueID);
 };
 
-}
+} // end anonymous namespace
 
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ]
@@ -158,7 +175,7 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
@@ -230,8 +247,7 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
     return false;
   }
 
-  for (;;) {
-    
+  while (true) {
     SMLoc PrevLoc = getLexer().getLoc();
     if (getLexer().is(AsmToken::Comma) ||
       getLexer().is(AsmToken::EndOfStatement))
@@ -282,6 +298,9 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
     case 'w':
       flags |= ELF::SHF_WRITE;
       break;
+    case 'o':
+      flags |= ELF::SHF_LINK_ORDER;
+      break;
     case 'M':
       flags |= ELF::SHF_MERGE;
       break;
@@ -366,6 +385,97 @@ bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) {
   return ParseSectionArguments(/*IsPush=*/false, loc);
 }
 
+bool ELFAsmParser::maybeParseSectionType(StringRef &TypeName) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return false;
+  Lex();
+  if (L.isNot(AsmToken::At) && L.isNot(AsmToken::Percent) &&
+      L.isNot(AsmToken::String)) {
+    if (L.getAllowAtInIdentifier())
+      return TokError("expected '@<type>', '%<type>' or \"<type>\"");
+    else
+      return TokError("expected '%<type>' or \"<type>\"");
+  }
+  if (!L.is(AsmToken::String))
+    Lex();
+  if (L.is(AsmToken::Integer)) {
+    TypeName = getTok().getString();
+    Lex();
+  } else if (getParser().parseIdentifier(TypeName))
+    return TokError("expected identifier in directive");
+  return false;
+}
+
+bool ELFAsmParser::parseMergeSize(int64_t &Size) {
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected the entry size");
+  Lex();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+  if (Size <= 0)
+    return TokError("entry size must be positive");
+  return false;
+}
+
+bool ELFAsmParser::parseGroup(StringRef &GroupName) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected group name");
+  Lex();
+  if (getParser().parseIdentifier(GroupName))
+    return true;
+  if (L.is(AsmToken::Comma)) {
+    Lex();
+    StringRef Linkage;
+    if (getParser().parseIdentifier(Linkage))
+      return true;
+    if (Linkage != "comdat")
+      return TokError("Linkage must be 'comdat'");
+  }
+  return false;
+}
+
+bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected metadata symbol");
+  Lex();
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return true;
+  Associated = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
+  if (!Associated || !Associated->isInSection())
+    return TokError("symbol is not in a section: " + Name);
+  return false;
+}
+
+bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return false;
+  Lex();
+  StringRef UniqueStr;
+  if (getParser().parseIdentifier(UniqueStr))
+    return TokError("expected identifier in directive");
+  if (UniqueStr != "unique")
+    return TokError("expected 'unique'");
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected commma");
+  Lex();
+  if (getParser().parseAbsoluteExpression(UniqueID))
+    return true;
+  if (UniqueID < 0)
+    return TokError("unique id must be positive");
+  if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
+    return TokError("unique id is too large");
+  return false;
+}
+
+static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
+  return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
+}
+
 bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
@@ -379,14 +489,24 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
   StringRef UniqueStr;
+  MCSymbolELF *Associated = nullptr;
   int64_t UniqueID = ~0;
 
   // Set the defaults first.
-  if (SectionName == ".fini" || SectionName == ".init" ||
-      SectionName == ".rodata")
+  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
     Flags |= ELF::SHF_ALLOC;
-  if (SectionName == ".fini" || SectionName == ".init")
-    Flags |= ELF::SHF_EXECINSTR;
+  if (SectionName == ".fini" || SectionName == ".init" ||
+      hasPrefix(SectionName, ".text."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+  if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
+      hasPrefix(SectionName, ".bss.") ||
+      hasPrefix(SectionName, ".init_array.") ||
+      hasPrefix(SectionName, ".fini_array.") ||
+      hasPrefix(SectionName, ".preinit_array."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  if (hasPrefix(SectionName, ".tdata.") ||
+      hasPrefix(SectionName, ".tbss."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
 
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
@@ -422,65 +542,30 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       return TokError("Section cannot specifiy a group name while also acting "
                       "as a member of the last group");
 
-    if (getLexer().isNot(AsmToken::Comma)) {
+    if (maybeParseSectionType(TypeName))
+      return true;
+
+    MCAsmLexer &L = getLexer();
+    if (TypeName.empty()) {
       if (Mergeable)
         return TokError("Mergeable section must specify the type");
       if (Group)
         return TokError("Group section must specify the type");
-    } else {
-      Lex();
-      if (getLexer().is(AsmToken::At) || getLexer().is(AsmToken::Percent) ||
-          getLexer().is(AsmToken::String)) {
-        if (!getLexer().is(AsmToken::String))
-          Lex();
-      } else
-        return TokError("expected '@<type>', '%<type>' or \"<type>\"");
-
-      if (getParser().parseIdentifier(TypeName))
-        return TokError("expected identifier in directive");
-
-      if (Mergeable) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected the entry size");
-        Lex();
-        if (getParser().parseAbsoluteExpression(Size))
-          return true;
-        if (Size <= 0)
-          return TokError("entry size must be positive");
-      }
-
-      if (Group) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected group name");
-        Lex();
-        if (getParser().parseIdentifier(GroupName))
-          return true;
-        if (getLexer().is(AsmToken::Comma)) {
-          Lex();
-          StringRef Linkage;
-          if (getParser().parseIdentifier(Linkage))
-            return true;
-          if (Linkage != "comdat")
-            return TokError("Linkage must be 'comdat'");
-        }
-      }
-      if (getLexer().is(AsmToken::Comma)) {
-        Lex();
-        if (getParser().parseIdentifier(UniqueStr))
-          return TokError("expected identifier in directive");
-        if (UniqueStr != "unique")
-          return TokError("expected 'unique'");
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected commma");
-        Lex();
-        if (getParser().parseAbsoluteExpression(UniqueID))
-          return true;
-        if (UniqueID < 0)
-          return TokError("unique id must be positive");
-        if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
-          return TokError("unique id is too large");
-      }
+      if (L.isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in directive");
     }
+
+    if (Mergeable)
+      if (parseMergeSize(Size))
+        return true;
+    if (Group)
+      if (parseGroup(GroupName))
+        return true;
+    if (Flags & ELF::SHF_LINK_ORDER)
+      if (parseMetadataSym(Associated))
+        return true;
+    if (maybeParseUniqueID(UniqueID))
+      return true;
   }
 
 EndStmt:
@@ -493,11 +578,15 @@ EndStmt:
   if (TypeName.empty()) {
     if (SectionName.startswith(".note"))
       Type = ELF::SHT_NOTE;
-    else if (SectionName == ".init_array")
+    else if (hasPrefix(SectionName, ".init_array."))
       Type = ELF::SHT_INIT_ARRAY;
-    else if (SectionName == ".fini_array")
+    else if (hasPrefix(SectionName, ".bss."))
+      Type = ELF::SHT_NOBITS;
+    else if (hasPrefix(SectionName, ".tbss."))
+      Type = ELF::SHT_NOBITS;
+    else if (hasPrefix(SectionName, ".fini_array."))
       Type = ELF::SHT_FINI_ARRAY;
-    else if (SectionName == ".preinit_array")
+    else if (hasPrefix(SectionName, ".preinit_array."))
       Type = ELF::SHT_PREINIT_ARRAY;
   } else {
     if (TypeName == "init_array")
@@ -514,7 +603,7 @@ EndStmt:
       Type = ELF::SHT_NOTE;
     else if (TypeName == "unwind")
       Type = ELF::SHT_X86_64_UNWIND;
-    else
+    else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
 
@@ -528,8 +617,9 @@ EndStmt:
       }
   }
 
-  MCSection *ELFSection = getContext().getELFSection(SectionName, Type, Flags,
-                                                     Size, GroupName, UniqueID);
+  MCSection *ELFSection =
+      getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
+                                 UniqueID, Associated);
   getStreamer().SwitchSection(ELFSection, Subsection);
 
   if (getContext().getGenDwarfForAssembly()) {
@@ -677,6 +767,7 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   const MCExpr *Value = MCSymbolRefExpr::create(Sym, getContext());
 
   getStreamer().EmitAssignment(Alias, Value);
+  getStreamer().emitELFSymverDirective(Alias, Sym);
   return false;
 }
 
@@ -752,4 +843,4 @@ MCAsmParserExtension *createELFAsmParser() {
   return new ELFAsmParser;
 }
 
-}
+} // end namespace llvm
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 63c0daba09a0..f8fe78aece0c 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmLexer.cpp - Abstract Asm Lexer Interface ---------------------===//
+//===- MCAsmLexer.cpp - Abstract Asm Lexer Interface ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer()
-    : TokStart(nullptr), SkipSpace(true), IsAtStartOfStatement(true),
-      CommentConsumer(nullptr) {
+MCAsmLexer::MCAsmLexer() {
   CurTok.emplace_back(AsmToken::Space, StringRef());
 }
 
-MCAsmLexer::~MCAsmLexer() {
-}
+MCAsmLexer::~MCAsmLexer() = default;
 
 SMLoc MCAsmLexer::getLoc() const {
   return SMLoc::getFromPointer(TokStart);
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 98f4daf972d6..27b37f3e2dfb 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -7,22 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
-MCAsmParser::MCAsmParser()
-    : TargetParser(nullptr), ShowParsedOperands(0), HadError(false),
-      PendingErrors() {}
+MCAsmParser::MCAsmParser() : ShowParsedOperands(0) {}
 
-MCAsmParser::~MCAsmParser() {
-}
+MCAsmParser::~MCAsmParser() = default;
 
 void MCAsmParser::setTargetParser(MCTargetAsmParser &P) {
   assert(!TargetParser && "Target parser is already initialized!");
@@ -118,10 +118,10 @@ bool MCAsmParser::addErrorSuffix(const Twine &Suffix) {
   return true;
 }
 
-bool MCAsmParser::parseMany(std::function<bool()> parseOne, bool hasComma) {
+bool MCAsmParser::parseMany(function_ref<bool()> parseOne, bool hasComma) {
   if (parseOptionalToken(AsmToken::EndOfStatement))
     return false;
-  while (1) {
+  while (true) {
     if (parseOne())
       return true;
     if (parseOptionalToken(AsmToken::EndOfStatement))
@@ -137,6 +137,9 @@ bool MCAsmParser::parseExpression(const MCExpr *&Res) {
   return parseExpression(Res, L);
 }
 
-LLVM_DUMP_METHOD void MCParsedAsmOperand::dump() const {
+void MCParsedAsmOperand::dump() const {
+  // Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
+#endif
 }
diff --git a/lib/MC/MCParser/MCAsmParserExtension.cpp b/lib/MC/MCParser/MCAsmParserExtension.cpp
index 3f25a14926b6..031f473dc5fe 100644
--- a/lib/MC/MCParser/MCAsmParserExtension.cpp
+++ b/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmParserExtension.cpp - Asm Parser Hooks -----------------------===//
+//===- MCAsmParserExtension.cpp - Asm Parser Hooks ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+
 using namespace llvm;
 
-MCAsmParserExtension::MCAsmParserExtension() :
-  BracketExpressionsSupported(false) {
-}
+MCAsmParserExtension::MCAsmParserExtension() = default;
 
-MCAsmParserExtension::~MCAsmParserExtension() {
-}
+MCAsmParserExtension::~MCAsmParserExtension() = default;
 
 void MCAsmParserExtension::Initialize(MCAsmParser &Parser) {
   this->Parser = &Parser;
diff --git a/lib/MC/MCParser/MCTargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp
index 14a22c6b8a2f..5f821443bb96 100644
--- a/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- MCTargetAsmParser.cpp - Target Assembly Parser ---------------------==//
+//===-- MCTargetAsmParser.cpp - Target Assembly Parser --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+
 using namespace llvm;
 
 MCTargetAsmParser::MCTargetAsmParser(MCTargetOptions const &MCOptions,
                                      const MCSubtargetInfo &STI)
-  : AvailableFeatures(0), ParsingInlineAsm(false), MCOptions(MCOptions),
-    STI(&STI)
-{
-}
+  : MCOptions(MCOptions), STI(&STI) {}
 
-MCTargetAsmParser::~MCTargetAsmParser() {
-}
+MCTargetAsmParser::~MCTargetAsmParser() = default;
 
 MCSubtargetInfo &MCTargetAsmParser::copySTI() {
   MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI());
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
index ea117f3caa85..a75100a4876b 100644
--- a/lib/MC/MCRegisterInfo.cpp
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//=== MC/MCRegisterInfo.cpp - Target Register Description -------*- C++ -*-===//
+//===- MC/MCRegisterInfo.cpp - Target Register Description ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,9 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/MC/MCSection.cpp b/lib/MC/MCSection.cpp
index 9064cdf2f319..7986c0122043 100644
--- a/lib/MC/MCSection.cpp
+++ b/lib/MC/MCSection.cpp
@@ -7,17 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
+#include <algorithm>
+#include <utility>
 
-//===----------------------------------------------------------------------===//
-// MCSection
-//===----------------------------------------------------------------------===//
+using namespace llvm;
 
 MCSection::MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
@@ -31,8 +32,7 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
 
 bool MCSection::hasEnded() const { return End && End->isInSection(); }
 
-MCSection::~MCSection() {
-}
+MCSection::~MCSection() = default;
 
 void MCSection::setBundleLockState(BundleLockStateType NewState) {
   if (NewState == NotBundleLocked) {
@@ -85,8 +85,9 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
   return IP;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCSection::dump() {
-  raw_ostream &OS = llvm::errs();
+  raw_ostream &OS = errs();
 
   OS << "<MCSection";
   OS << " Fragments:[\n      ";
@@ -97,3 +98,4 @@ LLVM_DUMP_METHOD void MCSection::dump() {
   }
   OS << "]>";
 }
+#endif
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index f2dd47d81b7e..f0709cbc2515 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
-MCSectionCOFF::~MCSectionCOFF() {} // anchor.
+MCSectionCOFF::~MCSectionCOFF() = default; // anchor.
 
 // ShouldOmitSectionDirective - Decides whether a '.section' directive
 // should be printed before the section name
@@ -37,10 +37,9 @@ void MCSectionCOFF::setSelection(int Selection) const {
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 }
 
-void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
+void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
-
   // standard sections don't require the '.section'
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
@@ -94,7 +93,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
         OS << "newest,";
         break;
       default:
-        assert (0 && "unsupported COFF selection type");
+        assert(false && "unsupported COFF selection type");
         break;
     }
     assert(COMDATSymbol);
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index 587b28f71b7d..78fe01cca24a 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -7,23 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
-MCSectionELF::~MCSectionELF() {} // anchor.
+MCSectionELF::~MCSectionELF() = default; // anchor.
 
 // Decides whether a '.section' directive
 // should be printed before the section name.
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
-
   if (isUnique())
     return false;
 
@@ -53,10 +53,9 @@ static void printName(raw_ostream &OS, StringRef Name) {
   OS << '"';
 }
 
-void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
+void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
-
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName();
     if (Subsection) {
@@ -104,14 +103,21 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'S';
   if (Flags & ELF::SHF_TLS)
     OS << 'T';
+  if (Flags & ELF::SHF_LINK_ORDER)
+    OS << 'o';
 
   // If there are target-specific flags, print them.
-  if (Flags & ELF::XCORE_SHF_CP_SECTION)
-    OS << 'c';
-  if (Flags & ELF::XCORE_SHF_DP_SECTION)
-    OS << 'd';
-  if (Flags & ELF::SHF_ARM_PURECODE)
-    OS << 'y';
+  Triple::ArchType Arch = T.getArch();
+  if (Arch == Triple::xcore) {
+    if (Flags & ELF::XCORE_SHF_CP_SECTION)
+      OS << 'c';
+    if (Flags & ELF::XCORE_SHF_DP_SECTION)
+      OS << 'd';
+  } else if (Arch == Triple::arm || Arch == Triple::armeb ||
+             Arch == Triple::thumb || Arch == Triple::thumbeb) {
+    if (Flags & ELF::SHF_ARM_PURECODE)
+      OS << 'y';
+  }
 
   OS << '"';
 
@@ -137,6 +143,13 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << "progbits";
   else if (Type == ELF::SHT_X86_64_UNWIND)
     OS << "unwind";
+  else if (Type == ELF::SHT_MIPS_DWARF)
+    // Print hex value of the flag while we do not have
+    // any standard symbolic representation of the flag.
+    OS << "0x7000001e";
+  else
+    report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
+                       " for section " + getSectionName());
 
   if (EntrySize) {
     assert(Flags & ELF::SHF_MERGE);
@@ -149,6 +162,12 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << ",comdat";
   }
 
+  if (Flags & ELF::SHF_LINK_ORDER) {
+    assert(AssociatedSymbol);
+    OS << ",";
+    printName(OS, AssociatedSymbol->getName());
+  }
+
   if (isUnique())
     OS << ",unique," << UniqueID;
 
diff --git a/lib/MC/MCSectionMachO.cpp b/lib/MC/MCSectionMachO.cpp
index c2a772fdbdac..f40237231a2f 100644
--- a/lib/MC/MCSectionMachO.cpp
+++ b/lib/MC/MCSectionMachO.cpp
@@ -16,45 +16,57 @@ using namespace llvm;
 /// SectionTypeDescriptors - These are strings that describe the various section
 /// types.  This *must* be kept in order with and stay synchronized with the
 /// section type list.
-static const struct {
-  StringRef AssemblerName, EnumName;
-} SectionTypeDescriptors[MachO::LAST_KNOWN_SECTION_TYPE+1] = {
-  { "regular",                  "S_REGULAR" },                    // 0x00
-  { StringRef(),                "S_ZEROFILL" },                   // 0x01
-  { "cstring_literals",         "S_CSTRING_LITERALS" },           // 0x02
-  { "4byte_literals",           "S_4BYTE_LITERALS" },             // 0x03
-  { "8byte_literals",           "S_8BYTE_LITERALS" },             // 0x04
-  { "literal_pointers",         "S_LITERAL_POINTERS" },           // 0x05
-  { "non_lazy_symbol_pointers", "S_NON_LAZY_SYMBOL_POINTERS" },   // 0x06
-  { "lazy_symbol_pointers",     "S_LAZY_SYMBOL_POINTERS" },       // 0x07
-  { "symbol_stubs",             "S_SYMBOL_STUBS" },               // 0x08
-  { "mod_init_funcs",           "S_MOD_INIT_FUNC_POINTERS" },     // 0x09
-  { "mod_term_funcs",           "S_MOD_TERM_FUNC_POINTERS" },     // 0x0A
-  { "coalesced",                "S_COALESCED" },                  // 0x0B
-  { StringRef(), /*FIXME??*/    "S_GB_ZEROFILL" },                // 0x0C
-  { "interposing",              "S_INTERPOSING" },                // 0x0D
-  { "16byte_literals",          "S_16BYTE_LITERALS" },            // 0x0E
-  { StringRef(), /*FIXME??*/    "S_DTRACE_DOF" },                 // 0x0F
-  { StringRef(), /*FIXME??*/    "S_LAZY_DYLIB_SYMBOL_POINTERS" }, // 0x10
-  { "thread_local_regular",     "S_THREAD_LOCAL_REGULAR" },       // 0x11
-  { "thread_local_zerofill",    "S_THREAD_LOCAL_ZEROFILL" },      // 0x12
-  { "thread_local_variables",   "S_THREAD_LOCAL_VARIABLES" },     // 0x13
-  { "thread_local_variable_pointers",
-    "S_THREAD_LOCAL_VARIABLE_POINTERS" },                         // 0x14
-  { "thread_local_init_function_pointers",
-    "S_THREAD_LOCAL_INIT_FUNCTION_POINTERS"},                     // 0x15
+static constexpr struct {
+  StringLiteral AssemblerName, EnumName;
+} SectionTypeDescriptors[MachO::LAST_KNOWN_SECTION_TYPE + 1] = {
+    {StringLiteral("regular"), StringLiteral("S_REGULAR")}, // 0x00
+    {StringLiteral(""), StringLiteral("S_ZEROFILL")},       // 0x01
+    {StringLiteral("cstring_literals"),
+     StringLiteral("S_CSTRING_LITERALS")}, // 0x02
+    {StringLiteral("4byte_literals"),
+     StringLiteral("S_4BYTE_LITERALS")}, // 0x03
+    {StringLiteral("8byte_literals"),
+     StringLiteral("S_8BYTE_LITERALS")}, // 0x04
+    {StringLiteral("literal_pointers"),
+     StringLiteral("S_LITERAL_POINTERS")}, // 0x05
+    {StringLiteral("non_lazy_symbol_pointers"),
+     StringLiteral("S_NON_LAZY_SYMBOL_POINTERS")}, // 0x06
+    {StringLiteral("lazy_symbol_pointers"),
+     StringLiteral("S_LAZY_SYMBOL_POINTERS")},                        // 0x07
+    {StringLiteral("symbol_stubs"), StringLiteral("S_SYMBOL_STUBS")}, // 0x08
+    {StringLiteral("mod_init_funcs"),
+     StringLiteral("S_MOD_INIT_FUNC_POINTERS")}, // 0x09
+    {StringLiteral("mod_term_funcs"),
+     StringLiteral("S_MOD_TERM_FUNC_POINTERS")},                     // 0x0A
+    {StringLiteral("coalesced"), StringLiteral("S_COALESCED")},      // 0x0B
+    {StringLiteral("") /*FIXME??*/, StringLiteral("S_GB_ZEROFILL")}, // 0x0C
+    {StringLiteral("interposing"), StringLiteral("S_INTERPOSING")},  // 0x0D
+    {StringLiteral("16byte_literals"),
+     StringLiteral("S_16BYTE_LITERALS")},                           // 0x0E
+    {StringLiteral("") /*FIXME??*/, StringLiteral("S_DTRACE_DOF")}, // 0x0F
+    {StringLiteral("") /*FIXME??*/,
+     StringLiteral("S_LAZY_DYLIB_SYMBOL_POINTERS")}, // 0x10
+    {StringLiteral("thread_local_regular"),
+     StringLiteral("S_THREAD_LOCAL_REGULAR")}, // 0x11
+    {StringLiteral("thread_local_zerofill"),
+     StringLiteral("S_THREAD_LOCAL_ZEROFILL")}, // 0x12
+    {StringLiteral("thread_local_variables"),
+     StringLiteral("S_THREAD_LOCAL_VARIABLES")}, // 0x13
+    {StringLiteral("thread_local_variable_pointers"),
+     StringLiteral("S_THREAD_LOCAL_VARIABLE_POINTERS")}, // 0x14
+    {StringLiteral("thread_local_init_function_pointers"),
+     StringLiteral("S_THREAD_LOCAL_INIT_FUNCTION_POINTERS")}, // 0x15
 };
 
-
 /// SectionAttrDescriptors - This is an array of descriptors for section
 /// attributes.  Unlike the SectionTypeDescriptors, this is not directly indexed
 /// by attribute, instead it is searched.
-static const struct {
+static constexpr struct {
   unsigned AttrFlag;
-  StringRef AssemblerName, EnumName;
+  StringLiteral AssemblerName, EnumName;
 } SectionAttrDescriptors[] = {
 #define ENTRY(ASMNAME, ENUM) \
-  { MachO::ENUM, ASMNAME, #ENUM },
+  { MachO::ENUM, StringLiteral(ASMNAME), StringLiteral(#ENUM) },
 ENTRY("pure_instructions",   S_ATTR_PURE_INSTRUCTIONS)
 ENTRY("no_toc",              S_ATTR_NO_TOC)
 ENTRY("strip_static_syms",   S_ATTR_STRIP_STATIC_SYMS)
@@ -62,11 +74,11 @@ ENTRY("no_dead_strip",       S_ATTR_NO_DEAD_STRIP)
 ENTRY("live_support",        S_ATTR_LIVE_SUPPORT)
 ENTRY("self_modifying_code", S_ATTR_SELF_MODIFYING_CODE)
 ENTRY("debug",               S_ATTR_DEBUG)
-ENTRY(StringRef() /*FIXME*/, S_ATTR_SOME_INSTRUCTIONS)
-ENTRY(StringRef() /*FIXME*/, S_ATTR_EXT_RELOC)
-ENTRY(StringRef() /*FIXME*/, S_ATTR_LOC_RELOC)
+ENTRY("" /*FIXME*/,          S_ATTR_SOME_INSTRUCTIONS)
+ENTRY("" /*FIXME*/,          S_ATTR_EXT_RELOC)
+ENTRY("" /*FIXME*/,          S_ATTR_LOC_RELOC)
 #undef ENTRY
-  { 0, "none", StringRef() }, // used if section has no attributes but has a stub size
+  { 0, StringLiteral("none"), StringLiteral("") }, // used if section has no attributes but has a stub size
 };
 
 MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
@@ -89,7 +101,7 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
   }
 }
 
-void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI,
+void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
                                           const MCExpr *Subsection) const {
   OS << "\t.section\t" << getSegmentName() << ',' << getSectionName();
diff --git a/lib/MC/MCSectionWasm.cpp b/lib/MC/MCSectionWasm.cpp
new file mode 100644
index 000000000000..c61f28e129f5
--- /dev/null
+++ b/lib/MC/MCSectionWasm.cpp
@@ -0,0 +1,97 @@
+//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSectionWasm::~MCSectionWasm() {} // anchor.
+
+// Decides whether a '.section' directive
+// should be printed before the section name.
+bool MCSectionWasm::ShouldOmitSectionDirective(StringRef Name,
+                                               const MCAsmInfo &MAI) const {
+  return MAI.shouldOmitSectionDirective(Name);
+}
+
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
+void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+                                         raw_ostream &OS,
+                                         const MCExpr *Subsection) const {
+
+  if (ShouldOmitSectionDirective(SectionName, MAI)) {
+    OS << '\t' << getSectionName();
+    if (Subsection) {
+      OS << '\t';
+      Subsection->print(OS, &MAI);
+    }
+    OS << '\n';
+    return;
+  }
+
+  OS << "\t.section\t";
+  printName(OS, getSectionName());
+  OS << ",\"";
+
+  // TODO: Print section flags.
+
+  OS << '"';
+
+  OS << ',';
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (MAI.getCommentString()[0] == '@')
+    OS << '%';
+  else
+    OS << '@';
+
+  // TODO: Print section type.
+
+  if (isUnique())
+    OS << ",unique," << UniqueID;
+
+  OS << '\n';
+
+  if (Subsection) {
+    OS << "\t.subsection\t";
+    Subsection->print(OS, &MAI);
+    OS << '\n';
+  }
+}
+
+bool MCSectionWasm::UseCodeAlign() const { return false; }
+
+bool MCSectionWasm::isVirtualSection() const { return false; }
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index fb28f856f671..c9a6f12b6a58 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -7,36 +7,44 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinEH.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
-using namespace llvm;
+#include <cassert>
+#include <cstdint>
+#include <utility>
 
-// Pin the vtables to this file.
-MCTargetStreamer::~MCTargetStreamer() {}
+using namespace llvm;
 
 MCTargetStreamer::MCTargetStreamer(MCStreamer &S) : Streamer(S) {
   S.setTargetStreamer(this);
 }
 
+// Pin the vtables to this file.
+MCTargetStreamer::~MCTargetStreamer() = default;
+
 void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {}
 
 void MCTargetStreamer::finish() {}
@@ -290,10 +298,17 @@ void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
   SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
 }
 
-void MCStreamer::EmitLabel(MCSymbol *Symbol) {
+void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  Symbol->redefineIfPossible();
+
+  if (!Symbol->isUndefined() || Symbol->isVariable())
+    return getContext().reportError(Loc, "invalid symbol redefinition");
+
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSectionOnly() && "Cannot emit before setting section!");
   assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!");
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
   Symbol->setFragment(&getCurrentSectionOnly()->getDummyFragment());
 
   MCTargetStreamer *TS = getTargetStreamer();
@@ -666,7 +681,7 @@ void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) {
 
 void MCStreamer::EmitWinCFIPushFrame(bool Code) {
   EnsureValidWinFrameInfo();
-  if (CurrentWinFrameInfo->Instructions.size() > 0)
+  if (!CurrentWinFrameInfo->Instructions.empty())
     report_fatal_error("If present, PushMachFrame must be the first UOP");
 
   MCSymbol *Label = EmitCFILabel();
@@ -762,8 +777,8 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
   }
 }
 
-void MCStreamer::EmitInstruction(const MCInst &Inst,
-                                 const MCSubtargetInfo &STI) {
+void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                                 bool) {
   // Scan for values.
   for (unsigned i = Inst.getNumOperands(); i--;)
     if (Inst.getOperand(i).isExpr())
@@ -792,12 +807,22 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
 void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
 void MCStreamer::EmitThumbFunc(MCSymbol *Func) {}
 void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
-void MCStreamer::EndCOFFSymbolDef() {}
+void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
+void MCStreamer::EndCOFFSymbolDef() {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
 void MCStreamer::EmitFileDirective(StringRef Filename) {}
-void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {}
-void MCStreamer::EmitCOFFSymbolType(int Type) {}
+void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
+void MCStreamer::EmitCOFFSymbolType(int Type) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
 void MCStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+void MCStreamer::emitELFSymverDirective(MCSymbol *Alias,
+                                        const MCSymbol *Aliasee) {}
 void MCStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {}
 void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 1b592504b1e4..777b4e3d6b67 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MCSubtargetInfo.cpp - Subtarget Information -----------------------===//
+//===- MCSubtargetInfo.cpp - Subtarget Information ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstring>
 
 using namespace llvm;
 
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index 20d985df7ea0..cb262542b89f 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -7,13 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+
 using namespace llvm;
 
 // Only the address of this fragment is ever actually used.
@@ -75,4 +81,8 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << '"';
 }
 
-LLVM_DUMP_METHOD void MCSymbol::dump() const { dbgs() << *this; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MCSymbol::dump() const {
+  dbgs() << *this;
+}
+#endif
diff --git a/lib/MC/MCSymbolELF.cpp b/lib/MC/MCSymbolELF.cpp
index ec7ef447ff89..ffa8260d4342 100644
--- a/lib/MC/MCSymbolELF.cpp
+++ b/lib/MC/MCSymbolELF.cpp
@@ -42,6 +42,8 @@ enum {
 
 void MCSymbolELF::setBinding(unsigned Binding) const {
   setIsBindingSet();
+  if (getType() == ELF::STT_SECTION && Binding != ELF::STB_LOCAL)
+    setType(ELF::STT_NOTYPE);
   unsigned Val;
   switch (Binding) {
   default:
@@ -93,6 +95,8 @@ unsigned MCSymbolELF::getBinding() const {
 
 void MCSymbolELF::setType(unsigned Type) const {
   unsigned Val;
+  if (Type == ELF::STT_SECTION && getBinding() != ELF::STB_LOCAL)
+    return;
   switch (Type) {
   default:
     llvm_unreachable("Unsupported Binding");
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index 419210537eea..5d666b67fddb 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCTargetOptions.cpp - MC Target Options --------------------===//
+//===- lib/MC/MCTargetOptions.cpp - MC Target Options ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,19 +10,16 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCTargetOptions.h"
 
-namespace llvm {
+using namespace llvm;
 
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
       MCFatalWarnings(false), MCNoWarn(false), MCNoDeprecatedWarn(false),
-      MCSaveTempLabels(false),
-      MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
-      MCPIECopyRelocations(false), ShowMCEncoding(false),
-      ShowMCInst(false), AsmVerbose(false),
-      PreserveAsmComments(true), DwarfVersion(0), ABIName() {}
+      MCSaveTempLabels(false), MCUseDwarfDirectory(false),
+      MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false),
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
+      PreserveAsmComments(true) {}
 
 StringRef MCTargetOptions::getABIName() const {
   return ABIName;
 }
-
-} // end namespace llvm
diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index c1336d6d1b49..32a6adbf224e 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp
@@ -37,9 +37,11 @@ void MCValue::print(raw_ostream &OS) const {
     OS << " + " << getConstant();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCValue::dump() const {
   print(dbgs());
 }
+#endif
 
 MCSymbolRefExpr::VariantKind MCValue::getAccessVariant() const {
   const MCSymbolRefExpr *B = getSymB();
diff --git a/lib/MC/MCWasmObjectTargetWriter.cpp b/lib/MC/MCWasmObjectTargetWriter.cpp
new file mode 100644
index 000000000000..a09a17d7a124
--- /dev/null
+++ b/lib/MC/MCWasmObjectTargetWriter.cpp
@@ -0,0 +1,27 @@
+//===-- MCWasmObjectTargetWriter.cpp - Wasm Target Writer Subclass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+
+using namespace llvm;
+
+MCWasmObjectTargetWriter::MCWasmObjectTargetWriter(bool Is64Bit_)
+    : Is64Bit(Is64Bit_) {}
+
+bool MCWasmObjectTargetWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                       unsigned Type) const {
+  return false;
+}
+
+void MCWasmObjectTargetWriter::sortRelocs(
+    const MCAssembler &Asm, std::vector<WasmRelocationEntry> &Relocs) {
+}
diff --git a/lib/MC/MCWasmStreamer.cpp b/lib/MC/MCWasmStreamer.cpp
new file mode 100644
index 000000000000..59b62b8d37c3
--- /dev/null
+++ b/lib/MC/MCWasmStreamer.cpp
@@ -0,0 +1,216 @@
+//===- lib/MC/MCWasmStreamer.cpp - Wasm Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits Wasm .o object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCWasmStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCWasmStreamer::~MCWasmStreamer() {}
+
+void MCWasmStreamer::mergeFragment(MCDataFragment *DF, MCDataFragment *EF) {
+  flushPendingLabels(DF, DF->getContents().size());
+
+  for (unsigned i = 0, e = EF->getFixups().size(); i != e; ++i) {
+    EF->getFixups()[i].setOffset(EF->getFixups()[i].getOffset() +
+                                 DF->getContents().size());
+    DF->getFixups().push_back(EF->getFixups()[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
+}
+
+void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+
+  // Do any generic stuff we need to do.
+  llvm_unreachable("invalid assembler flag!");
+}
+
+void MCWasmStreamer::ChangeSection(MCSection *Section,
+                                   const MCExpr *Subsection) {
+  MCAssembler &Asm = getAssembler();
+  auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
+  const MCSymbol *Grp = SectionWasm->getGroup();
+  if (Grp)
+    Asm.registerSymbol(*Grp);
+
+  this->MCObjectStreamer::ChangeSection(Section, Subsection);
+}
+
+void MCWasmStreamer::EmitWeakReference(MCSymbol *Alias,
+                                       const MCSymbol *Symbol) {
+  getAssembler().registerSymbol(*Symbol);
+  const MCExpr *Value = MCSymbolRefExpr::create(
+      Symbol, MCSymbolRefExpr::VK_WEAKREF, getContext());
+  Alias->setVariableValue(Value);
+}
+
+bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
+  assert(Attribute != MCSA_IndirectSymbol && "indirect symbols not supported");
+
+  auto *Symbol = cast<MCSymbolWasm>(S);
+
+  // Adding a symbol attribute always introduces the symbol, note that an
+  // important side effect of calling registerSymbol here is to register
+  // the symbol with the assembler.
+  getAssembler().registerSymbol(*Symbol);
+
+  switch (Attribute) {
+  case MCSA_LazyReference:
+  case MCSA_Reference:
+  case MCSA_SymbolResolver:
+  case MCSA_PrivateExtern:
+  case MCSA_WeakDefinition:
+  case MCSA_WeakDefAutoPrivate:
+  case MCSA_Invalid:
+  case MCSA_IndirectSymbol:
+    return false;
+  case MCSA_Global:
+    Symbol->setExternal(true);
+    break;
+  case MCSA_ELF_TypeFunction:
+    Symbol->setIsFunction(true);
+    break;
+  case MCSA_ELF_TypeObject:
+    Symbol->setIsFunction(false);
+    break;
+  default:
+    // unrecognized directive
+    return false;
+  }
+
+  return true;
+}
+
+void MCWasmStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
+                                      unsigned ByteAlignment) {
+  llvm_unreachable("Common symbols are not yet implemented for Wasm");
+}
+
+void MCWasmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  cast<MCSymbolWasm>(Symbol)->setSize(Value);
+}
+
+void MCWasmStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
+                                           unsigned ByteAlignment) {
+  llvm_unreachable("Local common symbols are not yet implemented for Wasm");
+}
+
+void MCWasmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                   SMLoc Loc) {
+  MCObjectStreamer::EmitValueImpl(Value, Size, Loc);
+}
+
+void MCWasmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                          unsigned ValueSize,
+                                          unsigned MaxBytesToEmit) {
+  MCObjectStreamer::EmitValueToAlignment(ByteAlignment, Value, ValueSize,
+                                         MaxBytesToEmit);
+}
+
+void MCWasmStreamer::EmitIdent(StringRef IdentString) {
+  MCSection *Comment = getAssembler().getContext().getWasmSection(
+      ".comment", 0, 0);
+  PushSection();
+  SwitchSection(Comment);
+  if (!SeenIdent) {
+    EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  EmitBytes(IdentString);
+  EmitIntValue(0, 1);
+  PopSection();
+}
+
+void MCWasmStreamer::EmitInstToFragment(const MCInst &Inst,
+                                        const MCSubtargetInfo &STI) {
+  this->MCObjectStreamer::EmitInstToFragment(Inst, STI);
+}
+
+void MCWasmStreamer::EmitInstToData(const MCInst &Inst,
+                                    const MCSubtargetInfo &STI) {
+  MCAssembler &Assembler = getAssembler();
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
+
+  // Append the encoded instruction to the current data fragment (or create a
+  // new such fragment if the current fragment is not a data fragment).
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCWasmStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+
+  this->MCObjectStreamer::FinishImpl();
+}
+
+MCStreamer *llvm::createWasmStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                     bool RelaxAll) {
+  MCWasmStreamer *S = new MCWasmStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
+
+void MCWasmStreamer::EmitThumbFunc(MCSymbol *Func) {
+  llvm_unreachable("Generic Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+                                  uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+                                    uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index c4b35f5db9b4..d9ccf0dd661f 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -7,23 +7,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
 #include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mc"
diff --git a/lib/MC/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp
index 1a501bcafc12..fbd7ba60bc90 100644
--- a/lib/MC/StringTableBuilder.cpp
+++ b/lib/MC/StringTableBuilder.cpp
@@ -1,4 +1,4 @@
-//===-- StringTableBuilder.cpp - String table building utility ------------===//
+//===- StringTableBuilder.cpp - String table building utility -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,18 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/StringTableBuilder.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
 
-StringTableBuilder::~StringTableBuilder() {}
+StringTableBuilder::~StringTableBuilder() = default;
 
 void StringTableBuilder::initSize() {
   // Account for leading bytes in table so that offsets returned from add are
@@ -48,7 +54,7 @@ void StringTableBuilder::write(raw_ostream &OS) const {
   assert(isFinalized());
   SmallString<0> Data;
   Data.resize(getSize());
-  write((uint8_t *)&Data[0]);
+  write((uint8_t *)Data.data());
   OS << Data;
 }
 
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 32f06f8a7d6a..51aaa4b0aa25 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -7,28 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the SubtargetFeature interface.
+/// \file Implements the SubtargetFeature interface.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cctype>
-#include <cstdlib>
-using namespace llvm;
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <vector>
 
-//===----------------------------------------------------------------------===//
-//                          Static Helper Functions
-//===----------------------------------------------------------------------===//
+using namespace llvm;
 
-/// hasFlag - Determine if a feature has a flag; '+' or '-'
-///
+/// Determine if a feature has a flag; '+' or '-'
 static inline bool hasFlag(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
@@ -37,14 +40,12 @@ static inline bool hasFlag(StringRef Feature) {
   return Ch == '+' || Ch =='-';
 }
 
-/// StripFlag - Return string stripped of flag.
-///
+/// Return string stripped of flag.
 static inline std::string StripFlag(StringRef Feature) {
   return hasFlag(Feature) ? Feature.substr(1) : Feature;
 }
 
-/// isEnabled - Return true if enable flag; '+'.
-///
+/// Return true if enable flag; '+'.
 static inline bool isEnabled(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
@@ -53,15 +54,13 @@ static inline bool isEnabled(StringRef Feature) {
   return Ch == '+';
 }
 
-/// Split - Splits a string of comma separated items in to a vector of strings.
-///
+/// Splits a string of comma separated items in to a vector of strings.
 static void Split(std::vector<std::string> &V, StringRef S) {
   SmallVector<StringRef, 3> Tmp;
   S.split(Tmp, ',', -1, false /* KeepEmpty */);
   V.assign(Tmp.begin(), Tmp.end());
 }
 
-/// Adding features.
 void SubtargetFeatures::AddFeature(StringRef String, bool Enable) {
   // Don't add empty features.
   if (!String.empty())
@@ -81,8 +80,7 @@ static const SubtargetFeatureKV *Find(StringRef S,
   return F;
 }
 
-/// getLongestEntryLength - Return the length of the longest entry in the table.
-///
+/// Return the length of the longest entry in the table.
 static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
   size_t MaxLen = 0;
   for (auto &I : Table)
@@ -91,7 +89,6 @@ static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
 }
 
 /// Display help for feature choices.
-///
 static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
                  ArrayRef<SubtargetFeatureKV> FeatTable) {
   // Determine the length of the longest CPU and Feature entries.
@@ -114,58 +111,47 @@ static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
             "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
 }
 
-//===----------------------------------------------------------------------===//
-//                    SubtargetFeatures Implementation
-//===----------------------------------------------------------------------===//
-
 SubtargetFeatures::SubtargetFeatures(StringRef Initial) {
   // Break up string into separate features
   Split(Features, Initial);
 }
 
-
 std::string SubtargetFeatures::getString() const {
   return join(Features.begin(), Features.end(), ",");
 }
 
-/// SetImpliedBits - For each feature that is (transitively) implied by this
-/// feature, set it.
-///
+/// For each feature that is (transitively) implied by this feature, set it.
 static
-void SetImpliedBits(FeatureBitset &Bits, const SubtargetFeatureKV *FeatureEntry,
+void SetImpliedBits(FeatureBitset &Bits, const SubtargetFeatureKV &FeatureEntry,
                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
-  for (auto &FE : FeatureTable) {
-    if (FeatureEntry->Value == FE.Value) continue;
+  for (const SubtargetFeatureKV &FE : FeatureTable) {
+    if (FeatureEntry.Value == FE.Value) continue;
 
-    if ((FeatureEntry->Implies & FE.Value).any()) {
+    if ((FeatureEntry.Implies & FE.Value).any()) {
       Bits |= FE.Value;
-      SetImpliedBits(Bits, &FE, FeatureTable);
+      SetImpliedBits(Bits, FE, FeatureTable);
     }
   }
 }
 
-/// ClearImpliedBits - For each feature that (transitively) implies this
-/// feature, clear it.
-///
+/// For each feature that (transitively) implies this feature, clear it.
 static
-void ClearImpliedBits(FeatureBitset &Bits, 
-                      const SubtargetFeatureKV *FeatureEntry,
+void ClearImpliedBits(FeatureBitset &Bits,
+                      const SubtargetFeatureKV &FeatureEntry,
                       ArrayRef<SubtargetFeatureKV> FeatureTable) {
-  for (auto &FE : FeatureTable) {
-    if (FeatureEntry->Value == FE.Value) continue;
+  for (const SubtargetFeatureKV &FE : FeatureTable) {
+    if (FeatureEntry.Value == FE.Value) continue;
 
-    if ((FE.Implies & FeatureEntry->Value).any()) {
+    if ((FE.Implies & FeatureEntry.Value).any()) {
       Bits &= ~FE.Value;
-      ClearImpliedBits(Bits, &FE, FeatureTable);
+      ClearImpliedBits(Bits, FE, FeatureTable);
     }
   }
 }
 
-/// ToggleFeature - Toggle a feature and update the feature bits.
 void
 SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
                                  ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   // Find feature in table.
   const SubtargetFeatureKV *FeatureEntry =
       Find(StripFlag(Feature), FeatureTable);
@@ -174,23 +160,21 @@ SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
     if ((Bits & FeatureEntry->Value) == FeatureEntry->Value) {
       Bits &= ~FeatureEntry->Value;
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
+      ClearImpliedBits(Bits, *FeatureEntry, FeatureTable);
     } else {
       Bits |=  FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
+      SetImpliedBits(Bits, *FeatureEntry, FeatureTable);
     }
   } else {
-    errs() << "'" << Feature
-           << "' is not a recognized feature for this target"
+    errs() << "'" << Feature << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
 }
 
 void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
                                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   assert(hasFlag(Feature));
 
   // Find feature in table.
@@ -203,37 +187,30 @@ void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
       Bits |= FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
+      SetImpliedBits(Bits, *FeatureEntry, FeatureTable);
     } else {
       Bits &= ~FeatureEntry->Value;
 
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
+      ClearImpliedBits(Bits, *FeatureEntry, FeatureTable);
     }
   } else {
-    errs() << "'" << Feature
-           << "' is not a recognized feature for this target"
+    errs() << "'" << Feature << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
 }
 
-
-/// getFeatureBits - Get feature bits a CPU.
-///
 FeatureBitset
 SubtargetFeatures::getFeatureBits(StringRef CPU,
                                   ArrayRef<SubtargetFeatureKV> CPUTable,
                                   ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   if (CPUTable.empty() || FeatureTable.empty())
     return FeatureBitset();
 
-#ifndef NDEBUG
   assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) &&
          "CPU table is not sorted");
   assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) &&
          "CPU features table is not sorted");
-#endif
   // Resulting bits
   FeatureBitset Bits;
 
@@ -253,17 +230,16 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
       // Set the feature implied by this CPU feature, if any.
       for (auto &FE : FeatureTable) {
         if ((CPUEntry->Value & FE.Value).any())
-          SetImpliedBits(Bits, &FE, FeatureTable);
+          SetImpliedBits(Bits, FE, FeatureTable);
       }
     } else {
-      errs() << "'" << CPU
-             << "' is not a recognized processor for this target"
+      errs() << "'" << CPU << "' is not a recognized processor for this target"
              << " (ignoring processor)\n";
     }
   }
 
   // Iterate through each feature
-  for (auto &Feature : Features) {
+  for (const std::string &Feature : Features) {
     // Check for help
     if (Feature == "+help")
       Help(CPUTable, FeatureTable);
@@ -274,27 +250,22 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
   return Bits;
 }
 
-/// print - Print feature string.
-///
 void SubtargetFeatures::print(raw_ostream &OS) const {
   for (auto &F : Features)
     OS << F << " ";
   OS << "\n";
 }
 
-/// dump - Dump feature info.
-///
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SubtargetFeatures::dump() const {
   print(dbgs());
 }
+#endif
 
-/// Adds the default features for the specified target triple.
-///
-/// FIXME: This is an inelegant way of specifying the features of a
-/// subtarget. It would be better if we could encode this information
-/// into the IR. See <rdar://5972456>.
-///
 void SubtargetFeatures::getDefaultSubtargetFeatures(const Triple& Triple) {
+  // FIXME: This is an inelegant way of specifying the features of a
+  // subtarget. It would be better if we could encode this information
+  // into the IR. See <rdar://5972456>.
   if (Triple.getVendor() == Triple::Apple) {
     if (Triple.getArch() == Triple::ppc) {
       // powerpc-apple-*
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
new file mode 100644
index 000000000000..159cc3b4def2
--- /dev/null
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -0,0 +1,1149 @@
+//===- lib/MC/WasmObjectWriter.cpp - Wasm File Writer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Wasm object file writer information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/Wasm.h"
+#include <vector>
+
+using namespace llvm;
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "reloc-info"
+
+namespace {
+// For patching purposes, we need to remember where each section starts, both
+// for patching up the section size field, and for patching up references to
+// locations within the section.
+struct SectionBookkeeping {
+  // Where the size of the section is written.
+  uint64_t SizeOffset;
+  // Where the contents of the section starts (after the header).
+  uint64_t ContentsOffset;
+};
+
+class WasmObjectWriter : public MCObjectWriter {
+  /// Helper struct for containing some precomputed information on symbols.
+  struct WasmSymbolData {
+    const MCSymbolWasm *Symbol;
+    StringRef Name;
+
+    // Support lexicographic sorting.
+    bool operator<(const WasmSymbolData &RHS) const { return Name < RHS.Name; }
+  };
+
+  /// The target specific Wasm writer instance.
+  std::unique_ptr<MCWasmObjectTargetWriter> TargetObjectWriter;
+
+  // Relocations for fixing up references in the code section.
+  std::vector<WasmRelocationEntry> CodeRelocations;
+
+  // Relocations for fixing up references in the data section.
+  std::vector<WasmRelocationEntry> DataRelocations;
+
+  // Fixups for call_indirect type indices.
+  std::vector<WasmRelocationEntry> TypeIndexFixups;
+
+  // Index values to use for fixing up call_indirect type indices.
+  std::vector<uint32_t> TypeIndexFixupTypes;
+
+  // TargetObjectWriter wrappers.
+  bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const {
+    return TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
+  }
+
+  void startSection(SectionBookkeeping &Section, unsigned SectionId,
+                    const char *Name = nullptr);
+  void endSection(SectionBookkeeping &Section);
+
+public:
+  WasmObjectWriter(MCWasmObjectTargetWriter *MOTW, raw_pwrite_stream &OS)
+      : MCObjectWriter(OS, /*IsLittleEndian=*/true), TargetObjectWriter(MOTW) {}
+
+private:
+  void reset() override {
+    MCObjectWriter::reset();
+  }
+
+  ~WasmObjectWriter() override;
+
+  void writeHeader(const MCAssembler &Asm);
+
+  void writeValueType(wasm::ValType Ty) {
+    encodeSLEB128(int32_t(Ty), getStream());
+  }
+
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, bool &IsPCRel,
+                        uint64_t &FixedValue) override;
+
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override;
+
+  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+};
+} // end anonymous namespace
+
+WasmObjectWriter::~WasmObjectWriter() {}
+
+// Return the padding size to write a 32-bit value into a 5-byte ULEB128.
+static unsigned PaddingFor5ByteULEB128(uint32_t X) {
+  return X == 0 ? 4 : (4u - (31u - countLeadingZeros(X)) / 7u);
+}
+
+// Return the padding size to write a 32-bit value into a 5-byte SLEB128.
+static unsigned PaddingFor5ByteSLEB128(int32_t X) {
+  return 5 - getSLEB128Size(X);
+}
+
+// Write out a section header and a patchable section size field.
+void WasmObjectWriter::startSection(SectionBookkeeping &Section,
+                                    unsigned SectionId,
+                                    const char *Name) {
+  assert((Name != nullptr) == (SectionId == wasm::WASM_SEC_CUSTOM) &&
+         "Only custom sections can have names");
+
+  encodeULEB128(SectionId, getStream());
+
+  Section.SizeOffset = getStream().tell();
+
+  // The section size. We don't know the size yet, so reserve enough space
+  // for any 32-bit value; we'll patch it later.
+  encodeULEB128(UINT32_MAX, getStream());
+
+  // The position where the section starts, for measuring its size.
+  Section.ContentsOffset = getStream().tell();
+
+  // Custom sections in wasm also have a string identifier.
+  if (SectionId == wasm::WASM_SEC_CUSTOM) {
+    encodeULEB128(strlen(Name), getStream());
+    writeBytes(Name);
+  }
+}
+
+// Now that the section is complete and we know how big it is, patch up the
+// section size field at the start of the section.
+void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
+  uint64_t Size = getStream().tell() - Section.ContentsOffset;
+  if (uint32_t(Size) != Size)
+    report_fatal_error("section size does not fit in a uint32_t");
+
+  unsigned Padding = PaddingFor5ByteULEB128(Size);
+
+  // Write the final section size to the payload_len field, which follows
+  // the section id byte.
+  uint8_t Buffer[16];
+  unsigned SizeLen = encodeULEB128(Size, Buffer, Padding);
+  assert(SizeLen == 5);
+  getStream().pwrite((char *)Buffer, SizeLen, Section.SizeOffset);
+}
+
+// Emit the Wasm header.
+void WasmObjectWriter::writeHeader(const MCAssembler &Asm) {
+  writeBytes(StringRef(wasm::WasmMagic, sizeof(wasm::WasmMagic)));
+  writeLE32(wasm::WasmVersion);
+}
+
+void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
+                                                const MCAsmLayout &Layout) {
+}
+
+void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        bool &IsPCRel, uint64_t &FixedValue) {
+  MCSectionWasm &FixupSection = cast<MCSectionWasm>(*Fragment->getParent());
+  uint64_t C = Target.getConstant();
+  uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  MCContext &Ctx = Asm.getContext();
+
+  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+    assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
+           "Should not have constructed this");
+
+    // Let A, B and C being the components of Target and R be the location of
+    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
+    // If it is pcrel, we want to compute (A - B + C - R).
+
+    // In general, Wasm has no relocations for -B. It can only represent (A + C)
+    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
+    // replace B to implement it: (A - R - K + C)
+    if (IsPCRel) {
+      Ctx.reportError(
+          Fixup.getLoc(),
+          "No relocation available to represent this relative expression");
+      return;
+    }
+
+    const auto &SymB = cast<MCSymbolWasm>(RefB->getSymbol());
+
+    if (SymB.isUndefined()) {
+      Ctx.reportError(Fixup.getLoc(),
+                      Twine("symbol '") + SymB.getName() +
+                          "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    assert(!SymB.isAbsolute() && "Should have been folded");
+    const MCSection &SecB = SymB.getSection();
+    if (&SecB != &FixupSection) {
+      Ctx.reportError(Fixup.getLoc(),
+                      "Cannot represent a difference across sections");
+      return;
+    }
+
+    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
+    uint64_t K = SymBOffset - FixupOffset;
+    IsPCRel = true;
+    C -= K;
+  }
+
+  // We either rejected the fixup or folded B into C at this point.
+  const MCSymbolRefExpr *RefA = Target.getSymA();
+  const auto *SymA = RefA ? cast<MCSymbolWasm>(&RefA->getSymbol()) : nullptr;
+
+  bool ViaWeakRef = false;
+  if (SymA && SymA->isVariable()) {
+    const MCExpr *Expr = SymA->getVariableValue();
+    if (const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF) {
+        SymA = cast<MCSymbolWasm>(&Inner->getSymbol());
+        ViaWeakRef = true;
+      }
+    }
+  }
+
+  // Put any constant offset in an addend. Offsets can be negative, and
+  // LLVM expects wrapping, in contrast to wasm's immediates which can't
+  // be negative and don't wrap.
+  FixedValue = 0;
+
+  if (SymA) {
+    if (ViaWeakRef)
+      llvm_unreachable("weakref used in reloc not yet implemented");
+    else
+      SymA->setUsedInReloc();
+  }
+
+  if (RefA) {
+    if (RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX) {
+      assert(C == 0);
+      WasmRelocationEntry Rec(FixupOffset, SymA, C,
+                              wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB,
+                              &FixupSection);
+      TypeIndexFixups.push_back(Rec);
+      return;
+    }
+  }
+
+  unsigned Type = getRelocType(Ctx, Target, Fixup, IsPCRel);
+
+  WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection);
+
+  if (FixupSection.hasInstructions())
+    CodeRelocations.push_back(Rec);
+  else
+    DataRelocations.push_back(Rec);
+}
+
+namespace {
+
+// The signature of a wasm function, in a struct capable of being used as a
+// DenseMap key.
+struct WasmFunctionType {
+  // Support empty and tombstone instances, needed by DenseMap.
+  enum { Plain, Empty, Tombstone } State;
+
+  // The return types of the function.
+  SmallVector<wasm::ValType, 1> Returns;
+
+  // The parameter types of the function.
+  SmallVector<wasm::ValType, 4> Params;
+
+  WasmFunctionType() : State(Plain) {}
+
+  bool operator==(const WasmFunctionType &Other) const {
+    return State == Other.State && Returns == Other.Returns &&
+           Params == Other.Params;
+  }
+};
+
+// Traits for using WasmFunctionType in a DenseMap.
+struct WasmFunctionTypeDenseMapInfo {
+  static WasmFunctionType getEmptyKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Empty;
+    return FuncTy;
+  }
+  static WasmFunctionType getTombstoneKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Tombstone;
+    return FuncTy;
+  }
+  static unsigned getHashValue(const WasmFunctionType &FuncTy) {
+    uintptr_t Value = FuncTy.State;
+    for (wasm::ValType Ret : FuncTy.Returns)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
+    for (wasm::ValType Param : FuncTy.Params)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
+    return Value;
+  }
+  static bool isEqual(const WasmFunctionType &LHS,
+                      const WasmFunctionType &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// A wasm import to be written into the import section.
+struct WasmImport {
+  StringRef ModuleName;
+  StringRef FieldName;
+  unsigned Kind;
+  int32_t Type;
+};
+
+// A wasm function to be written into the function section.
+struct WasmFunction {
+  int32_t Type;
+  const MCSymbolWasm *Sym;
+};
+
+// A wasm export to be written into the export section.
+struct WasmExport {
+  StringRef FieldName;
+  unsigned Kind;
+  uint32_t Index;
+};
+
+// A wasm global to be written into the global section.
+struct WasmGlobal {
+  wasm::ValType Type;
+  bool IsMutable;
+  bool HasImport;
+  uint64_t InitialValue;
+  uint32_t ImportIndex;
+};
+
+} // end anonymous namespace
+
+// Write X as an (unsigned) LEB value at offset Offset in Stream, padded
+// to allow patching.
+static void
+WritePatchableLEB(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+  uint8_t Buffer[5];
+  unsigned Padding = PaddingFor5ByteULEB128(X);
+  unsigned SizeLen = encodeULEB128(X, Buffer, Padding);
+  assert(SizeLen == 5);
+  Stream.pwrite((char *)Buffer, SizeLen, Offset);
+}
+
+// Write X as an signed LEB value at offset Offset in Stream, padded
+// to allow patching.
+static void
+WritePatchableSLEB(raw_pwrite_stream &Stream, int32_t X, uint64_t Offset) {
+  uint8_t Buffer[5];
+  unsigned Padding = PaddingFor5ByteSLEB128(X);
+  unsigned SizeLen = encodeSLEB128(X, Buffer, Padding);
+  assert(SizeLen == 5);
+  Stream.pwrite((char *)Buffer, SizeLen, Offset);
+}
+
+// Write X as a plain integer value at offset Offset in Stream.
+static void WriteI32(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+  uint8_t Buffer[4];
+  support::endian::write32le(Buffer, X);
+  Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
+}
+
+// Compute a value to write into the code at the location covered
+// by RelEntry. This value isn't used by the static linker, since
+// we have addends; it just serves to make the code more readable
+// and to make standalone wasm modules directly usable.
+static uint32_t ProvisionalValue(const WasmRelocationEntry &RelEntry) {
+  const MCSymbolWasm *Sym = RelEntry.Symbol;
+
+  // For undefined symbols, use a hopefully invalid value.
+  if (!Sym->isDefined(false))
+    return UINT32_MAX;
+
+  MCSectionWasm &Section =
+    cast<MCSectionWasm>(RelEntry.Symbol->getSection(false));
+  uint64_t Address = Section.getSectionOffset() + RelEntry.Addend;
+
+  // Ignore overflow. LLVM allows address arithmetic to silently wrap.
+  uint32_t Value = Address;
+
+  return Value;
+}
+
+// Apply the portions of the relocation records that we can handle ourselves
+// directly.
+static void ApplyRelocations(
+    ArrayRef<WasmRelocationEntry> Relocations,
+    raw_pwrite_stream &Stream,
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices,
+    uint64_t ContentsOffset)
+{
+  for (const WasmRelocationEntry &RelEntry : Relocations) {
+    uint64_t Offset = ContentsOffset +
+                      RelEntry.FixupSection->getSectionOffset() +
+                      RelEntry.Offset;
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: {
+      uint32_t Index = SymbolIndices[RelEntry.Symbol];
+      assert(RelEntry.Addend == 0);
+
+      WritePatchableLEB(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: {
+      uint32_t Index = SymbolIndices[RelEntry.Symbol];
+      assert(RelEntry.Addend == 0);
+
+      WritePatchableSLEB(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+
+      WritePatchableSLEB(Stream, Value, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+
+      WritePatchableLEB(Stream, Value, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: {
+      uint32_t Index = SymbolIndices[RelEntry.Symbol];
+      assert(RelEntry.Addend == 0);
+
+      WriteI32(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+
+      WriteI32(Stream, Value, Offset);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+// Write out the portions of the relocation records that the linker will
+// need to handle.
+static void WriteRelocations(
+    ArrayRef<WasmRelocationEntry> Relocations,
+    raw_pwrite_stream &Stream,
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices)
+{
+  for (const WasmRelocationEntry RelEntry : Relocations) {
+    encodeULEB128(RelEntry.Type, Stream);
+
+    uint64_t Offset = RelEntry.Offset +
+                      RelEntry.FixupSection->getSectionOffset();
+    uint32_t Index = SymbolIndices[RelEntry.Symbol];
+    int64_t Addend = RelEntry.Addend;
+
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+      encodeULEB128(Offset, Stream);
+      encodeULEB128(Index, Stream);
+      assert(Addend == 0 && "addends not supported for functions");
+      break;
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      encodeULEB128(Offset, Stream);
+      encodeULEB128(Index, Stream);
+      encodeSLEB128(Addend, Stream);
+      break;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  }
+}
+
+// Write out the the type relocation records that the linker will
+// need to handle.
+static void WriteTypeRelocations(
+    ArrayRef<WasmRelocationEntry> TypeIndexFixups,
+    ArrayRef<uint32_t> TypeIndexFixupTypes,
+    raw_pwrite_stream &Stream)
+{
+  for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) {
+    const WasmRelocationEntry &Fixup = TypeIndexFixups[i];
+    uint32_t Type = TypeIndexFixupTypes[i];
+
+    assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+    assert(Fixup.Addend == 0);
+
+    uint64_t Offset = Fixup.Offset +
+                      Fixup.FixupSection->getSectionOffset();
+
+    encodeULEB128(Fixup.Type, Stream);
+    encodeULEB128(Offset, Stream);
+    encodeULEB128(Type, Stream);
+  }
+}
+
+void WasmObjectWriter::writeObject(MCAssembler &Asm,
+                                   const MCAsmLayout &Layout) {
+  MCContext &Ctx = Asm.getContext();
+  wasm::ValType PtrType = is64Bit() ? wasm::ValType::I64 : wasm::ValType::I32;
+
+  // Collect information from the available symbols.
+  DenseMap<WasmFunctionType, int32_t, WasmFunctionTypeDenseMapInfo>
+      FunctionTypeIndices;
+  SmallVector<WasmFunctionType, 4> FunctionTypes;
+  SmallVector<WasmFunction, 4> Functions;
+  SmallVector<uint32_t, 4> TableElems;
+  SmallVector<WasmGlobal, 4> Globals;
+  SmallVector<WasmImport, 4> Imports;
+  SmallVector<WasmExport, 4> Exports;
+  DenseMap<const MCSymbolWasm *, uint32_t> SymbolIndices;
+  SmallPtrSet<const MCSymbolWasm *, 4> IsAddressTaken;
+  unsigned NumFuncImports = 0;
+  unsigned NumGlobalImports = 0;
+  SmallVector<char, 0> DataBytes;
+  uint32_t StackPointerGlobal = 0;
+  bool HasStackPointer = false;
+
+  // Populate the IsAddressTaken set.
+  for (WasmRelocationEntry RelEntry : CodeRelocations) {
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+      IsAddressTaken.insert(RelEntry.Symbol);
+      break;
+    default:
+      break;
+    }
+  }
+  for (WasmRelocationEntry RelEntry : DataRelocations) {
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      IsAddressTaken.insert(RelEntry.Symbol);
+      break;
+    default:
+      break;
+    }
+  }
+
+  // Populate the Imports set.
+  for (const MCSymbol &S : Asm.symbols()) {
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    int32_t Type;
+
+    if (WS.isFunction()) {
+      // Prepare the function's type, if we haven't seen it yet.
+      WasmFunctionType F;
+      F.Returns = WS.getReturns();
+      F.Params = WS.getParams();
+      auto Pair =
+          FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+      if (Pair.second)
+        FunctionTypes.push_back(F);
+
+      Type = Pair.first->second;
+    } else {
+      Type = int32_t(PtrType);
+    }
+
+    // If the symbol is not defined in this translation unit, import it.
+    if (!WS.isTemporary() && !WS.isDefined(/*SetUsed=*/false)) {
+      WasmImport Import;
+      Import.ModuleName = WS.getModuleName();
+      Import.FieldName = WS.getName();
+
+      if (WS.isFunction()) {
+        Import.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+        Import.Type = Type;
+        SymbolIndices[&WS] = NumFuncImports;
+        ++NumFuncImports;
+      } else {
+        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+        Import.Type = Type;
+        SymbolIndices[&WS] = NumGlobalImports;
+        ++NumGlobalImports;
+      }
+
+      Imports.push_back(Import);
+    }
+  }
+
+  // In the special .global_variables section, we've encoded global
+  // variables used by the function. Translate them into the Globals
+  // list.
+  MCSectionWasm *GlobalVars = Ctx.getWasmSection(".global_variables", 0, 0);
+  if (!GlobalVars->getFragmentList().empty()) {
+    if (GlobalVars->getFragmentList().size() != 1)
+      report_fatal_error("only one .global_variables fragment supported");
+    const MCFragment &Frag = *GlobalVars->begin();
+    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+      report_fatal_error("only data supported in .global_variables");
+    const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+    if (!DataFrag.getFixups().empty())
+      report_fatal_error("fixups not supported in .global_variables");
+    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+    for (const uint8_t *p = (const uint8_t *)Contents.data(),
+                     *end = (const uint8_t *)Contents.data() + Contents.size();
+         p != end; ) {
+      WasmGlobal G;
+      if (end - p < 3)
+        report_fatal_error("truncated global variable encoding");
+      G.Type = wasm::ValType(int8_t(*p++));
+      G.IsMutable = bool(*p++);
+      G.HasImport = bool(*p++);
+      if (G.HasImport) {
+        G.InitialValue = 0;
+
+        WasmImport Import;
+        Import.ModuleName = (const char *)p;
+        const uint8_t *nul = (const uint8_t *)memchr(p, '\0', end - p);
+        if (!nul)
+          report_fatal_error("global module name must be nul-terminated");
+        p = nul + 1;
+        nul = (const uint8_t *)memchr(p, '\0', end - p);
+        if (!nul)
+          report_fatal_error("global base name must be nul-terminated");
+        Import.FieldName = (const char *)p;
+        p = nul + 1;
+
+        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+        Import.Type = int32_t(G.Type);
+
+        G.ImportIndex = NumGlobalImports;
+        ++NumGlobalImports;
+
+        Imports.push_back(Import);
+      } else {
+        unsigned n;
+        G.InitialValue = decodeSLEB128(p, &n);
+        G.ImportIndex = 0;
+        if ((ptrdiff_t)n > end - p)
+          report_fatal_error("global initial value must be valid SLEB128");
+        p += n;
+      }
+      Globals.push_back(G);
+    }
+  }
+
+  // In the special .stack_pointer section, we've encoded the stack pointer
+  // index.
+  MCSectionWasm *StackPtr = Ctx.getWasmSection(".stack_pointer", 0, 0);
+  if (!StackPtr->getFragmentList().empty()) {
+    if (StackPtr->getFragmentList().size() != 1)
+      report_fatal_error("only one .stack_pointer fragment supported");
+    const MCFragment &Frag = *StackPtr->begin();
+    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+      report_fatal_error("only data supported in .stack_pointer");
+    const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+    if (!DataFrag.getFixups().empty())
+      report_fatal_error("fixups not supported in .stack_pointer");
+    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+    if (Contents.size() != 4)
+      report_fatal_error("only one entry supported in .stack_pointer");
+    HasStackPointer = true;
+    StackPointerGlobal = NumGlobalImports + *(const int32_t *)Contents.data();
+  }
+
+  // Handle defined symbols.
+  for (const MCSymbol &S : Asm.symbols()) {
+    // Ignore unnamed temporary symbols, which aren't ever exported, imported,
+    // or used in relocations.
+    if (S.isTemporary() && S.getName().empty())
+      continue;
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    unsigned Index;
+    if (WS.isFunction()) {
+      // Prepare the function's type, if we haven't seen it yet.
+      WasmFunctionType F;
+      F.Returns = WS.getReturns();
+      F.Params = WS.getParams();
+      auto Pair =
+          FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+      if (Pair.second)
+        FunctionTypes.push_back(F);
+
+      int32_t Type = Pair.first->second;
+
+      if (WS.isDefined(/*SetUsed=*/false)) {
+        // A definition. Take the next available index.
+        Index = NumFuncImports + Functions.size();
+
+        // Prepare the function.
+        WasmFunction Func;
+        Func.Type = Type;
+        Func.Sym = &WS;
+        SymbolIndices[&WS] = Index;
+        Functions.push_back(Func);
+      } else {
+        // An import; the index was assigned above.
+        Index = SymbolIndices.find(&WS)->second;
+      }
+
+      // If needed, prepare the function to be called indirectly.
+      if (IsAddressTaken.count(&WS))
+        TableElems.push_back(Index);
+    } else {
+      // For now, ignore temporary non-function symbols.
+      if (S.isTemporary())
+        continue;
+
+      if (WS.getOffset() != 0)
+        report_fatal_error("data sections must contain one variable each");
+      if (!WS.getSize())
+        report_fatal_error("data symbols must have a size set with .size");
+
+      int64_t Size = 0;
+      if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+
+      if (WS.isDefined(false)) {
+        MCSectionWasm &DataSection =
+            static_cast<MCSectionWasm &>(WS.getSection());
+
+        if (uint64_t(Size) != Layout.getSectionFileSize(&DataSection))
+          report_fatal_error("data sections must contain at most one variable");
+
+        DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
+
+        DataSection.setSectionOffset(DataBytes.size());
+
+        for (MCSection::iterator I = DataSection.begin(), E = DataSection.end();
+             I != E; ++I) {
+          const MCFragment &Frag = *I;
+          if (Frag.hasInstructions())
+            report_fatal_error("only data supported in data sections");
+
+          if (const MCAlignFragment *Align = dyn_cast<MCAlignFragment>(&Frag)) {
+            if (Align->getValueSize() != 1)
+              report_fatal_error("only byte values supported for alignment");
+            // If nops are requested, use zeros, as this is the data section.
+            uint8_t Value = Align->hasEmitNops() ? 0 : Align->getValue();
+            uint64_t Size = std::min<uint64_t>(alignTo(DataBytes.size(),
+                                                       Align->getAlignment()),
+                                               DataBytes.size() +
+                                                   Align->getMaxBytesToEmit());
+            DataBytes.resize(Size, Value);
+          } else if (const MCFillFragment *Fill =
+                                              dyn_cast<MCFillFragment>(&Frag)) {
+            DataBytes.insert(DataBytes.end(), Size, Fill->getValue());
+          } else {
+            const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+            const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+
+            DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
+          }
+        }
+
+        // For each external global, prepare a corresponding wasm global
+        // holding its address.
+        if (WS.isExternal()) {
+          Index = NumGlobalImports + Globals.size();
+
+          WasmGlobal Global;
+          Global.Type = PtrType;
+          Global.IsMutable = false;
+          Global.HasImport = false;
+          Global.InitialValue = DataSection.getSectionOffset();
+          Global.ImportIndex = 0;
+          SymbolIndices[&WS] = Index;
+          Globals.push_back(Global);
+        }
+      }
+    }
+
+    // If the symbol is visible outside this translation unit, export it.
+    if (WS.isExternal()) {
+      assert(WS.isDefined(false));
+      WasmExport Export;
+      Export.FieldName = WS.getName();
+      Export.Index = Index;
+
+      if (WS.isFunction())
+        Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+      else
+        Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+
+      Exports.push_back(Export);
+    }
+  }
+
+  // Add types for indirect function calls.
+  for (const WasmRelocationEntry &Fixup : TypeIndexFixups) {
+    assert(Fixup.Addend == 0);
+    assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+
+    WasmFunctionType F;
+    F.Returns = Fixup.Symbol->getReturns();
+    F.Params = Fixup.Symbol->getParams();
+    auto Pair =
+        FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+    if (Pair.second)
+      FunctionTypes.push_back(F);
+
+    TypeIndexFixupTypes.push_back(Pair.first->second);
+  }
+
+  // Write out the Wasm header.
+  writeHeader(Asm);
+
+  SectionBookkeeping Section;
+
+  // === Type Section =========================================================
+  if (!FunctionTypes.empty()) {
+    startSection(Section, wasm::WASM_SEC_TYPE);
+
+    encodeULEB128(FunctionTypes.size(), getStream());
+
+    for (WasmFunctionType &FuncTy : FunctionTypes) {
+      encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream());
+      encodeULEB128(FuncTy.Params.size(), getStream());
+      for (wasm::ValType Ty : FuncTy.Params)
+        writeValueType(Ty);
+      encodeULEB128(FuncTy.Returns.size(), getStream());
+      for (wasm::ValType Ty : FuncTy.Returns)
+        writeValueType(Ty);
+    }
+
+    endSection(Section);
+  }
+
+  // === Import Section ========================================================
+  if (!Imports.empty()) {
+    startSection(Section, wasm::WASM_SEC_IMPORT);
+
+    encodeULEB128(Imports.size(), getStream());
+    for (const WasmImport &Import : Imports) {
+      StringRef ModuleName = Import.ModuleName;
+      encodeULEB128(ModuleName.size(), getStream());
+      writeBytes(ModuleName);
+
+      StringRef FieldName = Import.FieldName;
+      encodeULEB128(FieldName.size(), getStream());
+      writeBytes(FieldName);
+
+      encodeULEB128(Import.Kind, getStream());
+
+      switch (Import.Kind) {
+      case wasm::WASM_EXTERNAL_FUNCTION:
+        encodeULEB128(Import.Type, getStream());
+        break;
+      case wasm::WASM_EXTERNAL_GLOBAL:
+        encodeSLEB128(int32_t(Import.Type), getStream());
+        encodeULEB128(0, getStream()); // mutability
+        break;
+      default:
+        llvm_unreachable("unsupported import kind");
+      }
+    }
+
+    endSection(Section);
+  }
+
+  // === Function Section ======================================================
+  if (!Functions.empty()) {
+    startSection(Section, wasm::WASM_SEC_FUNCTION);
+
+    encodeULEB128(Functions.size(), getStream());
+    for (const WasmFunction &Func : Functions)
+      encodeULEB128(Func.Type, getStream());
+
+    endSection(Section);
+  }
+
+  // === Table Section =========================================================
+  // For now, always emit the table section, since indirect calls are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no indirect calls.
+  startSection(Section, wasm::WASM_SEC_TABLE);
+
+  // The number of tables, fixed to 1 for now.
+  encodeULEB128(1, getStream());
+
+  encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream());
+
+  encodeULEB128(0, getStream());                 // flags
+  encodeULEB128(TableElems.size(), getStream()); // initial
+
+  endSection(Section);
+
+  // === Memory Section ========================================================
+  // For now, always emit the memory section, since loads and stores are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no loads or stores.
+  startSection(Section, wasm::WASM_SEC_MEMORY);
+
+  encodeULEB128(1, getStream()); // number of memory spaces
+
+  encodeULEB128(0, getStream()); // flags
+  encodeULEB128(DataBytes.size(), getStream()); // initial
+
+  endSection(Section);
+
+  // === Global Section ========================================================
+  if (!Globals.empty()) {
+    startSection(Section, wasm::WASM_SEC_GLOBAL);
+
+    encodeULEB128(Globals.size(), getStream());
+    for (const WasmGlobal &Global : Globals) {
+      writeValueType(Global.Type);
+      write8(Global.IsMutable);
+
+      if (Global.HasImport) {
+        assert(Global.InitialValue == 0);
+        write8(wasm::WASM_OPCODE_GET_GLOBAL);
+        encodeULEB128(Global.ImportIndex, getStream());
+      } else {
+        assert(Global.ImportIndex == 0);
+        write8(wasm::WASM_OPCODE_I32_CONST);
+        encodeSLEB128(Global.InitialValue, getStream()); // offset
+      }
+      write8(wasm::WASM_OPCODE_END);
+    }
+
+    endSection(Section);
+  }
+
+  // === Export Section ========================================================
+  if (!Exports.empty()) {
+    startSection(Section, wasm::WASM_SEC_EXPORT);
+
+    encodeULEB128(Exports.size(), getStream());
+    for (const WasmExport &Export : Exports) {
+      encodeULEB128(Export.FieldName.size(), getStream());
+      writeBytes(Export.FieldName);
+
+      encodeSLEB128(Export.Kind, getStream());
+
+      encodeULEB128(Export.Index, getStream());
+    }
+
+    endSection(Section);
+  }
+
+#if 0 // TODO: Start Section
+  if (HaveStartFunction) {
+    // === Start Section =========================================================
+    startSection(Section, wasm::WASM_SEC_START);
+
+    encodeSLEB128(StartFunction, getStream());
+
+    endSection(Section);
+  }
+#endif
+
+  // === Elem Section ==========================================================
+  if (!TableElems.empty()) {
+    startSection(Section, wasm::WASM_SEC_ELEM);
+
+    encodeULEB128(1, getStream()); // number of "segments"
+    encodeULEB128(0, getStream()); // the table index
+
+    // init expr for starting offset
+    write8(wasm::WASM_OPCODE_I32_CONST);
+    encodeSLEB128(0, getStream());
+    write8(wasm::WASM_OPCODE_END);
+
+    encodeULEB128(TableElems.size(), getStream());
+    for (uint32_t Elem : TableElems)
+      encodeULEB128(Elem, getStream());
+
+    endSection(Section);
+  }
+
+  // === Code Section ==========================================================
+  if (!Functions.empty()) {
+    startSection(Section, wasm::WASM_SEC_CODE);
+
+    encodeULEB128(Functions.size(), getStream());
+
+    for (const WasmFunction &Func : Functions) {
+      MCSectionWasm &FuncSection =
+          static_cast<MCSectionWasm &>(Func.Sym->getSection());
+
+      if (Func.Sym->isVariable())
+        report_fatal_error("weak symbols not supported yet");
+
+      if (Func.Sym->getOffset() != 0)
+        report_fatal_error("function sections must contain one function each");
+
+      if (!Func.Sym->getSize())
+        report_fatal_error("function symbols must have a size set with .size");
+
+      int64_t Size = 0;
+      if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+
+      encodeULEB128(Size, getStream());
+
+      FuncSection.setSectionOffset(getStream().tell() -
+                                   Section.ContentsOffset);
+
+      Asm.writeSectionData(&FuncSection, Layout);
+    }
+
+    // Apply the type index fixups for call_indirect etc. instructions.
+    for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) {
+      uint32_t Type = TypeIndexFixupTypes[i];
+      unsigned Padding = PaddingFor5ByteULEB128(Type);
+
+      const WasmRelocationEntry &Fixup = TypeIndexFixups[i];
+      assert(Fixup.Addend == 0);
+      assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+      uint64_t Offset = Fixup.Offset +
+                        Fixup.FixupSection->getSectionOffset();
+
+      uint8_t Buffer[16];
+      unsigned SizeLen = encodeULEB128(Type, Buffer, Padding);
+      assert(SizeLen == 5);
+      getStream().pwrite((char *)Buffer, SizeLen,
+                         Section.ContentsOffset + Offset);
+    }
+
+    // Apply fixups.
+    ApplyRelocations(CodeRelocations, getStream(), SymbolIndices,
+                     Section.ContentsOffset);
+
+    endSection(Section);
+  }
+
+  // === Data Section ==========================================================
+  if (!DataBytes.empty()) {
+    startSection(Section, wasm::WASM_SEC_DATA);
+
+    encodeULEB128(1, getStream()); // count
+    encodeULEB128(0, getStream()); // memory index
+    write8(wasm::WASM_OPCODE_I32_CONST);
+    encodeSLEB128(0, getStream()); // offset
+    write8(wasm::WASM_OPCODE_END);
+    encodeULEB128(DataBytes.size(), getStream()); // size
+    writeBytes(DataBytes); // data
+
+    // Apply fixups.
+    ApplyRelocations(DataRelocations, getStream(), SymbolIndices,
+                     Section.ContentsOffset);
+
+    endSection(Section);
+  }
+
+  // === Name Section ==========================================================
+  uint32_t TotalFunctions = NumFuncImports + Functions.size();
+  if (TotalFunctions != 0) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "name");
+    SectionBookkeeping SubSection;
+    startSection(SubSection, wasm::WASM_NAMES_FUNCTION);
+
+    encodeULEB128(TotalFunctions, getStream());
+    uint32_t Index = 0;
+    for (const WasmImport &Import : Imports) {
+      if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+        encodeULEB128(Index, getStream());
+        encodeULEB128(Import.FieldName.size(), getStream());
+        writeBytes(Import.FieldName);
+        ++Index;
+      }
+    }
+    for (const WasmFunction &Func : Functions) {
+      encodeULEB128(Index, getStream());
+      encodeULEB128(Func.Sym->getName().size(), getStream());
+      writeBytes(Func.Sym->getName());
+      ++Index;
+    }
+
+    endSection(SubSection);
+    endSection(Section);
+  }
+
+  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // for descriptions of the reloc sections.
+
+  // === Code Reloc Section ====================================================
+  if (!CodeRelocations.empty()) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE");
+
+    encodeULEB128(wasm::WASM_SEC_CODE, getStream());
+
+    encodeULEB128(CodeRelocations.size(), getStream());
+
+    WriteRelocations(CodeRelocations, getStream(), SymbolIndices);
+    WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream());
+
+    endSection(Section);
+  }
+
+  // === Data Reloc Section ====================================================
+  if (!DataRelocations.empty()) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA");
+
+    encodeULEB128(wasm::WASM_SEC_DATA, getStream());
+
+    encodeULEB128(DataRelocations.size(), getStream());
+
+    WriteRelocations(DataRelocations, getStream(), SymbolIndices);
+
+    endSection(Section);
+  }
+
+  // === Linking Metadata Section ==============================================
+  if (HasStackPointer) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
+
+    encodeULEB128(1, getStream()); // count
+
+    encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type
+    encodeULEB128(StackPointerGlobal, getStream()); // id
+
+    endSection(Section);
+  }
+
+  // TODO: Translate the .comment section to the output.
+
+  // TODO: Translate debug sections to the output.
+}
+
+MCObjectWriter *llvm::createWasmObjectWriter(MCWasmObjectTargetWriter *MOTW,
+                                             raw_pwrite_stream &OS) {
+  return new WasmObjectWriter(MOTW, OS);
+}
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index afc5c6a14d11..da8fe73f823b 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/WinCOFFObjectWriter.cpp -------------------------*- C++ -*-===//
+//===- llvm/MC/WinCOFFObjectWriter.cpp ------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,37 +11,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/JamCRC.h"
-#include <cstdio>
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <ctime>
+#include <memory>
+#include <string>
+#include <vector>
 
 using namespace llvm;
+using llvm::support::endian::write32le;
 
 #define DEBUG_TYPE "WinCOFFObjectWriter"
 
 namespace {
+
 typedef SmallString<COFF::NameSize> name;
 
 enum AuxiliaryType {
@@ -57,25 +69,24 @@ struct AuxSymbol {
   COFF::Auxiliary Aux;
 };
 
-class COFFSymbol;
 class COFFSection;
 
 class COFFSymbol {
 public:
-  COFF::symbol Data;
+  COFF::symbol Data = {};
 
   typedef SmallVector<AuxSymbol, 1> AuxiliarySymbols;
 
   name Name;
   int Index;
   AuxiliarySymbols Aux;
-  COFFSymbol *Other;
-  COFFSection *Section;
-  int Relocations;
+  COFFSymbol *Other = nullptr;
+  COFFSection *Section = nullptr;
+  int Relocations = 0;
+  const MCSymbol *MC = nullptr;
 
-  const MCSymbol *MC;
+  COFFSymbol(StringRef Name) : Name(Name) {}
 
-  COFFSymbol(StringRef name);
   void set_name_offset(uint32_t Offset);
 
   int64_t getIndex() const { return Index; }
@@ -89,9 +100,10 @@ public:
 // This class contains staging data for a COFF relocation entry.
 struct COFFRelocation {
   COFF::relocation Data;
-  COFFSymbol *Symb;
+  COFFSymbol *Symb = nullptr;
+
+  COFFRelocation() = default;
 
-  COFFRelocation() : Symb(nullptr) {}
   static size_t size() { return COFF::RelocationSize; }
 };
 
@@ -99,15 +111,15 @@ typedef std::vector<COFFRelocation> relocations;
 
 class COFFSection {
 public:
-  COFF::section Header;
+  COFF::section Header = {};
 
   std::string Name;
   int Number;
-  MCSectionCOFF const *MCSection;
-  COFFSymbol *Symbol;
+  MCSectionCOFF const *MCSection = nullptr;
+  COFFSymbol *Symbol = nullptr;
   relocations Relocations;
 
-  COFFSection(StringRef name);
+  COFFSection(StringRef Name) : Name(Name) {}
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
@@ -121,7 +133,7 @@ public:
   std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
 
   // Root level file contents.
-  COFF::header Header;
+  COFF::header Header = {};
   sections Sections;
   symbols Symbols;
   StringTableBuilder Strings{StringTableBuilder::WinCOFF};
@@ -149,9 +161,6 @@ public:
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol *Symbol);
   COFFSection *createSection(StringRef Name);
 
-  template <typename object_t, typename list_t>
-  object_t *createCOFFEntity(StringRef Name, list_t &List);
-
   void defineSection(MCSectionCOFF const &Sec);
 
   COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
@@ -168,8 +177,12 @@ public:
   void WriteFileHeader(const COFF::header &Header);
   void WriteSymbol(const COFFSymbol &S);
   void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
-  void writeSectionHeader(const COFF::section &S);
+  void writeSectionHeaders();
   void WriteRelocation(const COFF::relocation &R);
+  uint32_t writeSectionContents(MCAssembler &Asm, const MCAsmLayout &Layout,
+                                const MCSection &MCSec);
+  void writeSection(MCAssembler &Asm, const MCAsmLayout &Layout,
+                    const COFFSection &Sec, const MCSection &MCSec);
 
   // MCObjectWriter interface implementation.
 
@@ -181,45 +194,29 @@ public:
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  bool isWeak(const MCSymbol &Sym) const override;
-
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override;
 
+  void createFileSymbols(MCAssembler &Asm);
+  void assignSectionNumbers();
+  void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
+
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 };
-}
 
-static inline void write_uint32_le(void *Data, uint32_t Value) {
-  support::endian::write<uint32_t, support::little, support::unaligned>(Data,
-                                                                        Value);
-}
+} // end anonymous namespace
 
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
-COFFSymbol::COFFSymbol(StringRef name)
-    : Name(name.begin(), name.end()), Other(nullptr), Section(nullptr),
-      Relocations(0), MC(nullptr) {
-  memset(&Data, 0, sizeof(Data));
-}
-
 // In the case that the name does not fit within 8 bytes, the offset
 // into the string table is stored in the last 4 bytes instead, leaving
 // the first 4 bytes as 0.
 void COFFSymbol::set_name_offset(uint32_t Offset) {
-  write_uint32_le(Data.Name + 0, 0);
-  write_uint32_le(Data.Name + 4, Offset);
-}
-
-//------------------------------------------------------------------------------
-// Section class implementation
-
-COFFSection::COFFSection(StringRef name)
-    : Name(name), MCSection(nullptr), Symbol(nullptr) {
-  memset(&Header, 0, sizeof(Header));
+  write32le(Data.Name + 0, 0);
+  write32le(Data.Name + 4, Offset);
 }
 
 //------------------------------------------------------------------------------
@@ -228,115 +225,92 @@ COFFSection::COFFSection(StringRef name)
 WinCOFFObjectWriter::WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
                                          raw_pwrite_stream &OS)
     : MCObjectWriter(OS, true), TargetObjectWriter(MOTW) {
-  memset(&Header, 0, sizeof(Header));
-
   Header.Machine = TargetObjectWriter->getMachine();
 }
 
 COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
-  return createCOFFEntity<COFFSymbol>(Name, Symbols);
+  Symbols.push_back(make_unique<COFFSymbol>(Name));
+  return Symbols.back().get();
 }
 
 COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol *Symbol) {
-  symbol_map::iterator i = SymbolMap.find(Symbol);
-  if (i != SymbolMap.end())
-    return i->second;
-  COFFSymbol *RetSymbol =
-      createCOFFEntity<COFFSymbol>(Symbol->getName(), Symbols);
-  SymbolMap[Symbol] = RetSymbol;
-  return RetSymbol;
+  COFFSymbol *&Ret = SymbolMap[Symbol];
+  if (!Ret)
+    Ret = createSymbol(Symbol->getName());
+  return Ret;
 }
 
 COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
-  return createCOFFEntity<COFFSection>(Name, Sections);
+  Sections.emplace_back(make_unique<COFFSection>(Name));
+  return Sections.back().get();
 }
 
-/// A template used to lookup or create a symbol/section, and initialize it if
-/// needed.
-template <typename object_t, typename list_t>
-object_t *WinCOFFObjectWriter::createCOFFEntity(StringRef Name, list_t &List) {
-  List.push_back(make_unique<object_t>(Name));
-
-  return List.back().get();
-}
-
-/// This function takes a section data object from the assembler
-/// and creates the associated COFF section staging object.
-void WinCOFFObjectWriter::defineSection(MCSectionCOFF const &Sec) {
-  COFFSection *coff_section = createSection(Sec.getSectionName());
-  COFFSymbol *coff_symbol = createSymbol(Sec.getSectionName());
-  if (Sec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    if (const MCSymbol *S = Sec.getCOMDATSymbol()) {
-      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
-      if (COMDATSymbol->Section)
-        report_fatal_error("two sections have the same comdat");
-      COMDATSymbol->Section = coff_section;
-    }
-  }
-
-  coff_section->Symbol = coff_symbol;
-  coff_symbol->Section = coff_section;
-  coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
-
-  // In this case the auxiliary symbol is a Section Definition.
-  coff_symbol->Aux.resize(1);
-  memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
-  coff_symbol->Aux[0].AuxType = ATSectionDefinition;
-  coff_symbol->Aux[0].Aux.SectionDefinition.Selection = Sec.getSelection();
-
-  coff_section->Header.Characteristics = Sec.getCharacteristics();
-
-  uint32_t &Characteristics = coff_section->Header.Characteristics;
+static uint32_t getAlignment(const MCSectionCOFF &Sec) {
   switch (Sec.getAlignment()) {
   case 1:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_1BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_1BYTES;
   case 2:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_2BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_2BYTES;
   case 4:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_4BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_4BYTES;
   case 8:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_8BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_8BYTES;
   case 16:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_16BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_16BYTES;
   case 32:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_32BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_32BYTES;
   case 64:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_64BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_64BYTES;
   case 128:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_128BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_128BYTES;
   case 256:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_256BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_256BYTES;
   case 512:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_512BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_512BYTES;
   case 1024:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_1024BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_1024BYTES;
   case 2048:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_2048BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_2048BYTES;
   case 4096:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_4096BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_4096BYTES;
   case 8192:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_8192BYTES;
-    break;
-  default:
-    llvm_unreachable("unsupported section alignment");
+    return COFF::IMAGE_SCN_ALIGN_8192BYTES;
+  }
+  llvm_unreachable("unsupported section alignment");
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF section staging object.
+void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
+  COFFSection *Section = createSection(MCSec.getSectionName());
+  COFFSymbol *Symbol = createSymbol(MCSec.getSectionName());
+  Section->Symbol = Symbol;
+  Symbol->Section = Section;
+  Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+
+  // Create a COMDAT symbol if needed.
+  if (MCSec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (const MCSymbol *S = MCSec.getCOMDATSymbol()) {
+      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
+      if (COMDATSymbol->Section)
+        report_fatal_error("two sections have the same comdat");
+      COMDATSymbol->Section = Section;
+    }
   }
 
+  // In this case the auxiliary symbol is a Section Definition.
+  Symbol->Aux.resize(1);
+  Symbol->Aux[0] = {};
+  Symbol->Aux[0].AuxType = ATSectionDefinition;
+  Symbol->Aux[0].Aux.SectionDefinition.Selection = MCSec.getSelection();
+
+  // Set section alignment.
+  Section->Header.Characteristics = MCSec.getCharacteristics();
+  Section->Header.Characteristics |= getAlignment(MCSec);
+
   // Bind internal COFF section to MC section.
-  coff_section->MCSection = &Sec;
-  SectionMap[&Sec] = coff_section;
+  Section->MCSection = &MCSec;
+  SectionMap[&MCSec] = Section;
 }
 
 static uint64_t getSymbolValue(const MCSymbol &Symbol,
@@ -368,25 +342,25 @@ COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
 
 /// This function takes a symbol data object from the assembler
 /// and creates the associated COFF symbol staging object.
-void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
+void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
                                        MCAssembler &Assembler,
                                        const MCAsmLayout &Layout) {
-  COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
-  const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
+  COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
+  const MCSymbol *Base = Layout.getBaseSymbol(MCSym);
   COFFSection *Sec = nullptr;
   if (Base && Base->getFragment()) {
     Sec = SectionMap[Base->getFragment()->getParent()];
-    if (coff_symbol->Section && coff_symbol->Section != Sec)
+    if (Sym->Section && Sym->Section != Sec)
       report_fatal_error("conflicting sections for symbol");
   }
 
   COFFSymbol *Local = nullptr;
-  if (cast<MCSymbolCOFF>(Symbol).isWeakExternal()) {
-    coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+  if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {
+    Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
 
-    COFFSymbol *WeakDefault = getLinkedSymbol(Symbol);
+    COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);
     if (!WeakDefault) {
-      std::string WeakName = (".weak." + Symbol.getName() + ".default").str();
+      std::string WeakName = (".weak." + MCSym.getName() + ".default").str();
       WeakDefault = createSymbol(WeakName);
       if (!Sec)
         WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
@@ -395,41 +369,41 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
       Local = WeakDefault;
     }
 
-    coff_symbol->Other = WeakDefault;
+    Sym->Other = WeakDefault;
 
     // Setup the Weak External auxiliary symbol.
-    coff_symbol->Aux.resize(1);
-    memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
-    coff_symbol->Aux[0].AuxType = ATWeakExternal;
-    coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = 0;
-    coff_symbol->Aux[0].Aux.WeakExternal.Characteristics =
+    Sym->Aux.resize(1);
+    memset(&Sym->Aux[0], 0, sizeof(Sym->Aux[0]));
+    Sym->Aux[0].AuxType = ATWeakExternal;
+    Sym->Aux[0].Aux.WeakExternal.TagIndex = 0;
+    Sym->Aux[0].Aux.WeakExternal.Characteristics =
         COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY;
   } else {
     if (!Base)
-      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+      Sym->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
     else
-      coff_symbol->Section = Sec;
-    Local = coff_symbol;
+      Sym->Section = Sec;
+    Local = Sym;
   }
 
   if (Local) {
-    Local->Data.Value = getSymbolValue(Symbol, Layout);
+    Local->Data.Value = getSymbolValue(MCSym, Layout);
 
-    const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(Symbol);
+    const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym);
     Local->Data.Type = SymbolCOFF.getType();
     Local->Data.StorageClass = SymbolCOFF.getClass();
 
     // If no storage class was specified in the streamer, define it here.
     if (Local->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
-      bool IsExternal = Symbol.isExternal() ||
-                        (!Symbol.getFragment() && !Symbol.isVariable());
+      bool IsExternal = MCSym.isExternal() ||
+                        (!MCSym.getFragment() && !MCSym.isVariable());
 
       Local->Data.StorageClass = IsExternal ? COFF::IMAGE_SYM_CLASS_EXTERNAL
                                             : COFF::IMAGE_SYM_CLASS_STATIC;
     }
   }
 
-  coff_symbol->MC = &Symbol;
+  Sym->MC = &MCSym;
 }
 
 // Maximum offsets for different string table entry encodings.
@@ -459,24 +433,25 @@ static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
 }
 
 void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
-  if (S.Name.size() > COFF::NameSize) {
-    uint64_t StringTableEntry = Strings.getOffset(S.Name);
-
-    if (StringTableEntry <= Max7DecimalOffset) {
-      SmallVector<char, COFF::NameSize> Buffer;
-      Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
-      assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
-
-      std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
-    } else if (StringTableEntry <= MaxBase64Offset) {
-      // Starting with 10,000,000, offsets are encoded as base64.
-      encodeBase64StringEntry(S.Header.Name, StringTableEntry);
-    } else {
-      report_fatal_error("COFF string table is greater than 64 GB.");
-    }
-  } else {
+  if (S.Name.size() <= COFF::NameSize) {
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
+    return;
   }
+
+  uint64_t StringTableEntry = Strings.getOffset(S.Name);
+  if (StringTableEntry <= Max7DecimalOffset) {
+    SmallVector<char, COFF::NameSize> Buffer;
+    Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
+    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
+    std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
+    return;
+  }
+  if (StringTableEntry <= MaxBase64Offset) {
+    // Starting with 10,000,000, offsets are encoded as base64.
+    encodeBase64StringEntry(S.Header.Name, StringTableEntry);
+    return;
+  }
+  report_fatal_error("COFF string table is greater than 64 GB.");
 }
 
 void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
@@ -583,18 +558,37 @@ void WinCOFFObjectWriter::WriteAuxiliarySymbols(
   }
 }
 
-void WinCOFFObjectWriter::writeSectionHeader(const COFF::section &S) {
-  writeBytes(StringRef(S.Name, COFF::NameSize));
-
-  writeLE32(S.VirtualSize);
-  writeLE32(S.VirtualAddress);
-  writeLE32(S.SizeOfRawData);
-  writeLE32(S.PointerToRawData);
-  writeLE32(S.PointerToRelocations);
-  writeLE32(S.PointerToLineNumbers);
-  writeLE16(S.NumberOfRelocations);
-  writeLE16(S.NumberOfLineNumbers);
-  writeLE32(S.Characteristics);
+// Write the section header.
+void WinCOFFObjectWriter::writeSectionHeaders() {
+  // Section numbers must be monotonically increasing in the section
+  // header, but our Sections array is not sorted by section number,
+  // so make a copy of Sections and sort it.
+  std::vector<COFFSection *> Arr;
+  for (auto &Section : Sections)
+    Arr.push_back(Section.get());
+  std::sort(Arr.begin(), Arr.end(),
+            [](const COFFSection *A, const COFFSection *B) {
+              return A->Number < B->Number;
+            });
+
+  for (auto &Section : Arr) {
+    if (Section->Number == -1)
+      continue;
+
+    COFF::section &S = Section->Header;
+    if (Section->Relocations.size() >= 0xffff)
+      S.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+    writeBytes(StringRef(S.Name, COFF::NameSize));
+    writeLE32(S.VirtualSize);
+    writeLE32(S.VirtualAddress);
+    writeLE32(S.SizeOfRawData);
+    writeLE32(S.PointerToRawData);
+    writeLE32(S.PointerToRelocations);
+    writeLE32(S.PointerToLineNumbers);
+    writeLE16(S.NumberOfRelocations);
+    writeLE16(S.NumberOfLineNumbers);
+    writeLE32(S.Characteristics);
+  }
 }
 
 void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
@@ -603,6 +597,87 @@ void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
   writeLE16(R.Type);
 }
 
+// Write MCSec's contents. What this function does is essentially
+// "Asm.writeSectionData(&MCSec, Layout)", but it's a bit complicated
+// because it needs to compute a CRC.
+uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout,
+                                                   const MCSection &MCSec) {
+  // Save the contents of the section to a temporary buffer, we need this
+  // to CRC the data before we dump it into the object file.
+  SmallVector<char, 128> Buf;
+  raw_svector_ostream VecOS(Buf);
+  raw_pwrite_stream &OldStream = getStream();
+
+  // Redirect the output stream to our buffer and fill our buffer with
+  // the section data.
+  setStream(VecOS);
+  Asm.writeSectionData(&MCSec, Layout);
+
+  // Reset the stream back to what it was before.
+  setStream(OldStream);
+
+  // Write the section contents to the object file.
+  getStream() << Buf;
+
+  // Calculate our CRC with an initial value of '0', this is not how
+  // JamCRC is specified but it aligns with the expected output.
+  JamCRC JC(/*Init=*/0);
+  JC.update(Buf);
+  return JC.getCRC();
+}
+
+void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const COFFSection &Sec,
+                                       const MCSection &MCSec) {
+  if (Sec.Number == -1)
+    return;
+
+  // Write the section contents.
+  if (Sec.Header.PointerToRawData != 0) {
+    assert(getStream().tell() <= Sec.Header.PointerToRawData &&
+           "Section::PointerToRawData is insane!");
+
+    unsigned PaddingSize = Sec.Header.PointerToRawData - getStream().tell();
+    assert(PaddingSize < 4 &&
+           "Should only need at most three bytes of padding!");
+    WriteZeros(PaddingSize);
+
+    uint32_t CRC = writeSectionContents(Asm, Layout, MCSec);
+
+    // Update the section definition auxiliary symbol to record the CRC.
+    COFFSection *Sec = SectionMap[&MCSec];
+    COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
+    assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
+    AuxSymbol &SecDef = AuxSyms[0];
+    SecDef.Aux.SectionDefinition.CheckSum = CRC;
+  }
+
+  // Write relocations for this section.
+  if (Sec.Relocations.empty()) {
+    assert(Sec.Header.PointerToRelocations == 0 &&
+           "Section::PointerToRelocations is insane!");
+    return;
+  }
+
+  assert(getStream().tell() == Sec.Header.PointerToRelocations &&
+         "Section::PointerToRelocations is insane!");
+
+  if (Sec.Relocations.size() >= 0xffff) {
+    // In case of overflow, write actual relocation count as first
+    // relocation. Including the synthetic reloc itself (+ 1).
+    COFF::relocation R;
+    R.VirtualAddress = Sec.Relocations.size() + 1;
+    R.SymbolTableIndex = 0;
+    R.Type = 0;
+    WriteRelocation(R);
+  }
+
+  for (const auto &Relocation : Sec.Relocations)
+    WriteRelocation(Relocation.Data);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // MCObjectWriter interface implementations
 
@@ -632,23 +707,6 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
                                                                 InSet, IsPCRel);
 }
 
-bool WinCOFFObjectWriter::isWeak(const MCSymbol &Sym) const {
-  if (!Sym.isExternal())
-    return false;
-
-  if (!Sym.isInSection())
-    return false;
-
-  const auto &Sec = cast<MCSectionCOFF>(Sym.getSection());
-  if (!Sec.getCOMDATSymbol())
-    return false;
-
-  // It looks like for COFF it is invalid to replace a reference to a global
-  // in a comdat with a reference to a local.
-  // FIXME: Add a specification reference if available.
-  return true;
-}
-
 void WinCOFFObjectWriter::recordRelocation(
     MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment,
     const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) {
@@ -668,13 +726,13 @@ void WinCOFFObjectWriter::recordRelocation(
     return;
   }
 
-  MCSection *Section = Fragment->getParent();
+  MCSection *MCSec = Fragment->getParent();
 
   // Mark this symbol as requiring an entry in the symbol table.
-  assert(SectionMap.find(Section) != SectionMap.end() &&
+  assert(SectionMap.find(MCSec) != SectionMap.end() &&
          "Section must already have been defined in executePostLayoutBinding!");
 
-  COFFSection *coff_section = SectionMap[Section];
+  COFFSection *Sec = SectionMap[MCSec];
   const MCSymbolRefExpr *SymB = Target.getSymB();
   bool CrossSection = false;
 
@@ -796,46 +854,31 @@ void WinCOFFObjectWriter::recordRelocation(
     FixedValue = 0;
 
   if (TargetObjectWriter->recordRelocation(Fixup))
-    coff_section->Relocations.push_back(Reloc);
+    Sec->Relocations.push_back(Reloc);
 }
 
-void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
-                                      const MCAsmLayout &Layout) {
-  size_t SectionsSize = Sections.size();
-  if (SectionsSize > static_cast<size_t>(INT32_MAX))
-    report_fatal_error(
-        "PE COFF object files can't have more than 2147483647 sections");
-
-  // Assign symbol and section indexes and offsets.
-  int32_t NumberOfSections = static_cast<int32_t>(SectionsSize);
-
-  UseBigObj = NumberOfSections > COFF::MaxNumberOfSections16;
-
-  // Assign section numbers.
-  size_t Number = 1;
-  for (const auto &Section : Sections) {
-    Section->Number = Number;
-    Section->Symbol->Data.SectionNumber = Number;
-    Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Number;
-    ++Number;
-  }
-
-  Header.NumberOfSections = NumberOfSections;
-  Header.NumberOfSymbols = 0;
+static std::time_t getTime() {
+  std::time_t Now = time(nullptr);
+  if (Now < 0 || !isUInt<32>(Now))
+    return UINT32_MAX;
+  return Now;
+}
 
+// Create .file symbols.
+void WinCOFFObjectWriter::createFileSymbols(MCAssembler &Asm) {
   for (const std::string &Name : Asm.getFileNames()) {
     // round up to calculate the number of auxiliary symbols required
     unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
     unsigned Count = (Name.size() + SymbolSize - 1) / SymbolSize;
 
-    COFFSymbol *file = createSymbol(".file");
-    file->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
-    file->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
-    file->Aux.resize(Count);
+    COFFSymbol *File = createSymbol(".file");
+    File->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
+    File->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
+    File->Aux.resize(Count);
 
     unsigned Offset = 0;
     unsigned Length = Name.size();
-    for (auto &Aux : file->Aux) {
+    for (auto &Aux : File->Aux) {
       Aux.AuxType = ATFile;
 
       if (Length > SymbolSize) {
@@ -850,6 +893,109 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
       Offset += SymbolSize;
     }
   }
+}
+
+static bool isAssociative(const COFFSection &Section) {
+  return Section.Symbol->Aux[0].Aux.SectionDefinition.Selection ==
+         COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+}
+
+void WinCOFFObjectWriter::assignSectionNumbers() {
+  size_t I = 1;
+  auto Assign = [&](COFFSection &Section) {
+    Section.Number = I;
+    Section.Symbol->Data.SectionNumber = I;
+    Section.Symbol->Aux[0].Aux.SectionDefinition.Number = I;
+    ++I;
+  };
+
+  // Although it is not explicitly requested by the Microsoft COFF spec,
+  // we should avoid emitting forward associative section references,
+  // because MSVC link.exe as of 2017 cannot handle that.
+  for (const std::unique_ptr<COFFSection> &Section : Sections)
+    if (!isAssociative(*Section))
+      Assign(*Section);
+  for (const std::unique_ptr<COFFSection> &Section : Sections)
+    if (isAssociative(*Section))
+      Assign(*Section);
+}
+
+// Assign file offsets to COFF object file structures.
+void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
+                                            const MCAsmLayout &Layout) {
+  unsigned Offset = getInitialOffset();
+
+  Offset += UseBigObj ? COFF::Header32Size : COFF::Header16Size;
+  Offset += COFF::SectionSize * Header.NumberOfSections;
+
+  for (const auto &Section : Asm) {
+    COFFSection *Sec = SectionMap[&Section];
+
+    if (Sec->Number == -1)
+      continue;
+
+    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
+
+    if (IsPhysicalSection(Sec)) {
+      // Align the section data to a four byte boundary.
+      Offset = alignTo(Offset, 4);
+      Sec->Header.PointerToRawData = Offset;
+
+      Offset += Sec->Header.SizeOfRawData;
+    }
+
+    if (!Sec->Relocations.empty()) {
+      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
+
+      if (RelocationsOverflow) {
+        // Signal overflow by setting NumberOfRelocations to max value. Actual
+        // size is found in reloc #0. Microsoft tools understand this.
+        Sec->Header.NumberOfRelocations = 0xffff;
+      } else {
+        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      }
+      Sec->Header.PointerToRelocations = Offset;
+
+      if (RelocationsOverflow) {
+        // Reloc #0 will contain actual count, so make room for it.
+        Offset += COFF::RelocationSize;
+      }
+
+      Offset += COFF::RelocationSize * Sec->Relocations.size();
+
+      for (auto &Relocation : Sec->Relocations) {
+        assert(Relocation.Symb->getIndex() != -1);
+        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
+      }
+    }
+
+    assert(Sec->Symbol->Aux.size() == 1 &&
+           "Section's symbol must have one aux!");
+    AuxSymbol &Aux = Sec->Symbol->Aux[0];
+    assert(Aux.AuxType == ATSectionDefinition &&
+           "Section's symbol's aux symbol must be a Section Definition!");
+    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
+    Aux.Aux.SectionDefinition.NumberOfRelocations =
+        Sec->Header.NumberOfRelocations;
+    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
+        Sec->Header.NumberOfLineNumbers;
+  }
+
+  Header.PointerToSymbolTable = Offset;
+}
+
+void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+  if (Sections.size() > INT32_MAX)
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
+
+  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
+  Header.NumberOfSections = Sections.size();
+  Header.NumberOfSymbols = 0;
+
+  assignSectionNumbers();
+  createFileSymbols(Asm);
 
   for (auto &Symbol : Symbols) {
     // Update section number & offset for symbols that have them.
@@ -912,78 +1058,12 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Assoc->Number;
   }
 
-  // Assign file offsets to COFF object file structures.
-
-  unsigned offset = getInitialOffset();
-
-  if (UseBigObj)
-    offset += COFF::Header32Size;
-  else
-    offset += COFF::Header16Size;
-  offset += COFF::SectionSize * Header.NumberOfSections;
-
-  for (const auto &Section : Asm) {
-    COFFSection *Sec = SectionMap[&Section];
-
-    if (Sec->Number == -1)
-      continue;
-
-    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
-
-    if (IsPhysicalSection(Sec)) {
-      // Align the section data to a four byte boundary.
-      offset = alignTo(offset, 4);
-      Sec->Header.PointerToRawData = offset;
-
-      offset += Sec->Header.SizeOfRawData;
-    }
-
-    if (Sec->Relocations.size() > 0) {
-      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
-
-      if (RelocationsOverflow) {
-        // Signal overflow by setting NumberOfRelocations to max value. Actual
-        // size is found in reloc #0. Microsoft tools understand this.
-        Sec->Header.NumberOfRelocations = 0xffff;
-      } else {
-        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
-      }
-      Sec->Header.PointerToRelocations = offset;
-
-      if (RelocationsOverflow) {
-        // Reloc #0 will contain actual count, so make room for it.
-        offset += COFF::RelocationSize;
-      }
-
-      offset += COFF::RelocationSize * Sec->Relocations.size();
-
-      for (auto &Relocation : Sec->Relocations) {
-        assert(Relocation.Symb->getIndex() != -1);
-        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
-      }
-    }
-
-    assert(Sec->Symbol->Aux.size() == 1 &&
-           "Section's symbol must have one aux!");
-    AuxSymbol &Aux = Sec->Symbol->Aux[0];
-    assert(Aux.AuxType == ATSectionDefinition &&
-           "Section's symbol's aux symbol must be a Section Definition!");
-    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
-    Aux.Aux.SectionDefinition.NumberOfRelocations =
-        Sec->Header.NumberOfRelocations;
-    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
-        Sec->Header.NumberOfLineNumbers;
-  }
-
-  Header.PointerToSymbolTable = offset;
+  assignFileOffsets(Asm, Layout);
 
   // MS LINK expects to be able to use this timestamp to implement their
   // /INCREMENTAL feature.
   if (Asm.isIncrementalLinkerCompatible()) {
-    std::time_t Now = time(nullptr);
-    if (Now < 0 || !isUInt<32>(Now))
-      Now = UINT32_MAX;
-    Header.TimeDateStamp = Now;
+    Header.TimeDateStamp = getTime();
   } else {
     // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
     Header.TimeDateStamp = 0;
@@ -991,96 +1071,25 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
 
   // Write it all to disk...
   WriteFileHeader(Header);
+  writeSectionHeaders();
 
-  {
-    sections::iterator i, ie;
-    MCAssembler::iterator j, je;
-
-    for (auto &Section : Sections) {
-      if (Section->Number != -1) {
-        if (Section->Relocations.size() >= 0xffff)
-          Section->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-        writeSectionHeader(Section->Header);
-      }
-    }
-
-    SmallVector<char, 128> SectionContents;
-    for (i = Sections.begin(), ie = Sections.end(), j = Asm.begin(),
-        je = Asm.end();
-         (i != ie) && (j != je); ++i, ++j) {
-
-      if ((*i)->Number == -1)
-        continue;
-
-      if ((*i)->Header.PointerToRawData != 0) {
-        assert(getStream().tell() <= (*i)->Header.PointerToRawData &&
-               "Section::PointerToRawData is insane!");
-
-        unsigned SectionDataPadding =
-            (*i)->Header.PointerToRawData - getStream().tell();
-        assert(SectionDataPadding < 4 &&
-               "Should only need at most three bytes of padding!");
-
-        WriteZeros(SectionDataPadding);
-
-        // Save the contents of the section to a temporary buffer, we need this
-        // to CRC the data before we dump it into the object file.
-        SectionContents.clear();
-        raw_svector_ostream VecOS(SectionContents);
-        raw_pwrite_stream &OldStream = getStream();
-        // Redirect the output stream to our buffer.
-        setStream(VecOS);
-        // Fill our buffer with the section data.
-        Asm.writeSectionData(&*j, Layout);
-        // Reset the stream back to what it was before.
-        setStream(OldStream);
-
-        // Calculate our CRC with an initial value of '0', this is not how
-        // JamCRC is specified but it aligns with the expected output.
-        JamCRC JC(/*Init=*/0x00000000U);
-        JC.update(SectionContents);
-
-        // Write the section contents to the object file.
-        getStream() << SectionContents;
-
-        // Update the section definition auxiliary symbol to record the CRC.
-        COFFSection *Sec = SectionMap[&*j];
-        COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
-        assert(AuxSyms.size() == 1 &&
-               AuxSyms[0].AuxType == ATSectionDefinition);
-        AuxSymbol &SecDef = AuxSyms[0];
-        SecDef.Aux.SectionDefinition.CheckSum = JC.getCRC();
-      }
-
-      if ((*i)->Relocations.size() > 0) {
-        assert(getStream().tell() == (*i)->Header.PointerToRelocations &&
-               "Section::PointerToRelocations is insane!");
-
-        if ((*i)->Relocations.size() >= 0xffff) {
-          // In case of overflow, write actual relocation count as first
-          // relocation. Including the synthetic reloc itself (+ 1).
-          COFF::relocation r;
-          r.VirtualAddress = (*i)->Relocations.size() + 1;
-          r.SymbolTableIndex = 0;
-          r.Type = 0;
-          WriteRelocation(r);
-        }
-
-        for (const auto &Relocation : (*i)->Relocations)
-          WriteRelocation(Relocation.Data);
-      } else
-        assert((*i)->Header.PointerToRelocations == 0 &&
-               "Section::PointerToRelocations is insane!");
-    }
-  }
+  // Write section contents.
+  sections::iterator I = Sections.begin();
+  sections::iterator IE = Sections.end();
+  MCAssembler::iterator J = Asm.begin();
+  MCAssembler::iterator JE = Asm.end();
+  for (; I != IE && J != JE; ++I, ++J)
+    writeSection(Asm, Layout, **I, *J);
 
   assert(getStream().tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
+  // Write a symbol table.
   for (auto &Symbol : Symbols)
     if (Symbol->getIndex() != -1)
       WriteSymbol(*Symbol);
 
+  // Write a string table, which completes the entire COFF file.
   Strings.write(getStream());
 }
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 6383d8794030..c26d87f36f83 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/WinCOFFStreamer.cpp -----------------------------*- C++ -*-===//
+//===- llvm/MC/WinCOFFStreamer.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,32 +11,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolCOFF.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "WinCOFFStreamer"
 
-namespace llvm {
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      MCCodeEmitter &CE, raw_pwrite_stream &OS)
     : MCObjectStreamer(Context, MAB, OS, &CE), CurSymbol(nullptr) {}
@@ -75,10 +79,9 @@ void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCWinCOFFStreamer::EmitLabel(MCSymbol *S) {
+void MCWinCOFFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCObjectStreamer::EmitLabel(Symbol);
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
 }
 
 void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
@@ -275,10 +278,6 @@ void MCWinCOFFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::EmitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
-}
-
 // TODO: Implement this if you want to emit .comment section in COFF obj files.
 void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) {
   llvm_unreachable("not implemented");
@@ -295,5 +294,3 @@ void MCWinCOFFStreamer::FinishImpl() {
 void MCWinCOFFStreamer::Error(const Twine &Msg) const {
   getContext().reportError(SMLoc(), Msg);
 }
-}
-
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index f8e3c5a0a03f..5b233aab2018 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -122,12 +122,27 @@ static void printWithSpacePadding(raw_fd_ostream &OS, T Data, unsigned Size,
   }
 }
 
+static bool isBSDLike(object::Archive::Kind Kind) {
+  switch (Kind) {
+  case object::Archive::K_GNU:
+    return false;
+  case object::Archive::K_BSD:
+  case object::Archive::K_DARWIN:
+    return true;
+  case object::Archive::K_MIPS64:
+  case object::Archive::K_DARWIN64:
+  case object::Archive::K_COFF:
+    break;
+  }
+  llvm_unreachable("not supported for writting");
+}
+
 static void print32(raw_ostream &Out, object::Archive::Kind Kind,
                     uint32_t Val) {
-  if (Kind == object::Archive::K_GNU)
-    support::endian::Writer<support::big>(Out).write(Val);
-  else
+  if (isBSDLike(Kind))
     support::endian::Writer<support::little>(Out).write(Val);
+  else
+    support::endian::Writer<support::big>(Out).write(Val);
 }
 
 static void printRestOfMemberHeader(
@@ -178,7 +193,7 @@ printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind, bool Thin,
                   std::vector<unsigned>::iterator &StringMapIndexIter,
                   const sys::TimePoint<std::chrono::seconds> &ModTime,
                   unsigned UID, unsigned GID, unsigned Perms, unsigned Size) {
-  if (Kind == object::Archive::K_BSD)
+  if (isBSDLike(Kind))
     return printBSDMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
   if (!useStringTable(Thin, Name))
     return printGNUSmallMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
@@ -285,10 +300,10 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
 
     if (!HeaderStartOffset) {
       HeaderStartOffset = Out.tell();
-      if (Kind == object::Archive::K_GNU)
-        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
-      else
+      if (isBSDLike(Kind))
         printBSDMemberHeader(Out, "__.SYMDEF", now(Deterministic), 0, 0, 0, 0);
+      else
+        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
       BodyStartOffset = Out.tell();
       print32(Out, Kind, 0); // number of entries or bytes
     }
@@ -307,7 +322,7 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
         return EC;
       NameOS << '\0';
       MemberOffsetRefs.push_back(MemberNum);
-      if (Kind == object::Archive::K_BSD)
+      if (isBSDLike(Kind))
         print32(Out, Kind, NameOffset);
       print32(Out, Kind, 0); // member offset
     }
@@ -316,10 +331,21 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   if (HeaderStartOffset == 0)
     return 0;
 
+  // ld64 prefers the cctools type archive which pads its string table to a
+  // boundary of sizeof(int32_t).
+  if (isBSDLike(Kind))
+    for (unsigned P = OffsetToAlignment(NameOS.tell(), sizeof(int32_t)); P--;)
+      NameOS << '\0';
+
   StringRef StringTable = NameOS.str();
-  if (Kind == object::Archive::K_BSD)
+  if (isBSDLike(Kind))
     print32(Out, Kind, StringTable.size()); // byte count of the string table
   Out << StringTable;
+  // If there are no symbols, emit an empty symbol table, to satisfy Solaris
+  // tools, older versions of which expect a symbol table in a non-empty
+  // archive, regardless of whether there are any symbols in it.
+  if (StringTable.size() == 0)
+    print32(Out, Kind, 0);
 
   // ld64 requires the next member header to start at an offset that is
   // 4 bytes aligned.
@@ -336,10 +362,10 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   // Patch up the number of symbols.
   Out.seek(BodyStartOffset);
   unsigned NumSyms = MemberOffsetRefs.size();
-  if (Kind == object::Archive::K_GNU)
-    print32(Out, Kind, NumSyms);
-  else
+  if (isBSDLike(Kind))
     print32(Out, Kind, NumSyms * 8);
+  else
+    print32(Out, Kind, NumSyms);
 
   Out.seek(Pos);
   return BodyStartOffset + 4;
@@ -351,8 +377,7 @@ llvm::writeArchive(StringRef ArcName,
                    bool WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
-  assert((!Thin || Kind == object::Archive::K_GNU) &&
-         "Only the gnu format has a thin mode");
+  assert((!Thin || !isBSDLike(Kind)) && "Only the gnu format has a thin mode");
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
@@ -368,10 +393,6 @@ llvm::writeArchive(StringRef ArcName,
 
   std::vector<unsigned> MemberOffsetRefs;
 
-  std::vector<std::unique_ptr<MemoryBuffer>> Buffers;
-  std::vector<MemoryBufferRef> Members;
-  std::vector<sys::fs::file_status> NewMemberStatus;
-
   unsigned MemberReferenceOffset = 0;
   if (WriteSymtab) {
     ErrorOr<unsigned> MemberReferenceOffsetOrErr = writeSymbolTable(
@@ -382,25 +403,35 @@ llvm::writeArchive(StringRef ArcName,
   }
 
   std::vector<unsigned> StringMapIndexes;
-  if (Kind != object::Archive::K_BSD)
+  if (!isBSDLike(Kind))
     writeStringTable(Out, ArcName, NewMembers, StringMapIndexes, Thin);
 
   std::vector<unsigned>::iterator StringMapIndexIter = StringMapIndexes.begin();
   std::vector<unsigned> MemberOffset;
   for (const NewArchiveMember &M : NewMembers) {
     MemoryBufferRef File = M.Buf->getMemBufferRef();
+    unsigned Padding = 0;
 
     unsigned Pos = Out.tell();
     MemberOffset.push_back(Pos);
 
+    // ld64 expects the members to be 8-byte aligned for 64-bit content and at
+    // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
+    // uniformly.  This matches the behaviour with cctools and ensures that ld64
+    // is happy with archives that we generate.
+    if (Kind == object::Archive::K_DARWIN)
+      Padding = OffsetToAlignment(M.Buf->getBufferSize(), 8);
+
     printMemberHeader(Out, Kind, Thin,
                       sys::path::filename(M.Buf->getBufferIdentifier()),
                       StringMapIndexIter, M.ModTime, M.UID, M.GID, M.Perms,
-                      M.Buf->getBufferSize());
+                      M.Buf->getBufferSize() + Padding);
 
     if (!Thin)
       Out << File.getBuffer();
 
+    while (Padding--)
+      Out << '\n';
     if (Out.tell() % 2)
       Out << '\n';
   }
@@ -408,7 +439,7 @@ llvm::writeArchive(StringRef ArcName,
   if (MemberReferenceOffset) {
     Out.seek(MemberReferenceOffset);
     for (unsigned MemberNum : MemberOffsetRefs) {
-      if (Kind == object::Archive::K_BSD)
+      if (isBSDLike(Kind))
         Out.seek(Out.tell() + 4); // skip over the string offset
       print32(Out, Kind, MemberOffset[MemberNum]);
     }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index b895c3fcc050..2007f560c166 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMObject
   ELFObjectFile.cpp
   Error.cpp
   IRObjectFile.cpp
+  IRSymtab.cpp
   MachOObjectFile.cpp
   MachOUniversal.cpp
   ModuleSummaryIndexObjectFile.cpp
diff --git a/lib/Object/Decompressor.cpp b/lib/Object/Decompressor.cpp
index bca41fd9f487..0be602b1fc1a 100644
--- a/lib/Object/Decompressor.cpp
+++ b/lib/Object/Decompressor.cpp
@@ -95,8 +95,5 @@ Error Decompressor::decompress(SmallString<32> &Out) {
 
 Error Decompressor::decompress(MutableArrayRef<char> Buffer) {
   size_t Size = Buffer.size();
-  zlib::Status Status = zlib::uncompress(SectionData, Buffer.data(), Size);
-  if (Status != zlib::StatusOK)
-    return createError("decompression failed");
-  return Error::success();
+  return zlib::uncompress(SectionData, Buffer.data(), Size);
 }
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index 4bd69e34e3c3..3f8c81c8e911 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/MathExtras.h"
 
 namespace llvm {
@@ -55,71 +57,247 @@ ObjectFile::createELFObjectFile(MemoryBufferRef Obj) {
   return std::move(R);
 }
 
-SubtargetFeatures ELFObjectFileBase::getFeatures() const {
-  switch (getEMachine()) {
-  case ELF::EM_MIPS: {
-    SubtargetFeatures Features;
-    unsigned PlatformFlags;
-    getPlatformFlags(PlatformFlags);
+SubtargetFeatures ELFObjectFileBase::getMIPSFeatures() const {
+  SubtargetFeatures Features;
+  unsigned PlatformFlags;
+  getPlatformFlags(PlatformFlags);
+
+  switch (PlatformFlags & ELF::EF_MIPS_ARCH) {
+  case ELF::EF_MIPS_ARCH_1:
+    break;
+  case ELF::EF_MIPS_ARCH_2:
+    Features.AddFeature("mips2");
+    break;
+  case ELF::EF_MIPS_ARCH_3:
+    Features.AddFeature("mips3");
+    break;
+  case ELF::EF_MIPS_ARCH_4:
+    Features.AddFeature("mips4");
+    break;
+  case ELF::EF_MIPS_ARCH_5:
+    Features.AddFeature("mips5");
+    break;
+  case ELF::EF_MIPS_ARCH_32:
+    Features.AddFeature("mips32");
+    break;
+  case ELF::EF_MIPS_ARCH_64:
+    Features.AddFeature("mips64");
+    break;
+  case ELF::EF_MIPS_ARCH_32R2:
+    Features.AddFeature("mips32r2");
+    break;
+  case ELF::EF_MIPS_ARCH_64R2:
+    Features.AddFeature("mips64r2");
+    break;
+  case ELF::EF_MIPS_ARCH_32R6:
+    Features.AddFeature("mips32r6");
+    break;
+  case ELF::EF_MIPS_ARCH_64R6:
+    Features.AddFeature("mips64r6");
+    break;
+  default:
+    llvm_unreachable("Unknown EF_MIPS_ARCH value");
+  }
+
+  switch (PlatformFlags & ELF::EF_MIPS_MACH) {
+  case ELF::EF_MIPS_MACH_NONE:
+    // No feature associated with this value.
+    break;
+  case ELF::EF_MIPS_MACH_OCTEON:
+    Features.AddFeature("cnmips");
+    break;
+  default:
+    llvm_unreachable("Unknown EF_MIPS_ARCH value");
+  }
 
-    switch (PlatformFlags & ELF::EF_MIPS_ARCH) {
-    case ELF::EF_MIPS_ARCH_1:
+  if (PlatformFlags & ELF::EF_MIPS_ARCH_ASE_M16)
+    Features.AddFeature("mips16");
+  if (PlatformFlags & ELF::EF_MIPS_MICROMIPS)
+    Features.AddFeature("micromips");
+
+  return Features;
+}
+
+SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
+  SubtargetFeatures Features;
+  ARMAttributeParser Attributes;
+  std::error_code EC = getBuildAttributes(Attributes);
+  if (EC)
+    return SubtargetFeatures();
+
+  // both ARMv7-M and R have to support thumb hardware div
+  bool isV7 = false;
+  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch))
+    isV7 = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch)
+      == ARMBuildAttrs::v7;
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch_profile)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile)) {
+    case ARMBuildAttrs::ApplicationProfile:
+      Features.AddFeature("aclass");
       break;
-    case ELF::EF_MIPS_ARCH_2:
-      Features.AddFeature("mips2");
+    case ARMBuildAttrs::RealTimeProfile:
+      Features.AddFeature("rclass");
+      if (isV7)
+        Features.AddFeature("hwdiv");
       break;
-    case ELF::EF_MIPS_ARCH_3:
-      Features.AddFeature("mips3");
+    case ARMBuildAttrs::MicroControllerProfile:
+      Features.AddFeature("mclass");
+      if (isV7)
+        Features.AddFeature("hwdiv");
       break;
-    case ELF::EF_MIPS_ARCH_4:
-      Features.AddFeature("mips4");
+    }
+  }
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::THUMB_ISA_use)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use)) {
+    default:
       break;
-    case ELF::EF_MIPS_ARCH_5:
-      Features.AddFeature("mips5");
+    case ARMBuildAttrs::Not_Allowed:
+      Features.AddFeature("thumb", false);
+      Features.AddFeature("thumb2", false);
       break;
-    case ELF::EF_MIPS_ARCH_32:
-      Features.AddFeature("mips32");
+    case ARMBuildAttrs::AllowThumb32:
+      Features.AddFeature("thumb2");
       break;
-    case ELF::EF_MIPS_ARCH_64:
-      Features.AddFeature("mips64");
+    }
+  }
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::FP_arch)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::FP_arch)) {
+    default:
       break;
-    case ELF::EF_MIPS_ARCH_32R2:
-      Features.AddFeature("mips32r2");
+    case ARMBuildAttrs::Not_Allowed:
+      Features.AddFeature("vfp2", false);
+      Features.AddFeature("vfp3", false);
+      Features.AddFeature("vfp4", false);
       break;
-    case ELF::EF_MIPS_ARCH_64R2:
-      Features.AddFeature("mips64r2");
+    case ARMBuildAttrs::AllowFPv2:
+      Features.AddFeature("vfp2");
       break;
-    case ELF::EF_MIPS_ARCH_32R6:
-      Features.AddFeature("mips32r6");
+    case ARMBuildAttrs::AllowFPv3A:
+    case ARMBuildAttrs::AllowFPv3B:
+      Features.AddFeature("vfp3");
       break;
-    case ELF::EF_MIPS_ARCH_64R6:
-      Features.AddFeature("mips64r6");
+    case ARMBuildAttrs::AllowFPv4A:
+    case ARMBuildAttrs::AllowFPv4B:
+      Features.AddFeature("vfp4");
       break;
-    default:
-      llvm_unreachable("Unknown EF_MIPS_ARCH value");
     }
+  }
 
-    switch (PlatformFlags & ELF::EF_MIPS_MACH) {
-    case ELF::EF_MIPS_MACH_NONE:
-      // No feature associated with this value.
+  if (Attributes.hasAttribute(ARMBuildAttrs::Advanced_SIMD_arch)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::Advanced_SIMD_arch)) {
+    default:
+      break;
+    case ARMBuildAttrs::Not_Allowed:
+      Features.AddFeature("neon", false);
+      Features.AddFeature("fp16", false);
+      break;
+    case ARMBuildAttrs::AllowNeon:
+      Features.AddFeature("neon");
       break;
-    case ELF::EF_MIPS_MACH_OCTEON:
-      Features.AddFeature("cnmips");
+    case ARMBuildAttrs::AllowNeon2:
+      Features.AddFeature("neon");
+      Features.AddFeature("fp16");
       break;
+    }
+  }
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::DIV_use)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::DIV_use)) {
     default:
-      llvm_unreachable("Unknown EF_MIPS_ARCH value");
+      break;
+    case ARMBuildAttrs::DisallowDIV:
+      Features.AddFeature("hwdiv", false);
+      Features.AddFeature("hwdiv-arm", false);
+      break;
+    case ARMBuildAttrs::AllowDIVExt:
+      Features.AddFeature("hwdiv");
+      Features.AddFeature("hwdiv-arm");
+      break;
     }
+  }
 
-    if (PlatformFlags & ELF::EF_MIPS_ARCH_ASE_M16)
-      Features.AddFeature("mips16");
-    if (PlatformFlags & ELF::EF_MIPS_MICROMIPS)
-      Features.AddFeature("micromips");
+  return Features;
+}
 
-    return Features;
-  }
+SubtargetFeatures ELFObjectFileBase::getFeatures() const {
+  switch (getEMachine()) {
+  case ELF::EM_MIPS:
+    return getMIPSFeatures();
+  case ELF::EM_ARM:
+    return getARMFeatures();
   default:
     return SubtargetFeatures();
   }
 }
 
+// FIXME Encode from a tablegen description or target parser.
+void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
+  if (TheTriple.getSubArch() != Triple::NoSubArch)
+    return;
+
+  ARMAttributeParser Attributes;
+  std::error_code EC = getBuildAttributes(Attributes);
+  if (EC)
+    return;
+
+  std::string Triple;
+  // Default to ARM, but use the triple if it's been set.
+  if (TheTriple.getArch() == Triple::thumb ||
+      TheTriple.getArch() == Triple::thumbeb)
+    Triple = "thumb";
+  else
+    Triple = "arm";
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch)) {
+    case ARMBuildAttrs::v4:
+      Triple += "v4";
+      break;
+    case ARMBuildAttrs::v4T:
+      Triple += "v4t";
+      break;
+    case ARMBuildAttrs::v5T:
+      Triple += "v5t";
+      break;
+    case ARMBuildAttrs::v5TE:
+      Triple += "v5te";
+      break;
+    case ARMBuildAttrs::v5TEJ:
+      Triple += "v5tej";
+      break;
+    case ARMBuildAttrs::v6:
+      Triple += "v6";
+      break;
+    case ARMBuildAttrs::v6KZ:
+      Triple += "v6kz";
+      break;
+    case ARMBuildAttrs::v6T2:
+      Triple += "v6t2";
+      break;
+    case ARMBuildAttrs::v6K:
+      Triple += "v6k";
+      break;
+    case ARMBuildAttrs::v7:
+      Triple += "v7";
+      break;
+    case ARMBuildAttrs::v6_M:
+      Triple += "v6m";
+      break;
+    case ARMBuildAttrs::v6S_M:
+      Triple += "v6sm";
+      break;
+    case ARMBuildAttrs::v7E_M:
+      Triple += "v7em";
+      break;
+    }
+  }
+  if (!isLittleEndian())
+    Triple += "eb";
+
+  TheTriple.setArchName(Triple);
+}
+
 } // end namespace llvm
diff --git a/lib/Object/IRSymtab.cpp b/lib/Object/IRSymtab.cpp
new file mode 100644
index 000000000000..da1ef9946b50
--- /dev/null
+++ b/lib/Object/IRSymtab.cpp
@@ -0,0 +1,231 @@
+//===- IRSymtab.cpp - implementation of IR symbol tables --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/IRSymtab.h"
+#include "llvm/Analysis/ObjectUtils.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/StringSaver.h"
+
+using namespace llvm;
+using namespace irsymtab;
+
+namespace {
+
+/// Stores the temporary state that is required to build an IR symbol table.
+struct Builder {
+  SmallVector<char, 0> &Symtab;
+  SmallVector<char, 0> &Strtab;
+  Builder(SmallVector<char, 0> &Symtab, SmallVector<char, 0> &Strtab)
+      : Symtab(Symtab), Strtab(Strtab) {}
+
+  StringTableBuilder StrtabBuilder{StringTableBuilder::ELF};
+
+  BumpPtrAllocator Alloc;
+  StringSaver Saver{Alloc};
+
+  DenseMap<const Comdat *, unsigned> ComdatMap;
+  ModuleSymbolTable Msymtab;
+  SmallPtrSet<GlobalValue *, 8> Used;
+  Mangler Mang;
+  Triple TT;
+
+  std::vector<storage::Comdat> Comdats;
+  std::vector<storage::Module> Mods;
+  std::vector<storage::Symbol> Syms;
+  std::vector<storage::Uncommon> Uncommons;
+
+  std::string COFFLinkerOpts;
+  raw_string_ostream COFFLinkerOptsOS{COFFLinkerOpts};
+
+  void setStr(storage::Str &S, StringRef Value) {
+    S.Offset = StrtabBuilder.add(Value);
+  }
+  template <typename T>
+  void writeRange(storage::Range<T> &R, const std::vector<T> &Objs) {
+    R.Offset = Symtab.size();
+    R.Size = Objs.size();
+    Symtab.insert(Symtab.end(), reinterpret_cast<const char *>(Objs.data()),
+                  reinterpret_cast<const char *>(Objs.data() + Objs.size()));
+  }
+
+  Error addModule(Module *M);
+  Error addSymbol(ModuleSymbolTable::Symbol Sym);
+
+  Error build(ArrayRef<Module *> Mods);
+};
+
+Error Builder::addModule(Module *M) {
+  collectUsedGlobalVariables(*M, Used, /*CompilerUsed*/ false);
+
+  storage::Module Mod;
+  Mod.Begin = Msymtab.symbols().size();
+  Msymtab.addModule(M);
+  Mod.End = Msymtab.symbols().size();
+  Mods.push_back(Mod);
+
+  if (TT.isOSBinFormatCOFF()) {
+    if (auto E = M->materializeMetadata())
+      return E;
+    if (Metadata *Val = M->getModuleFlag("Linker Options")) {
+      MDNode *LinkerOptions = cast<MDNode>(Val);
+      for (const MDOperand &MDOptions : LinkerOptions->operands())
+        for (const MDOperand &MDOption : cast<MDNode>(MDOptions)->operands())
+          COFFLinkerOptsOS << " " << cast<MDString>(MDOption)->getString();
+    }
+  }
+
+  return Error::success();
+}
+
+Error Builder::addSymbol(ModuleSymbolTable::Symbol Msym) {
+  Syms.emplace_back();
+  storage::Symbol &Sym = Syms.back();
+  Sym = {};
+
+  Sym.UncommonIndex = -1;
+  storage::Uncommon *Unc = nullptr;
+  auto Uncommon = [&]() -> storage::Uncommon & {
+    if (Unc)
+      return *Unc;
+    Sym.UncommonIndex = Uncommons.size();
+    Uncommons.emplace_back();
+    Unc = &Uncommons.back();
+    *Unc = {};
+    setStr(Unc->COFFWeakExternFallbackName, "");
+    return *Unc;
+  };
+
+  SmallString<64> Name;
+  {
+    raw_svector_ostream OS(Name);
+    Msymtab.printSymbolName(OS, Msym);
+  }
+  setStr(Sym.Name, Saver.save(StringRef(Name)));
+
+  auto Flags = Msymtab.getSymbolFlags(Msym);
+  if (Flags & object::BasicSymbolRef::SF_Undefined)
+    Sym.Flags |= 1 << storage::Symbol::FB_undefined;
+  if (Flags & object::BasicSymbolRef::SF_Weak)
+    Sym.Flags |= 1 << storage::Symbol::FB_weak;
+  if (Flags & object::BasicSymbolRef::SF_Common)
+    Sym.Flags |= 1 << storage::Symbol::FB_common;
+  if (Flags & object::BasicSymbolRef::SF_Indirect)
+    Sym.Flags |= 1 << storage::Symbol::FB_indirect;
+  if (Flags & object::BasicSymbolRef::SF_Global)
+    Sym.Flags |= 1 << storage::Symbol::FB_global;
+  if (Flags & object::BasicSymbolRef::SF_FormatSpecific)
+    Sym.Flags |= 1 << storage::Symbol::FB_format_specific;
+  if (Flags & object::BasicSymbolRef::SF_Executable)
+    Sym.Flags |= 1 << storage::Symbol::FB_executable;
+
+  Sym.ComdatIndex = -1;
+  auto *GV = Msym.dyn_cast<GlobalValue *>();
+  if (!GV) {
+    setStr(Sym.IRName, "");
+    return Error::success();
+  }
+
+  setStr(Sym.IRName, GV->getName());
+
+  if (Used.count(GV))
+    Sym.Flags |= 1 << storage::Symbol::FB_used;
+  if (GV->isThreadLocal())
+    Sym.Flags |= 1 << storage::Symbol::FB_tls;
+  if (GV->hasGlobalUnnamedAddr())
+    Sym.Flags |= 1 << storage::Symbol::FB_unnamed_addr;
+  if (canBeOmittedFromSymbolTable(GV))
+    Sym.Flags |= 1 << storage::Symbol::FB_may_omit;
+  Sym.Flags |= unsigned(GV->getVisibility()) << storage::Symbol::FB_visibility;
+
+  if (Flags & object::BasicSymbolRef::SF_Common) {
+    Uncommon().CommonSize = GV->getParent()->getDataLayout().getTypeAllocSize(
+        GV->getType()->getElementType());
+    Uncommon().CommonAlign = GV->getAlignment();
+  }
+
+  const GlobalObject *Base = GV->getBaseObject();
+  if (!Base)
+    return make_error<StringError>("Unable to determine comdat of alias!",
+                                   inconvertibleErrorCode());
+  if (const Comdat *C = Base->getComdat()) {
+    auto P = ComdatMap.insert(std::make_pair(C, Comdats.size()));
+    Sym.ComdatIndex = P.first->second;
+
+    if (P.second) {
+      storage::Comdat Comdat;
+      setStr(Comdat.Name, C->getName());
+      Comdats.push_back(Comdat);
+    }
+  }
+
+  if (TT.isOSBinFormatCOFF()) {
+    emitLinkerFlagsForGlobalCOFF(COFFLinkerOptsOS, GV, TT, Mang);
+
+    if ((Flags & object::BasicSymbolRef::SF_Weak) &&
+        (Flags & object::BasicSymbolRef::SF_Indirect)) {
+      std::string FallbackName;
+      raw_string_ostream OS(FallbackName);
+      Msymtab.printSymbolName(
+          OS, cast<GlobalValue>(
+                  cast<GlobalAlias>(GV)->getAliasee()->stripPointerCasts()));
+      OS.flush();
+      setStr(Uncommon().COFFWeakExternFallbackName, Saver.save(FallbackName));
+    }
+  }
+
+  return Error::success();
+}
+
+Error Builder::build(ArrayRef<Module *> IRMods) {
+  storage::Header Hdr;
+
+  assert(!IRMods.empty());
+  setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple());
+  setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
+  TT = Triple(IRMods[0]->getTargetTriple());
+
+  // This adds the symbols for each module to Msymtab.
+  for (auto *M : IRMods)
+    if (Error Err = addModule(M))
+      return Err;
+
+  for (ModuleSymbolTable::Symbol Msym : Msymtab.symbols())
+    if (Error Err = addSymbol(Msym))
+      return Err;
+
+  COFFLinkerOptsOS.flush();
+  setStr(Hdr.COFFLinkerOpts, COFFLinkerOpts);
+
+  // We are about to fill in the header's range fields, so reserve space for it
+  // and copy it in afterwards.
+  Symtab.resize(sizeof(storage::Header));
+  writeRange(Hdr.Modules, Mods);
+  writeRange(Hdr.Comdats, Comdats);
+  writeRange(Hdr.Symbols, Syms);
+  writeRange(Hdr.Uncommons, Uncommons);
+
+  *reinterpret_cast<storage::Header *>(Symtab.data()) = Hdr;
+
+  raw_svector_ostream OS(Strtab);
+  StrtabBuilder.finalizeInOrder();
+  StrtabBuilder.write(OS);
+
+  return Error::success();
+}
+
+} // anonymous namespace
+
+Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
+                      SmallVector<char, 0> &Strtab) {
+  return Builder(Symtab, Strtab).build(Mods);
+}
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 5b018676eba3..1753d2baaedd 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -106,13 +106,6 @@ static StringRef parseSegmentOrSectionName(const char *P) {
   return StringRef(P, 16);
 }
 
-// Helper to advance a section or symbol iterator multiple increments at a time.
-template<class T>
-static void advance(T &it, size_t Val) {
-  while (Val--)
-    ++it;
-}
-
 static unsigned getCPUType(const MachOObjectFile &O) {
   return O.getHeader().cputype;
 }
@@ -368,7 +361,7 @@ static Error parseSegmentLoadCommand(
                             CmdName + " extends past the end of the file");
     if (S.vmsize != 0 && S.filesize > S.vmsize)
       return malformedError("load command " + Twine(LoadCommandIndex) +
-                            " fileoff field in " + CmdName +
+                            " filesize field in " + CmdName +
                             " greater than vmsize field");
     IsPageZeroSegment |= StringRef("__PAGEZERO").equals(S.segname);
   } else
@@ -784,6 +777,52 @@ static Error checkVersCommand(const MachOObjectFile &Obj,
   return Error::success();
 }
 
+static Error checkNoteCommand(const MachOObjectFile &Obj,
+                              const MachOObjectFile::LoadCommandInfo &Load,
+                              uint32_t LoadCommandIndex,
+                              std::list<MachOElement> &Elements) {
+  if (Load.C.cmdsize != sizeof(MachO::note_command))
+    return malformedError("load command " + Twine(LoadCommandIndex) + 
+                          " LC_NOTE has incorrect cmdsize");
+  MachO::note_command Nt = getStruct<MachO::note_command>(Obj, Load.Ptr);
+  uint64_t FileSize = Obj.getData().size();
+  if (Nt.offset > FileSize)
+    return malformedError("offset field of LC_NOTE command " +
+                          Twine(LoadCommandIndex) + " extends "
+                          "past the end of the file");
+  uint64_t BigSize = Nt.offset;
+  BigSize += Nt.size;
+  if (BigSize > FileSize)
+    return malformedError("size field plus offset field of LC_NOTE command " +
+                          Twine(LoadCommandIndex) + " extends past the end of "
+                          "the file");
+  if (Error Err = checkOverlappingElement(Elements, Nt.offset, Nt.size,
+                                          "LC_NOTE data"))
+    return Err;
+  return Error::success();
+}
+
+static Error
+parseBuildVersionCommand(const MachOObjectFile &Obj,
+                         const MachOObjectFile::LoadCommandInfo &Load,
+                         SmallVectorImpl<const char*> &BuildTools,
+                         uint32_t LoadCommandIndex) {
+  MachO::build_version_command BVC =
+      getStruct<MachO::build_version_command>(Obj, Load.Ptr);
+  if (Load.C.cmdsize !=
+      sizeof(MachO::build_version_command) +
+          BVC.ntools * sizeof(MachO::build_tool_version))
+    return malformedError("load command " + Twine(LoadCommandIndex) +
+                          " LC_BUILD_VERSION_COMMAND has incorrect cmdsize");
+
+  auto Start = Load.Ptr + sizeof(MachO::build_version_command);
+  BuildTools.resize(BVC.ntools);
+  for (unsigned i = 0; i < BVC.ntools; ++i)
+    BuildTools[i] = Start + i * sizeof(MachO::build_tool_version);
+
+  return Error::success();
+}
+
 static Error checkRpathCommand(const MachOObjectFile &Obj,
                                const MachOObjectFile::LoadCommandInfo &Load,
                                uint32_t LoadCommandIndex) {
@@ -931,7 +970,26 @@ static Error checkThreadCommand(const MachOObjectFile &Obj,
       sys::swapByteOrder(count);
     state += sizeof(uint32_t);
 
-    if (cputype == MachO::CPU_TYPE_X86_64) {
+    if (cputype == MachO::CPU_TYPE_I386) {
+      if (flavor == MachO::x86_THREAD_STATE32) {
+        if (count != MachO::x86_THREAD_STATE32_COUNT)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " count not x86_THREAD_STATE32_COUNT for "
+                                "flavor number " + Twine(nflavor) + " which is "
+                                "a x86_THREAD_STATE32 flavor in " + CmdName +
+                                " command");
+        if (state + sizeof(MachO::x86_thread_state32_t) > end)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " x86_THREAD_STATE32 extends past end of "
+                                "command in " + CmdName + " command");
+        state += sizeof(MachO::x86_thread_state32_t);
+      } else {
+        return malformedError("load command " + Twine(LoadCommandIndex) +
+                              " unknown flavor (" + Twine(flavor) + ") for "
+                              "flavor number " + Twine(nflavor) + " in " +
+                              CmdName + " command");
+      }
+    } else if (cputype == MachO::CPU_TYPE_X86_64) {
       if (flavor == MachO::x86_THREAD_STATE64) {
         if (count != MachO::x86_THREAD_STATE64_COUNT)
           return malformedError("load command " + Twine(LoadCommandIndex) +
@@ -1280,6 +1338,12 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
       if ((Err = checkVersCommand(*this, Load, I, &VersLoadCmd,
                                   "LC_VERSION_MIN_WATCHOS")))
         return;
+    } else if (Load.C.cmd == MachO::LC_NOTE) {
+      if ((Err = checkNoteCommand(*this, Load, I, Elements)))
+        return;
+    } else if (Load.C.cmd == MachO::LC_BUILD_VERSION) {
+      if ((Err = parseBuildVersionCommand(*this, Load, BuildTools, I)))
+        return;
     } else if (Load.C.cmd == MachO::LC_RPATH) {
       if ((Err = checkRpathCommand(*this, Load, I)))
         return;
@@ -2201,6 +2265,10 @@ std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
   return std::error_code();
 }
 
+uint32_t MachOObjectFile::getLibraryCount() const {
+  return Libraries.size();
+}
+
 section_iterator
 MachOObjectFile::getRelocationRelocatedSection(relocation_iterator Rel) const {
   DataRefImpl Sec;
@@ -2383,6 +2451,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
         *ArchFlag = "armv7em";
       return Triple("thumbv7em-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7K:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
       if (ArchFlag)
         *ArchFlag = "armv7k";
       return Triple("armv7k-apple-darwin");
@@ -2393,6 +2463,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
         *ArchFlag = "armv7m";
       return Triple("thumbv7m-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7S:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
       if (ArchFlag)
         *ArchFlag = "armv7s";
       return Triple("armv7s-apple-darwin");
@@ -2402,6 +2474,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
   case MachO::CPU_TYPE_ARM64:
     switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
     case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
       if (ArchFlag)
         *ArchFlag = "arm64";
       return Triple("arm64-apple-darwin");
@@ -2674,10 +2748,11 @@ iterator_range<export_iterator> MachOObjectFile::exports() const {
   return exports(getDyldInfoExportsTrie());
 }
 
-MachORebaseEntry::MachORebaseEntry(ArrayRef<uint8_t> Bytes, bool is64Bit)
-    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
-      RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
-      PointerSize(is64Bit ? 8 : 4), Malformed(false), Done(false) {}
+MachORebaseEntry::MachORebaseEntry(Error *E, const MachOObjectFile *O,
+                                   ArrayRef<uint8_t> Bytes, bool is64Bit)
+    : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0),
+      SegmentIndex(-1), RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
+      PointerSize(is64Bit ? 8 : 4), Done(false) {}
 
 void MachORebaseEntry::moveToFirst() {
   Ptr = Opcodes.begin();
@@ -2691,22 +2766,29 @@ void MachORebaseEntry::moveToEnd() {
 }
 
 void MachORebaseEntry::moveNext() {
+  ErrorAsOutParameter ErrAsOutParam(E);
   // If in the middle of some loop, move to next rebasing in loop.
   SegmentOffset += AdvanceAmount;
   if (RemainingLoopCount) {
     --RemainingLoopCount;
     return;
   }
+  // REBASE_OPCODE_DONE is only used for padding if we are not aligned to
+  // pointer size. Therefore it is possible to reach the end without ever having
+  // seen REBASE_OPCODE_DONE.
   if (Ptr == Opcodes.end()) {
     Done = true;
     return;
   }
   bool More = true;
-  while (More && !Malformed) {
+  while (More) {
     // Parse next opcode and set up next loop.
+    const uint8_t *OpcodeStart = Ptr;
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::REBASE_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::REBASE_OPCODE_MASK;
+    uint32_t Count, Skip;
+    const char *error = nullptr;
     switch (Opcode) {
     case MachO::REBASE_OPCODE_DONE:
       More = false;
@@ -2716,6 +2798,13 @@ void MachORebaseEntry::moveNext() {
       break;
     case MachO::REBASE_OPCODE_SET_TYPE_IMM:
       RebaseType = ImmValue;
+      if (RebaseType > MachO::REBASE_TYPE_TEXT_PCREL32) {
+          *E = malformedError("for REBASE_OPCODE_SET_TYPE_IMM bad bind type: " +
+               Twine((int)RebaseType) + " for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: "
@@ -2723,7 +2812,23 @@ void MachORebaseEntry::moveNext() {
       break;
     case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
       SegmentIndex = ImmValue;
-      SegmentOffset = readULEB128();
+      SegmentOffset = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
@@ -2732,22 +2837,80 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_ULEB:
-      SegmentOffset += readULEB128();
+      SegmentOffset += readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-rebase",
                       llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: "
                                    << format("SegmentOffset=0x%06X",
                                              SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_IMM_SCALED:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       SegmentOffset += ImmValue * PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              false);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED "
+             " (after adding immediate times the pointer size) " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-rebase",
                       llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: "
                                    << format("SegmentOffset=0x%06X",
                                              SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_DO_REBASE_IMM_TIMES:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_IMM_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = PointerSize;
-      RemainingLoopCount = ImmValue - 1;
+      Skip = 0;
+      Count = ImmValue;
+      if (ImmValue != 0)
+        RemainingLoopCount = ImmValue - 1;
+      else
+        RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_IMM_TIMES "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: "
@@ -2757,8 +2920,38 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = PointerSize;
-      RemainingLoopCount = readULEB128() - 1;
+      Skip = 0;
+      Count = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: "
@@ -2768,8 +2961,35 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB:
-      AdvanceAmount = readULEB128() + PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Skip = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = Skip + PointerSize;
+      Count = 1;
       RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: "
@@ -2779,8 +2999,46 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB:
-      RemainingLoopCount = readULEB128() - 1;
-      AdvanceAmount = readULEB128() + PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Count = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      Skip = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = Skip + PointerSize;
+
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: "
@@ -2790,23 +3048,25 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     default:
-      Malformed = true;
+      *E = malformedError("bad rebase info (bad opcode value 0x" +
+           utohexstr(Opcode) + " for opcode at: 0x" +
+           utohexstr(OpcodeStart - Opcodes.begin()));
+      moveToEnd();
+      return;
     }
   }
 }
 
-uint64_t MachORebaseEntry::readULEB128() {
+uint64_t MachORebaseEntry::readULEB128(const char **error) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
+  uint64_t Result = decodeULEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-uint32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
+int32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachORebaseEntry::segmentOffset() const { return SegmentOffset; }
 
@@ -2822,6 +3082,24 @@ StringRef MachORebaseEntry::typeName() const {
   return "unknown";
 }
 
+// For use with the SegIndex of a checked Mach-O Rebase entry
+// to get the segment name.
+StringRef MachORebaseEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Rebase entry
+// to get the section name.
+StringRef MachORebaseEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Rebase entry
+// to get the address.
+uint64_t MachORebaseEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
 bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
 #ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
@@ -2834,25 +3112,29 @@ bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
 }
 
 iterator_range<rebase_iterator>
-MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
-  MachORebaseEntry Start(Opcodes, is64);
+MachOObjectFile::rebaseTable(Error &Err, MachOObjectFile *O,
+                             ArrayRef<uint8_t> Opcodes, bool is64) {
+  if (O->BindRebaseSectionTable == nullptr)
+    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+  MachORebaseEntry Start(&Err, O, Opcodes, is64);
   Start.moveToFirst();
 
-  MachORebaseEntry Finish(Opcodes, is64);
+  MachORebaseEntry Finish(&Err, O, Opcodes, is64);
   Finish.moveToEnd();
 
   return make_range(rebase_iterator(Start), rebase_iterator(Finish));
 }
 
-iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
-  return rebaseTable(getDyldInfoRebaseOpcodes(), is64Bit());
+iterator_range<rebase_iterator> MachOObjectFile::rebaseTable(Error &Err) {
+  return rebaseTable(Err, this, getDyldInfoRebaseOpcodes(), is64Bit());
 }
 
-MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
-    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
-      Ordinal(0), Flags(0), Addend(0), RemainingLoopCount(0), AdvanceAmount(0),
-      BindType(0), PointerSize(is64Bit ? 8 : 4),
-      TableKind(BK), Malformed(false), Done(false) {}
+MachOBindEntry::MachOBindEntry(Error *E, const MachOObjectFile *O,
+                               ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
+    : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0),
+      SegmentIndex(-1), LibraryOrdinalSet(false), Ordinal(0), Flags(0),
+      Addend(0), RemainingLoopCount(0), AdvanceAmount(0), BindType(0),
+      PointerSize(is64Bit ? 8 : 4), TableKind(BK), Done(false) {}
 
 void MachOBindEntry::moveToFirst() {
   Ptr = Opcodes.begin();
@@ -2866,24 +3148,31 @@ void MachOBindEntry::moveToEnd() {
 }
 
 void MachOBindEntry::moveNext() {
+  ErrorAsOutParameter ErrAsOutParam(E);
   // If in the middle of some loop, move to next binding in loop.
   SegmentOffset += AdvanceAmount;
   if (RemainingLoopCount) {
     --RemainingLoopCount;
     return;
   }
+  // BIND_OPCODE_DONE is only used for padding if we are not aligned to
+  // pointer size. Therefore it is possible to reach the end without ever having
+  // seen BIND_OPCODE_DONE.
   if (Ptr == Opcodes.end()) {
     Done = true;
     return;
   }
   bool More = true;
-  while (More && !Malformed) {
+  while (More) {
     // Parse next opcode and set up next loop.
+    const uint8_t *OpcodeStart = Ptr;
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::BIND_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::BIND_OPCODE_MASK;
     int8_t SignExtended;
     const uint8_t *SymStart;
+    uint32_t Count, Skip;
+    const char *error = nullptr;
     switch (Opcode) {
     case MachO::BIND_OPCODE_DONE:
       if (TableKind == Kind::Lazy) {
@@ -2899,28 +3188,81 @@ void MachOBindEntry::moveNext() {
           break;
       }
       More = false;
-      Done = true;
       moveToEnd();
       DEBUG_WITH_TYPE("mach-o-bind", llvm::dbgs() << "BIND_OPCODE_DONE\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_IMM:
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_ORDINAL_IMM not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       Ordinal = ImmValue;
+      LibraryOrdinalSet = true;
+      if (ImmValue > O->getLibraryCount()) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad "
+             "library ordinal: " + Twine((int)ImmValue) + " (max " +
+             Twine((int)O->getLibraryCount()) + ") for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: "
                        << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB:
-      Ordinal = readULEB128();
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Ordinal = readULEB128(&error);
+      LibraryOrdinalSet = true;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Ordinal > (int)O->getLibraryCount()) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad "
+             "library ordinal: " + Twine((int)Ordinal) + " (max " +
+             Twine((int)O->getLibraryCount()) + ") for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: "
                        << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_SPECIAL_IMM:
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_SPECIAL_IMM not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       if (ImmValue) {
         SignExtended = MachO::BIND_OPCODE_MASK | ImmValue;
         Ordinal = SignExtended;
+        LibraryOrdinalSet = true;
+        if (Ordinal < MachO::BIND_SPECIAL_DYLIB_FLAT_LOOKUP) {
+          *E = malformedError("for BIND_OPCODE_SET_DYLIB_SPECIAL_IMM unknown "
+               "special ordinal: " + Twine((int)Ordinal) + " for opcode at: "
+               "0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+        }
       } else
         Ordinal = 0;
       DEBUG_WITH_TYPE(
@@ -2931,9 +3273,16 @@ void MachOBindEntry::moveNext() {
     case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM:
       Flags = ImmValue;
       SymStart = Ptr;
-      while (*Ptr) {
+      while (*Ptr && (Ptr < Opcodes.end())) {
         ++Ptr;
       }
+      if (Ptr == Opcodes.end()) {
+          *E = malformedError("for BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM "
+               "symbol name extends past opcodes for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       SymbolName = StringRef(reinterpret_cast<const char*>(SymStart),
                              Ptr-SymStart);
       ++Ptr;
@@ -2948,15 +3297,27 @@ void MachOBindEntry::moveNext() {
       break;
     case MachO::BIND_OPCODE_SET_TYPE_IMM:
       BindType = ImmValue;
+      if (ImmValue > MachO::BIND_TYPE_TEXT_PCREL32) {
+          *E = malformedError("for BIND_OPCODE_SET_TYPE_IMM bad bind type: " +
+               Twine((int)ImmValue) + " for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_TYPE_IMM: "
                        << "BindType=" << (int)BindType << "\n");
       break;
     case MachO::BIND_OPCODE_SET_ADDEND_SLEB:
-      Addend = readSLEB128();
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      Addend = readSLEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_ADDEND_SLEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: "
@@ -2964,7 +3325,22 @@ void MachOBindEntry::moveNext() {
       break;
     case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
       SegmentIndex = ImmValue;
-      SegmentOffset = readULEB128();
+      SegmentOffset = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
@@ -2973,7 +3349,22 @@ void MachOBindEntry::moveNext() {
                        << "\n");
       break;
     case MachO::BIND_OPCODE_ADD_ADDR_ULEB:
-      SegmentOffset += readULEB128();
+      SegmentOffset += readULEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
                       llvm::dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: "
                                    << format("SegmentOffset=0x%06X",
@@ -2982,16 +3373,83 @@ void MachOBindEntry::moveNext() {
     case MachO::BIND_OPCODE_DO_BIND:
       AdvanceAmount = PointerSize;
       RemainingLoopCount = 0;
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND " + Twine(error) +
+             " for opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND missing preceding "
+             "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND missing preceding "
+             "BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
                       llvm::dbgs() << "BIND_OPCODE_DO_BIND: "
                                    << format("SegmentOffset=0x%06X",
                                              SegmentOffset) << "\n");
       return;
      case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
-      AdvanceAmount = readULEB128() + PointerSize;
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB not allowed in "
+             "lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing "
+             "preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing "
+             "preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = readULEB128(&error) + PointerSize;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      // Note, this is not really an error until the next bind but make no sense
+      // for a BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB to not be followed by another
+      // bind operation.
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset +
+                                            AdvanceAmount, false);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB (after adding "
+             "ULEB) " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       RemainingLoopCount = 0;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: "
@@ -3001,10 +3459,47 @@ void MachOBindEntry::moveNext() {
                        << "\n");
       return;
     case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED:
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED not "
+             "allowed in lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             "missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for "
+             "opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             "missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = ImmValue * PointerSize + PointerSize;
       RemainingLoopCount = 0;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset +
+                                            AdvanceAmount, false);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             " (after adding immediate times the pointer size) " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
                       llvm::dbgs()
                       << "BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED: "
@@ -3012,10 +3507,65 @@ void MachOBindEntry::moveNext() {
                                              SegmentOffset) << "\n");
       return;
     case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
-      RemainingLoopCount = readULEB128() - 1;
-      AdvanceAmount = readULEB128() + PointerSize;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB not "
+             "allowed in lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Count = readULEB128(&error);
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+                            " (count value) " + Twine(error) + " for opcode at"
+                            ": 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Skip = readULEB128(&error);
+      AdvanceAmount = Skip + PointerSize;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+                            " (skip value) " + Twine(error) + " for opcode at"
+                            ": 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             "missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for "
+             "opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             "missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                            SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: "
@@ -3025,34 +3575,34 @@ void MachOBindEntry::moveNext() {
                        << "\n");
       return;
     default:
-      Malformed = true;
+      *E = malformedError("bad bind info (bad opcode value 0x" +
+           utohexstr(Opcode) + " for opcode at: 0x" +
+           utohexstr(OpcodeStart - Opcodes.begin()));
+      moveToEnd();
+      return;
     }
   }
 }
 
-uint64_t MachOBindEntry::readULEB128() {
+uint64_t MachOBindEntry::readULEB128(const char **error) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
+  uint64_t Result = decodeULEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-int64_t MachOBindEntry::readSLEB128() {
+int64_t MachOBindEntry::readSLEB128(const char **error) {
   unsigned Count;
-  int64_t Result = decodeSLEB128(Ptr, &Count);
+  int64_t Result = decodeSLEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-uint32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
+int32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachOBindEntry::segmentOffset() const { return SegmentOffset; }
 
@@ -3076,6 +3626,24 @@ uint32_t MachOBindEntry::flags() const { return Flags; }
 
 int MachOBindEntry::ordinal() const { return Ordinal; }
 
+// For use with the SegIndex of a checked Mach-O Bind entry
+// to get the segment name.
+StringRef MachOBindEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind entry
+// to get the section name.
+StringRef MachOBindEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind entry
+// to get the address.
+uint64_t MachOBindEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
 bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
 #ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
@@ -3087,30 +3655,149 @@ bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
          (Done == Other.Done);
 }
 
+// Build table of sections so SegIndex/SegOffset pairs can be translated.
+BindRebaseSegInfo::BindRebaseSegInfo(const object::MachOObjectFile *Obj) {
+  uint32_t CurSegIndex = Obj->hasPageZeroSegment() ? 1 : 0;
+  StringRef CurSegName;
+  uint64_t CurSegAddress;
+  for (const SectionRef &Section : Obj->sections()) {
+    SectionInfo Info;
+    Section.getName(Info.SectionName);
+    Info.Address = Section.getAddress();
+    Info.Size = Section.getSize();
+    Info.SegmentName =
+        Obj->getSectionFinalSegmentName(Section.getRawDataRefImpl());
+    if (!Info.SegmentName.equals(CurSegName)) {
+      ++CurSegIndex;
+      CurSegName = Info.SegmentName;
+      CurSegAddress = Info.Address;
+    }
+    Info.SegmentIndex = CurSegIndex - 1;
+    Info.OffsetInSegment = Info.Address - CurSegAddress;
+    Info.SegmentStartAddress = CurSegAddress;
+    Sections.push_back(Info);
+  }
+  MaxSegIndex = CurSegIndex;
+}
+
+// For use with a SegIndex,SegOffset pair in MachOBindEntry::moveNext() to
+// validate a MachOBindEntry or MachORebaseEntry.
+const char * BindRebaseSegInfo::checkSegAndOffset(int32_t SegIndex,
+                                                  uint64_t SegOffset,
+                                                  bool endInvalid) {
+  if (SegIndex == -1)
+    return "missing preceding *_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB";
+  if (SegIndex >= MaxSegIndex)
+    return "bad segIndex (too large)";
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > SegOffset)
+      continue;
+    if (SegOffset > (SI.OffsetInSegment + SI.Size))
+      continue;
+    if (endInvalid && SegOffset >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return nullptr;
+  }
+  return "bad segOffset, too large";
+}
+
+// For use in MachOBindEntry::moveNext() to validate a MachOBindEntry for
+// the BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB opcode and for use in
+// MachORebaseEntry::moveNext() to validate a MachORebaseEntry for
+// REBASE_OPCODE_DO_*_TIMES* opcodes.  The SegIndex and SegOffset must have
+// been already checked.
+const char * BindRebaseSegInfo::checkCountAndSkip(uint32_t Count, uint32_t Skip,
+                                                  uint8_t PointerSize,
+                                                  int32_t SegIndex,
+                                                  uint64_t SegOffset) {
+  const SectionInfo &SI = findSection(SegIndex, SegOffset);
+  uint64_t addr = SI.SegmentStartAddress + SegOffset;
+  if (addr >= SI.Address + SI.Size)
+    return "bad segOffset, too large";
+  uint64_t i = 0;
+  if (Count > 1)
+    i = (Skip + PointerSize) * (Count - 1);
+  else if (Count == 1)
+    i = Skip + PointerSize;
+  if (addr + i >= SI.Address + SI.Size) {
+    // For rebase opcodes they can step from one section to another.
+    uint64_t TrailingSegOffset = (addr + i) - SI.SegmentStartAddress;
+    const char *error = checkSegAndOffset(SegIndex, TrailingSegOffset, false);
+    if (error)
+      return "bad count and skip, too large";
+  }
+  return nullptr;
+}
+
+// For use with the SegIndex of a checked Mach-O Bind or Rebase entry
+// to get the segment name.
+StringRef BindRebaseSegInfo::segmentName(int32_t SegIndex) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex == SegIndex)
+      return SI.SegmentName;
+  }
+  llvm_unreachable("invalid SegIndex");
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// to get the SectionInfo.
+const BindRebaseSegInfo::SectionInfo &BindRebaseSegInfo::findSection(
+                                     int32_t SegIndex, uint64_t SegOffset) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > SegOffset)
+      continue;
+    if (SegOffset >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return SI;
+  }
+  llvm_unreachable("SegIndex and SegOffset not in any section");
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// entry to get the section name.
+StringRef BindRebaseSegInfo::sectionName(int32_t SegIndex,
+                                         uint64_t SegOffset) {
+  return findSection(SegIndex, SegOffset).SectionName;
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// entry to get the address.
+uint64_t BindRebaseSegInfo::address(uint32_t SegIndex, uint64_t OffsetInSeg) {
+  const SectionInfo &SI = findSection(SegIndex, OffsetInSeg);
+  return SI.SegmentStartAddress + OffsetInSeg;
+}
+
 iterator_range<bind_iterator>
-MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
+MachOObjectFile::bindTable(Error &Err, MachOObjectFile *O,
+                           ArrayRef<uint8_t> Opcodes, bool is64,
                            MachOBindEntry::Kind BKind) {
-  MachOBindEntry Start(Opcodes, is64, BKind);
+  if (O->BindRebaseSectionTable == nullptr)
+    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+  MachOBindEntry Start(&Err, O, Opcodes, is64, BKind);
   Start.moveToFirst();
 
-  MachOBindEntry Finish(Opcodes, is64, BKind);
+  MachOBindEntry Finish(&Err, O, Opcodes, is64, BKind);
   Finish.moveToEnd();
 
   return make_range(bind_iterator(Start), bind_iterator(Finish));
 }
 
-iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
-  return bindTable(getDyldInfoBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::bindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Regular);
 }
 
-iterator_range<bind_iterator> MachOObjectFile::lazyBindTable() const {
-  return bindTable(getDyldInfoLazyBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::lazyBindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoLazyBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Lazy);
 }
 
-iterator_range<bind_iterator> MachOObjectFile::weakBindTable() const {
-  return bindTable(getDyldInfoWeakBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::weakBindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoWeakBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Weak);
 }
 
@@ -3289,6 +3976,21 @@ MachOObjectFile::getVersionMinLoadCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::version_min_command>(*this, L.Ptr);
 }
 
+MachO::note_command
+MachOObjectFile::getNoteLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::note_command>(*this, L.Ptr);
+}
+
+MachO::build_version_command
+MachOObjectFile::getBuildVersionLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::build_version_command>(*this, L.Ptr);
+}
+
+MachO::build_tool_version
+MachOObjectFile::getBuildToolVersion(unsigned index) const {
+  return getStruct<MachO::build_tool_version>(*this, BuildTools[index]);
+}
+
 MachO::dylib_command
 MachOObjectFile::getDylibIDLoadCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::dylib_command>(*this, L.Ptr);
diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp
index 11ace84b9ceb..de1ddab88fd4 100644
--- a/lib/Object/ModuleSummaryIndexObjectFile.cpp
+++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp
@@ -96,13 +96,18 @@ ModuleSummaryIndexObjectFile::create(MemoryBufferRef Object) {
 // Parse the module summary index out of an IR file and return the summary
 // index object if found, or nullptr if not.
 Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
+llvm::getModuleSummaryIndexForFile(StringRef Path, StringRef Identifier) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   std::error_code EC = FileOrErr.getError();
   if (EC)
     return errorCodeToError(EC);
-  MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+  std::unique_ptr<MemoryBuffer> MemBuffer = std::move(FileOrErr.get());
+  // If Identifier is non-empty, use it as the buffer identifier, which
+  // will become the module path in the index.
+  if (Identifier.empty())
+    Identifier = MemBuffer->getBufferIdentifier();
+  MemoryBufferRef BufferRef(MemBuffer->getBuffer(), Identifier);
   if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize())
     return nullptr;
   Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
diff --git a/lib/Object/ModuleSymbolTable.cpp b/lib/Object/ModuleSymbolTable.cpp
index 90488007ff59..9a935d8e0869 100644
--- a/lib/Object/ModuleSymbolTable.cpp
+++ b/lib/Object/ModuleSymbolTable.cpp
@@ -43,27 +43,98 @@ void ModuleSymbolTable::addModule(Module *M) {
   else
     FirstMod = M;
 
-  for (Function &F : *M)
-    SymTab.push_back(&F);
-  for (GlobalVariable &GV : M->globals())
+  for (GlobalValue &GV : M->global_values())
     SymTab.push_back(&GV);
-  for (GlobalAlias &GA : M->aliases())
-    SymTab.push_back(&GA);
-
-  CollectAsmSymbols(Triple(M->getTargetTriple()), M->getModuleInlineAsm(),
-                    [this](StringRef Name, BasicSymbolRef::Flags Flags) {
-                      SymTab.push_back(new (AsmSymbols.Allocate())
-                                           AsmSymbol(Name, Flags));
-                    });
+
+  CollectAsmSymbols(*M, [this](StringRef Name, BasicSymbolRef::Flags Flags) {
+    SymTab.push_back(new (AsmSymbols.Allocate()) AsmSymbol(Name, Flags));
+  });
+}
+
+// Ensure ELF .symver aliases get the same binding as the defined symbol
+// they alias with.
+static void handleSymverAliases(const Module &M, RecordStreamer &Streamer) {
+  if (Streamer.symverAliases().empty())
+    return;
+
+  // The name in the assembler will be mangled, but the name in the IR
+  // might not, so we first compute a mapping from mangled name to GV.
+  Mangler Mang;
+  SmallString<64> MangledName;
+  StringMap<const GlobalValue *> MangledNameMap;
+  auto GetMangledName = [&](const GlobalValue &GV) {
+    if (!GV.hasName())
+      return;
+
+    MangledName.clear();
+    MangledName.reserve(GV.getName().size() + 1);
+    Mang.getNameWithPrefix(MangledName, &GV, /*CannotUsePrivateLabel=*/false);
+    MangledNameMap[MangledName] = &GV;
+  };
+  for (const Function &F : M)
+    GetMangledName(F);
+  for (const GlobalVariable &GV : M.globals())
+    GetMangledName(GV);
+  for (const GlobalAlias &GA : M.aliases())
+    GetMangledName(GA);
+
+  // Walk all the recorded .symver aliases, and set up the binding
+  // for each alias.
+  for (auto &Symver : Streamer.symverAliases()) {
+    const MCSymbol *Aliasee = Symver.first;
+    MCSymbolAttr Attr = MCSA_Invalid;
+
+    // First check if the aliasee binding was recorded in the asm.
+    RecordStreamer::State state = Streamer.getSymbolState(Aliasee);
+    switch (state) {
+    case RecordStreamer::Global:
+    case RecordStreamer::DefinedGlobal:
+      Attr = MCSA_Global;
+      break;
+    case RecordStreamer::UndefinedWeak:
+    case RecordStreamer::DefinedWeak:
+      Attr = MCSA_Weak;
+      break;
+    default:
+      break;
+    }
+
+    // If we don't have a symbol attribute from assembly, then check if
+    // the aliasee was defined in the IR.
+    if (Attr == MCSA_Invalid) {
+      const auto *GV = M.getNamedValue(Aliasee->getName());
+      if (!GV) {
+        auto MI = MangledNameMap.find(Aliasee->getName());
+        if (MI != MangledNameMap.end())
+          GV = MI->second;
+        else
+          continue;
+      }
+      if (GV->hasExternalLinkage())
+        Attr = MCSA_Global;
+      else if (GV->hasLocalLinkage())
+        Attr = MCSA_Local;
+      else if (GV->isWeakForLinker())
+        Attr = MCSA_Weak;
+    }
+    if (Attr == MCSA_Invalid)
+      continue;
+
+    // Set the detected binding on each alias with this aliasee.
+    for (auto &Alias : Symver.second)
+      Streamer.EmitSymbolAttribute(Alias, Attr);
+  }
 }
 
 void ModuleSymbolTable::CollectAsmSymbols(
-    const Triple &TT, StringRef InlineAsm,
+    const Module &M,
     function_ref<void(StringRef, BasicSymbolRef::Flags)> AsmSymbol) {
+  StringRef InlineAsm = M.getModuleInlineAsm();
   if (InlineAsm.empty())
     return;
 
   std::string Err;
+  const Triple TT(M.getTargetTriple());
   const Target *T = TargetRegistry::lookupTarget(TT.str(), Err);
   assert(T && T->hasMCAsmParser());
 
@@ -106,6 +177,8 @@ void ModuleSymbolTable::CollectAsmSymbols(
   if (Parser->Run(false))
     return;
 
+  handleSymverAliases(M, Streamer);
+
   for (auto &KV : Streamer) {
     StringRef Key = KV.first();
     RecordStreamer::State Value = KV.second;
diff --git a/lib/Object/RecordStreamer.cpp b/lib/Object/RecordStreamer.cpp
index 572b960bc85f..c9c27451f809 100644
--- a/lib/Object/RecordStreamer.cpp
+++ b/lib/Object/RecordStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- RecordStreamer.cpp - Record asm definde and used symbols ----------===//
+//===-- RecordStreamer.cpp - Record asm defined and used symbols ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -78,11 +78,11 @@ RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); }
 RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {}
 
 void RecordStreamer::EmitInstruction(const MCInst &Inst,
-                                     const MCSubtargetInfo &STI) {
+                                     const MCSubtargetInfo &STI, bool) {
   MCStreamer::EmitInstruction(Inst, STI);
 }
 
-void RecordStreamer::EmitLabel(MCSymbol *Symbol) {
+void RecordStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCStreamer::EmitLabel(Symbol);
   markDefined(*Symbol);
 }
@@ -110,3 +110,8 @@ void RecordStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment) {
   markDefined(*Symbol);
 }
+
+void RecordStreamer::emitELFSymverDirective(MCSymbol *Alias,
+                                            const MCSymbol *Aliasee) {
+  SymverAliasMap[Aliasee].push_back(Alias);
+}
diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h
index 617d8a43fbd2..a845ecd786a8 100644
--- a/lib/Object/RecordStreamer.h
+++ b/lib/Object/RecordStreamer.h
@@ -20,6 +20,10 @@ public:
 
 private:
   StringMap<State> Symbols;
+  // Map of aliases created by .symver directives, saved so we can update
+  // their symbol binding after parsing complete. This maps from each
+  // aliasee to its list of aliases.
+  DenseMap<const MCSymbol *, std::vector<MCSymbol *>> SymverAliasMap;
   void markDefined(const MCSymbol &Symbol);
   void markGlobal(const MCSymbol &Symbol, MCSymbolAttr Attribute);
   void markUsed(const MCSymbol &Symbol);
@@ -30,14 +34,29 @@ public:
   const_iterator begin();
   const_iterator end();
   RecordStreamer(MCContext &Context);
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                     unsigned ByteAlignment) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
+  /// Record .symver aliases for later processing.
+  void emitELFSymverDirective(MCSymbol *Alias,
+                              const MCSymbol *Aliasee) override;
+  /// Return the map of .symver aliasee to associated aliases.
+  DenseMap<const MCSymbol *, std::vector<MCSymbol *>> &symverAliases() {
+    return SymverAliasMap;
+  }
+  /// Get the state recorded for the given symbol.
+  State getSymbolState(const MCSymbol *Sym) {
+    auto SI = Symbols.find(Sym->getName());
+    if (SI == Symbols.end())
+      return NeverSeen;
+    return SI->second;
+  }
 };
 }
 #endif
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 2b61a8a034f6..fc1dca35424e 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- WasmObjectFile.cpp - Wasm object file implementation -----*- C++ -*-===//
+//===- WasmObjectFile.cpp - Wasm object file implementation ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/SymbolicFile.h"
 #include "llvm/Object/Wasm.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/Wasm.h"
+#include <algorithm>
+#include <cstdint>
+#include <system_error>
 
-namespace llvm {
-namespace object {
+using namespace llvm;
+using namespace object;
 
 Expected<std::unique_ptr<WasmObjectFile>>
 ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
@@ -24,34 +38,139 @@ ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
   return std::move(ObjectFile);
 }
 
-namespace {
+#define VARINT7_MAX ((1<<7)-1)
+#define VARINT7_MIN (-(1<<7))
+#define VARUINT7_MAX (1<<7)
+#define VARUINT1_MAX (1)
 
-uint32_t readUint32(const uint8_t *&Ptr) {
+static uint8_t readUint8(const uint8_t *&Ptr) { return *Ptr++; }
+
+static uint32_t readUint32(const uint8_t *&Ptr) {
   uint32_t Result = support::endian::read32le(Ptr);
   Ptr += sizeof(Result);
   return Result;
 }
 
-uint64_t readULEB128(const uint8_t *&Ptr) {
+static int32_t readFloat32(const uint8_t *&Ptr) {
+  int32_t Result = 0;
+  memcpy(&Result, Ptr, sizeof(Result));
+  Ptr += sizeof(Result);
+  return Result;
+}
+
+static int64_t readFloat64(const uint8_t *&Ptr) {
+  int64_t Result = 0;
+  memcpy(&Result, Ptr, sizeof(Result));
+  Ptr += sizeof(Result);
+  return Result;
+}
+
+static uint64_t readULEB128(const uint8_t *&Ptr) {
   unsigned Count;
   uint64_t Result = decodeULEB128(Ptr, &Count);
   Ptr += Count;
   return Result;
 }
 
-StringRef readString(const uint8_t *&Ptr) {
+static StringRef readString(const uint8_t *&Ptr) {
   uint32_t StringLen = readULEB128(Ptr);
   StringRef Return = StringRef(reinterpret_cast<const char *>(Ptr), StringLen);
   Ptr += StringLen;
   return Return;
 }
 
-Error readSection(wasm::WasmSection &Section, const uint8_t *&Ptr,
-                  const uint8_t *Start) {
+static int64_t readLEB128(const uint8_t *&Ptr) {
+  unsigned Count;
+  uint64_t Result = decodeSLEB128(Ptr, &Count);
+  Ptr += Count;
+  return Result;
+}
+
+static uint8_t readVaruint1(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= VARUINT1_MAX && result >= 0);
+  return result;
+}
+
+static int8_t readVarint7(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= VARINT7_MAX && result >= VARINT7_MIN);
+  return result;
+}
+
+static uint8_t readVaruint7(const uint8_t *&Ptr) {
+  uint64_t result = readULEB128(Ptr);
+  assert(result <= VARUINT7_MAX);
+  return result;
+}
+
+static int32_t readVarint32(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= INT32_MAX && result >= INT32_MIN);
+  return result;
+}
+
+static uint32_t readVaruint32(const uint8_t *&Ptr) {
+  uint64_t result = readULEB128(Ptr);
+  assert(result <= UINT32_MAX);
+  return result;
+}
+
+static int64_t readVarint64(const uint8_t *&Ptr) {
+  return readLEB128(Ptr);
+}
+
+static uint8_t readOpcode(const uint8_t *&Ptr) {
+  return readUint8(Ptr);
+}
+
+static Error readInitExpr(wasm::WasmInitExpr &Expr, const uint8_t *&Ptr) {
+  Expr.Opcode = readOpcode(Ptr);
+
+  switch (Expr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    Expr.Value.Int32 = readVarint32(Ptr);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    Expr.Value.Int64 = readVarint64(Ptr);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    Expr.Value.Float32 = readFloat32(Ptr);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    Expr.Value.Float64 = readFloat64(Ptr);
+    break;
+  case wasm::WASM_OPCODE_GET_GLOBAL:
+    Expr.Value.Global = readUint32(Ptr);
+    break;
+  default:
+    return make_error<GenericBinaryError>("Invalid opcode in init_expr",
+                                          object_error::parse_failed);
+  }
+
+  uint8_t EndOpcode = readOpcode(Ptr);
+  if (EndOpcode != wasm::WASM_OPCODE_END) {
+    return make_error<GenericBinaryError>("Invalid init_expr",
+                                          object_error::parse_failed);
+  }
+  return Error::success();
+}
+
+static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
+  wasm::WasmLimits Result;
+  Result.Flags = readVaruint1(Ptr);
+  Result.Initial = readVaruint32(Ptr);
+  if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    Result.Maximum = readVaruint32(Ptr);
+  return Result;
+}
+
+static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
+                         const uint8_t *Start) {
   // TODO(sbc): Avoid reading past EOF in the case of malformed files.
   Section.Offset = Ptr - Start;
-  Section.Type = readULEB128(Ptr);
-  uint32_t Size = readULEB128(Ptr);
+  Section.Type = readVaruint7(Ptr);
+  uint32_t Size = readVaruint32(Ptr);
   if (Size == 0)
     return make_error<StringError>("Zero length section",
                                    object_error::parse_failed);
@@ -59,10 +178,9 @@ Error readSection(wasm::WasmSection &Section, const uint8_t *&Ptr,
   Ptr += Size;
   return Error::success();
 }
-}
 
 WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
-    : ObjectFile(Binary::ID_Wasm, Buffer) {
+    : ObjectFile(Binary::ID_Wasm, Buffer), StartFunction(-1) {
   ErrorAsOutParameter ErrAsOutParam(&Err);
   Header.Magic = getData().substr(0, 4);
   if (Header.Magic != StringRef("\0asm", 4)) {
@@ -79,21 +197,388 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   }
 
   const uint8_t *Eof = getPtr(getData().size());
-  wasm::WasmSection Sec;
+  WasmSection Sec;
   while (Ptr < Eof) {
     if ((Err = readSection(Sec, Ptr, getPtr(0))))
       return;
-    if (Sec.Type == wasm::WASM_SEC_USER) {
-      if ((Err = parseUserSection(Sec, Sec.Content.data(), Sec.Content.size())))
-        return;
-    }
+    if ((Err = parseSection(Sec)))
+      return;
+
     Sections.push_back(Sec);
   }
 }
 
-Error WasmObjectFile::parseUserSection(wasm::WasmSection &Sec,
-                                       const uint8_t *Ptr, size_t Length) {
+Error WasmObjectFile::parseSection(WasmSection &Sec) {
+  const uint8_t* Start = Sec.Content.data();
+  const uint8_t* End = Start + Sec.Content.size();
+  switch (Sec.Type) {
+  case wasm::WASM_SEC_CUSTOM:
+    return parseCustomSection(Sec, Start, End);
+  case wasm::WASM_SEC_TYPE:
+    return parseTypeSection(Start, End);
+  case wasm::WASM_SEC_IMPORT:
+    return parseImportSection(Start, End);
+  case wasm::WASM_SEC_FUNCTION:
+    return parseFunctionSection(Start, End);
+  case wasm::WASM_SEC_TABLE:
+    return parseTableSection(Start, End);
+  case wasm::WASM_SEC_MEMORY:
+    return parseMemorySection(Start, End);
+  case wasm::WASM_SEC_GLOBAL:
+    return parseGlobalSection(Start, End);
+  case wasm::WASM_SEC_EXPORT:
+    return parseExportSection(Start, End);
+  case wasm::WASM_SEC_START:
+    return parseStartSection(Start, End);
+  case wasm::WASM_SEC_ELEM:
+    return parseElemSection(Start, End);
+  case wasm::WASM_SEC_CODE:
+    return parseCodeSection(Start, End);
+  case wasm::WASM_SEC_DATA:
+    return parseDataSection(Start, End);
+  default:
+    return make_error<GenericBinaryError>("Bad section type",
+                                          object_error::parse_failed);
+  }
+}
+
+Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
+  while (Ptr < End) {
+    uint8_t Type = readVarint7(Ptr);
+    uint32_t Size = readVaruint32(Ptr);
+    switch (Type) {
+    case wasm::WASM_NAMES_FUNCTION: {
+      uint32_t Count = readVaruint32(Ptr);
+      while (Count--) {
+        /*uint32_t Index =*/readVaruint32(Ptr);
+        StringRef Name = readString(Ptr);
+        if (Name.size())
+          Symbols.emplace_back(Name,
+                               WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME);
+      }
+      break;
+    }
+    // Ignore local names for now
+    case wasm::WASM_NAMES_LOCAL:
+    default:
+      Ptr += Size;
+      break;
+    }
+  }
+
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Name section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+WasmSection* WasmObjectFile::findCustomSectionByName(StringRef Name) {
+  for (WasmSection& Section : Sections) {
+    if (Section.Type == wasm::WASM_SEC_CUSTOM && Section.Name == Name)
+      return &Section;
+  }
+  return nullptr;
+}
+
+WasmSection* WasmObjectFile::findSectionByType(uint32_t Type) {
+  assert(Type != wasm::WASM_SEC_CUSTOM);
+  for (WasmSection& Section : Sections) {
+    if (Section.Type == Type)
+      return &Section;
+  }
+  return nullptr;
+}
+
+Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr,
+                                        const uint8_t *End) {
+  uint8_t SectionCode = readVarint7(Ptr);
+  WasmSection* Section = nullptr;
+  if (SectionCode == wasm::WASM_SEC_CUSTOM) {
+    StringRef Name = readString(Ptr);
+    Section = findCustomSectionByName(Name);
+  } else {
+    Section = findSectionByType(SectionCode);
+  }
+  if (!Section)
+    return make_error<GenericBinaryError>("Invalid section code",
+                                          object_error::parse_failed);
+  uint32_t RelocCount = readVaruint32(Ptr);
+  while (RelocCount--) {
+    wasm::WasmRelocation Reloc;
+    memset(&Reloc, 0, sizeof(Reloc));
+    Reloc.Type = readVaruint32(Ptr);
+    Reloc.Offset = readVaruint32(Ptr);
+    Reloc.Index = readVaruint32(Ptr);
+    switch (Reloc.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+      break;
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      Reloc.Addend = readVaruint32(Ptr);
+      break;
+    default:
+      return make_error<GenericBinaryError>("Bad relocation type",
+                                            object_error::parse_failed);
+    }
+    Section->Relocations.push_back(Reloc);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Reloc section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseCustomSection(WasmSection &Sec,
+                                         const uint8_t *Ptr, const uint8_t *End) {
   Sec.Name = readString(Ptr);
+  if (Sec.Name == "name") {
+    if (Error Err = parseNameSection(Ptr, End))
+      return Err;
+  } else if (Sec.Name.startswith("reloc.")) {
+    if (Error Err = parseRelocSection(Sec.Name, Ptr, End))
+      return Err;
+  }
+  return Error::success();
+}
+
+Error WasmObjectFile::parseTypeSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Signatures.reserve(Count);
+  while (Count--) {
+    wasm::WasmSignature Sig;
+    Sig.ReturnType = wasm::WASM_TYPE_NORESULT;
+    int8_t Form = readVarint7(Ptr);
+    if (Form != wasm::WASM_TYPE_FUNC) {
+      return make_error<GenericBinaryError>("Invalid signature type",
+                                            object_error::parse_failed);
+    }
+    uint32_t ParamCount = readVaruint32(Ptr);
+    Sig.ParamTypes.reserve(ParamCount);
+    while (ParamCount--) {
+      uint32_t ParamType = readVarint7(Ptr);
+      Sig.ParamTypes.push_back(ParamType);
+    }
+    uint32_t ReturnCount = readVaruint32(Ptr);
+    if (ReturnCount) {
+      if (ReturnCount != 1) {
+        return make_error<GenericBinaryError>(
+            "Multiple return types not supported", object_error::parse_failed);
+      }
+      Sig.ReturnType = readVarint7(Ptr);
+    }
+    Signatures.push_back(Sig);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Type section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Imports.reserve(Count);
+  while (Count--) {
+    wasm::WasmImport Im;
+    Im.Module = readString(Ptr);
+    Im.Field = readString(Ptr);
+    Im.Kind = readUint8(Ptr);
+    switch (Im.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      Im.SigIndex = readVaruint32(Ptr);
+      Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::FUNCTION_IMPORT);
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      Im.GlobalType = readVarint7(Ptr);
+      Im.GlobalMutable = readVaruint1(Ptr);
+      Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT);
+      break;
+    default:
+      // TODO(sbc): Handle other kinds of imports
+      return make_error<GenericBinaryError>(
+          "Unexpected import kind", object_error::parse_failed);
+    }
+    Imports.push_back(Im);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Import section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseFunctionSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  FunctionTypes.reserve(Count);
+  while (Count--) {
+    FunctionTypes.push_back(readVaruint32(Ptr));
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Function section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Tables.reserve(Count);
+  while (Count--) {
+    wasm::WasmTable Table;
+    Table.ElemType = readVarint7(Ptr);
+    if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+      return make_error<GenericBinaryError>("Invalid table element type",
+                                            object_error::parse_failed);
+    }
+    Table.Limits = readLimits(Ptr);
+    Tables.push_back(Table);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Table section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseMemorySection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Memories.reserve(Count);
+  while (Count--) {
+    Memories.push_back(readLimits(Ptr));
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Memory section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseGlobalSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Globals.reserve(Count);
+  while (Count--) {
+    wasm::WasmGlobal Global;
+    Global.Type = readVarint7(Ptr);
+    Global.Mutable = readVaruint1(Ptr);
+    if (Error Err = readInitExpr(Global.InitExpr, Ptr))
+      return Err;
+    Globals.push_back(Global);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Global section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Exports.reserve(Count);
+  while (Count--) {
+    wasm::WasmExport Ex;
+    Ex.Name = readString(Ptr);
+    Ex.Kind = readUint8(Ptr);
+    Ex.Index = readVaruint32(Ptr);
+    Exports.push_back(Ex);
+    switch (Ex.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::FUNCTION_EXPORT);
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT);
+      break;
+    default:
+      // TODO(sbc): Handle other kinds of exports
+      return make_error<GenericBinaryError>(
+          "Unexpected export kind", object_error::parse_failed);
+    }
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Export section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
+  StartFunction = readVaruint32(Ptr);
+  if (StartFunction < FunctionTypes.size())
+    return make_error<GenericBinaryError>("Invalid start function",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseCodeSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t FunctionCount = readVaruint32(Ptr);
+  if (FunctionCount != FunctionTypes.size()) {
+    return make_error<GenericBinaryError>("Invalid function count",
+                                          object_error::parse_failed);
+  }
+
+  CodeSection = ArrayRef<uint8_t>(Ptr, End - Ptr);
+
+  while (FunctionCount--) {
+    wasm::WasmFunction Function;
+    uint32_t FunctionSize = readVaruint32(Ptr);
+    const uint8_t *FunctionEnd = Ptr + FunctionSize;
+
+    uint32_t NumLocalDecls = readVaruint32(Ptr);
+    Function.Locals.reserve(NumLocalDecls);
+    while (NumLocalDecls--) {
+      wasm::WasmLocalDecl Decl;
+      Decl.Count = readVaruint32(Ptr);
+      Decl.Type = readVarint7(Ptr);
+      Function.Locals.push_back(Decl);
+    }
+
+    uint32_t BodySize = FunctionEnd - Ptr;
+    Function.Body = ArrayRef<uint8_t>(Ptr, BodySize);
+    Ptr += BodySize;
+    assert(Ptr == FunctionEnd);
+    Functions.push_back(Function);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Code section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseElemSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  ElemSegments.reserve(Count);
+  while (Count--) {
+    wasm::WasmElemSegment Segment;
+    Segment.TableIndex = readVaruint32(Ptr);
+    if (Segment.TableIndex != 0) {
+      return make_error<GenericBinaryError>("Invalid TableIndex",
+                                            object_error::parse_failed);
+    }
+    if (Error Err = readInitExpr(Segment.Offset, Ptr))
+      return Err;
+    uint32_t NumElems = readVaruint32(Ptr);
+    while (NumElems--) {
+      Segment.Functions.push_back(readVaruint32(Ptr));
+    }
+    ElemSegments.push_back(Segment);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Elem section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseDataSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  DataSegments.reserve(Count);
+  while (Count--) {
+    wasm::WasmDataSegment Segment;
+    Segment.Index = readVaruint32(Ptr);
+    if (Error Err = readInitExpr(Segment.Offset, Ptr))
+      return Err;
+    uint32_t Size = readVaruint32(Ptr);
+    Segment.Content = ArrayRef<uint8_t>(Ptr, Size);
+    Ptr += Size;
+    DataSegments.push_back(Segment);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Data section ended prematurely",
+                                          object_error::parse_failed);
   return Error::success();
 }
 
@@ -105,37 +590,48 @@ const wasm::WasmObjectHeader &WasmObjectFile::getHeader() const {
   return Header;
 }
 
-void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
-  llvm_unreachable("not yet implemented");
-}
-
-std::error_code WasmObjectFile::printSymbolName(raw_ostream &OS,
-                                                DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return object_error::invalid_symbol_index;
-}
+void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const { Symb.d.a++; }
 
 uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  switch (Sym.Type) {
+  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
+    return object::SymbolRef::SF_Undefined | SymbolRef::SF_Executable;
+  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
+    return object::SymbolRef::SF_Global | SymbolRef::SF_Executable;
+  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+    return object::SymbolRef::SF_Executable;
+  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+    return object::SymbolRef::SF_Undefined;
+  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+    return object::SymbolRef::SF_Global;
+  }
+  llvm_unreachable("Unknown WasmSymbol::SymbolType");
 }
 
 basic_symbol_iterator WasmObjectFile::symbol_begin() const {
-  return BasicSymbolRef(DataRefImpl(), this);
+  DataRefImpl Ref;
+  Ref.d.a = 0;
+  return BasicSymbolRef(Ref, this);
 }
 
 basic_symbol_iterator WasmObjectFile::symbol_end() const {
-  return BasicSymbolRef(DataRefImpl(), this);
+  DataRefImpl Ref;
+  Ref.d.a = Symbols.size();
+  return BasicSymbolRef(Ref, this);
+}
+
+const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const {
+  return Symbols[Symb.d.a];
 }
 
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  return Sym.Name;
 }
 
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  return (uint64_t)Symb.d.a;
 }
 
 uint64_t WasmObjectFile::getSymbolValueImpl(DataRefImpl Symb) const {
@@ -169,7 +665,7 @@ void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
 
 std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
                                                StringRef &Res) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
 #define ECase(X)                                                               \
   case wasm::WASM_SEC_##X:                                                     \
     Res = #X;                                                                  \
@@ -186,7 +682,7 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
     ECase(ELEM);
     ECase(CODE);
     ECase(DATA);
-  case wasm::WASM_SEC_USER:
+  case wasm::WASM_SEC_CUSTOM:
     Res = S.Name;
     break;
   default:
@@ -199,13 +695,13 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
 uint64_t WasmObjectFile::getSectionAddress(DataRefImpl Sec) const { return 0; }
 
 uint64_t WasmObjectFile::getSectionSize(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
   return S.Content.size();
 }
 
 std::error_code WasmObjectFile::getSectionContents(DataRefImpl Sec,
                                                    StringRef &Res) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
   // This will never fail since wasm sections can never be empty (user-sections
   // must have a name and non-user sections each have a defined structure).
   Res = StringRef(reinterpret_cast<const char *>(S.Content.data()),
@@ -222,13 +718,11 @@ bool WasmObjectFile::isSectionCompressed(DataRefImpl Sec) const {
 }
 
 bool WasmObjectFile::isSectionText(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
-  return S.Type == wasm::WASM_SEC_CODE;
+  return getWasmSection(Sec).Type == wasm::WASM_SEC_CODE;
 }
 
 bool WasmObjectFile::isSectionData(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
-  return S.Type == wasm::WASM_SEC_DATA;
+  return getWasmSection(Sec).Type == wasm::WASM_SEC_DATA;
 }
 
 bool WasmObjectFile::isSectionBSS(DataRefImpl Sec) const { return false; }
@@ -237,31 +731,28 @@ bool WasmObjectFile::isSectionVirtual(DataRefImpl Sec) const { return false; }
 
 bool WasmObjectFile::isSectionBitcode(DataRefImpl Sec) const { return false; }
 
-relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  RelocationRef Rel;
-  return relocation_iterator(Rel);
+relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Ref) const {
+  DataRefImpl RelocRef;
+  RelocRef.d.a = Ref.d.a;
+  RelocRef.d.b = 0;
+  return relocation_iterator(RelocationRef(RelocRef, this));
 }
 
-relocation_iterator WasmObjectFile::section_rel_end(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  RelocationRef Rel;
-  return relocation_iterator(Rel);
-}
-
-section_iterator WasmObjectFile::getRelocatedSection(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  SectionRef Ref;
-  return section_iterator(Ref);
+relocation_iterator WasmObjectFile::section_rel_end(DataRefImpl Ref) const {
+  const WasmSection &Sec = getWasmSection(Ref);
+  DataRefImpl RelocRef;
+  RelocRef.d.a = Ref.d.a;
+  RelocRef.d.b = Sec.Relocations.size();
+  return relocation_iterator(RelocationRef(RelocRef, this));
 }
 
 void WasmObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  llvm_unreachable("not yet implemented");
+  Rel.d.b++;
 }
 
-uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  return Rel.Offset;
 }
 
 symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
@@ -270,14 +761,28 @@ symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(Ref);
 }
 
-uint64_t WasmObjectFile::getRelocationType(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+uint64_t WasmObjectFile::getRelocationType(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  return Rel.Type;
 }
 
 void WasmObjectFile::getRelocationTypeName(
-    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  llvm_unreachable("not yet implemented");
+    DataRefImpl Ref, SmallVectorImpl<char> &Result) const {
+  const wasm::WasmRelocation& Rel = getWasmRelocation(Ref);
+  StringRef Res = "Unknown";
+
+#define WASM_RELOC(name, value)  \
+  case wasm::name:              \
+    Res = #name;               \
+    break;
+
+  switch (Rel.Type) {
+#include "llvm/Support/WasmRelocs/WebAssembly.def"
+  }
+
+#undef WASM_RELOC
+
+  Result.append(Res.begin(), Res.end());
 }
 
 section_iterator WasmObjectFile::section_begin() const {
@@ -304,10 +809,25 @@ SubtargetFeatures WasmObjectFile::getFeatures() const {
 
 bool WasmObjectFile::isRelocatableObject() const { return false; }
 
-const wasm::WasmSection *
+const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const {
+  assert(Ref.d.a < Sections.size());
+  return Sections[Ref.d.a];
+}
+
+const WasmSection &
 WasmObjectFile::getWasmSection(const SectionRef &Section) const {
-  return &Sections[Section.getRawDataRefImpl().d.a];
+  return getWasmSection(Section.getRawDataRefImpl());
 }
 
-} // end namespace object
-} // end namespace llvm
+const wasm::WasmRelocation &
+WasmObjectFile::getWasmRelocation(const RelocationRef &Ref) const {
+  return getWasmRelocation(Ref.getRawDataRefImpl());
+}
+
+const wasm::WasmRelocation &
+WasmObjectFile::getWasmRelocation(DataRefImpl Ref) const {
+  assert(Ref.d.a < Sections.size());
+  const WasmSection& Sec = Sections[Ref.d.a];
+  assert(Ref.d.b < Sec.Relocations.size());
+  return Sec.Relocations[Ref.d.b];
+}
diff --git a/lib/ObjectYAML/CMakeLists.txt b/lib/ObjectYAML/CMakeLists.txt
index 2eee95b318db..37f8fd7bce1a 100644
--- a/lib/ObjectYAML/CMakeLists.txt
+++ b/lib/ObjectYAML/CMakeLists.txt
@@ -1,8 +1,11 @@
 add_llvm_library(LLVMObjectYAML
-  YAML.cpp
   COFFYAML.cpp
+  DWARFEmitter.cpp
+  DWARFVisitor.cpp
+  DWARFYAML.cpp
   ELFYAML.cpp
   MachOYAML.cpp
   ObjectYAML.cpp
-  DWARFYAML.cpp
+  WasmYAML.cpp
+  YAML.cpp
   )
diff --git a/lib/ObjectYAML/DWARFEmitter.cpp b/lib/ObjectYAML/DWARFEmitter.cpp
new file mode 100644
index 000000000000..1aa1519b708b
--- /dev/null
+++ b/lib/ObjectYAML/DWARFEmitter.cpp
@@ -0,0 +1,321 @@
+//===- DWARFEmitter - Convert YAML to DWARF binary data -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief The DWARF component of yaml2obj. Provided as library code for tests.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/DWARFEmitter.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SwapByteOrder.h"
+
+#include "DWARFVisitor.h"
+
+#include <algorithm>
+
+using namespace llvm;
+
+template <typename T>
+static void writeInteger(T Integer, raw_ostream &OS, bool IsLittleEndian) {
+  if (IsLittleEndian != sys::IsLittleEndianHost)
+    sys::swapByteOrder(Integer);
+  OS.write(reinterpret_cast<char *>(&Integer), sizeof(T));
+}
+
+static void writeVariableSizedInteger(uint64_t Integer, size_t Size,
+                                      raw_ostream &OS, bool IsLittleEndian) {
+  if (8 == Size)
+    writeInteger((uint64_t)Integer, OS, IsLittleEndian);
+  else if (4 == Size)
+    writeInteger((uint32_t)Integer, OS, IsLittleEndian);
+  else if (2 == Size)
+    writeInteger((uint16_t)Integer, OS, IsLittleEndian);
+  else if (1 == Size)
+    writeInteger((uint8_t)Integer, OS, IsLittleEndian);
+  else
+    assert(false && "Invalid integer write size.");
+}
+
+static void ZeroFillBytes(raw_ostream &OS, size_t Size) {
+  std::vector<uint8_t> FillData;
+  FillData.insert(FillData.begin(), Size, 0);
+  OS.write(reinterpret_cast<char *>(FillData.data()), Size);
+}
+
+void writeInitialLength(const DWARFYAML::InitialLength &Length, raw_ostream &OS,
+                        bool IsLittleEndian) {
+  writeInteger((uint32_t)Length.TotalLength, OS, IsLittleEndian);
+  if (Length.isDWARF64())
+    writeInteger((uint64_t)Length.TotalLength64, OS, IsLittleEndian);
+}
+
+void DWARFYAML::EmitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto Str : DI.DebugStrings) {
+    OS.write(Str.data(), Str.size());
+    OS.write('\0');
+  }
+}
+
+void DWARFYAML::EmitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto AbbrevDecl : DI.AbbrevDecls) {
+    encodeULEB128(AbbrevDecl.Code, OS);
+    encodeULEB128(AbbrevDecl.Tag, OS);
+    OS.write(AbbrevDecl.Children);
+    for (auto Attr : AbbrevDecl.Attributes) {
+      encodeULEB128(Attr.Attribute, OS);
+      encodeULEB128(Attr.Form, OS);
+      if (Attr.Form == dwarf::DW_FORM_implicit_const)
+        encodeSLEB128(Attr.Value, OS);
+    }
+    encodeULEB128(0, OS);
+    encodeULEB128(0, OS);
+  }
+}
+
+void DWARFYAML::EmitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto Range : DI.ARanges) {
+    auto HeaderStart = OS.tell();
+    writeInitialLength(Range.Length, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian);
+    writeInteger((uint32_t)Range.CuOffset, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)Range.AddrSize, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)Range.SegSize, OS, DI.IsLittleEndian);
+
+    auto HeaderSize = OS.tell() - HeaderStart;
+    auto FirstDescriptor = alignTo(HeaderSize, Range.AddrSize * 2);
+    ZeroFillBytes(OS, FirstDescriptor - HeaderSize);
+
+    for (auto Descriptor : Range.Descriptors) {
+      writeVariableSizedInteger(Descriptor.Address, Range.AddrSize, OS,
+                                DI.IsLittleEndian);
+      writeVariableSizedInteger(Descriptor.Length, Range.AddrSize, OS,
+                                DI.IsLittleEndian);
+    }
+    ZeroFillBytes(OS, Range.AddrSize * 2);
+  }
+}
+
+void DWARFYAML::EmitPubSection(raw_ostream &OS,
+                               const DWARFYAML::PubSection &Sect,
+                               bool IsLittleEndian) {
+  writeInitialLength(Sect.Length, OS, IsLittleEndian);
+  writeInteger((uint16_t)Sect.Version, OS, IsLittleEndian);
+  writeInteger((uint32_t)Sect.UnitOffset, OS, IsLittleEndian);
+  writeInteger((uint32_t)Sect.UnitSize, OS, IsLittleEndian);
+  for (auto Entry : Sect.Entries) {
+    writeInteger((uint32_t)Entry.DieOffset, OS, IsLittleEndian);
+    if (Sect.IsGNUStyle)
+      writeInteger((uint32_t)Entry.Descriptor, OS, IsLittleEndian);
+    OS.write(Entry.Name.data(), Entry.Name.size());
+    OS.write('\0');
+  }
+}
+
+/// \brief An extension of the DWARFYAML::ConstVisitor which writes compile
+/// units and DIEs to a stream.
+class DumpVisitor : public DWARFYAML::ConstVisitor {
+  raw_ostream &OS;
+
+protected:
+  virtual void onStartCompileUnit(const DWARFYAML::Unit &CU) {
+    writeInitialLength(CU.Length, OS, DebugInfo.IsLittleEndian);
+    writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian);
+    if(CU.Version >= 5) {
+      writeInteger((uint8_t)CU.Type, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+    }else {
+      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
+    }
+    
+  }
+
+  virtual void onStartDIE(const DWARFYAML::Unit &CU,
+                          const DWARFYAML::Entry &DIE) {
+    encodeULEB128(DIE.AbbrCode, OS);
+  }
+
+  virtual void onValue(const uint8_t U) {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  virtual void onValue(const uint16_t U) {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+  virtual void onValue(const uint32_t U) {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+  virtual void onValue(const uint64_t U, const bool LEB = false) {
+    if (LEB)
+      encodeULEB128(U, OS);
+    else
+      writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  virtual void onValue(const int64_t S, const bool LEB = false) {
+    if (LEB)
+      encodeSLEB128(S, OS);
+    else
+      writeInteger(S, OS, DebugInfo.IsLittleEndian);
+  }
+
+  virtual void onValue(const StringRef String) {
+    OS.write(String.data(), String.size());
+    OS.write('\0');
+  }
+
+  virtual void onValue(const MemoryBufferRef MBR) {
+    OS.write(MBR.getBufferStart(), MBR.getBufferSize());
+  }
+
+public:
+  DumpVisitor(const DWARFYAML::Data &DI, raw_ostream &Out)
+      : DWARFYAML::ConstVisitor(DI), OS(Out) {}
+};
+
+void DWARFYAML::EmitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  DumpVisitor Visitor(DI, OS);
+  Visitor.traverseDebugInfo();
+}
+
+static void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
+  OS.write(File.Name.data(), File.Name.size());
+  OS.write('\0');
+  encodeULEB128(File.DirIdx, OS);
+  encodeULEB128(File.ModTime, OS);
+  encodeULEB128(File.Length, OS);
+}
+
+void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (const auto &LineTable : DI.DebugLines) {
+    writeInitialLength(LineTable.Length, OS, DI.IsLittleEndian);
+    uint64_t SizeOfPrologueLength = LineTable.Length.isDWARF64() ? 8 : 4;
+    writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian);
+    writeVariableSizedInteger(LineTable.PrologueLength, SizeOfPrologueLength,
+                              OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.MinInstLength, OS, DI.IsLittleEndian);
+    if (LineTable.Version >= 4)
+      writeInteger((uint8_t)LineTable.MaxOpsPerInst, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.DefaultIsStmt, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.LineBase, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.LineRange, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.OpcodeBase, OS, DI.IsLittleEndian);
+
+    for (auto OpcodeLength : LineTable.StandardOpcodeLengths)
+      writeInteger((uint8_t)OpcodeLength, OS, DI.IsLittleEndian);
+
+    for (auto IncludeDir : LineTable.IncludeDirs) {
+      OS.write(IncludeDir.data(), IncludeDir.size());
+      OS.write('\0');
+    }
+    OS.write('\0');
+
+    for (auto File : LineTable.Files)
+      EmitFileEntry(OS, File);
+    OS.write('\0');
+
+    for (auto Op : LineTable.Opcodes) {
+      writeInteger((uint8_t)Op.Opcode, OS, DI.IsLittleEndian);
+      if (Op.Opcode == 0) {
+        encodeULEB128(Op.ExtLen, OS);
+        writeInteger((uint8_t)Op.SubOpcode, OS, DI.IsLittleEndian);
+        switch (Op.SubOpcode) {
+        case dwarf::DW_LNE_set_address:
+        case dwarf::DW_LNE_set_discriminator:
+          writeVariableSizedInteger(Op.Data, DI.CompileUnits[0].AddrSize, OS,
+                                    DI.IsLittleEndian);
+          break;
+        case dwarf::DW_LNE_define_file:
+          EmitFileEntry(OS, Op.FileEntry);
+          break;
+        case dwarf::DW_LNE_end_sequence:
+          break;
+        default:
+          for (auto OpByte : Op.UnknownOpcodeData)
+            writeInteger((uint8_t)OpByte, OS, DI.IsLittleEndian);
+        }
+      } else if (Op.Opcode < LineTable.OpcodeBase) {
+        switch (Op.Opcode) {
+        case dwarf::DW_LNS_copy:
+        case dwarf::DW_LNS_negate_stmt:
+        case dwarf::DW_LNS_set_basic_block:
+        case dwarf::DW_LNS_const_add_pc:
+        case dwarf::DW_LNS_set_prologue_end:
+        case dwarf::DW_LNS_set_epilogue_begin:
+          break;
+
+        case dwarf::DW_LNS_advance_pc:
+        case dwarf::DW_LNS_set_file:
+        case dwarf::DW_LNS_set_column:
+        case dwarf::DW_LNS_set_isa:
+          encodeULEB128(Op.Data, OS);
+          break;
+
+        case dwarf::DW_LNS_advance_line:
+          encodeSLEB128(Op.SData, OS);
+          break;
+
+        case dwarf::DW_LNS_fixed_advance_pc:
+          writeInteger((uint16_t)Op.Data, OS, DI.IsLittleEndian);
+          break;
+
+        default:
+          for (auto OpData : Op.StandardOpcodeData) {
+            encodeULEB128(OpData, OS);
+          }
+        }
+      }
+    }
+  }
+}
+
+typedef void (*EmitFuncType)(raw_ostream &, const DWARFYAML::Data &);
+
+static void
+EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
+                     StringRef Sec,
+                     StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
+  std::string Data;
+  raw_string_ostream DebugInfoStream(Data);
+  EmitFunc(DebugInfoStream, DI);
+  DebugInfoStream.flush();
+  if (!Data.empty())
+    OutputBuffers[Sec] = MemoryBuffer::getMemBufferCopy(Data);
+}
+
+Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
+DWARFYAML::EmitDebugSections(StringRef YAMLString,
+                             bool IsLittleEndian) {
+  StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
+
+  yaml::Input YIn(YAMLString);
+
+  DWARFYAML::Data DI;
+  DI.IsLittleEndian = IsLittleEndian;
+  YIn >> DI;
+  if (YIn.error())
+    return errorCodeToError(YIn.error());
+
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugInfo, "debug_info",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugLine, "debug_line",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugStr, "debug_str",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugAbbrev, "debug_abbrev",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugAranges, "debug_aranges",
+                       DebugSections);
+  return std::move(DebugSections);
+}
diff --git a/lib/ObjectYAML/DWARFVisitor.cpp b/lib/ObjectYAML/DWARFVisitor.cpp
new file mode 100644
index 000000000000..36a9f7638bd4
--- /dev/null
+++ b/lib/ObjectYAML/DWARFVisitor.cpp
@@ -0,0 +1,178 @@
+//===--- DWARFVisitor.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFVisitor.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+
+using namespace llvm;
+
+template <typename T>
+void DWARFYAML::VisitorImpl<T>::onVariableSizeValue(uint64_t U, unsigned Size) {
+  switch (Size) {
+  case 8:
+    onValue((uint64_t)U);
+    break;
+  case 4:
+    onValue((uint32_t)U);
+    break;
+  case 2:
+    onValue((uint16_t)U);
+    break;
+  case 1:
+    onValue((uint8_t)U);
+    break;
+  default:
+    llvm_unreachable("Invalid integer write size.");
+  }
+}
+
+unsigned getOffsetSize(const DWARFYAML::Unit &Unit) {
+  return Unit.Length.isDWARF64() ? 8 : 4;
+}
+
+unsigned getRefSize(const DWARFYAML::Unit &Unit) {
+  if (Unit.Version == 2)
+    return Unit.AddrSize;
+  return getOffsetSize(Unit);
+}
+
+template <typename T> void DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
+  for (auto &Unit : DebugInfo.CompileUnits) {
+    onStartCompileUnit(Unit);
+    auto FirstAbbrevCode = Unit.Entries[0].AbbrCode;
+
+    for (auto &Entry : Unit.Entries) {
+      onStartDIE(Unit, Entry);
+      if (Entry.AbbrCode == 0u)
+        continue;
+      auto &Abbrev = DebugInfo.AbbrevDecls[Entry.AbbrCode - FirstAbbrevCode];
+      auto FormVal = Entry.Values.begin();
+      auto AbbrForm = Abbrev.Attributes.begin();
+      for (;
+           FormVal != Entry.Values.end() && AbbrForm != Abbrev.Attributes.end();
+           ++FormVal, ++AbbrForm) {
+        onForm(*AbbrForm, *FormVal);
+        dwarf::Form Form = AbbrForm->Form;
+        bool Indirect;
+        do {
+          Indirect = false;
+          switch (Form) {
+          case dwarf::DW_FORM_addr:
+            onVariableSizeValue(FormVal->Value, Unit.AddrSize);
+            break;
+          case dwarf::DW_FORM_ref_addr:
+            onVariableSizeValue(FormVal->Value, getRefSize(Unit));
+            break;
+          case dwarf::DW_FORM_exprloc:
+          case dwarf::DW_FORM_block:
+            onValue((uint64_t)FormVal->BlockData.size(), true);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          case dwarf::DW_FORM_block1: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint8_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_block2: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint16_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_block4: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint32_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_data1:
+          case dwarf::DW_FORM_ref1:
+          case dwarf::DW_FORM_flag:
+          case dwarf::DW_FORM_strx1:
+          case dwarf::DW_FORM_addrx1:
+            onValue((uint8_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data2:
+          case dwarf::DW_FORM_ref2:
+          case dwarf::DW_FORM_strx2:
+          case dwarf::DW_FORM_addrx2:
+            onValue((uint16_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data4:
+          case dwarf::DW_FORM_ref4:
+          case dwarf::DW_FORM_ref_sup4:
+          case dwarf::DW_FORM_strx4:
+          case dwarf::DW_FORM_addrx4:
+            onValue((uint32_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data8:
+          case dwarf::DW_FORM_ref8:
+          case dwarf::DW_FORM_ref_sup8:
+            onValue((uint64_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_sdata:
+            onValue((int64_t)FormVal->Value, true);
+            break;
+          case dwarf::DW_FORM_udata:
+          case dwarf::DW_FORM_ref_udata:
+            onValue((uint64_t)FormVal->Value, true);
+            break;
+          case dwarf::DW_FORM_string:
+            onValue(FormVal->CStr);
+            break;
+          case dwarf::DW_FORM_indirect:
+            onValue((uint64_t)FormVal->Value, true);
+            Indirect = true;
+            Form = static_cast<dwarf::Form>((uint64_t)FormVal->Value);
+            ++FormVal;
+            break;
+          case dwarf::DW_FORM_strp:
+          case dwarf::DW_FORM_sec_offset:
+          case dwarf::DW_FORM_GNU_ref_alt:
+          case dwarf::DW_FORM_GNU_strp_alt:
+          case dwarf::DW_FORM_line_strp:
+          case dwarf::DW_FORM_strp_sup:
+            onVariableSizeValue(FormVal->Value, getOffsetSize(Unit));
+            break;
+          case dwarf::DW_FORM_ref_sig8:
+            onValue((uint64_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_GNU_addr_index:
+          case dwarf::DW_FORM_GNU_str_index:
+            onValue((uint64_t)FormVal->Value, true);
+            break;
+          default:
+            break;
+          }
+        } while (Indirect);
+      }
+      onEndDIE(Unit, Entry);
+    }
+    onEndCompileUnit(Unit);
+  }
+}
+
+// Explicitly instantiate the two template expansions.
+template class DWARFYAML::VisitorImpl<DWARFYAML::Data>;
+template class DWARFYAML::VisitorImpl<const DWARFYAML::Data>;
diff --git a/lib/ObjectYAML/DWARFVisitor.h b/lib/ObjectYAML/DWARFVisitor.h
new file mode 100644
index 000000000000..263e36220a05
--- /dev/null
+++ b/lib/ObjectYAML/DWARFVisitor.h
@@ -0,0 +1,97 @@
+//===--- DWARFVisitor.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_DWARFVISITOR_H
+#define LLVM_OBJECTYAML_DWARFVISITOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+namespace DWARFYAML {
+
+struct Data;
+struct Unit;
+struct Entry;
+struct FormValue;
+struct AttributeAbbrev;
+
+/// \brief A class to visits DWARFYAML Compile Units and DIEs in preorder.
+///
+/// Extensions of this class can either maintain const or non-const references
+/// to the DWARFYAML::Data object.
+template <typename T> class VisitorImpl {
+protected:
+  T &DebugInfo;
+
+  /// Visitor Functions
+  /// @{
+  virtual void onStartCompileUnit(Unit &CU) {}
+  virtual void onEndCompileUnit(Unit &CU) {}
+  virtual void onStartDIE(Unit &CU, Entry &DIE) {}
+  virtual void onEndDIE(Unit &CU, Entry &DIE) {}
+  virtual void onForm(AttributeAbbrev &AttAbbrev, FormValue &Value) {}
+  /// @}
+
+  /// Const Visitor Functions
+  /// @{
+  virtual void onStartCompileUnit(const Unit &CU) {}
+  virtual void onEndCompileUnit(const Unit &CU) {}
+  virtual void onStartDIE(const Unit &CU, const Entry &DIE) {}
+  virtual void onEndDIE(const Unit &CU, const Entry &DIE) {}
+  virtual void onForm(const AttributeAbbrev &AttAbbrev,
+                      const FormValue &Value) {}
+  /// @}
+
+  /// Value visitors
+  /// @{
+  virtual void onValue(const uint8_t U) {}
+  virtual void onValue(const uint16_t U) {}
+  virtual void onValue(const uint32_t U) {}
+  virtual void onValue(const uint64_t U, const bool LEB = false) {}
+  virtual void onValue(const int64_t S, const bool LEB = false) {}
+  virtual void onValue(const StringRef String) {}
+  virtual void onValue(const MemoryBufferRef MBR) {}
+  /// @}
+
+public:
+  VisitorImpl(T &DI) : DebugInfo(DI) {}
+
+  virtual ~VisitorImpl() {}
+
+  void traverseDebugInfo();
+
+private:
+  void onVariableSizeValue(uint64_t U, unsigned Size);
+};
+
+// Making the visior instantiations extern and explicit in the cpp file. This
+// prevents them from being instantiated in every compile unit that uses the
+// visitors.
+extern template class VisitorImpl<DWARFYAML::Data>;
+extern template class VisitorImpl<const DWARFYAML::Data>;
+
+class Visitor : public VisitorImpl<Data> {
+public:
+  Visitor(Data &DI) : VisitorImpl<Data>(DI) {}
+};
+
+class ConstVisitor : public VisitorImpl<const Data> {
+public:
+  ConstVisitor(const Data &DI) : VisitorImpl<const Data>(DI) {}
+};
+
+} // namespace DWARFYAML
+} // namespace llvm
+
+#endif
diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp
index 014e63fe7d34..edb9545f14b1 100644
--- a/lib/ObjectYAML/DWARFYAML.cpp
+++ b/lib/ObjectYAML/DWARFYAML.cpp
@@ -54,6 +54,8 @@ void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
     IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev) {
   IO.mapRequired("Attribute", AttAbbrev.Attribute);
   IO.mapRequired("Form", AttAbbrev.Form);
+  if(AttAbbrev.Form == dwarf::DW_FORM_implicit_const)
+    IO.mapRequired("Value", AttAbbrev.Value);
 }
 
 void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
@@ -97,6 +99,8 @@ void MappingTraits<DWARFYAML::PubSection>::mapping(
 void MappingTraits<DWARFYAML::Unit>::mapping(IO &IO, DWARFYAML::Unit &Unit) {
   IO.mapRequired("Length", Unit.Length);
   IO.mapRequired("Version", Unit.Version);
+  if (Unit.Version >= 5)
+    IO.mapRequired("UnitType", Unit.Type);
   IO.mapRequired("AbbrOffset", Unit.AbbrOffset);
   IO.mapRequired("AddrSize", Unit.AddrSize);
   IO.mapOptional("Entries", Unit.Entries);
@@ -144,9 +148,7 @@ void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
 
 void MappingTraits<DWARFYAML::LineTable>::mapping(
     IO &IO, DWARFYAML::LineTable &LineTable) {
-  IO.mapRequired("TotalLength", LineTable.TotalLength);
-  if (LineTable.TotalLength == UINT32_MAX)
-    IO.mapRequired("TotalLength64", LineTable.TotalLength64);
+  IO.mapRequired("Length", LineTable.Length);
   IO.mapRequired("Version", LineTable.Version);
   IO.mapRequired("PrologueLength", LineTable.PrologueLength);
   IO.mapRequired("MinInstLength", LineTable.MinInstLength);
@@ -162,6 +164,13 @@ void MappingTraits<DWARFYAML::LineTable>::mapping(
   IO.mapRequired("Opcodes", LineTable.Opcodes);
 }
 
+void MappingTraits<DWARFYAML::InitialLength>::mapping(
+    IO &IO, DWARFYAML::InitialLength &InitialLength) {
+  IO.mapRequired("TotalLength", InitialLength.TotalLength);
+  if (InitialLength.isDWARF64())
+    IO.mapRequired("TotalLength64", InitialLength.TotalLength64);
+}
+
 } // namespace llvm::yaml
 
 } // namespace llvm
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index fe9af9f3ac76..3052901da45c 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -21,231 +21,229 @@ ELFYAML::Section::~Section() {}
 
 namespace yaml {
 
-void
-ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(IO &IO,
-                                                      ELFYAML::ELF_ET &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(ET_NONE)
-  ECase(ET_REL)
-  ECase(ET_EXEC)
-  ECase(ET_DYN)
-  ECase(ET_CORE)
+void ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(
+    IO &IO, ELFYAML::ELF_ET &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(ET_NONE);
+  ECase(ET_REL);
+  ECase(ET_EXEC);
+  ECase(ET_DYN);
+  ECase(ET_CORE);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
 
-void
-ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO,
-                                                      ELFYAML::ELF_EM &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(EM_NONE)
-  ECase(EM_M32)
-  ECase(EM_SPARC)
-  ECase(EM_386)
-  ECase(EM_68K)
-  ECase(EM_88K)
-  ECase(EM_IAMCU)
-  ECase(EM_860)
-  ECase(EM_MIPS)
-  ECase(EM_S370)
-  ECase(EM_MIPS_RS3_LE)
-  ECase(EM_PARISC)
-  ECase(EM_VPP500)
-  ECase(EM_SPARC32PLUS)
-  ECase(EM_960)
-  ECase(EM_PPC)
-  ECase(EM_PPC64)
-  ECase(EM_S390)
-  ECase(EM_SPU)
-  ECase(EM_V800)
-  ECase(EM_FR20)
-  ECase(EM_RH32)
-  ECase(EM_RCE)
-  ECase(EM_ARM)
-  ECase(EM_ALPHA)
-  ECase(EM_SH)
-  ECase(EM_SPARCV9)
-  ECase(EM_TRICORE)
-  ECase(EM_ARC)
-  ECase(EM_H8_300)
-  ECase(EM_H8_300H)
-  ECase(EM_H8S)
-  ECase(EM_H8_500)
-  ECase(EM_IA_64)
-  ECase(EM_MIPS_X)
-  ECase(EM_COLDFIRE)
-  ECase(EM_68HC12)
-  ECase(EM_MMA)
-  ECase(EM_PCP)
-  ECase(EM_NCPU)
-  ECase(EM_NDR1)
-  ECase(EM_STARCORE)
-  ECase(EM_ME16)
-  ECase(EM_ST100)
-  ECase(EM_TINYJ)
-  ECase(EM_X86_64)
-  ECase(EM_PDSP)
-  ECase(EM_PDP10)
-  ECase(EM_PDP11)
-  ECase(EM_FX66)
-  ECase(EM_ST9PLUS)
-  ECase(EM_ST7)
-  ECase(EM_68HC16)
-  ECase(EM_68HC11)
-  ECase(EM_68HC08)
-  ECase(EM_68HC05)
-  ECase(EM_SVX)
-  ECase(EM_ST19)
-  ECase(EM_VAX)
-  ECase(EM_CRIS)
-  ECase(EM_JAVELIN)
-  ECase(EM_FIREPATH)
-  ECase(EM_ZSP)
-  ECase(EM_MMIX)
-  ECase(EM_HUANY)
-  ECase(EM_PRISM)
-  ECase(EM_AVR)
-  ECase(EM_FR30)
-  ECase(EM_D10V)
-  ECase(EM_D30V)
-  ECase(EM_V850)
-  ECase(EM_M32R)
-  ECase(EM_MN10300)
-  ECase(EM_MN10200)
-  ECase(EM_PJ)
-  ECase(EM_OPENRISC)
-  ECase(EM_ARC_COMPACT)
-  ECase(EM_XTENSA)
-  ECase(EM_VIDEOCORE)
-  ECase(EM_TMM_GPP)
-  ECase(EM_NS32K)
-  ECase(EM_TPC)
-  ECase(EM_SNP1K)
-  ECase(EM_ST200)
-  ECase(EM_IP2K)
-  ECase(EM_MAX)
-  ECase(EM_CR)
-  ECase(EM_F2MC16)
-  ECase(EM_MSP430)
-  ECase(EM_BLACKFIN)
-  ECase(EM_SE_C33)
-  ECase(EM_SEP)
-  ECase(EM_ARCA)
-  ECase(EM_UNICORE)
-  ECase(EM_EXCESS)
-  ECase(EM_DXP)
-  ECase(EM_ALTERA_NIOS2)
-  ECase(EM_CRX)
-  ECase(EM_XGATE)
-  ECase(EM_C166)
-  ECase(EM_M16C)
-  ECase(EM_DSPIC30F)
-  ECase(EM_CE)
-  ECase(EM_M32C)
-  ECase(EM_TSK3000)
-  ECase(EM_RS08)
-  ECase(EM_SHARC)
-  ECase(EM_ECOG2)
-  ECase(EM_SCORE7)
-  ECase(EM_DSP24)
-  ECase(EM_VIDEOCORE3)
-  ECase(EM_LATTICEMICO32)
-  ECase(EM_SE_C17)
-  ECase(EM_TI_C6000)
-  ECase(EM_TI_C2000)
-  ECase(EM_TI_C5500)
-  ECase(EM_MMDSP_PLUS)
-  ECase(EM_CYPRESS_M8C)
-  ECase(EM_R32C)
-  ECase(EM_TRIMEDIA)
-  ECase(EM_HEXAGON)
-  ECase(EM_8051)
-  ECase(EM_STXP7X)
-  ECase(EM_NDS32)
-  ECase(EM_ECOG1)
-  ECase(EM_ECOG1X)
-  ECase(EM_MAXQ30)
-  ECase(EM_XIMO16)
-  ECase(EM_MANIK)
-  ECase(EM_CRAYNV2)
-  ECase(EM_RX)
-  ECase(EM_METAG)
-  ECase(EM_MCST_ELBRUS)
-  ECase(EM_ECOG16)
-  ECase(EM_CR16)
-  ECase(EM_ETPU)
-  ECase(EM_SLE9X)
-  ECase(EM_L10M)
-  ECase(EM_K10M)
-  ECase(EM_AARCH64)
-  ECase(EM_AVR32)
-  ECase(EM_STM8)
-  ECase(EM_TILE64)
-  ECase(EM_TILEPRO)
-  ECase(EM_CUDA)
-  ECase(EM_TILEGX)
-  ECase(EM_CLOUDSHIELD)
-  ECase(EM_COREA_1ST)
-  ECase(EM_COREA_2ND)
-  ECase(EM_ARC_COMPACT2)
-  ECase(EM_OPEN8)
-  ECase(EM_RL78)
-  ECase(EM_VIDEOCORE5)
-  ECase(EM_78KOR)
-  ECase(EM_56800EX)
-  ECase(EM_AMDGPU)
-  ECase(EM_RISCV)
-  ECase(EM_LANAI)
-  ECase(EM_BPF)
+void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
+    IO &IO, ELFYAML::ELF_EM &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(EM_NONE);
+  ECase(EM_M32);
+  ECase(EM_SPARC);
+  ECase(EM_386);
+  ECase(EM_68K);
+  ECase(EM_88K);
+  ECase(EM_IAMCU);
+  ECase(EM_860);
+  ECase(EM_MIPS);
+  ECase(EM_S370);
+  ECase(EM_MIPS_RS3_LE);
+  ECase(EM_PARISC);
+  ECase(EM_VPP500);
+  ECase(EM_SPARC32PLUS);
+  ECase(EM_960);
+  ECase(EM_PPC);
+  ECase(EM_PPC64);
+  ECase(EM_S390);
+  ECase(EM_SPU);
+  ECase(EM_V800);
+  ECase(EM_FR20);
+  ECase(EM_RH32);
+  ECase(EM_RCE);
+  ECase(EM_ARM);
+  ECase(EM_ALPHA);
+  ECase(EM_SH);
+  ECase(EM_SPARCV9);
+  ECase(EM_TRICORE);
+  ECase(EM_ARC);
+  ECase(EM_H8_300);
+  ECase(EM_H8_300H);
+  ECase(EM_H8S);
+  ECase(EM_H8_500);
+  ECase(EM_IA_64);
+  ECase(EM_MIPS_X);
+  ECase(EM_COLDFIRE);
+  ECase(EM_68HC12);
+  ECase(EM_MMA);
+  ECase(EM_PCP);
+  ECase(EM_NCPU);
+  ECase(EM_NDR1);
+  ECase(EM_STARCORE);
+  ECase(EM_ME16);
+  ECase(EM_ST100);
+  ECase(EM_TINYJ);
+  ECase(EM_X86_64);
+  ECase(EM_PDSP);
+  ECase(EM_PDP10);
+  ECase(EM_PDP11);
+  ECase(EM_FX66);
+  ECase(EM_ST9PLUS);
+  ECase(EM_ST7);
+  ECase(EM_68HC16);
+  ECase(EM_68HC11);
+  ECase(EM_68HC08);
+  ECase(EM_68HC05);
+  ECase(EM_SVX);
+  ECase(EM_ST19);
+  ECase(EM_VAX);
+  ECase(EM_CRIS);
+  ECase(EM_JAVELIN);
+  ECase(EM_FIREPATH);
+  ECase(EM_ZSP);
+  ECase(EM_MMIX);
+  ECase(EM_HUANY);
+  ECase(EM_PRISM);
+  ECase(EM_AVR);
+  ECase(EM_FR30);
+  ECase(EM_D10V);
+  ECase(EM_D30V);
+  ECase(EM_V850);
+  ECase(EM_M32R);
+  ECase(EM_MN10300);
+  ECase(EM_MN10200);
+  ECase(EM_PJ);
+  ECase(EM_OPENRISC);
+  ECase(EM_ARC_COMPACT);
+  ECase(EM_XTENSA);
+  ECase(EM_VIDEOCORE);
+  ECase(EM_TMM_GPP);
+  ECase(EM_NS32K);
+  ECase(EM_TPC);
+  ECase(EM_SNP1K);
+  ECase(EM_ST200);
+  ECase(EM_IP2K);
+  ECase(EM_MAX);
+  ECase(EM_CR);
+  ECase(EM_F2MC16);
+  ECase(EM_MSP430);
+  ECase(EM_BLACKFIN);
+  ECase(EM_SE_C33);
+  ECase(EM_SEP);
+  ECase(EM_ARCA);
+  ECase(EM_UNICORE);
+  ECase(EM_EXCESS);
+  ECase(EM_DXP);
+  ECase(EM_ALTERA_NIOS2);
+  ECase(EM_CRX);
+  ECase(EM_XGATE);
+  ECase(EM_C166);
+  ECase(EM_M16C);
+  ECase(EM_DSPIC30F);
+  ECase(EM_CE);
+  ECase(EM_M32C);
+  ECase(EM_TSK3000);
+  ECase(EM_RS08);
+  ECase(EM_SHARC);
+  ECase(EM_ECOG2);
+  ECase(EM_SCORE7);
+  ECase(EM_DSP24);
+  ECase(EM_VIDEOCORE3);
+  ECase(EM_LATTICEMICO32);
+  ECase(EM_SE_C17);
+  ECase(EM_TI_C6000);
+  ECase(EM_TI_C2000);
+  ECase(EM_TI_C5500);
+  ECase(EM_MMDSP_PLUS);
+  ECase(EM_CYPRESS_M8C);
+  ECase(EM_R32C);
+  ECase(EM_TRIMEDIA);
+  ECase(EM_HEXAGON);
+  ECase(EM_8051);
+  ECase(EM_STXP7X);
+  ECase(EM_NDS32);
+  ECase(EM_ECOG1);
+  ECase(EM_ECOG1X);
+  ECase(EM_MAXQ30);
+  ECase(EM_XIMO16);
+  ECase(EM_MANIK);
+  ECase(EM_CRAYNV2);
+  ECase(EM_RX);
+  ECase(EM_METAG);
+  ECase(EM_MCST_ELBRUS);
+  ECase(EM_ECOG16);
+  ECase(EM_CR16);
+  ECase(EM_ETPU);
+  ECase(EM_SLE9X);
+  ECase(EM_L10M);
+  ECase(EM_K10M);
+  ECase(EM_AARCH64);
+  ECase(EM_AVR32);
+  ECase(EM_STM8);
+  ECase(EM_TILE64);
+  ECase(EM_TILEPRO);
+  ECase(EM_CUDA);
+  ECase(EM_TILEGX);
+  ECase(EM_CLOUDSHIELD);
+  ECase(EM_COREA_1ST);
+  ECase(EM_COREA_2ND);
+  ECase(EM_ARC_COMPACT2);
+  ECase(EM_OPEN8);
+  ECase(EM_RL78);
+  ECase(EM_VIDEOCORE5);
+  ECase(EM_78KOR);
+  ECase(EM_56800EX);
+  ECase(EM_AMDGPU);
+  ECase(EM_RISCV);
+  ECase(EM_LANAI);
+  ECase(EM_BPF);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFCLASS>::enumeration(
     IO &IO, ELFYAML::ELF_ELFCLASS &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
   // Since the semantics of ELFCLASSNONE is "invalid", just don't accept it
   // here.
-  ECase(ELFCLASS32)
-  ECase(ELFCLASS64)
+  ECase(ELFCLASS32);
+  ECase(ELFCLASS64);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFDATA>::enumeration(
     IO &IO, ELFYAML::ELF_ELFDATA &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
   // Since the semantics of ELFDATANONE is "invalid", just don't accept it
   // here.
-  ECase(ELFDATA2LSB)
-  ECase(ELFDATA2MSB)
+  ECase(ELFDATA2LSB);
+  ECase(ELFDATA2MSB);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFOSABI>::enumeration(
     IO &IO, ELFYAML::ELF_ELFOSABI &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(ELFOSABI_NONE)
-  ECase(ELFOSABI_HPUX)
-  ECase(ELFOSABI_NETBSD)
-  ECase(ELFOSABI_GNU)
-  ECase(ELFOSABI_GNU)
-  ECase(ELFOSABI_HURD)
-  ECase(ELFOSABI_SOLARIS)
-  ECase(ELFOSABI_AIX)
-  ECase(ELFOSABI_IRIX)
-  ECase(ELFOSABI_FREEBSD)
-  ECase(ELFOSABI_TRU64)
-  ECase(ELFOSABI_MODESTO)
-  ECase(ELFOSABI_OPENBSD)
-  ECase(ELFOSABI_OPENVMS)
-  ECase(ELFOSABI_NSK)
-  ECase(ELFOSABI_AROS)
-  ECase(ELFOSABI_FENIXOS)
-  ECase(ELFOSABI_CLOUDABI)
-  ECase(ELFOSABI_C6000_ELFABI)
-  ECase(ELFOSABI_AMDGPU_HSA)
-  ECase(ELFOSABI_C6000_LINUX)
-  ECase(ELFOSABI_ARM)
-  ECase(ELFOSABI_STANDALONE)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(ELFOSABI_NONE);
+  ECase(ELFOSABI_HPUX);
+  ECase(ELFOSABI_NETBSD);
+  ECase(ELFOSABI_GNU);
+  ECase(ELFOSABI_GNU);
+  ECase(ELFOSABI_HURD);
+  ECase(ELFOSABI_SOLARIS);
+  ECase(ELFOSABI_AIX);
+  ECase(ELFOSABI_IRIX);
+  ECase(ELFOSABI_FREEBSD);
+  ECase(ELFOSABI_TRU64);
+  ECase(ELFOSABI_MODESTO);
+  ECase(ELFOSABI_OPENBSD);
+  ECase(ELFOSABI_OPENVMS);
+  ECase(ELFOSABI_NSK);
+  ECase(ELFOSABI_AROS);
+  ECase(ELFOSABI_FENIXOS);
+  ECase(ELFOSABI_CLOUDABI);
+  ECase(ELFOSABI_C6000_ELFABI);
+  ECase(ELFOSABI_AMDGPU_HSA);
+  ECase(ELFOSABI_C6000_LINUX);
+  ECase(ELFOSABI_ARM);
+  ECase(ELFOSABI_STANDALONE);
 #undef ECase
 }
 
@@ -253,92 +251,92 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
                                                  ELFYAML::ELF_EF &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M);
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
+#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M)
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    BCase(EF_ARM_SOFT_FLOAT)
-    BCase(EF_ARM_VFP_FLOAT)
-    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK)
+    BCase(EF_ARM_SOFT_FLOAT);
+    BCase(EF_ARM_VFP_FLOAT);
+    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK);
     break;
   case ELF::EM_MIPS:
-    BCase(EF_MIPS_NOREORDER)
-    BCase(EF_MIPS_PIC)
-    BCase(EF_MIPS_CPIC)
-    BCase(EF_MIPS_ABI2)
-    BCase(EF_MIPS_32BITMODE)
-    BCase(EF_MIPS_FP64)
-    BCase(EF_MIPS_NAN2008)
-    BCase(EF_MIPS_MICROMIPS)
-    BCase(EF_MIPS_ARCH_ASE_M16)
-    BCase(EF_MIPS_ARCH_ASE_MDMX)
-    BCaseMask(EF_MIPS_ABI_O32, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_O64, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_EABI32, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_EABI64, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_MACH_3900, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4010, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4100, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4650, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4120, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4111, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_SB1, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_XLR, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON2, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON3, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5400, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5900, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5500, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_9000, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS2E, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS2F, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS3A, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH)
+    BCase(EF_MIPS_NOREORDER);
+    BCase(EF_MIPS_PIC);
+    BCase(EF_MIPS_CPIC);
+    BCase(EF_MIPS_ABI2);
+    BCase(EF_MIPS_32BITMODE);
+    BCase(EF_MIPS_FP64);
+    BCase(EF_MIPS_NAN2008);
+    BCase(EF_MIPS_MICROMIPS);
+    BCase(EF_MIPS_ARCH_ASE_M16);
+    BCase(EF_MIPS_ARCH_ASE_MDMX);
+    BCaseMask(EF_MIPS_ABI_O32, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_O64, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_EABI32, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_EABI64, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_MACH_3900, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4010, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4100, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4650, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4120, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4111, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_SB1, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_XLR, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON2, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON3, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5400, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5900, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5500, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_9000, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS2E, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS2F, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS3A, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH);
     break;
   case ELF::EM_HEXAGON:
-    BCase(EF_HEXAGON_MACH_V2)
-    BCase(EF_HEXAGON_MACH_V3)
-    BCase(EF_HEXAGON_MACH_V4)
-    BCase(EF_HEXAGON_MACH_V5)
-    BCase(EF_HEXAGON_ISA_V2)
-    BCase(EF_HEXAGON_ISA_V3)
-    BCase(EF_HEXAGON_ISA_V4)
-    BCase(EF_HEXAGON_ISA_V5)
+    BCase(EF_HEXAGON_MACH_V2);
+    BCase(EF_HEXAGON_MACH_V3);
+    BCase(EF_HEXAGON_MACH_V4);
+    BCase(EF_HEXAGON_MACH_V5);
+    BCase(EF_HEXAGON_ISA_V2);
+    BCase(EF_HEXAGON_ISA_V3);
+    BCase(EF_HEXAGON_ISA_V4);
+    BCase(EF_HEXAGON_ISA_V5);
     break;
   case ELF::EM_AVR:
-    BCase(EF_AVR_ARCH_AVR1)
-    BCase(EF_AVR_ARCH_AVR2)
-    BCase(EF_AVR_ARCH_AVR25)
-    BCase(EF_AVR_ARCH_AVR3)
-    BCase(EF_AVR_ARCH_AVR31)
-    BCase(EF_AVR_ARCH_AVR35)
-    BCase(EF_AVR_ARCH_AVR4)
-    BCase(EF_AVR_ARCH_AVR51)
-    BCase(EF_AVR_ARCH_AVR6)
-    BCase(EF_AVR_ARCH_AVRTINY)
-    BCase(EF_AVR_ARCH_XMEGA1)
-    BCase(EF_AVR_ARCH_XMEGA2)
-    BCase(EF_AVR_ARCH_XMEGA3)
-    BCase(EF_AVR_ARCH_XMEGA4)
-    BCase(EF_AVR_ARCH_XMEGA5)
-    BCase(EF_AVR_ARCH_XMEGA6)
-    BCase(EF_AVR_ARCH_XMEGA7)
+    BCase(EF_AVR_ARCH_AVR1);
+    BCase(EF_AVR_ARCH_AVR2);
+    BCase(EF_AVR_ARCH_AVR25);
+    BCase(EF_AVR_ARCH_AVR3);
+    BCase(EF_AVR_ARCH_AVR31);
+    BCase(EF_AVR_ARCH_AVR35);
+    BCase(EF_AVR_ARCH_AVR4);
+    BCase(EF_AVR_ARCH_AVR51);
+    BCase(EF_AVR_ARCH_AVR6);
+    BCase(EF_AVR_ARCH_AVRTINY);
+    BCase(EF_AVR_ARCH_XMEGA1);
+    BCase(EF_AVR_ARCH_XMEGA2);
+    BCase(EF_AVR_ARCH_XMEGA3);
+    BCase(EF_AVR_ARCH_XMEGA4);
+    BCase(EF_AVR_ARCH_XMEGA5);
+    BCase(EF_AVR_ARCH_XMEGA6);
+    BCase(EF_AVR_ARCH_XMEGA7);
     break;
   case ELF::EM_AMDGPU:
   case ELF::EM_X86_64:
@@ -354,51 +352,51 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     IO &IO, ELFYAML::ELF_SHT &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(SHT_NULL)
-  ECase(SHT_PROGBITS)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(SHT_NULL);
+  ECase(SHT_PROGBITS);
   // No SHT_SYMTAB. Use the top-level `Symbols` key instead.
   // FIXME: Issue a diagnostic with this information.
-  ECase(SHT_STRTAB)
-  ECase(SHT_RELA)
-  ECase(SHT_HASH)
-  ECase(SHT_DYNAMIC)
-  ECase(SHT_NOTE)
-  ECase(SHT_NOBITS)
-  ECase(SHT_REL)
-  ECase(SHT_SHLIB)
-  ECase(SHT_DYNSYM)
-  ECase(SHT_INIT_ARRAY)
-  ECase(SHT_FINI_ARRAY)
-  ECase(SHT_PREINIT_ARRAY)
-  ECase(SHT_GROUP)
-  ECase(SHT_SYMTAB_SHNDX)
-  ECase(SHT_LOOS)
-  ECase(SHT_GNU_ATTRIBUTES)
-  ECase(SHT_GNU_HASH)
-  ECase(SHT_GNU_verdef)
-  ECase(SHT_GNU_verneed)
-  ECase(SHT_GNU_versym)
-  ECase(SHT_HIOS)
-  ECase(SHT_LOPROC)
+  ECase(SHT_STRTAB);
+  ECase(SHT_RELA);
+  ECase(SHT_HASH);
+  ECase(SHT_DYNAMIC);
+  ECase(SHT_NOTE);
+  ECase(SHT_NOBITS);
+  ECase(SHT_REL);
+  ECase(SHT_SHLIB);
+  ECase(SHT_DYNSYM);
+  ECase(SHT_INIT_ARRAY);
+  ECase(SHT_FINI_ARRAY);
+  ECase(SHT_PREINIT_ARRAY);
+  ECase(SHT_GROUP);
+  ECase(SHT_SYMTAB_SHNDX);
+  ECase(SHT_LOOS);
+  ECase(SHT_GNU_ATTRIBUTES);
+  ECase(SHT_GNU_HASH);
+  ECase(SHT_GNU_verdef);
+  ECase(SHT_GNU_verneed);
+  ECase(SHT_GNU_versym);
+  ECase(SHT_HIOS);
+  ECase(SHT_LOPROC);
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    ECase(SHT_ARM_EXIDX)
-    ECase(SHT_ARM_PREEMPTMAP)
-    ECase(SHT_ARM_ATTRIBUTES)
-    ECase(SHT_ARM_DEBUGOVERLAY)
-    ECase(SHT_ARM_OVERLAYSECTION)
+    ECase(SHT_ARM_EXIDX);
+    ECase(SHT_ARM_PREEMPTMAP);
+    ECase(SHT_ARM_ATTRIBUTES);
+    ECase(SHT_ARM_DEBUGOVERLAY);
+    ECase(SHT_ARM_OVERLAYSECTION);
     break;
   case ELF::EM_HEXAGON:
-    ECase(SHT_HEX_ORDERED)
+    ECase(SHT_HEX_ORDERED);
     break;
   case ELF::EM_X86_64:
-    ECase(SHT_X86_64_UNWIND)
+    ECase(SHT_X86_64_UNWIND);
     break;
   case ELF::EM_MIPS:
-    ECase(SHT_MIPS_REGINFO)
-    ECase(SHT_MIPS_OPTIONS)
-    ECase(SHT_MIPS_ABIFLAGS)
+    ECase(SHT_MIPS_REGINFO);
+    ECase(SHT_MIPS_OPTIONS);
+    ECase(SHT_MIPS_ABIFLAGS);
     break;
   default:
     // Nothing to do.
@@ -410,43 +408,43 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
 void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
                                                   ELFYAML::ELF_SHF &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-  BCase(SHF_WRITE)
-  BCase(SHF_ALLOC)
-  BCase(SHF_EXCLUDE)
-  BCase(SHF_EXECINSTR)
-  BCase(SHF_MERGE)
-  BCase(SHF_STRINGS)
-  BCase(SHF_INFO_LINK)
-  BCase(SHF_LINK_ORDER)
-  BCase(SHF_OS_NONCONFORMING)
-  BCase(SHF_GROUP)
-  BCase(SHF_TLS)
-  switch(Object->Header.Machine) {
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
+  BCase(SHF_WRITE);
+  BCase(SHF_ALLOC);
+  BCase(SHF_EXCLUDE);
+  BCase(SHF_EXECINSTR);
+  BCase(SHF_MERGE);
+  BCase(SHF_STRINGS);
+  BCase(SHF_INFO_LINK);
+  BCase(SHF_LINK_ORDER);
+  BCase(SHF_OS_NONCONFORMING);
+  BCase(SHF_GROUP);
+  BCase(SHF_TLS);
+  switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    BCase(SHF_ARM_PURECODE)
+    BCase(SHF_ARM_PURECODE);
     break;
   case ELF::EM_AMDGPU:
-    BCase(SHF_AMDGPU_HSA_GLOBAL)
-    BCase(SHF_AMDGPU_HSA_READONLY)
-    BCase(SHF_AMDGPU_HSA_CODE)
-    BCase(SHF_AMDGPU_HSA_AGENT)
+    BCase(SHF_AMDGPU_HSA_GLOBAL);
+    BCase(SHF_AMDGPU_HSA_READONLY);
+    BCase(SHF_AMDGPU_HSA_CODE);
+    BCase(SHF_AMDGPU_HSA_AGENT);
     break;
   case ELF::EM_HEXAGON:
-    BCase(SHF_HEX_GPREL)
+    BCase(SHF_HEX_GPREL);
     break;
   case ELF::EM_MIPS:
-    BCase(SHF_MIPS_NODUPES)
-    BCase(SHF_MIPS_NAMES)
-    BCase(SHF_MIPS_LOCAL)
-    BCase(SHF_MIPS_NOSTRIP)
-    BCase(SHF_MIPS_GPREL)
-    BCase(SHF_MIPS_MERGE)
-    BCase(SHF_MIPS_ADDR)
-    BCase(SHF_MIPS_STRING)
+    BCase(SHF_MIPS_NODUPES);
+    BCase(SHF_MIPS_NAMES);
+    BCase(SHF_MIPS_LOCAL);
+    BCase(SHF_MIPS_NOSTRIP);
+    BCase(SHF_MIPS_GPREL);
+    BCase(SHF_MIPS_MERGE);
+    BCase(SHF_MIPS_ADDR);
+    BCase(SHF_MIPS_STRING);
     break;
   case ELF::EM_X86_64:
-    BCase(SHF_X86_64_LARGE)
+    BCase(SHF_X86_64_LARGE);
     break;
   default:
     // Nothing to do.
@@ -457,25 +455,25 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
     IO &IO, ELFYAML::ELF_STT &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(STT_NOTYPE)
-  ECase(STT_OBJECT)
-  ECase(STT_FUNC)
-  ECase(STT_SECTION)
-  ECase(STT_FILE)
-  ECase(STT_COMMON)
-  ECase(STT_TLS)
-  ECase(STT_GNU_IFUNC)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(STT_NOTYPE);
+  ECase(STT_OBJECT);
+  ECase(STT_FUNC);
+  ECase(STT_SECTION);
+  ECase(STT_FILE);
+  ECase(STT_COMMON);
+  ECase(STT_TLS);
+  ECase(STT_GNU_IFUNC);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STV>::enumeration(
     IO &IO, ELFYAML::ELF_STV &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(STV_DEFAULT)
-  ECase(STV_INTERNAL)
-  ECase(STV_HIDDEN)
-  ECase(STV_PROTECTED)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(STV_DEFAULT);
+  ECase(STV_INTERNAL);
+  ECase(STV_HIDDEN);
+  ECase(STV_PROTECTED);
 #undef ECase
 }
 
@@ -483,13 +481,13 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
                                                   ELFYAML::ELF_STO &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
   switch (Object->Header.Machine) {
   case ELF::EM_MIPS:
-    BCase(STO_MIPS_OPTIONAL)
-    BCase(STO_MIPS_PLT)
-    BCase(STO_MIPS_PIC)
-    BCase(STO_MIPS_MICROMIPS)
+    BCase(STO_MIPS_OPTIONAL);
+    BCase(STO_MIPS_PLT);
+    BCase(STO_MIPS_PIC);
+    BCase(STO_MIPS_MICROMIPS);
     break;
   default:
     break; // Nothing to do
@@ -500,11 +498,11 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_RSS>::enumeration(
     IO &IO, ELFYAML::ELF_RSS &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(RSS_UNDEF)
-  ECase(RSS_GP)
-  ECase(RSS_GP0)
-  ECase(RSS_LOC)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(RSS_UNDEF);
+  ECase(RSS_GP);
+  ECase(RSS_GP0);
+  ECase(RSS_LOC);
 #undef ECase
 }
 
@@ -553,51 +551,51 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_REG>::enumeration(
     IO &IO, ELFYAML::MIPS_AFL_REG &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X);
-  ECase(REG_NONE)
-  ECase(REG_32)
-  ECase(REG_64)
-  ECase(REG_128)
+#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X)
+  ECase(REG_NONE);
+  ECase(REG_32);
+  ECase(REG_64);
+  ECase(REG_128);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_ABI_FP>::enumeration(
     IO &IO, ELFYAML::MIPS_ABI_FP &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::Val_GNU_MIPS_ABI_##X);
-  ECase(FP_ANY)
-  ECase(FP_DOUBLE)
-  ECase(FP_SINGLE)
-  ECase(FP_SOFT)
-  ECase(FP_OLD_64)
-  ECase(FP_XX)
-  ECase(FP_64)
-  ECase(FP_64A)
+#define ECase(X) IO.enumCase(Value, #X, Mips::Val_GNU_MIPS_ABI_##X)
+  ECase(FP_ANY);
+  ECase(FP_DOUBLE);
+  ECase(FP_SINGLE);
+  ECase(FP_SOFT);
+  ECase(FP_OLD_64);
+  ECase(FP_XX);
+  ECase(FP_64);
+  ECase(FP_64A);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_EXT>::enumeration(
     IO &IO, ELFYAML::MIPS_AFL_EXT &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X);
-  ECase(EXT_NONE)
-  ECase(EXT_XLR)
-  ECase(EXT_OCTEON2)
-  ECase(EXT_OCTEONP)
-  ECase(EXT_LOONGSON_3A)
-  ECase(EXT_OCTEON)
-  ECase(EXT_5900)
-  ECase(EXT_4650)
-  ECase(EXT_4010)
-  ECase(EXT_4100)
-  ECase(EXT_3900)
-  ECase(EXT_10000)
-  ECase(EXT_SB1)
-  ECase(EXT_4111)
-  ECase(EXT_4120)
-  ECase(EXT_5400)
-  ECase(EXT_5500)
-  ECase(EXT_LOONGSON_2E)
-  ECase(EXT_LOONGSON_2F)
-  ECase(EXT_OCTEON3)
+#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X)
+  ECase(EXT_NONE);
+  ECase(EXT_XLR);
+  ECase(EXT_OCTEON2);
+  ECase(EXT_OCTEONP);
+  ECase(EXT_LOONGSON_3A);
+  ECase(EXT_OCTEON);
+  ECase(EXT_5900);
+  ECase(EXT_4650);
+  ECase(EXT_4010);
+  ECase(EXT_4100);
+  ECase(EXT_3900);
+  ECase(EXT_10000);
+  ECase(EXT_SB1);
+  ECase(EXT_4111);
+  ECase(EXT_4120);
+  ECase(EXT_5400);
+  ECase(EXT_5500);
+  ECase(EXT_LOONGSON_2E);
+  ECase(EXT_LOONGSON_2F);
+  ECase(EXT_OCTEON3);
 #undef ECase
 }
 
@@ -614,27 +612,27 @@ void ScalarEnumerationTraits<ELFYAML::MIPS_ISA>::enumeration(
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_ASE>::bitset(
     IO &IO, ELFYAML::MIPS_AFL_ASE &Value) {
-#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_ASE_##X);
-  BCase(DSP)
-  BCase(DSPR2)
-  BCase(EVA)
-  BCase(MCU)
-  BCase(MDMX)
-  BCase(MIPS3D)
-  BCase(MT)
-  BCase(SMARTMIPS)
-  BCase(VIRT)
-  BCase(MSA)
-  BCase(MIPS16)
-  BCase(MICROMIPS)
-  BCase(XPA)
+#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_ASE_##X)
+  BCase(DSP);
+  BCase(DSPR2);
+  BCase(EVA);
+  BCase(MCU);
+  BCase(MDMX);
+  BCase(MIPS3D);
+  BCase(MT);
+  BCase(SMARTMIPS);
+  BCase(VIRT);
+  BCase(MSA);
+  BCase(MIPS16);
+  BCase(MICROMIPS);
+  BCase(XPA);
 #undef BCase
 }
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_FLAGS1>::bitset(
     IO &IO, ELFYAML::MIPS_AFL_FLAGS1 &Value) {
-#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_FLAGS1_##X);
-  BCase(ODDSPREG)
+#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_FLAGS1_##X)
+  BCase(ODDSPREG);
 #undef BCase
 }
 
diff --git a/lib/ObjectYAML/MachOYAML.cpp b/lib/ObjectYAML/MachOYAML.cpp
index a033a79189bd..6b0e4e3762d0 100644
--- a/lib/ObjectYAML/MachOYAML.cpp
+++ b/lib/ObjectYAML/MachOYAML.cpp
@@ -230,6 +230,12 @@ void mapLoadCommandData<MachO::dylinker_command>(
   IO.mapOptional("PayloadString", LoadCommand.PayloadString);
 }
 
+template <>
+void mapLoadCommandData<MachO::build_version_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Tools", LoadCommand.Tools);
+}
+
 void MappingTraits<MachOYAML::LoadCommand>::mapping(
     IO &IO, MachOYAML::LoadCommand &LoadCommand) {
   MachO::LoadCommandType TempCmd = static_cast<MachO::LoadCommandType>(
@@ -282,6 +288,12 @@ void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
   IO.mapOptional("reserved3", Section.reserved3);
 }
 
+void MappingTraits<MachO::build_tool_version>::mapping(
+    IO &IO, MachO::build_tool_version &tool) {
+  IO.mapRequired("tool", tool.tool);
+  IO.mapRequired("version", tool.version);
+}
+
 void MappingTraits<MachO::dylib>::mapping(IO &IO, MachO::dylib &DylibStruct) {
   IO.mapRequired("name", DylibStruct.name);
   IO.mapRequired("timestamp", DylibStruct.timestamp);
@@ -558,6 +570,23 @@ void MappingTraits<MachO::version_min_command>::mapping(
   IO.mapRequired("sdk", LoadCommand.sdk);
 }
 
+void MappingTraits<MachO::note_command>::mapping(
+    IO &IO, MachO::note_command &LoadCommand) {
+
+  IO.mapRequired("data_owner", LoadCommand.data_owner);
+  IO.mapRequired("offset", LoadCommand.offset);
+  IO.mapRequired("size", LoadCommand.size);
+}
+
+void MappingTraits<MachO::build_version_command>::mapping(
+    IO &IO, MachO::build_version_command &LoadCommand) {
+
+  IO.mapRequired("platform", LoadCommand.platform);
+  IO.mapRequired("minos", LoadCommand.minos);
+  IO.mapRequired("sdk", LoadCommand.sdk);
+  IO.mapRequired("ntools", LoadCommand.ntools);
+}
+
 } // namespace llvm::yaml
 
 } // namespace llvm
diff --git a/lib/ObjectYAML/ObjectYAML.cpp b/lib/ObjectYAML/ObjectYAML.cpp
index cbbaac6062a7..74581c1ecaac 100644
--- a/lib/ObjectYAML/ObjectYAML.cpp
+++ b/lib/ObjectYAML/ObjectYAML.cpp
@@ -43,6 +43,9 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
       ObjectFile.FatMachO.reset(new MachOYAML::UniversalBinary());
       MappingTraits<MachOYAML::UniversalBinary>::mapping(IO,
                                                          *ObjectFile.FatMachO);
+    } else if (IO.mapTag("!WASM")) {
+      ObjectFile.Wasm.reset(new WasmYAML::Object());
+      MappingTraits<WasmYAML::Object>::mapping(IO, *ObjectFile.Wasm);
     } else {
       Input &In = (Input &)IO;
       std::string Tag = In.getCurrentNode()->getRawTag();
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
new file mode 100644
index 000000000000..3e1bed19d61f
--- /dev/null
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -0,0 +1,357 @@
+//===- WasmYAML.cpp - Wasm YAMLIO implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of wasm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/WasmYAML.h"
+#include "llvm/Object/Wasm.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MipsABIFlags.h"
+
+namespace llvm {
+
+namespace WasmYAML {
+
+// Declared here rather than in the header to comply with:
+// http://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers
+Section::~Section() {}
+
+} // end namespace WasmYAML
+
+namespace yaml {
+
+void MappingTraits<WasmYAML::FileHeader>::mapping(
+    IO &IO, WasmYAML::FileHeader &FileHdr) {
+  IO.mapRequired("Version", FileHdr.Version);
+}
+
+void MappingTraits<WasmYAML::Object>::mapping(IO &IO,
+                                              WasmYAML::Object &Object) {
+  IO.setContext(&Object);
+  IO.mapTag("!WASM", true);
+  IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("Sections", Object.Sections);
+  IO.setContext(nullptr);
+}
+
+static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) {
+  IO.mapRequired("Type", Section.Type);
+  IO.mapOptional("Relocations", Section.Relocations);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::CustomSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("Payload", Section.Payload);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::TypeSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Signatures", Section.Signatures);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ImportSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Imports", Section.Imports);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::FunctionSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("FunctionTypes", Section.FunctionTypes);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::TableSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Tables", Section.Tables);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::MemorySection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Memories", Section.Memories);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Globals", Section.Globals);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ExportSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Exports", Section.Exports);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::StartSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("StartFunction", Section.StartFunction);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ElemSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Segments", Section.Segments);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::CodeSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Functions", Section.Functions);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::DataSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Segments", Section.Segments);
+}
+
+void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
+    IO &IO, std::unique_ptr<WasmYAML::Section> &Section) {
+  WasmYAML::SectionType SectionType;
+  if (IO.outputting())
+    SectionType = Section->Type;
+  else
+    IO.mapRequired("Type", SectionType);
+
+  switch (SectionType) {
+  case wasm::WASM_SEC_CUSTOM:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::CustomSection());
+    sectionMapping(IO, *cast<WasmYAML::CustomSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_TYPE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::TypeSection());
+    sectionMapping(IO, *cast<WasmYAML::TypeSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_IMPORT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ImportSection());
+    sectionMapping(IO, *cast<WasmYAML::ImportSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_FUNCTION:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::FunctionSection());
+    sectionMapping(IO, *cast<WasmYAML::FunctionSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_TABLE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::TableSection());
+    sectionMapping(IO, *cast<WasmYAML::TableSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_MEMORY:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::MemorySection());
+    sectionMapping(IO, *cast<WasmYAML::MemorySection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_GLOBAL:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::GlobalSection());
+    sectionMapping(IO, *cast<WasmYAML::GlobalSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_EXPORT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ExportSection());
+    sectionMapping(IO, *cast<WasmYAML::ExportSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_START:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::StartSection());
+    sectionMapping(IO, *cast<WasmYAML::StartSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_ELEM:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ElemSection());
+    sectionMapping(IO, *cast<WasmYAML::ElemSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_CODE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::CodeSection());
+    sectionMapping(IO, *cast<WasmYAML::CodeSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_DATA:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::DataSection());
+    sectionMapping(IO, *cast<WasmYAML::DataSection>(Section.get()));
+    break;
+  default:
+    llvm_unreachable("Unknown section type");
+  }
+}
+
+void ScalarEnumerationTraits<WasmYAML::SectionType>::enumeration(
+    IO &IO, WasmYAML::SectionType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_SEC_##X);
+  ECase(CUSTOM);
+  ECase(TYPE);
+  ECase(IMPORT);
+  ECase(FUNCTION);
+  ECase(TABLE);
+  ECase(MEMORY);
+  ECase(GLOBAL);
+  ECase(EXPORT);
+  ECase(START);
+  ECase(ELEM);
+  ECase(CODE);
+  ECase(DATA);
+#undef ECase
+}
+
+void MappingTraits<WasmYAML::Signature>::mapping(
+    IO &IO, WasmYAML::Signature &Signature) {
+  IO.mapOptional("Index", Signature.Index);
+  IO.mapRequired("ReturnType", Signature.ReturnType);
+  IO.mapRequired("ParamTypes", Signature.ParamTypes);
+}
+
+void MappingTraits<WasmYAML::Table>::mapping(IO &IO, WasmYAML::Table &Table) {
+  IO.mapRequired("ElemType", Table.ElemType);
+  IO.mapRequired("Limits", Table.TableLimits);
+}
+
+void MappingTraits<WasmYAML::Function>::mapping(IO &IO,
+                                                WasmYAML::Function &Function) {
+  IO.mapRequired("Locals", Function.Locals);
+  IO.mapRequired("Body", Function.Body);
+}
+
+void MappingTraits<WasmYAML::Relocation>::mapping(
+    IO &IO, WasmYAML::Relocation &Relocation) {
+  IO.mapRequired("Type", Relocation.Type);
+  IO.mapRequired("Index", Relocation.Index);
+  IO.mapRequired("Offset", Relocation.Offset);
+  IO.mapRequired("Addend", Relocation.Addend);
+}
+
+void MappingTraits<WasmYAML::LocalDecl>::mapping(
+    IO &IO, WasmYAML::LocalDecl &LocalDecl) {
+  IO.mapRequired("Type", LocalDecl.Type);
+  IO.mapRequired("Count", LocalDecl.Count);
+}
+
+void MappingTraits<WasmYAML::Limits>::mapping(IO &IO,
+                                              WasmYAML::Limits &Limits) {
+  if (!IO.outputting() || Limits.Flags)
+    IO.mapOptional("Flags", Limits.Flags);
+  IO.mapRequired("Initial", Limits.Initial);
+  if (!IO.outputting() || Limits.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    IO.mapOptional("Maximum", Limits.Maximum);
+}
+
+void MappingTraits<WasmYAML::ElemSegment>::mapping(
+    IO &IO, WasmYAML::ElemSegment &Segment) {
+  IO.mapRequired("Offset", Segment.Offset);
+  IO.mapRequired("Functions", Segment.Functions);
+}
+
+void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
+                                              WasmYAML::Import &Import) {
+  IO.mapRequired("Module", Import.Module);
+  IO.mapRequired("Field", Import.Field);
+  IO.mapRequired("Kind", Import.Kind);
+  if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+    IO.mapRequired("SigIndex", Import.SigIndex);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
+    IO.mapRequired("GlobalType", Import.GlobalType);
+    IO.mapRequired("GlobalMutable", Import.GlobalMutable);
+  } else {
+    llvm_unreachable("unhandled import type");
+  }
+}
+
+void MappingTraits<WasmYAML::Export>::mapping(IO &IO,
+                                              WasmYAML::Export &Export) {
+  IO.mapRequired("Name", Export.Name);
+  IO.mapRequired("Kind", Export.Kind);
+  IO.mapRequired("Index", Export.Index);
+}
+
+void MappingTraits<WasmYAML::Global>::mapping(IO &IO,
+                                              WasmYAML::Global &Global) {
+  IO.mapRequired("Type", Global.Type);
+  IO.mapRequired("Mutable", Global.Mutable);
+  IO.mapRequired("InitExpr", Global.InitExpr);
+}
+
+void MappingTraits<wasm::WasmInitExpr>::mapping(IO &IO,
+                                                wasm::WasmInitExpr &Expr) {
+  WasmYAML::Opcode Op = Expr.Opcode;
+  IO.mapRequired("Opcode", Op);
+  Expr.Opcode = Op;
+  switch (Expr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    IO.mapRequired("Value", Expr.Value.Int32);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    IO.mapRequired("Value", Expr.Value.Int64);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    IO.mapRequired("Value", Expr.Value.Float32);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    IO.mapRequired("Value", Expr.Value.Float64);
+    break;
+  }
+}
+
+void MappingTraits<WasmYAML::DataSegment>::mapping(
+    IO &IO, WasmYAML::DataSegment &Segment) {
+  IO.mapRequired("Index", Segment.Index);
+  IO.mapRequired("Offset", Segment.Offset);
+  IO.mapRequired("Content", Segment.Content);
+}
+
+void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
+    IO &IO, WasmYAML::ValueType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
+  ECase(I32);
+  ECase(I64);
+  ECase(F32);
+  ECase(F64);
+  ECase(ANYFUNC);
+  ECase(FUNC);
+  ECase(NORESULT);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::ExportKind>::enumeration(
+    IO &IO, WasmYAML::ExportKind &Kind) {
+#define ECase(X) IO.enumCase(Kind, #X, wasm::WASM_EXTERNAL_##X);
+  ECase(FUNCTION);
+  ECase(TABLE);
+  ECase(MEMORY);
+  ECase(GLOBAL);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::Opcode>::enumeration(
+    IO &IO, WasmYAML::Opcode &Code) {
+#define ECase(X) IO.enumCase(Code, #X, wasm::WASM_OPCODE_##X);
+  ECase(END);
+  ECase(I32_CONST);
+  ECase(I64_CONST);
+  ECase(F64_CONST);
+  ECase(F32_CONST);
+  ECase(GET_GLOBAL);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
+    IO &IO, WasmYAML::TableType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
+  ECase(ANYFUNC);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration(
+    IO &IO, WasmYAML::RelocType &Type) {
+#define WASM_RELOC(name, value) IO.enumCase(Type, #name, wasm::name);
+#include "llvm/Support/WasmRelocs/WebAssembly.def"
+#undef WASM_RELOC
+}
+
+} // end namespace yaml
+} // end namespace llvm
diff --git a/lib/Option/Arg.cpp b/lib/Option/Arg.cpp
index c3de2d1a4965..3e8a1d802314 100644
--- a/lib/Option/Arg.cpp
+++ b/lib/Option/Arg.cpp
@@ -61,7 +61,9 @@ void Arg::print(raw_ostream& O) const {
   O << "]>\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Arg::dump() const { print(dbgs()); }
+#endif
 
 std::string Arg::getAsString(const ArgList &Args) const {
   SmallString<256> Res;
diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index f94de866ef34..39dbce87f9ae 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp
@@ -19,203 +19,44 @@
 using namespace llvm;
 using namespace llvm::opt;
 
-void arg_iterator::SkipToNextArg() {
-  for (; Current != Args.end(); ++Current) {
-    // Done if there are no filters.
-    if (!Id0.isValid())
-      break;
-
-    // Otherwise require a match.
-    const Option &O = (*Current)->getOption();
-    if (O.matches(Id0) ||
-        (Id1.isValid() && O.matches(Id1)) ||
-        (Id2.isValid() && O.matches(Id2)))
-      break;
-  }
-}
-
 void ArgList::append(Arg *A) {
   Args.push_back(A);
-}
-
-void ArgList::eraseArg(OptSpecifier Id) {
-  Args.erase(
-      remove_if(*this, [=](Arg *A) { return A->getOption().matches(Id); }),
-      end());
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1,
-                                OptSpecifier Id2) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id0) || (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1,
-                                OptSpecifier Id2, OptSpecifier Id3) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id0) || (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) || (*it)->getOption().matches(Id3))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
 
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1)) {
-      Res = *it;
-      Res->claim();
-
-    }
+  // Update ranges for the option and all of its groups.
+  for (Option O = A->getOption().getUnaliasedOption(); O.isValid();
+       O = O.getGroup()) {
+    auto &R =
+        OptRanges.insert(std::make_pair(O.getID(), emptyRange())).first->second;
+    R.first = std::min<unsigned>(R.first, Args.size() - 1);
+    R.second = Args.size();
   }
-
-  return Res;
 }
 
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4, OptSpecifier Id5) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4) ||
-        (*it)->getOption().matches(Id5)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4, OptSpecifier Id5,
-                         OptSpecifier Id6) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4) ||
-        (*it)->getOption().matches(Id5) ||
-        (*it)->getOption().matches(Id6)) {
-      Res = *it;
-      Res->claim();
-    }
+void ArgList::eraseArg(OptSpecifier Id) {
+  // Zero out the removed entries but keep them around so that we don't
+  // need to invalidate OptRanges.
+  for (Arg *const &A : filtered(Id)) {
+    // Avoid the need for a non-const filtered iterator variant.
+    Arg **ArgsBegin = Args.data();
+    ArgsBegin[&A - ArgsBegin] = nullptr;
   }
-
-  return Res;
+  OptRanges.erase(Id.getID());
 }
 
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4, OptSpecifier Id5,
-                         OptSpecifier Id6, OptSpecifier Id7) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4) ||
-        (*it)->getOption().matches(Id5) ||
-        (*it)->getOption().matches(Id6) ||
-        (*it)->getOption().matches(Id7)) {
-      Res = *it;
-      Res->claim();
+ArgList::OptRange
+ArgList::getRange(std::initializer_list<OptSpecifier> Ids) const {
+  OptRange R = emptyRange();
+  for (auto Id : Ids) {
+    auto I = OptRanges.find(Id.getID());
+    if (I != OptRanges.end()) {
+      R.first = std::min(R.first, I->second.first);
+      R.second = std::max(R.second, I->second.second);
     }
   }
-
-  return Res;
+  // Map an empty {-1, 0} range to {0, 0} so it can be used to form iterators.
+  if (R.first == -1u)
+    R.first = 0;
+  return R;
 }
 
 bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
@@ -231,8 +72,7 @@ bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
   return Default;
 }
 
-StringRef ArgList::getLastArgValue(OptSpecifier Id,
-                                         StringRef Default) const {
+StringRef ArgList::getLastArgValue(OptSpecifier Id, StringRef Default) const {
   if (Arg *A = getLastArg(Id))
     return A->getValue();
   return Default;
@@ -262,7 +102,7 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0,
 void ArgList::AddAllArgsExcept(ArgStringList &Output,
                                ArrayRef<OptSpecifier> Ids,
                                ArrayRef<OptSpecifier> ExcludeIds) const {
-  for (const Arg *Arg : Args) {
+  for (const Arg *Arg : *this) {
     bool Excluded = false;
     for (OptSpecifier Id : ExcludeIds) {
       if (Arg->getOption().matches(Id)) {
@@ -325,14 +165,14 @@ void ArgList::AddAllArgsTranslated(ArgStringList &Output, OptSpecifier Id0,
 }
 
 void ArgList::ClaimAllArgs(OptSpecifier Id0) const {
-  for (auto Arg : filtered(Id0))
+  for (auto *Arg : filtered(Id0))
     Arg->claim();
 }
 
 void ArgList::ClaimAllArgs() const {
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it)
-    if (!(*it)->isClaimed())
-      (*it)->claim();
+  for (auto *Arg : *this)
+    if (!Arg->isClaimed())
+      Arg->claim();
 }
 
 const char *ArgList::GetOrMakeJoinedArgString(unsigned Index,
@@ -353,7 +193,9 @@ void ArgList::print(raw_ostream &O) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ArgList::dump() const { print(dbgs()); }
+#endif
 
 //
 
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 5eb179fbd257..736b939fe80b 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -83,7 +83,9 @@ void Option::print(raw_ostream &O) const {
   O << ">\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Option::dump() const { print(dbgs()); }
+#endif
 
 bool Option::matches(OptSpecifier Opt) const {
   // Aliases are never considered in matching, look through them.
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 2994a07b1ccf..0421946a32a6 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -61,6 +62,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/GCOVProfiler.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/ConstantMerge.h"
 #include "llvm/Transforms/IPO/CrossDSOCFI.h"
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
@@ -104,9 +106,12 @@
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopPredication.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
@@ -131,8 +136,8 @@
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include "llvm/Transforms/Utils/Mem2Reg.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/SimplifyInstructions.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
@@ -142,6 +147,9 @@
 
 using namespace llvm;
 
+static cl::opt<unsigned> MaxDevirtIterations("pm-max-devirt-iterations",
+                                             cl::ReallyHidden, cl::init(4));
+
 static Regex DefaultAliasRegex("^(default|lto-pre-link|lto)<(O[0123sz])>$");
 
 static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) {
@@ -316,19 +324,21 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // the other we have is `LoopInstSimplify`.
   LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging);
 
-  // FIXME: Enable these when the loop pass manager can support enforcing loop
-  // simplified and LCSSA form as well as updating the loop nest after
-  // transformations and we finsih porting the loop passes.
-#if 0
   // Rotate Loop - disable header duplication at -Oz
   LPM1.addPass(LoopRotatePass(Level != Oz));
   LPM1.addPass(LICMPass());
+#if 0
+  // The LoopUnswitch pass isn't yet ported to the new pass manager.
   LPM1.addPass(LoopUnswitchPass(/* OptimizeForSize */ Level != O3));
+#endif
   LPM2.addPass(IndVarSimplifyPass());
-  LPM2.addPass(LoopIdiomPass());
+  LPM2.addPass(LoopIdiomRecognizePass());
   LPM2.addPass(LoopDeletionPass());
-  LPM2.addPass(SimpleLoopUnrollPass());
-#endif
+  LPM2.addPass(LoopUnrollPass::createFull(Level));
+
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1)));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
@@ -363,12 +373,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
   FPM.addPass(DSEPass());
-  // FIXME: Enable this when the loop pass manager can support enforcing loop
-  // simplified and LCSSA form as well as updating the loop nest after
-  // transformations and we finsih porting the loop passes.
-#if 0
   FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
-#endif
 
   // Finally, do an expensive DCE pass to catch all the dead code exposed by
   // the simplifications and basic cleanup after all the simplifications.
@@ -379,6 +384,56 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   return FPM;
 }
 
+static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
+                              PassBuilder::OptimizationLevel Level,
+                              bool RunProfileGen, std::string ProfileGenFile,
+                              std::string ProfileUseFile) {
+  // Generally running simplification passes and the inliner with an high
+  // threshold results in smaller executables, but there may be cases where
+  // the size grows, so let's be conservative here and skip this simplification
+  // at -Os/Oz.
+  if (!isOptimizingForSize(Level)) {
+    InlineParams IP;
+
+    // In the old pass manager, this is a cl::opt. Should still this be one?
+    IP.DefaultThreshold = 75;
+
+    // FIXME: The hint threshold has the same value used by the regular inliner.
+    // This should probably be lowered after performance testing.
+    // FIXME: this comment is cargo culted from the old pass manager, revisit).
+    IP.HintThreshold = 325;
+
+    CGSCCPassManager CGPipeline(DebugLogging);
+
+    CGPipeline.addPass(InlinerPass(IP));
+
+    FunctionPassManager FPM;
+    FPM.addPass(SROA());
+    FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
+    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
+    FPM.addPass(InstCombinePass()); // Combine silly sequences.
+
+    // FIXME: Here the old pass manager inserts peephole extensions.
+    // Add them when they're supported.
+    CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline)));
+  }
+
+  if (RunProfileGen) {
+    MPM.addPass(PGOInstrumentationGen());
+
+    // Add the profile lowering pass.
+    InstrProfOptions Options;
+    if (!ProfileGenFile.empty())
+      Options.InstrProfileOutput = ProfileGenFile;
+    MPM.addPass(InstrProfiling(Options));
+  }
+
+  if (!ProfileUseFile.empty())
+    MPM.addPass(PGOInstrumentationUse(ProfileUseFile));
+}
+
 ModulePassManager
 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
                                            bool DebugLogging) {
@@ -429,10 +484,20 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   GlobalCleanupPM.addPass(SimplifyCFGPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM)));
 
-  // FIXME: Enable this when cross-IR-unit analysis invalidation is working.
-#if 0
-  MPM.addPass(RequireAnalysisPass<GlobalsAA>());
-#endif
+  // Add all the requested passes for PGO Instrumentation, if requested.
+  if (PGOOpt) {
+    assert(PGOOpt->RunProfileGen || PGOOpt->SamplePGO ||
+           !PGOOpt->ProfileUseFile.empty());
+    addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
+                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+  }
+
+  // Indirect call promotion that promotes intra-module targes only.
+  MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO));
+
+  // Require the GlobalsAA analysis for the module so we can query it within
+  // the CGSCC pipeline.
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
 
   // Now begin the main postorder CGSCC pipeline.
   // FIXME: The current CGSCC pipeline has its origins in the legacy pass
@@ -454,13 +519,24 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Now deduce any function attributes based in the current code.
   MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
 
+  // When at O3 add argument promotion to the pass pipeline.
+  // FIXME: It isn't at all clear why this should be limited to O3.
+  if (Level == O3)
+    MainCGPipeline.addPass(ArgumentPromotionPass());
+
   // Lastly, add the core function simplification pipeline nested inside the
   // CGSCC walk.
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, DebugLogging)));
 
+  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
+  // to detect when we devirtualize indirect calls and iterate the SCC passes
+  // in that case to try and catch knock-on inlining or function attrs
+  // opportunities. Then we add it to the module pipeline by walking the SCCs
+  // in postorder (or bottom-up).
   MPM.addPass(
-      createModuleToPostOrderCGSCCPassAdaptor(std::move(MainCGPipeline)));
+      createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass(
+          std::move(MainCGPipeline), MaxDevirtIterations, DebugLogging)));
 
   // This ends the canonicalization and simplification phase of the pipeline.
   // At this point, we expect to have canonical and simple IR which we begin
@@ -475,17 +551,14 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // FIXME: Is this really an optimization rather than a canonicalization?
   MPM.addPass(ReversePostOrderFunctionAttrsPass());
 
-  // Recompute GloblasAA here prior to function passes. This is particularly
+  // Re-require GloblasAA here prior to function passes. This is particularly
   // useful as the above will have inlined, DCE'ed, and function-attr
   // propagated everything. We should at this point have a reasonably minimal
   // and richly annotated call graph. By computing aliasing and mod/ref
   // information for all local globals here, the late loop passes and notably
   // the vectorizer will be able to use them to help recognize vectorizable
   // memory operations.
-  // FIXME: Enable this once analysis invalidation is fully supported.
-#if 0
-  MPM.addPass(Require<GlobalsAA>());
-#endif
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
 
   FunctionPassManager OptimizePM(DebugLogging);
   OptimizePM.addPass(Float2IntPass());
@@ -495,36 +568,63 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Optimize the loop execution. These passes operate on entire loop nests
   // rather than on each loop in an inside-out manner, and so they are actually
   // function passes.
+
+  // First rotate loops that may have been un-rotated by prior passes.
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
+
+  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   OptimizePM.addPass(LoopDistributePass());
-#if 0
-  // FIXME: LoopVectorize relies on "requiring" LCSSA which isn't supported in
-  // the new PM.
+
+  // Now run the core loop vectorizer.
   OptimizePM.addPass(LoopVectorizePass());
-#endif
-  // FIXME: Need to port Loop Load Elimination and add it here.
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  OptimizePM.addPass(LoopLoadEliminationPass());
+
+  // Cleanup after the loop optimization passes.
   OptimizePM.addPass(InstCombinePass());
 
+
+  // Now that we've formed fast to execute loop structures, we do further
+  // optimizations. These are run afterward as they might block doing complex
+  // analyses and transforms such as what are needed for loop vectorization.
+
   // Optimize parallel scalar instruction chains into SIMD instructions.
   OptimizePM.addPass(SLPVectorizerPass());
 
-  // Cleanup after vectorizers.
+  // Cleanup after all of the vectorizers.
   OptimizePM.addPass(SimplifyCFGPass());
   OptimizePM.addPass(InstCombinePass());
 
   // Unroll small loops to hide loop backedge latency and saturate any parallel
-  // execution resources of an out-of-order processor.
-  // FIXME: Need to add once loop pass pipeline is available.
-
-  // FIXME: Add the loop sink pass when ported.
-
-  // FIXME: Add cleanup from the loop pass manager when we're forming LCSSA
-  // here.
+  // execution resources of an out-of-order processor. We also then need to
+  // clean up redundancies and loop invariant code.
+  // FIXME: It would be really good to use a loop-integrated instruction
+  // combiner for cleanup here so that the unrolling and LICM can be pipelined
+  // across the loop nests.
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopUnrollPass::create(Level)));
+  OptimizePM.addPass(InstCombinePass());
+  OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
 
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
   OptimizePM.addPass(AlignmentFromAssumptionsPass());
 
-  // ADd the core optimizing pipeline.
+  // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+  // canonicalization pass that enables other optimizations. As a result,
+  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+  // result too early.
+  OptimizePM.addPass(LoopSinkPass());
+
+  // And finally clean up LCSSA form before generating code.
+  OptimizePM.addPass(InstSimplifierPass());
+
+  // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
   // Now we need to do some global optimization transforms.
@@ -550,13 +650,167 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   assert(Level != O0 && "Must request optimizations for the default pipeline!");
   ModulePassManager MPM(DebugLogging);
 
-  // FIXME: Finish fleshing this out to match the legacy LTO pipelines.
-  FunctionPassManager LateFPM(DebugLogging);
-  LateFPM.addPass(InstCombinePass());
-  LateFPM.addPass(SimplifyCFGPass());
+  // Remove unused virtual tables to improve the quality of code generated by
+  // whole-program devirtualization and bitset lowering.
+  MPM.addPass(GlobalDCEPass());
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // Do basic inference of function attributes from known properties of system
+  // libraries and other oracles.
+  MPM.addPass(InferFunctionAttrsPass());
+
+  if (Level > 1) {
+    // Indirect call promotion. This should promote all the targets that are
+    // left by the earlier promotion pass that promotes intra-module targets.
+    // This two-step promotion is to save the compile time. For LTO, it should
+    // produce the same result as if we only do promotion here.
+    MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */,
+                                         PGOOpt && PGOOpt->SamplePGO));
+
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.
+   MPM.addPass(IPSCCPPass());
+  }
 
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
+  // Now deduce any function attributes based in the current code.
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              PostOrderFunctionAttrsPass()));
 
+  // Do RPO function attribute inference across the module to forward-propagate
+  // attributes where applicable.
+  // FIXME: Is this really an optimization rather than a canonicalization?
+  MPM.addPass(ReversePostOrderFunctionAttrsPass());
+
+  // Use inragne annotations on GEP indices to split globals where beneficial.
+  MPM.addPass(GlobalSplitPass());
+
+  // Run whole program optimization of virtual call when the list of callees
+  // is fixed.
+  MPM.addPass(WholeProgramDevirtPass());
+
+  // Stop here at -O1.
+  if (Level == 1)
+    return MPM;
+
+  // Optimize globals to try and fold them into constants.
+  MPM.addPass(GlobalOptPass());
+
+  // Promote any localized globals to SSA registers.
+  MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
+
+  // Linking modules together can lead to duplicate global constant, only
+  // keep one copy of each constant.
+  MPM.addPass(ConstantMergePass());
+
+  // Remove unused arguments from functions.
+  MPM.addPass(DeadArgumentEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  // FIXME: add peephole extensions here as the legacy PM does.
+  MPM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+
+  // Note: historically, the PruneEH pass was run first to deduce nounwind and
+  // generally clean up exception handling overhead. It isn't clear this is
+  // valuable as the inliner doesn't currently care whether it is inlining an
+  // invoke or a call.
+  // Run the inliner now.
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(InlinerPass()));
+
+  // Optimize globals again after we ran the inliner.
+  MPM.addPass(GlobalOptPass());
+
+  // Garbage collect dead functions.
+  // FIXME: Add ArgumentPromotion pass after once it's ported.
+  MPM.addPass(GlobalDCEPass());
+
+  FunctionPassManager FPM(DebugLogging);
+
+  // The IPO Passes may leave cruft around. Clean up after them.
+  // FIXME: add peephole extensions here as the legacy PM does.
+  FPM.addPass(InstCombinePass());
+  FPM.addPass(JumpThreadingPass());
+
+  // Break up allocas
+  FPM.addPass(SROA());
+
+  // Run a few AA driver optimizations here and now to cleanup the code.
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              PostOrderFunctionAttrsPass()));
+  // FIXME: here we run IP alias analysis in the legacy PM.
+
+  FunctionPassManager MainFPM;
+
+  // FIXME: once we fix LoopPass Manager, add LICM here.
+  // FIXME: once we provide support for enabling MLSM, add it here.
+  // FIXME: once we provide support for enabling NewGVN, add it here.
+  MainFPM.addPass(GVN());
+
+  // Remove dead memcpy()'s.
+  MainFPM.addPass(MemCpyOptPass());
+
+  // Nuke dead stores.
+  MainFPM.addPass(DSEPass());
+
+  // FIXME: at this point, we run a bunch of loop passes:
+  // indVarSimplify, loopDeletion, loopInterchange, loopUnrool,
+  // loopVectorize. Enable them once the remaining issue with LPM
+  // are sorted out.
+
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(SimplifyCFGPass());
+  MainFPM.addPass(SCCPPass());
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(BDCEPass());
+
+  // FIXME: We may want to run SLPVectorizer here.
+  // After vectorization, assume intrinsics may tell us more
+  // about pointer alignments.
+#if 0
+  MainFPM.add(AlignmentFromAssumptionsPass());
+#endif
+
+  // FIXME: Conditionally run LoadCombine here, after it's ported
+  // (in case we still have this pass, given its questionable usefulness).
+
+  // FIXME: add peephole extensions to the PM here.
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(JumpThreadingPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));
+
+  // Create a function that performs CFI checks for cross-DSO calls with
+  // targets in the current module.
+  MPM.addPass(CrossDSOCFIPass());
+
+  // Lower type metadata and the type.test intrinsic. This pass supports
+  // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
+  // to be run at link time if CFI is enabled. This pass does nothing if
+  // CFI is disabled.
+  // Enable once we add support for the summary in the new PM.
+#if 0
+  MPM.addPass(LowerTypeTestsPass(Summary ? PassSummaryAction::Export :
+                                           PassSummaryAction::None,
+                                Summary));
+#endif
+
+  // Add late LTO optimization passes.
+  // Delete basic blocks, which optimization passes may have killed.
+  MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
+
+  // Drop bodies of available eternally objects to improve GlobalDCE.
+  MPM.addPass(EliminateAvailableExternallyPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  MPM.addPass(GlobalDCEPass());
+
+  // FIXME: Enable MergeFuncs, conditionally, after ported, maybe.
   return MPM;
 }
 
@@ -579,12 +833,8 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
   // Add support for querying global aliasing information when available.
   // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
   // analysis, all that the `AAManager` can do is query for any *cached*
-  // results from `GlobalsAA` through a readonly proxy..
-#if 0
-  // FIXME: Enable once the invalidation logic supports this. Currently, the
-  // `AAManager` will hold stale references to the module analyses.
+  // results from `GlobalsAA` through a readonly proxy.
   AA.registerModuleAnalysis<GlobalsAA>();
-#endif
 
   return AA;
 }
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index a9939fddb98c..efd4c097a675 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -85,6 +85,7 @@ CGSCC_ANALYSIS("fam-proxy", FunctionAnalysisManagerCGSCCProxy())
 #ifndef CGSCC_PASS
 #define CGSCC_PASS(NAME, CREATE_PASS)
 #endif
+CGSCC_PASS("argpromotion", ArgumentPromotionPass())
 CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
 CGSCC_PASS("inline", InlinerPass())
@@ -159,6 +160,7 @@ FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn", GVN())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
+FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass())
 FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
@@ -169,8 +171,10 @@ FUNCTION_PASS("jump-threading", JumpThreadingPass())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
+FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("loop-vectorize", LoopVectorizePass())
+FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
@@ -223,7 +227,9 @@ LOOP_PASS("loop-deletion", LoopDeletionPass())
 LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
-LOOP_PASS("unroll", LoopUnrollPass())
+LOOP_PASS("unroll", LoopUnrollPass::create())
+LOOP_PASS("unroll-full", LoopUnrollPass::createFull())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("loop-predication", LoopPredicationPass())
 #undef LOOP_PASS
diff --git a/lib/ProfileData/Coverage/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6d907c7098e0..23999a5312c7 100644
--- a/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMapping.cpp - Code coverage mapping support ---------*- C++ -*-=//
+//===- CoverageMapping.cpp - Code coverage mapping support ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/Coverage/CoverageMapping.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/Coverage/CoverageMapping.h"
 #include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -59,7 +73,7 @@ void CounterExpressionBuilder::extractTerms(
 
 Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   // Gather constant terms.
-  llvm::SmallVector<std::pair<unsigned, int>, 32> Terms;
+  SmallVector<std::pair<unsigned, int>, 32> Terms;
   extractTerms(ExpressionTree, +1, Terms);
 
   // If there are no terms, this is just a zero. The algorithm below assumes at
@@ -120,8 +134,7 @@ Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS) {
       get(CounterExpression(CounterExpression::Subtract, LHS, RHS)));
 }
 
-void CounterMappingContext::dump(const Counter &C,
-                                 llvm::raw_ostream &OS) const {
+void CounterMappingContext::dump(const Counter &C, raw_ostream &OS) const {
   switch (C.getKind()) {
   case Counter::Zero:
     OS << '0';
@@ -145,7 +158,7 @@ void CounterMappingContext::dump(const Counter &C,
     return;
   Expected<int64_t> Value = evaluate(C);
   if (auto E = Value.takeError()) {
-    llvm::consumeError(std::move(E));
+    consumeError(std::move(E));
     return;
   }
   OS << '[' << *Value << ']';
@@ -217,7 +230,7 @@ Error CoverageMapping::loadFunctionRecord(
   for (const auto &Region : Record.MappingRegions) {
     Expected<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
     if (auto E = ExecutionCount.takeError()) {
-      llvm::consumeError(std::move(E));
+      consumeError(std::move(E));
       return Error::success();
     }
     Function.pushRegion(Region, *ExecutionCount);
@@ -281,6 +294,7 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
 }
 
 namespace {
+
 /// \brief Distributes functions into instantiation sets.
 ///
 /// An instantiation set is a collection of functions that have the same source
@@ -326,7 +340,7 @@ class SegmentBuilder {
       Segments.pop_back();
     DEBUG(dbgs() << "Segment at " << Line << ":" << Col);
     // Set this region's count.
-    if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion) {
+    if (Region.Kind != CounterMappingRegion::SkippedRegion) {
       DEBUG(dbgs() << " with count " << Region.ExecutionCount);
       Segments.emplace_back(Line, Col, Region.ExecutionCount, IsRegionEntry);
     } else
@@ -380,10 +394,10 @@ class SegmentBuilder {
       // in combineRegions(). Because we accumulate counter values only from
       // regions of the same kind as the first region of the area, prefer
       // CodeRegion to ExpansionRegion and ExpansionRegion to SkippedRegion.
-      static_assert(coverage::CounterMappingRegion::CodeRegion <
-                            coverage::CounterMappingRegion::ExpansionRegion &&
-                        coverage::CounterMappingRegion::ExpansionRegion <
-                            coverage::CounterMappingRegion::SkippedRegion,
+      static_assert(CounterMappingRegion::CodeRegion <
+                            CounterMappingRegion::ExpansionRegion &&
+                        CounterMappingRegion::ExpansionRegion <
+                            CounterMappingRegion::SkippedRegion,
                     "Unexpected order of region kind values");
       return LHS.Kind < RHS.Kind;
     });
@@ -437,7 +451,8 @@ public:
     return Segments;
   }
 };
-}
+
+} // end anonymous namespace
 
 std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
   std::vector<StringRef> Filenames;
@@ -487,7 +502,7 @@ static bool isExpansion(const CountedRegion &R, unsigned FileID) {
 
 CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
   CoverageData FileCoverage(Filename);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
 
   for (const auto &Function : Functions) {
     auto MainFileID = findMainViewFileID(Filename, Function);
@@ -533,7 +548,7 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
     return CoverageData();
 
   CoverageData FunctionCoverage(Function.Filenames[*MainFileID]);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
   for (const auto &CR : Function.CountedRegions)
     if (CR.FileID == *MainFileID) {
       Regions.push_back(CR);
@@ -551,7 +566,7 @@ CoverageData CoverageMapping::getCoverageForExpansion(
     const ExpansionRecord &Expansion) const {
   CoverageData ExpansionCoverage(
       Expansion.Function.Filenames[Expansion.FileID]);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
   for (const auto &CR : Expansion.Function.CountedRegions)
     if (CR.FileID == Expansion.FileID) {
       Regions.push_back(CR);
@@ -566,8 +581,7 @@ CoverageData CoverageMapping::getCoverageForExpansion(
   return ExpansionCoverage;
 }
 
-namespace {
-std::string getCoverageMapErrString(coveragemap_error Err) {
+static std::string getCoverageMapErrString(coveragemap_error Err) {
   switch (Err) {
   case coveragemap_error::success:
     return "Success";
@@ -585,6 +599,8 @@ std::string getCoverageMapErrString(coveragemap_error Err) {
   llvm_unreachable("A value of coveragemap_error has no message.");
 }
 
+namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
@@ -594,6 +610,7 @@ class CoverageMappingErrorCategoryType : public std::error_category {
     return getCoverageMapErrString(static_cast<coveragemap_error>(IE));
   }
 };
+
 } // end anonymous namespace
 
 std::string CoverageMapError::message() const {
diff --git a/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index a6c7031ccd3d..a34f359cd542 100644
--- a/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMappingReader.cpp - Code coverage mapping reader ----*- C++ -*-=//
+//===- CoverageMappingReader.cpp - Code coverage mapping reader -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,14 +13,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -226,9 +246,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
 }
 
 Error RawCoverageMappingReader::read() {
-
   // Read the virtual file mapping.
-  llvm::SmallVector<unsigned, 8> VirtualFileMapping;
+  SmallVector<unsigned, 8> VirtualFileMapping;
   uint64_t NumFileMappings;
   if (auto Err = readSize(NumFileMappings))
     return Err;
@@ -349,7 +368,10 @@ static Expected<bool> isCoverageMappingDummy(uint64_t Hash, StringRef Mapping) {
 }
 
 namespace {
+
 struct CovMapFuncRecordReader {
+  virtual ~CovMapFuncRecordReader() = default;
+
   // The interface to read coverage mapping function records for a module.
   //
   // \p Buf points to the buffer containing the \c CovHeader of the coverage
@@ -359,26 +381,24 @@ struct CovMapFuncRecordReader {
   // greater than \p End if not.
   virtual Expected<const char *> readFunctionRecords(const char *Buf,
                                                      const char *End) = 0;
-  virtual ~CovMapFuncRecordReader() {}
+
   template <class IntPtrT, support::endianness Endian>
   static Expected<std::unique_ptr<CovMapFuncRecordReader>>
-  get(coverage::CovMapVersion Version, InstrProfSymtab &P,
+  get(CovMapVersion Version, InstrProfSymtab &P,
       std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
       std::vector<StringRef> &F);
 };
 
 // A class for reading coverage mapping function records for a module.
-template <coverage::CovMapVersion Version, class IntPtrT,
-          support::endianness Endian>
+template <CovMapVersion Version, class IntPtrT, support::endianness Endian>
 class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
-  typedef typename coverage::CovMapTraits<
+  typedef typename CovMapTraits<
       Version, IntPtrT>::CovMapFuncRecordType FuncRecordType;
-  typedef typename coverage::CovMapTraits<Version, IntPtrT>::NameRefType
-      NameRefType;
+  typedef typename CovMapTraits<Version, IntPtrT>::NameRefType  NameRefType;
 
   // Maps function's name references to the indexes of their records
   // in \c Records.
-  llvm::DenseMap<NameRefType, size_t> FunctionRecords;
+  DenseMap<NameRefType, size_t> FunctionRecords;
   InstrProfSymtab &ProfileNames;
   std::vector<StringRef> &Filenames;
   std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records;
@@ -432,14 +452,16 @@ public:
       std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
       std::vector<StringRef> &F)
       : ProfileNames(P), Filenames(F), Records(R) {}
-  ~VersionedCovMapFuncRecordReader() override {}
+
+  ~VersionedCovMapFuncRecordReader() override = default;
 
   Expected<const char *> readFunctionRecords(const char *Buf,
                                              const char *End) override {
     using namespace support;
+
     if (Buf + sizeof(CovMapHeader) > End)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
-    auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
+    auto CovHeader = reinterpret_cast<const CovMapHeader *>(Buf);
     uint32_t NRecords = CovHeader->getNRecords<Endian>();
     uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>();
     uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>();
@@ -490,14 +512,16 @@ public:
     return Buf;
   }
 };
+
 } // end anonymous namespace
 
 template <class IntPtrT, support::endianness Endian>
 Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
-    coverage::CovMapVersion Version, InstrProfSymtab &P,
+    CovMapVersion Version, InstrProfSymtab &P,
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
     std::vector<StringRef> &F) {
   using namespace coverage;
+
   switch (Version) {
   case CovMapVersion::Version1:
     return llvm::make_unique<VersionedCovMapFuncRecordReader<
@@ -518,11 +542,12 @@ static Error readCoverageMappingData(
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
     std::vector<StringRef> &Filenames) {
   using namespace coverage;
+
   // Read the records in the coverage data section.
   auto CovHeader =
-      reinterpret_cast<const coverage::CovMapHeader *>(Data.data());
+      reinterpret_cast<const CovMapHeader *>(Data.data());
   CovMapVersion Version = (CovMapVersion)CovHeader->getVersion<Endian>();
-  if (Version > coverage::CovMapVersion::CurrentVersion)
+  if (Version > CovMapVersion::CurrentVersion)
     return make_error<CoverageMapError>(coveragemap_error::unsupported_version);
   Expected<std::unique_ptr<CovMapFuncRecordReader>> ReaderExpected =
       CovMapFuncRecordReader::get<T, Endian>(Version, ProfileNames, Records,
@@ -538,6 +563,7 @@ static Error readCoverageMappingData(
   }
   return Error::success();
 }
+
 static const char *TestingFormatMagic = "llvmcovmtestdata";
 
 static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
@@ -595,21 +621,21 @@ static Error loadBinaryFormat(MemoryBufferRef ObjectBuffer,
                               StringRef &CoverageMapping,
                               uint8_t &BytesInAddress,
                               support::endianness &Endian, StringRef Arch) {
-  auto BinOrErr = object::createBinary(ObjectBuffer);
+  auto BinOrErr = createBinary(ObjectBuffer);
   if (!BinOrErr)
     return BinOrErr.takeError();
   auto Bin = std::move(BinOrErr.get());
   std::unique_ptr<ObjectFile> OF;
-  if (auto *Universal = dyn_cast<object::MachOUniversalBinary>(Bin.get())) {
+  if (auto *Universal = dyn_cast<MachOUniversalBinary>(Bin.get())) {
     // If we have a universal binary, try to look up the object for the
     // appropriate architecture.
     auto ObjectFileOrErr = Universal->getObjectForArch(Arch);
     if (!ObjectFileOrErr)
       return ObjectFileOrErr.takeError();
     OF = std::move(ObjectFileOrErr.get());
-  } else if (isa<object::ObjectFile>(Bin.get())) {
+  } else if (isa<ObjectFile>(Bin.get())) {
     // For any other object file, upcast and take ownership.
-    OF.reset(cast<object::ObjectFile>(Bin.release()));
+    OF.reset(cast<ObjectFile>(Bin.release()));
     // If we've asked for a particular arch, make sure they match.
     if (!Arch.empty() && OF->getArch() != Triple(Arch).getArch())
       return errorCodeToError(object_error::arch_not_found);
@@ -623,11 +649,15 @@ static Error loadBinaryFormat(MemoryBufferRef ObjectBuffer,
                                 : support::endianness::big;
 
   // Look for the sections that we are interested in.
-  auto NamesSection = lookupSection(*OF, getInstrProfNameSectionName(false));
+  auto ObjFormat = OF->getTripleObjectFormat();
+  auto NamesSection =
+      lookupSection(*OF, getInstrProfSectionName(IPSK_name, ObjFormat,
+                                                 /*AddSegmentInfo=*/false));
   if (auto E = NamesSection.takeError())
     return E;
   auto CoverageSection =
-      lookupSection(*OF, getInstrProfCoverageSectionName(false));
+      lookupSection(*OF, getInstrProfSectionName(IPSK_covmap, ObjFormat,
+                                                 /*AddSegmentInfo=*/false));
   if (auto E = CoverageSection.takeError())
     return E;
 
diff --git a/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 82356333b937..f131be2cba49 100644
--- a/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMappingWriter.cpp - Code coverage mapping writer -------------=//
+//===- CoverageMappingWriter.cpp - Code coverage mapping writer -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ProfileData/Coverage/CoverageMappingWriter.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -27,14 +34,25 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
 }
 
 namespace {
+
 /// \brief Gather only the expressions that are used by the mapping
 /// regions in this function.
 class CounterExpressionsMinimizer {
   ArrayRef<CounterExpression> Expressions;
-  llvm::SmallVector<CounterExpression, 16> UsedExpressions;
+  SmallVector<CounterExpression, 16> UsedExpressions;
   std::vector<unsigned> AdjustedExpressionIDs;
 
 public:
+  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
+                              ArrayRef<CounterMappingRegion> MappingRegions)
+      : Expressions(Expressions) {
+    AdjustedExpressionIDs.resize(Expressions.size(), 0);
+    for (const auto &I : MappingRegions)
+      mark(I.Count);
+    for (const auto &I : MappingRegions)
+      gatherUsed(I.Count);
+  }
+
   void mark(Counter C) {
     if (!C.isExpression())
       return;
@@ -54,16 +72,6 @@ public:
     gatherUsed(E.RHS);
   }
 
-  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
-                              ArrayRef<CounterMappingRegion> MappingRegions)
-      : Expressions(Expressions) {
-    AdjustedExpressionIDs.resize(Expressions.size(), 0);
-    for (const auto &I : MappingRegions)
-      mark(I.Count);
-    for (const auto &I : MappingRegions)
-      gatherUsed(I.Count);
-  }
-
   ArrayRef<CounterExpression> getExpressions() const { return UsedExpressions; }
 
   /// \brief Adjust the given counter to correctly transition from the old
@@ -74,7 +82,8 @@ public:
     return C;
   }
 };
-}
+
+} // end anonymous namespace
 
 /// \brief Encode the counter.
 ///
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 74acd9e5e207..64a65ccc11a1 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProf.cpp - Instrumented profiling format support -----------------=//
+//===- InstrProf.cpp - Instrumented profiling format support --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,29 +12,68 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 static cl::opt<bool> StaticFuncFullModulePrefix(
-    "static-func-full-module-prefix", cl::init(false),
+    "static-func-full-module-prefix", cl::init(true),
     cl::desc("Use full module build paths in the profile counter names for "
              "static functions."));
 
-namespace {
-std::string getInstrProfErrString(instrprof_error Err) {
+// This option is tailored to users that have different top-level directory in
+// profile-gen and profile-use compilation. Users need to specific the number
+// of levels to strip. A value larger than the number of directories in the
+// source file will strip all the directory names and only leave the basename.
+//
+// Note current ThinLTO module importing for the indirect-calls assumes
+// the source directory name not being stripped. A non-zero option value here
+// can potentially prevent some inter-module indirect-call-promotions.
+static cl::opt<unsigned> StaticFuncStripDirNamePrefix(
+    "static-func-strip-dirname-prefix", cl::init(0),
+    cl::desc("Strip specified level of directory name from source path in "
+             "the profile counter name for static functions."));
+
+static std::string getInstrProfErrString(instrprof_error Err) {
   switch (Err) {
   case instrprof_error::success:
     return "Success";
@@ -76,15 +115,19 @@ std::string getInstrProfErrString(instrprof_error Err) {
   llvm_unreachable("A value of instrprof_error has no message.");
 }
 
+namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class InstrProfErrorCategoryType : public std::error_category {
   const char *name() const noexcept override { return "llvm.instrprof"; }
+
   std::string message(int IE) const override {
     return getInstrProfErrString(static_cast<instrprof_error>(IE));
   }
 };
+
 } // end anonymous namespace
 
 static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory;
@@ -93,8 +136,49 @@ const std::error_category &llvm::instrprof_category() {
   return *ErrorCategory;
 }
 
+namespace {
+
+const char *InstrProfSectNameCommon[] = {
+#define INSTR_PROF_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)      \
+  SectNameCommon,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+const char *InstrProfSectNameCoff[] = {
+#define INSTR_PROF_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)      \
+  SectNameCoff,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+const char *InstrProfSectNamePrefix[] = {
+#define INSTR_PROF_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)      \
+  Prefix,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+} // namespace
+
 namespace llvm {
 
+std::string getInstrProfSectionName(InstrProfSectKind IPSK,
+                                    Triple::ObjectFormatType OF,
+                                    bool AddSegmentInfo) {
+  std::string SectName;
+
+  if (OF == Triple::MachO && AddSegmentInfo)
+    SectName = InstrProfSectNamePrefix[IPSK];
+
+  if (OF == Triple::COFF)
+    SectName += InstrProfSectNameCoff[IPSK];
+  else
+    SectName += InstrProfSectNameCommon[IPSK];
+
+  if (OF == Triple::MachO && IPSK == IPSK_data && AddSegmentInfo)
+    SectName += ",regular,live_support";
+
+  return SectName;
+}
+
 void SoftInstrProfErrors::addError(instrprof_error IE) {
   if (IE == instrprof_error::success)
     return;
@@ -133,6 +217,24 @@ std::string getPGOFuncName(StringRef RawFuncName,
   return GlobalValue::getGlobalIdentifier(RawFuncName, Linkage, FileName);
 }
 
+// Strip NumPrefix level of directory name from PathNameStr. If the number of
+// directory separators is less than NumPrefix, strip all the directories and
+// leave base file name only.
+static StringRef stripDirPrefix(StringRef PathNameStr, uint32_t NumPrefix) {
+  uint32_t Count = NumPrefix;
+  uint32_t Pos = 0, LastPos = 0;
+  for (auto & CI : PathNameStr) {
+    ++Pos;
+    if (llvm::sys::path::is_separator(CI)) {
+      LastPos = Pos;
+      --Count;
+    }
+    if (Count == 0)
+      break;
+  }
+  return PathNameStr.substr(LastPos);
+}
+
 // Return the PGOFuncName. This function has some special handling when called
 // in LTO optimization. The following only applies when calling in LTO passes
 // (when \c InLTO is true): LTO's internalization privatizes many global linkage
@@ -151,6 +253,8 @@ std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
     StringRef FileName = (StaticFuncFullModulePrefix
                               ? F.getParent()->getName()
                               : sys::path::filename(F.getParent()->getName()));
+    if (StaticFuncFullModulePrefix && StaticFuncStripDirNamePrefix != 0)
+      FileName = stripDirPrefix(FileName, StaticFuncStripDirNamePrefix);
     return getPGOFuncName(F.getName(), F.getLinkage(), FileName, Version);
   }
 
@@ -198,7 +302,6 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName) {
-
   // We generally want to match the function's linkage, but available_externally
   // and extern_weak both have the wrong semantics, and anything that doesn't
   // need to link across compilation units doesn't need to be visible at all.
@@ -236,6 +339,17 @@ void InstrProfSymtab::create(Module &M, bool InLTO) {
     const std::string &PGOFuncName = getPGOFuncName(F, InLTO);
     addFuncName(PGOFuncName);
     MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+    // In ThinLTO, local function may have been promoted to global and have
+    // suffix added to the function name. We need to add the stripped function
+    // name to the symbol table so that we can find a match from profile.
+    if (InLTO) {
+      auto pos = PGOFuncName.find('.');
+      if (pos != std::string::npos) {
+        const std::string &OtherFuncName = PGOFuncName.substr(0, pos);
+        addFuncName(OtherFuncName);
+        MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
+      }
+    }
   }
 
   finalizeSymtab();
@@ -243,7 +357,7 @@ void InstrProfSymtab::create(Module &M, bool InLTO) {
 
 Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
                                 bool doCompression, std::string &Result) {
-  assert(NameStrs.size() && "No name data to emit");
+  assert(!NameStrs.empty() && "No name data to emit");
 
   uint8_t Header[16], *P = Header;
   std::string UncompressedNameStrings =
@@ -271,12 +385,12 @@ Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
   }
 
   SmallString<128> CompressedNameStrings;
-  zlib::Status Success =
-      zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
-                     zlib::BestSizeCompression);
-
-  if (Success != zlib::StatusOK)
+  Error E = zlib::compress(StringRef(UncompressedNameStrings),
+                           CompressedNameStrings, zlib::BestSizeCompression);
+  if (E) {
+    consumeError(std::move(E));
     return make_error<InstrProfError>(instrprof_error::compress_failed);
+  }
 
   return WriteStringToResult(CompressedNameStrings.size(),
                              CompressedNameStrings);
@@ -315,9 +429,12 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
     if (isCompressed) {
       StringRef CompressedNameStrings(reinterpret_cast<const char *>(P),
                                       CompressedSize);
-      if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
-                           UncompressedSize) != zlib::StatusOK)
+      if (Error E =
+              zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
+                               UncompressedSize)) {
+        consumeError(std::move(E));
         return make_error<InstrProfError>(instrprof_error::uncompress_failed);
+      }
       P += CompressedSize;
       NameStrings = StringRef(UncompressedNameStrings.data(),
                               UncompressedNameStrings.size());
@@ -553,6 +670,7 @@ void ValueProfRecord::deserializeTo(InstrProfRecord &Record,
 void ValueProfRecord::swapBytes(support::endianness Old,
                                 support::endianness New) {
   using namespace support;
+
   if (Old == New)
     return;
 
@@ -589,6 +707,7 @@ void ValueProfData::deserializeTo(InstrProfRecord &Record,
 template <class T>
 static T swapToHostOrder(const unsigned char *&D, support::endianness Orig) {
   using namespace support;
+
   if (Orig == little)
     return endian::readNext<T, little, unaligned>(D);
   else
@@ -623,6 +742,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
                                 const unsigned char *const BufferEnd,
                                 support::endianness Endianness) {
   using namespace support;
+
   if (D + sizeof(ValueProfData) > BufferEnd)
     return make_error<InstrProfError>(instrprof_error::truncated);
 
@@ -645,6 +765,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
 
 void ValueProfData::swapBytesToHost(support::endianness Endianness) {
   using namespace support;
+
   if (Endianness == getHostEndianness())
     return;
 
@@ -660,6 +781,7 @@ void ValueProfData::swapBytesToHost(support::endianness Endianness) {
 
 void ValueProfData::swapBytesFromHost(support::endianness Endianness) {
   using namespace support;
+
   if (Endianness == getHostEndianness())
     return;
 
@@ -791,7 +913,7 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
     return true;
 
   Triple TT(M.getTargetTriple());
-  if (!TT.isOSBinFormatELF())
+  if (!TT.isOSBinFormatELF() && !TT.isOSBinFormatWasm())
     return false;
 
   // See createPGOFuncNameVar for more details. To avoid link errors, profile
@@ -854,4 +976,26 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
   }
   return true;
 }
+
+// Parse the value profile options.
+void getMemOPSizeRangeFromOption(std::string MemOPSizeRange,
+                                 int64_t &RangeStart, int64_t &RangeLast) {
+  static const int64_t DefaultMemOPSizeRangeStart = 0;
+  static const int64_t DefaultMemOPSizeRangeLast = 8;
+  RangeStart = DefaultMemOPSizeRangeStart;
+  RangeLast = DefaultMemOPSizeRangeLast;
+
+  if (!MemOPSizeRange.empty()) {
+    auto Pos = MemOPSizeRange.find(":");
+    if (Pos != std::string::npos) {
+      if (Pos > 0)
+        RangeStart = atoi(MemOPSizeRange.substr(0, Pos).c_str());
+      if (Pos < MemOPSizeRange.size() - 1)
+        RangeLast = atoi(MemOPSizeRange.substr(Pos + 1).c_str());
+    } else
+      RangeLast = atoi(MemOPSizeRange.c_str());
+  }
+  assert(RangeLast >= RangeStart);
+}
+
 } // end namespace llvm
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index ad407f07957f..856f793363f7 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProfReader.cpp - Instrumented profiling reader -------------------=//
+//===- InstrProfReader.cpp - Instrumented profiling reader ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +12,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include <cassert>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -78,7 +96,6 @@ IndexedInstrProfReader::create(const Twine &Path) {
   return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
 }
 
-
 Expected<std::unique_ptr<IndexedInstrProfReader>>
 IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Sanity check the buffer.
@@ -182,7 +199,7 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         CHECK_LINE_END(Line);
         std::pair<StringRef, StringRef> VD = Line->rsplit(':');
         uint64_t TakenCount, Value;
-        if (VK == IPVK_IndirectCallTarget) {
+        if (ValueKind == IPVK_IndirectCallTarget) {
           Symtab->addFuncName(VD.first);
           Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
@@ -192,7 +209,8 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         CurrentValues.push_back({Value, TakenCount});
         Line++;
       }
-      Record.addValueData(VK, S, CurrentValues.data(), NumValueData, nullptr);
+      Record.addValueData(ValueKind, S, CurrentValues.data(), NumValueData,
+                          nullptr);
     }
   }
   return success();
@@ -232,7 +250,7 @@ Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
     return error(instrprof_error::malformed);
 
   // Read each counter and fill our internal storage with the values.
-  Record.Counts.clear();
+  Record.Clear();
   Record.Counts.reserve(NumCounters);
   for (uint64_t I = 0; I < NumCounters; ++I) {
     if (Line.is_at_end())
@@ -398,7 +416,6 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
     InstrProfRecord &Record) {
-
   Record.clearValueData();
   CurValueDataSize = 0;
   // Need to match the logic in value profile dumper code in compiler-rt:
@@ -454,9 +471,11 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
 }
 
 namespace llvm {
+
 template class RawInstrProfReader<uint32_t>;
 template class RawInstrProfReader<uint64_t>;
-}
+
+} // end namespace llvm
 
 InstrProfLookupTrait::hash_value_type
 InstrProfLookupTrait::ComputeHash(StringRef K) {
@@ -482,6 +501,8 @@ bool InstrProfLookupTrait::readValueProfilingData(
 
 data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
                                          offset_type N) {
+  using namespace support;
+
   // Check if the data is corrupt. If so, don't try to read it.
   if (N % sizeof(uint64_t))
     return data_type();
@@ -489,7 +510,6 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
 
-  using namespace support;
   const unsigned char *End = D + N;
   while (D < End) {
     // Read hash.
@@ -567,9 +587,10 @@ InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
 }
 
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  using namespace support;
+
   if (DataBuffer.getBufferSize() < 8)
     return false;
-  using namespace support;
   uint64_t Magic =
       endian::read<uint64_t, little, aligned>(DataBuffer.getBufferStart());
   // Verify that it's magical.
@@ -581,6 +602,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
                                     const unsigned char *Cur) {
   using namespace IndexedInstrProf;
   using namespace support;
+
   if (Version >= IndexedInstrProf::Version4) {
     const IndexedInstrProf::Summary *SummaryInLE =
         reinterpret_cast<const IndexedInstrProf::Summary *>(Cur);
@@ -617,6 +639,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
   } else {
     // For older version of profile data, we need to compute on the fly:
     using namespace IndexedInstrProf;
+
     InstrProfSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
     // FIXME: This only computes an empty summary. Need to call addRecord for
     // all InstrProfRecords to get the correct summary.
@@ -626,14 +649,14 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
 }
 
 Error IndexedInstrProfReader::readHeader() {
+  using namespace support;
+
   const unsigned char *Start =
       (const unsigned char *)DataBuffer->getBufferStart();
   const unsigned char *Cur = Start;
   if ((const unsigned char *)DataBuffer->getBufferEnd() - Cur < 24)
     return error(instrprof_error::truncated);
 
-  using namespace support;
-
   auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur);
   Cur += sizeof(IndexedInstrProf::Header);
 
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 029d75660a73..6b7bd3b2fc0a 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProfWriter.cpp - Instrumented profiling writer -------------------=//
+//===- InstrProfWriter.cpp - Instrumented profiling writer ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfWriter.h"
 #include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -41,10 +47,9 @@ namespace llvm {
 // A wrapper class to abstract writer stream with support of bytes
 // back patching.
 class ProfOStream {
-
 public:
-  ProfOStream(llvm::raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
-  ProfOStream(llvm::raw_string_ostream &STR)
+  ProfOStream(raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
+  ProfOStream(raw_string_ostream &STR)
       : IsFDOStream(false), OS(STR), LE(STR) {}
 
   uint64_t tell() { return OS.tell(); }
@@ -55,15 +60,16 @@ public:
   // directly and it won't be reflected in the stream's internal buffer.
   void patch(PatchItem *P, int NItems) {
     using namespace support;
+
     if (IsFDOStream) {
-      llvm::raw_fd_ostream &FDOStream = static_cast<llvm::raw_fd_ostream &>(OS);
+      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
     } else {
-      llvm::raw_string_ostream &SOStream =
+      raw_string_ostream &SOStream =
           static_cast<llvm::raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
       for (int K = 0; K < NItems; K++) {
@@ -94,17 +100,19 @@ public:
   typedef uint64_t hash_value_type;
   typedef uint64_t offset_type;
 
-  support::endianness ValueProfDataEndianness;
+  support::endianness ValueProfDataEndianness = support::little;
   InstrProfSummaryBuilder *SummaryBuilder;
 
-  InstrProfRecordWriterTrait() : ValueProfDataEndianness(support::little) {}
+  InstrProfRecordWriterTrait() = default;
+
   static hash_value_type ComputeHash(key_type_ref K) {
     return IndexedInstrProf::ComputeHash(K);
   }
 
   static std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
-    using namespace llvm::support;
+    using namespace support;
+
     endian::Writer<little> LE(Out);
 
     offset_type N = K.size();
@@ -130,7 +138,8 @@ public:
   }
 
   void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
-    using namespace llvm::support;
+    using namespace support;
+
     endian::Writer<little> LE(Out);
     for (const auto &ProfileData : *V) {
       const InstrProfRecord &ProfRecord = ProfileData.second;
@@ -154,8 +163,7 @@ public:
 } // end namespace llvm
 
 InstrProfWriter::InstrProfWriter(bool Sparse)
-    : Sparse(Sparse), FunctionData(), ProfileKind(PF_Unknown),
-      InfoObj(new InstrProfRecordWriterTrait()) {}
+    : Sparse(Sparse), InfoObj(new InstrProfRecordWriterTrait()) {}
 
 InstrProfWriter::~InstrProfWriter() { delete InfoObj; }
 
@@ -208,7 +216,7 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
     return true;
   for (const auto &Func : PD) {
     const InstrProfRecord &IPR = Func.second;
-    if (any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
+    if (llvm::any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
       return true;
   }
   return false;
@@ -217,6 +225,7 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
 static void setSummary(IndexedInstrProf::Summary *TheSummary,
                        ProfileSummary &PS) {
   using namespace IndexedInstrProf;
+
   std::vector<ProfileSummaryEntry> &Res = PS.getDetailedSummary();
   TheSummary->NumSummaryFields = Summary::NumKinds;
   TheSummary->NumCutoffEntries = Res.size();
@@ -231,9 +240,10 @@ static void setSummary(IndexedInstrProf::Summary *TheSummary,
 }
 
 void InstrProfWriter::writeImpl(ProfOStream &OS) {
+  using namespace IndexedInstrProf;
+
   OnDiskChainedHashTableGenerator<InstrProfRecordWriterTrait> Generator;
 
-  using namespace IndexedInstrProf;
   InstrProfSummaryBuilder ISB(ProfileSummaryBuilder::DefaultCutoffs);
   InfoObj->SummaryBuilder = &ISB;
 
@@ -301,7 +311,7 @@ void InstrProfWriter::write(raw_fd_ostream &OS) {
 
 std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
   std::string Data;
-  llvm::raw_string_ostream OS(Data);
+  raw_string_ostream OS(Data);
   ProfOStream POS(OS);
   // Write the hash table.
   writeImpl(POS);
diff --git a/lib/ProfileData/SampleProf.cpp b/lib/ProfileData/SampleProf.cpp
index 5bcfff0801d5..eafdd2154b7b 100644
--- a/lib/ProfileData/SampleProf.cpp
+++ b/lib/ProfileData/SampleProf.cpp
@@ -13,18 +13,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <system_error>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class SampleProfErrorCategoryType : public std::error_category {
   const char *name() const noexcept override { return "llvm.sampleprof"; }
+
   std::string message(int IE) const override {
     sampleprof_error E = static_cast<sampleprof_error>(IE);
     switch (E) {
@@ -54,7 +61,8 @@ class SampleProfErrorCategoryType : public std::error_category {
     llvm_unreachable("A value of sampleprof_error has no message.");
   }
 };
-}
+
+} // end anonymous namespace
 
 static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory;
 
@@ -74,7 +82,9 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LineLocation::dump() const { print(dbgs()); }
+#endif
 
 /// \brief Print the sample record to the stream \p OS indented by \p Indent.
 void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
@@ -87,7 +97,9 @@ void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
   OS << "\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SampleRecord::dump() const { print(dbgs(), 0); }
+#endif
 
 raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
                                           const SampleRecord &Sample) {
@@ -101,7 +113,7 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
      << " sampled lines\n";
 
   OS.indent(Indent);
-  if (BodySamples.size() > 0) {
+  if (!BodySamples.empty()) {
     OS << "Samples collected in the function's body {\n";
     SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples);
     for (const auto &SI : SortedBodySamples.get()) {
@@ -115,14 +127,16 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   }
 
   OS.indent(Indent);
-  if (CallsiteSamples.size() > 0) {
+  if (!CallsiteSamples.empty()) {
     OS << "Samples collected in inlined callsites {\n";
-    SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
+    SampleSorter<LineLocation, FunctionSamplesMap> SortedCallsiteSamples(
         CallsiteSamples);
     for (const auto &CS : SortedCallsiteSamples.get()) {
-      OS.indent(Indent + 2);
-      OS << CS->first << ": inlined callee: " << CS->second.getName() << ": ";
-      CS->second.print(OS, Indent + 4);
+      for (const auto &FS : CS->second) {
+        OS.indent(Indent + 2);
+        OS << CS->first << ": inlined callee: " << FS.second.getName() << ": ";
+        FS.second.print(OS, Indent + 4);
+      }
     }
     OS << "}\n";
   } else {
@@ -136,4 +150,6 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
-void FunctionSamples::dump(void) const { print(dbgs(), 0); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
+#endif
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index af80b036a5bb..234fe02ac8a8 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -23,14 +23,25 @@
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <system_error>
+#include <vector>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 /// \brief Dump the function profile for \p FName.
 ///
@@ -200,7 +211,7 @@ std::error_code SampleProfileReaderText::read() {
           InlineStack.pop_back();
         }
         FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt(
-            LineLocation(LineOffset, Discriminator));
+            LineLocation(LineOffset, Discriminator))[FName];
         FSamples.setName(FName);
         MergeResult(Result, FSamples.addTotalSamples(NumSamples));
         InlineStack.push_back(&FSamples);
@@ -352,8 +363,8 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
     if (std::error_code EC = FName.getError())
       return EC;
 
-    FunctionSamples &CalleeProfile =
-        FProfile.functionSamplesAt(LineLocation(*LineOffset, *Discriminator));
+    FunctionSamples &CalleeProfile = FProfile.functionSamplesAt(
+        LineLocation(*LineOffset, *Discriminator))[*FName];
     CalleeProfile.setName(*FName);
     if (std::error_code EC = readProfile(CalleeProfile))
       return EC;
@@ -625,7 +636,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
     uint32_t LineOffset = Offset >> 16;
     uint32_t Discriminator = Offset & 0xffff;
     FProfile = &CallerProfile->functionSamplesAt(
-        LineLocation(LineOffset, Discriminator));
+        LineLocation(LineOffset, Discriminator))[Name];
   }
   FProfile->setName(Name);
 
@@ -681,11 +692,9 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
       if (!GcovBuffer.readInt64(TargetCount))
         return sampleprof_error::truncated;
 
-      if (Update) {
-        FunctionSamples &TargetProfile = Profiles[TargetName];
-        TargetProfile.addCalledTargetSamples(LineOffset, Discriminator,
-                                             TargetName, TargetCount);
-      }
+      if (Update)
+        FProfile->addCalledTargetSamples(LineOffset, Discriminator,
+                                         TargetName, TargetCount);
     }
   }
 
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index 4fa71288f8d9..b91b6fb7c7ad 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -18,16 +18,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/ProfileData/SampleProfWriter.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <system_error>
+#include <utility>
+#include <vector>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 /// \brief Write samples to a text file.
 ///
@@ -61,20 +68,21 @@ std::error_code SampleProfileWriterText::write(const FunctionSamples &S) {
     OS << "\n";
   }
 
-  SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
+  SampleSorter<LineLocation, FunctionSamplesMap> SortedCallsiteSamples(
       S.getCallsiteSamples());
   Indent += 1;
-  for (const auto &I : SortedCallsiteSamples.get()) {
-    LineLocation Loc = I->first;
-    const FunctionSamples &CalleeSamples = I->second;
-    OS.indent(Indent);
-    if (Loc.Discriminator == 0)
-      OS << Loc.LineOffset << ": ";
-    else
-      OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
-    if (std::error_code EC = write(CalleeSamples))
-      return EC;
-  }
+  for (const auto &I : SortedCallsiteSamples.get())
+    for (const auto &FS : I->second) {
+      LineLocation Loc = I->first;
+      const FunctionSamples &CalleeSamples = FS.second;
+      OS.indent(Indent);
+      if (Loc.Discriminator == 0)
+        OS << Loc.LineOffset << ": ";
+      else
+        OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
+      if (std::error_code EC = write(CalleeSamples))
+        return EC;
+    }
   Indent -= 1;
 
   return sampleprof_error::success;
@@ -102,11 +110,12 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
   }
 
   // Recursively add all the names for inlined callsites.
-  for (const auto &J : S.getCallsiteSamples()) {
-    const FunctionSamples &CalleeSamples = J.second;
-    addName(CalleeSamples.getName());
-    addNames(CalleeSamples);
-  }
+  for (const auto &J : S.getCallsiteSamples())
+    for (const auto &FS : J.second) {
+      const FunctionSamples &CalleeSamples = FS.second;
+      addName(CalleeSamples.getName());
+      addNames(CalleeSamples);
+    }
 }
 
 std::error_code SampleProfileWriterBinary::writeHeader(
@@ -180,14 +189,15 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
 
   // Recursively emit all the callsite samples.
   encodeULEB128(S.getCallsiteSamples().size(), OS);
-  for (const auto &J : S.getCallsiteSamples()) {
-    LineLocation Loc = J.first;
-    const FunctionSamples &CalleeSamples = J.second;
-    encodeULEB128(Loc.LineOffset, OS);
-    encodeULEB128(Loc.Discriminator, OS);
-    if (std::error_code EC = writeBody(CalleeSamples))
-      return EC;
-  }
+  for (const auto &J : S.getCallsiteSamples())
+    for (const auto &FS : J.second) {
+      LineLocation Loc = J.first;
+      const FunctionSamples &CalleeSamples = FS.second;
+      encodeULEB128(Loc.LineOffset, OS);
+      encodeULEB128(Loc.Discriminator, OS);
+      if (std::error_code EC = writeBody(CalleeSamples))
+        return EC;
+    }
 
   return sampleprof_error::success;
 }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 4cfbbf8645e0..9778628911cd 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -26,8 +26,21 @@
 #include <cstring>
 #include <limits.h>
 
+#define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL)                             \
+  do {                                                                         \
+    if (usesLayout<IEEEFloat>(getSemantics()))                                 \
+      return U.IEEE.METHOD_CALL;                                               \
+    if (usesLayout<DoubleAPFloat>(getSemantics()))                             \
+      return U.Double.METHOD_CALL;                                             \
+    llvm_unreachable("Unexpected semantics");                                  \
+  } while (false)
+
 using namespace llvm;
 
+// TODO: Remove these and use APInt qualified types directly.
+typedef APInt::WordType integerPart;
+const unsigned int integerPartWidth = APInt::APINT_BITS_PER_WORD;
+
 /// A macro used to combine two fcCategory enums into one key which can be used
 /// in a switch statement to classify how the interaction of two APFloat's
 /// categories affects an operation.
@@ -66,33 +79,43 @@ namespace llvm {
   static const fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
   static const fltSemantics semBogus = {0, 0, 0, 0};
 
-  /* The PowerPC format consists of two doubles.  It does not map cleanly
-     onto the usual format above.  It is approximated using twice the
-     mantissa bits.  Note that for exponents near the double minimum,
-     we no longer can represent the full 106 mantissa bits, so those
-     will be treated as denormal numbers.
-
-     FIXME: While this approximation is equivalent to what GCC uses for
-     compile-time arithmetic on PPC double-double numbers, it is not able
-     to represent all possible values held by a PPC double-double number,
-     for example: (long double) 1.0 + (long double) 0x1p-106
-     Should this be replaced by a full emulation of PPC double-double?
+  /* The IBM double-double semantics. Such a number consists of a pair of IEEE
+     64-bit doubles (Hi, Lo), where |Hi| > |Lo|, and if normal,
+     (double)(Hi + Lo) == Hi. The numeric value it's modeling is Hi + Lo.
+     Therefore it has two 53-bit mantissa parts that aren't necessarily adjacent
+     to each other, and two 11-bit exponents.
 
      Note: we need to make the value different from semBogus as otherwise
      an unsafe optimization may collapse both values to a single address,
      and we heavily rely on them having distinct addresses.             */
   static const fltSemantics semPPCDoubleDouble = {-1, 0, 0, 0};
 
-  /* There are temporary semantics for the real PPCDoubleDouble implementation.
-     Currently, APFloat of PPCDoubleDouble holds one PPCDoubleDoubleImpl as the
-     high part of double double, and one IEEEdouble as the low part, so that
-     the old operations operate on PPCDoubleDoubleImpl, while the newly added
-     operations also populate the IEEEdouble.
+  /* These are legacy semantics for the fallback, inaccrurate implementation of
+     IBM double-double, if the accurate semPPCDoubleDouble doesn't handle the
+     operation. It's equivalent to having an IEEE number with consecutive 106
+     bits of mantissa and 11 bits of exponent.
+
+     It's not equivalent to IBM double-double. For example, a legit IBM
+     double-double, 1 + epsilon:
+
+       1 + epsilon = 1 + (1 >> 1076)
 
-     TODO: Once all functions support DoubleAPFloat mode, we'll change all
-     PPCDoubleDoubleImpl to IEEEdouble and remove PPCDoubleDoubleImpl.  */
-  static const fltSemantics semPPCDoubleDoubleImpl = {1023, -1022 + 53, 53 + 53,
-                                                      128};
+     is not representable by a consecutive 106 bits of mantissa.
+
+     Currently, these semantics are used in the following way:
+
+       semPPCDoubleDouble -> (IEEEdouble, IEEEdouble) ->
+       (64-bit APInt, 64-bit APInt) -> (128-bit APInt) ->
+       semPPCDoubleDoubleLegacy -> IEEE operations
+
+     We use bitcastToAPInt() to get the bit representation (in APInt) of the
+     underlying IEEEdouble, then use the APInt constructor to construct the
+     legacy IEEE float.
+
+     TODO: Implement all operations in semPPCDoubleDouble, and delete these
+     semantics.  */
+  static const fltSemantics semPPCDoubleDoubleLegacy = {1023, -1022 + 53,
+                                                        53 + 53, 128};
 
   const fltSemantics &APFloatBase::IEEEhalf() {
     return semIEEEhalf;
@@ -742,7 +765,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) {
 
 bool IEEEFloat::isDenormal() const {
   return isFiniteNonZero() && (exponent == semantics->minExponent) &&
-         (APInt::tcExtractBit(significandParts(), 
+         (APInt::tcExtractBit(significandParts(),
                               semantics->precision - 1) == 0);
 }
 
@@ -862,11 +885,6 @@ IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&semBogus) {
 
 IEEEFloat::~IEEEFloat() { freeSignificand(); }
 
-// Profile - This method 'profiles' an APFloat for use with FoldingSet.
-void IEEEFloat::Profile(FoldingSetNodeID &ID) const {
-  ID.Add(bitcastToAPInt());
-}
-
 unsigned int IEEEFloat::partCount() const {
   return partCountForBits(semantics->precision + 1);
 }
@@ -966,14 +984,14 @@ lostFraction IEEEFloat::multiplySignificand(const IEEEFloat &rhs,
   //     rhs = b23 . b22 ... b0 * 2^e2
   // the result of multiplication is:
   //   *this = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
-  // Note that there are three significant bits at the left-hand side of the 
+  // Note that there are three significant bits at the left-hand side of the
   // radix point: two for the multiplication, and an overflow bit for the
   // addition (that will always be zero at this point). Move the radix point
   // toward left by two bits, and adjust exponent accordingly.
   exponent += 2;
 
   if (addend && addend->isNonZero()) {
-    // The intermediate result of the multiplication has "2 * precision" 
+    // The intermediate result of the multiplication has "2 * precision"
     // signicant bit; adjust the addend to be consistent with mul result.
     //
     Significand savedSignificand = significand;
@@ -1025,7 +1043,7 @@ lostFraction IEEEFloat::multiplySignificand(const IEEEFloat &rhs,
   }
 
   // Convert the result having "2 * precision" significant-bits back to the one
-  // having "precision" significant-bits. First, move the radix point from 
+  // having "precision" significant-bits. First, move the radix point from
   // poision "2*precision - 1" to "precision - 1". The exponent need to be
   // adjusted by "2*precision - 1" - "precision - 1" = "precision".
   exponent -= precision + 1;
@@ -1611,16 +1629,6 @@ void IEEEFloat::changeSign() {
   sign = !sign;
 }
 
-void IEEEFloat::clearSign() {
-  /* So is this one. */
-  sign = 0;
-}
-
-void IEEEFloat::copySign(const IEEEFloat &rhs) {
-  /* And this one. */
-  sign = rhs.sign;
-}
-
 /* Normalized addition or subtraction.  */
 IEEEFloat::opStatus IEEEFloat::addOrSubtract(const IEEEFloat &rhs,
                                              roundingMode rounding_mode,
@@ -1712,9 +1720,10 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   int parts = partCount();
   integerPart *x = new integerPart[parts];
   bool ignored;
-  fs = V.convertToInteger(x, parts * integerPartWidth, true,
-                          rmNearestTiesToEven, &ignored);
-  if (fs==opInvalidOp) {
+  fs = V.convertToInteger(makeMutableArrayRef(x, parts),
+                          parts * integerPartWidth, true, rmNearestTiesToEven,
+                          &ignored);
+  if (fs == opInvalidOp) {
     delete[] x;
     return fs;
   }
@@ -1735,43 +1744,20 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   return fs;
 }
 
-/* Normalized llvm frem (C fmod).
-   This is not currently correct in all cases.  */
+/* Normalized llvm frem (C fmod). */
 IEEEFloat::opStatus IEEEFloat::mod(const IEEEFloat &rhs) {
   opStatus fs;
   fs = modSpecials(rhs);
 
-  if (isFiniteNonZero() && rhs.isFiniteNonZero()) {
-    IEEEFloat V = *this;
-    unsigned int origSign = sign;
-
-    fs = V.divide(rhs, rmNearestTiesToEven);
-    if (fs == opDivByZero)
-      return fs;
-
-    int parts = partCount();
-    integerPart *x = new integerPart[parts];
-    bool ignored;
-    fs = V.convertToInteger(x, parts * integerPartWidth, true,
-                            rmTowardZero, &ignored);
-    if (fs==opInvalidOp) {
-      delete[] x;
-      return fs;
-    }
-
-    fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
-                                          rmNearestTiesToEven);
-    assert(fs==opOK);   // should always work
-
-    fs = V.multiply(rhs, rmNearestTiesToEven);
-    assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
-
+  while (isFiniteNonZero() && rhs.isFiniteNonZero() &&
+         compareAbsoluteValue(rhs) != cmpLessThan) {
+    IEEEFloat V = scalbn(rhs, ilogb(*this) - ilogb(rhs), rmNearestTiesToEven);
+    if (compareAbsoluteValue(V) == cmpLessThan)
+      V = scalbn(V, -1, rmNearestTiesToEven);
+    V.sign = sign;
+  
     fs = subtract(V, rmNearestTiesToEven);
-    assert(fs==opOK || fs==opInexact);   // likewise
-
-    if (isZero())
-      sign = origSign;    // IEEE754 requires this
-    delete[] x;
+    assert(fs==opOK);
   }
   return fs;
 }
@@ -1840,7 +1826,7 @@ IEEEFloat::opStatus IEEEFloat::roundToIntegral(roundingMode rounding_mode) {
   IEEEFloat MagicConstant(*semantics);
   fs = MagicConstant.convertFromAPInt(IntegerConstant, false,
                                       rmNearestTiesToEven);
-  MagicConstant.copySign(*this);
+  MagicConstant.sign = sign;
 
   if (fs != opOK)
     return fs;
@@ -2047,7 +2033,7 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
    Note that for conversions to integer type the C standard requires
    round-to-zero to always be used.  */
 IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
-    integerPart *parts, unsigned int width, bool isSigned,
+    MutableArrayRef<integerPart> parts, unsigned int width, bool isSigned,
     roundingMode rounding_mode, bool *isExact) const {
   lostFraction lost_fraction;
   const integerPart *src;
@@ -2060,9 +2046,10 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
     return opInvalidOp;
 
   dstPartsCount = partCountForBits(width);
+  assert(dstPartsCount <= parts.size() && "Integer too big");
 
   if (category == fcZero) {
-    APInt::tcSet(parts, 0, dstPartsCount);
+    APInt::tcSet(parts.data(), 0, dstPartsCount);
     // Negative zero can't be represented as an int.
     *isExact = !sign;
     return opOK;
@@ -2074,7 +2061,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
      the destination.  */
   if (exponent < 0) {
     /* Our absolute value is less than one; truncate everything.  */
-    APInt::tcSet(parts, 0, dstPartsCount);
+    APInt::tcSet(parts.data(), 0, dstPartsCount);
     /* For exponent -1 the integer bit represents .5, look at that.
        For smaller exponents leftmost truncated bit is 0. */
     truncatedBits = semantics->precision -1U - exponent;
@@ -2090,11 +2077,13 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
     if (bits < semantics->precision) {
       /* We truncate (semantics->precision - bits) bits.  */
       truncatedBits = semantics->precision - bits;
-      APInt::tcExtract(parts, dstPartsCount, src, bits, truncatedBits);
+      APInt::tcExtract(parts.data(), dstPartsCount, src, bits, truncatedBits);
     } else {
       /* We want at least as many bits as are available.  */
-      APInt::tcExtract(parts, dstPartsCount, src, semantics->precision, 0);
-      APInt::tcShiftLeft(parts, dstPartsCount, bits - semantics->precision);
+      APInt::tcExtract(parts.data(), dstPartsCount, src, semantics->precision,
+                       0);
+      APInt::tcShiftLeft(parts.data(), dstPartsCount,
+                         bits - semantics->precision);
       truncatedBits = 0;
     }
   }
@@ -2106,7 +2095,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
                                                   truncatedBits);
     if (lost_fraction != lfExactlyZero &&
         roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
-      if (APInt::tcIncrement(parts, dstPartsCount))
+      if (APInt::tcIncrement(parts.data(), dstPartsCount))
         return opInvalidOp;     /* Overflow.  */
     }
   } else {
@@ -2114,7 +2103,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
   }
 
   /* Step 3: check if we fit in the destination.  */
-  unsigned int omsb = APInt::tcMSB(parts, dstPartsCount) + 1;
+  unsigned int omsb = APInt::tcMSB(parts.data(), dstPartsCount) + 1;
 
   if (sign) {
     if (!isSigned) {
@@ -2125,7 +2114,8 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
       /* It takes omsb bits to represent the unsigned integer value.
          We lose a bit for the sign, but care is needed as the
          maximally negative integer is a special case.  */
-      if (omsb == width && APInt::tcLSB(parts, dstPartsCount) + 1 != omsb)
+      if (omsb == width &&
+          APInt::tcLSB(parts.data(), dstPartsCount) + 1 != omsb)
         return opInvalidOp;
 
       /* This case can happen because of rounding.  */
@@ -2133,7 +2123,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
         return opInvalidOp;
     }
 
-    APInt::tcNegate (parts, dstPartsCount);
+    APInt::tcNegate (parts.data(), dstPartsCount);
   } else {
     if (omsb >= width + !isSigned)
       return opInvalidOp;
@@ -2155,11 +2145,10 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
    the original value.  This is almost equivalent to result==opOK,
    except for negative zeroes.
 */
-IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
-                                                unsigned int width,
-                                                bool isSigned,
-                                                roundingMode rounding_mode,
-                                                bool *isExact) const {
+IEEEFloat::opStatus
+IEEEFloat::convertToInteger(MutableArrayRef<integerPart> parts,
+                            unsigned int width, bool isSigned,
+                            roundingMode rounding_mode, bool *isExact) const {
   opStatus fs;
 
   fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode,
@@ -2169,6 +2158,7 @@ IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
     unsigned int bits, dstPartsCount;
 
     dstPartsCount = partCountForBits(width);
+    assert(dstPartsCount <= parts.size() && "Integer too big");
 
     if (category == fcNaN)
       bits = 0;
@@ -2177,30 +2167,14 @@ IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
     else
       bits = width - isSigned;
 
-    APInt::tcSetLeastSignificantBits(parts, dstPartsCount, bits);
+    APInt::tcSetLeastSignificantBits(parts.data(), dstPartsCount, bits);
     if (sign && isSigned)
-      APInt::tcShiftLeft(parts, dstPartsCount, width - 1);
+      APInt::tcShiftLeft(parts.data(), dstPartsCount, width - 1);
   }
 
   return fs;
 }
 
-/* Same as convertToInteger(integerPart*, ...), except the result is returned in
-   an APSInt, whose initial bit-width and signed-ness are used to determine the
-   precision of the conversion.
- */
-IEEEFloat::opStatus IEEEFloat::convertToInteger(APSInt &result,
-                                                roundingMode rounding_mode,
-                                                bool *isExact) const {
-  unsigned bitWidth = result.getBitWidth();
-  SmallVector<uint64_t, 4> parts(result.getNumWords());
-  opStatus status = convertToInteger(
-    parts.data(), bitWidth, result.isSigned(), rounding_mode, isExact);
-  // Keeps the original signed-ness.
-  result = APInt(bitWidth, parts);
-  return status;
-}
-
 /* Convert an unsigned integer SRC to a floating point number,
    rounding according to ROUNDING_MODE.  The sign of the floating
    point number is not modified.  */
@@ -2484,7 +2458,7 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {
 
   // Test if we have a zero number allowing for strings with no null terminators
   // and zero decimals with non-zero exponents.
-  // 
+  //
   // We computed firstSigDigit by ignoring all zeros and dots. Thus if
   // D->firstSigDigit equals str.end(), every digit must be a zero and there can
   // be at most one dot. On the other hand, if we have a zero with a non-zero
@@ -2852,7 +2826,7 @@ APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const {
 }
 
 APInt IEEEFloat::convertPPCDoubleDoubleAPFloatToAPInt() const {
-  assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleImpl);
+  assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy);
   assert(partCount()==2);
 
   uint64_t words[2];
@@ -3033,7 +3007,7 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics*)&semIEEEquad)
     return convertQuadrupleAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleImpl)
+  if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy)
     return convertPPCDoubleDoubleAPFloatToAPInt();
 
   assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
@@ -3103,14 +3077,14 @@ void IEEEFloat::initFromPPCDoubleDoubleAPInt(const APInt &api) {
 
   // Get the first double and convert to our format.
   initFromDoubleAPInt(APInt(64, i1));
-  fs = convert(semPPCDoubleDoubleImpl, rmNearestTiesToEven, &losesInfo);
+  fs = convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
   assert(fs == opOK && !losesInfo);
   (void)fs;
 
   // Unless we have a special case, add in second double.
   if (isFiniteNonZero()) {
     IEEEFloat v(semIEEEdouble, APInt(64, i2));
-    fs = v.convert(semPPCDoubleDoubleImpl, rmNearestTiesToEven, &losesInfo);
+    fs = v.convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
     assert(fs == opOK && !losesInfo);
     (void)fs;
 
@@ -3264,7 +3238,7 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromF80LongDoubleAPInt(api);
   if (Sem == &semIEEEquad)
     return initFromQuadrupleAPInt(api);
-  if (Sem == &semPPCDoubleDoubleImpl)
+  if (Sem == &semPPCDoubleDoubleLegacy)
     return initFromPPCDoubleDoubleAPInt(api);
 
   llvm_unreachable(nullptr);
@@ -3620,7 +3594,7 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
     Str.push_back(buffer[NDigits-I-1]);
 }
 
-bool IEEEFloat::getExactInverse(IEEEFloat *inv) const {
+bool IEEEFloat::getExactInverse(APFloat *inv) const {
   // Special floats and denormals have no exact inverse.
   if (!isFiniteNonZero())
     return false;
@@ -3644,7 +3618,7 @@ bool IEEEFloat::getExactInverse(IEEEFloat *inv) const {
          reciprocal.significandLSB() == reciprocal.semantics->precision - 1);
 
   if (inv)
-    *inv = reciprocal;
+    *inv = APFloat(reciprocal, *semantics);
 
   return true;
 }
@@ -3856,28 +3830,29 @@ IEEEFloat frexp(const IEEEFloat &Val, int &Exp, IEEEFloat::roundingMode RM) {
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S)
-    : Semantics(&S), Floats(new APFloat[2]{APFloat(semPPCDoubleDoubleImpl),
-                                           APFloat(semIEEEdouble)}) {
+    : Semantics(&S),
+      Floats(new APFloat[2]{APFloat(semIEEEdouble), APFloat(semIEEEdouble)}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, uninitializedTag)
     : Semantics(&S),
-      Floats(new APFloat[2]{APFloat(semPPCDoubleDoubleImpl, uninitialized),
+      Floats(new APFloat[2]{APFloat(semIEEEdouble, uninitialized),
                             APFloat(semIEEEdouble, uninitialized)}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, integerPart I)
-    : Semantics(&S), Floats(new APFloat[2]{APFloat(semPPCDoubleDoubleImpl, I),
+    : Semantics(&S), Floats(new APFloat[2]{APFloat(semIEEEdouble, I),
                                            APFloat(semIEEEdouble)}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, const APInt &I)
-    : Semantics(&S), Floats(new APFloat[2]{
-                         APFloat(semPPCDoubleDoubleImpl, I),
-                         APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
+    : Semantics(&S),
+      Floats(new APFloat[2]{
+          APFloat(semIEEEdouble, APInt(64, I.getRawData()[0])),
+          APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
@@ -3886,9 +3861,7 @@ DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, APFloat &&First,
     : Semantics(&S),
       Floats(new APFloat[2]{std::move(First), std::move(Second)}) {
   assert(Semantics == &semPPCDoubleDouble);
-  // TODO Check for First == &IEEEdouble once the transition is done.
-  assert(&Floats[0].getSemantics() == &semPPCDoubleDoubleImpl ||
-         &Floats[0].getSemantics() == &semIEEEdouble);
+  assert(&Floats[0].getSemantics() == &semIEEEdouble);
   assert(&Floats[1].getSemantics() == &semIEEEdouble);
 }
 
@@ -3917,6 +3890,7 @@ DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) {
   return *this;
 }
 
+// Implement addition, subtraction, multiplication and division based on:
 // "Software for Doubled-Precision Floating-Point Computations",
 // by Seppo Linnainmaa, ACM TOMS vol 7 no 3, September 1981, pages 272-283.
 APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
@@ -3928,7 +3902,7 @@ APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
   if (!z.isFinite()) {
     if (!z.isInfinity()) {
       Floats[0] = std::move(z);
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return (opStatus)Status;
     }
     Status = opOK;
@@ -3946,7 +3920,7 @@ APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
     }
     if (!z.isFinite()) {
       Floats[0] = std::move(z);
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return (opStatus)Status;
     }
     Floats[0] = z;
@@ -3982,13 +3956,13 @@ APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
     Status |= zz.add(cc, RM);
     if (zz.isZero() && !zz.isNegative()) {
       Floats[0] = std::move(z);
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return opOK;
     }
     Floats[0] = z;
     Status |= Floats[0].add(zz, RM);
     if (!Floats[0].isFinite()) {
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return (opStatus)Status;
     }
     Floats[1] = std::move(z);
@@ -4033,25 +4007,15 @@ APFloat::opStatus DoubleAPFloat::addWithSpecial(const DoubleAPFloat &LHS,
   }
   assert(LHS.getCategory() == fcNormal && RHS.getCategory() == fcNormal);
 
-  // These conversions will go away once PPCDoubleDoubleImpl goes away.
-  // (PPCDoubleDoubleImpl, IEEEDouble) -> (IEEEDouble, IEEEDouble)
-  APFloat A(semIEEEdouble,
-            APInt(64, LHS.Floats[0].bitcastToAPInt().getRawData()[0])),
-      AA(LHS.Floats[1]),
-      C(semIEEEdouble, APInt(64, RHS.Floats[0].bitcastToAPInt().getRawData()[0])),
+  APFloat A(LHS.Floats[0]), AA(LHS.Floats[1]), C(RHS.Floats[0]),
       CC(RHS.Floats[1]);
+  assert(&A.getSemantics() == &semIEEEdouble);
   assert(&AA.getSemantics() == &semIEEEdouble);
+  assert(&C.getSemantics() == &semIEEEdouble);
   assert(&CC.getSemantics() == &semIEEEdouble);
-  Out.Floats[0] = APFloat(semIEEEdouble);
+  assert(&Out.Floats[0].getSemantics() == &semIEEEdouble);
   assert(&Out.Floats[1].getSemantics() == &semIEEEdouble);
-
-  auto Ret = Out.addImpl(A, AA, C, CC, RM);
-
-  // (IEEEDouble, IEEEDouble) -> (PPCDoubleDoubleImpl, IEEEDouble)
-  uint64_t Buffer[] = {Out.Floats[0].bitcastToAPInt().getRawData()[0],
-                       Out.Floats[1].bitcastToAPInt().getRawData()[0]};
-  Out.Floats[0] = APFloat(semPPCDoubleDoubleImpl, APInt(128, 2, Buffer));
-  return Ret;
+  return Out.addImpl(A, AA, C, CC, RM);
 }
 
 APFloat::opStatus DoubleAPFloat::add(const DoubleAPFloat &RHS,
@@ -4067,6 +4031,140 @@ APFloat::opStatus DoubleAPFloat::subtract(const DoubleAPFloat &RHS,
   return Ret;
 }
 
+APFloat::opStatus DoubleAPFloat::multiply(const DoubleAPFloat &RHS,
+                                          APFloat::roundingMode RM) {
+  const auto &LHS = *this;
+  auto &Out = *this;
+  /* Interesting observation: For special categories, finding the lowest
+     common ancestor of the following layered graph gives the correct
+     return category:
+
+        NaN
+       /   \
+     Zero  Inf
+       \   /
+       Normal
+
+     e.g. NaN * NaN = NaN
+          Zero * Inf = NaN
+          Normal * Zero = Zero
+          Normal * Inf = Inf
+  */
+  if (LHS.getCategory() == fcNaN) {
+    Out = LHS;
+    return opOK;
+  }
+  if (RHS.getCategory() == fcNaN) {
+    Out = RHS;
+    return opOK;
+  }
+  if ((LHS.getCategory() == fcZero && RHS.getCategory() == fcInfinity) ||
+      (LHS.getCategory() == fcInfinity && RHS.getCategory() == fcZero)) {
+    Out.makeNaN(false, false, nullptr);
+    return opOK;
+  }
+  if (LHS.getCategory() == fcZero || LHS.getCategory() == fcInfinity) {
+    Out = LHS;
+    return opOK;
+  }
+  if (RHS.getCategory() == fcZero || RHS.getCategory() == fcInfinity) {
+    Out = RHS;
+    return opOK;
+  }
+  assert(LHS.getCategory() == fcNormal && RHS.getCategory() == fcNormal &&
+         "Special cases not handled exhaustively");
+
+  int Status = opOK;
+  APFloat A = Floats[0], B = Floats[1], C = RHS.Floats[0], D = RHS.Floats[1];
+  // t = a * c
+  APFloat T = A;
+  Status |= T.multiply(C, RM);
+  if (!T.isFiniteNonZero()) {
+    Floats[0] = T;
+    Floats[1].makeZero(/* Neg = */ false);
+    return (opStatus)Status;
+  }
+
+  // tau = fmsub(a, c, t), that is -fmadd(-a, c, t).
+  APFloat Tau = A;
+  T.changeSign();
+  Status |= Tau.fusedMultiplyAdd(C, T, RM);
+  T.changeSign();
+  {
+    // v = a * d
+    APFloat V = A;
+    Status |= V.multiply(D, RM);
+    // w = b * c
+    APFloat W = B;
+    Status |= W.multiply(C, RM);
+    Status |= V.add(W, RM);
+    // tau += v + w
+    Status |= Tau.add(V, RM);
+  }
+  // u = t + tau
+  APFloat U = T;
+  Status |= U.add(Tau, RM);
+
+  Floats[0] = U;
+  if (!U.isFinite()) {
+    Floats[1].makeZero(/* Neg = */ false);
+  } else {
+    // Floats[1] = (t - u) + tau
+    Status |= T.subtract(U, RM);
+    Status |= T.add(Tau, RM);
+    Floats[1] = T;
+  }
+  return (opStatus)Status;
+}
+
+APFloat::opStatus DoubleAPFloat::divide(const DoubleAPFloat &RHS,
+                                        APFloat::roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret =
+      Tmp.divide(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::remainder(const DoubleAPFloat &RHS) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret =
+      Tmp.remainder(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::mod(const DoubleAPFloat &RHS) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.mod(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand,
+                                const DoubleAPFloat &Addend,
+                                APFloat::roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.fusedMultiplyAdd(
+      APFloat(semPPCDoubleDoubleLegacy, Multiplicand.bitcastToAPInt()),
+      APFloat(semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.roundToIntegral(RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
 void DoubleAPFloat::changeSign() {
   Floats[0].changeSign();
   Floats[1].changeSign();
@@ -4101,12 +4199,200 @@ bool DoubleAPFloat::isNegative() const { return Floats[0].isNegative(); }
 
 void DoubleAPFloat::makeInf(bool Neg) {
   Floats[0].makeInf(Neg);
-  Floats[1].makeZero(false);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+void DoubleAPFloat::makeZero(bool Neg) {
+  Floats[0].makeZero(Neg);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+void DoubleAPFloat::makeLargest(bool Neg) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x7fefffffffffffffull));
+  Floats[1] = APFloat(semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull));
+  if (Neg)
+    changeSign();
+}
+
+void DoubleAPFloat::makeSmallest(bool Neg) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  Floats[0].makeSmallest(Neg);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+void DoubleAPFloat::makeSmallestNormalized(bool Neg) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x0360000000000000ull));
+  if (Neg)
+    Floats[0].changeSign();
+  Floats[1].makeZero(/* Neg = */ false);
 }
 
 void DoubleAPFloat::makeNaN(bool SNaN, bool Neg, const APInt *fill) {
   Floats[0].makeNaN(SNaN, Neg, fill);
-  Floats[1].makeZero(false);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+APFloat::cmpResult DoubleAPFloat::compare(const DoubleAPFloat &RHS) const {
+  auto Result = Floats[0].compare(RHS.Floats[0]);
+  // |Float[0]| > |Float[1]|
+  if (Result == APFloat::cmpEqual)
+    return Floats[1].compare(RHS.Floats[1]);
+  return Result;
+}
+
+bool DoubleAPFloat::bitwiseIsEqual(const DoubleAPFloat &RHS) const {
+  return Floats[0].bitwiseIsEqual(RHS.Floats[0]) &&
+         Floats[1].bitwiseIsEqual(RHS.Floats[1]);
+}
+
+hash_code hash_value(const DoubleAPFloat &Arg) {
+  if (Arg.Floats)
+    return hash_combine(hash_value(Arg.Floats[0]), hash_value(Arg.Floats[1]));
+  return hash_combine(Arg.Semantics);
+}
+
+APInt DoubleAPFloat::bitcastToAPInt() const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  uint64_t Data[] = {
+      Floats[0].bitcastToAPInt().getRawData()[0],
+      Floats[1].bitcastToAPInt().getRawData()[0],
+  };
+  return APInt(128, 2, Data);
+}
+
+APFloat::opStatus DoubleAPFloat::convertFromString(StringRef S,
+                                                   roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromString(S, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.next(nextDown);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input,
+                                unsigned int Width, bool IsSigned,
+                                roundingMode RM, bool *IsExact) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+      .convertToInteger(Input, Width, IsSigned, RM, IsExact);
+}
+
+APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input,
+                                                  bool IsSigned,
+                                                  roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromAPInt(Input, IsSigned, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::convertFromSignExtendedInteger(const integerPart *Input,
+                                              unsigned int InputSize,
+                                              bool IsSigned, roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromSignExtendedInteger(Input, InputSize, IsSigned, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::convertFromZeroExtendedInteger(const integerPart *Input,
+                                              unsigned int InputSize,
+                                              bool IsSigned, roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromZeroExtendedInteger(Input, InputSize, IsSigned, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+unsigned int DoubleAPFloat::convertToHexString(char *DST,
+                                               unsigned int HexDigits,
+                                               bool UpperCase,
+                                               roundingMode RM) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+      .convertToHexString(DST, HexDigits, UpperCase, RM);
+}
+
+bool DoubleAPFloat::isDenormal() const {
+  return getCategory() == fcNormal &&
+         (Floats[0].isDenormal() || Floats[1].isDenormal() ||
+          // (double)(Hi + Lo) == Hi defines a normal number.
+          Floats[0].compare(Floats[0] + Floats[1]) != cmpEqual);
+}
+
+bool DoubleAPFloat::isSmallest() const {
+  if (getCategory() != fcNormal)
+    return false;
+  DoubleAPFloat Tmp(*this);
+  Tmp.makeSmallest(this->isNegative());
+  return Tmp.compare(*this) == cmpEqual;
+}
+
+bool DoubleAPFloat::isLargest() const {
+  if (getCategory() != fcNormal)
+    return false;
+  DoubleAPFloat Tmp(*this);
+  Tmp.makeLargest(this->isNegative());
+  return Tmp.compare(*this) == cmpEqual;
+}
+
+bool DoubleAPFloat::isInteger() const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  (void)Tmp.add(Floats[0], rmNearestTiesToEven);
+  (void)Tmp.add(Floats[1], rmNearestTiesToEven);
+  return Tmp.isInteger();
+}
+
+void DoubleAPFloat::toString(SmallVectorImpl<char> &Str,
+                             unsigned FormatPrecision,
+                             unsigned FormatMaxPadding) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+      .toString(Str, FormatPrecision, FormatMaxPadding);
+}
+
+bool DoubleAPFloat::getExactInverse(APFloat *inv) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  if (!inv)
+    return Tmp.getExactInverse(nullptr);
+  APFloat Inv(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.getExactInverse(&Inv);
+  *inv = APFloat(semPPCDoubleDouble, Inv.bitcastToAPInt());
+  return Ret;
+}
+
+DoubleAPFloat scalbn(DoubleAPFloat Arg, int Exp, APFloat::roundingMode RM) {
+  assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  return DoubleAPFloat(semPPCDoubleDouble, scalbn(Arg.Floats[0], Exp, RM),
+                       scalbn(Arg.Floats[1], Exp, RM));
+}
+
+DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
+                    APFloat::roundingMode RM) {
+  assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat First = frexp(Arg.Floats[0], Exp, RM);
+  APFloat Second = Arg.Floats[1];
+  if (Arg.getCategory() == APFloat::fcNormal)
+    Second = scalbn(Second, -Exp, RM);
+  return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second));
 }
 
 } // End detail namespace
@@ -4126,10 +4412,16 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) {
 }
 
 APFloat::opStatus APFloat::convertFromString(StringRef Str, roundingMode RM) {
-  return getIEEE().convertFromString(Str, RM);
+  APFLOAT_DISPATCH_ON_SEMANTICS(convertFromString(Str, RM));
 }
 
-hash_code hash_value(const APFloat &Arg) { return hash_value(Arg.getIEEE()); }
+hash_code hash_value(const APFloat &Arg) {
+  if (APFloat::usesLayout<detail::IEEEFloat>(Arg.getSemantics()))
+    return hash_value(Arg.U.IEEE);
+  if (APFloat::usesLayout<detail::DoubleAPFloat>(Arg.getSemantics()))
+    return hash_value(Arg.U.Double);
+  llvm_unreachable("Unexpected semantics");
+}
 
 APFloat::APFloat(const fltSemantics &Semantics, StringRef S)
     : APFloat(Semantics) {
@@ -4146,10 +4438,8 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
   if (usesLayout<IEEEFloat>(getSemantics()) &&
       usesLayout<DoubleAPFloat>(ToSemantics)) {
     assert(&ToSemantics == &semPPCDoubleDouble);
-    auto Ret = U.IEEE.convert(semPPCDoubleDoubleImpl, RM, losesInfo);
-    *this = APFloat(DoubleAPFloat(semPPCDoubleDouble, std::move(*this),
-                                  APFloat(semIEEEdouble)),
-                    ToSemantics);
+    auto Ret = U.IEEE.convert(semPPCDoubleDoubleLegacy, RM, losesInfo);
+    *this = APFloat(ToSemantics, U.IEEE.bitcastToAPInt());
     return Ret;
   }
   if (usesLayout<DoubleAPFloat>(getSemantics()) &&
@@ -4189,6 +4479,30 @@ void APFloat::print(raw_ostream &OS) const {
   OS << Buffer << "\n";
 }
 
-void APFloat::dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void APFloat::dump() const { print(dbgs()); }
+#endif
+
+void APFloat::Profile(FoldingSetNodeID &NID) const {
+  NID.Add(bitcastToAPInt());
+}
+
+/* Same as convertToInteger(integerPart*, ...), except the result is returned in
+   an APSInt, whose initial bit-width and signed-ness are used to determine the
+   precision of the conversion.
+ */
+APFloat::opStatus APFloat::convertToInteger(APSInt &result,
+                                            roundingMode rounding_mode,
+                                            bool *isExact) const {
+  unsigned bitWidth = result.getBitWidth();
+  SmallVector<uint64_t, 4> parts(result.getNumWords());
+  opStatus status = convertToInteger(parts, bitWidth, result.isSigned(),
+                                     rounding_mode, isExact);
+  // Keeps the original signed-ness.
+  result = APInt(bitWidth, parts);
+  return status;
+}
 
 } // End llvm namespace
+
+#undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index fb8b45166a41..0c7da1dad0d2 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -63,7 +63,7 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
     r = cdigit - 'a';
     if (r <= radix - 11U)
       return r + 10;
-    
+
     radix = 10;
   }
 
@@ -76,14 +76,17 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
 
 
 void APInt::initSlowCase(uint64_t val, bool isSigned) {
+  VAL = 0;
   pVal = getClearedMemory(getNumWords());
   pVal[0] = val;
   if (isSigned && int64_t(val) < 0)
     for (unsigned i = 1; i < getNumWords(); ++i)
       pVal[i] = -1ULL;
+  clearUnusedBits();
 }
 
 void APInt::initSlowCase(const APInt& that) {
+  VAL = 0;
   pVal = getMemory(getNumWords());
   memcpy(pVal, that.pVal, getNumWords() * APINT_WORD_SIZE);
 }
@@ -95,6 +98,7 @@ void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
     VAL = bigVal[0];
   else {
     // Get memory, cleared to 0
+    VAL = 0;
     pVal = getClearedMemory(getNumWords());
     // Calculate the number of words to copy
     unsigned words = std::min<unsigned>(bigVal.size(), getNumWords());
@@ -106,17 +110,17 @@ void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
 }
 
 APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal)
-  : BitWidth(numBits), VAL(0) {
+  : BitWidth(numBits) {
   initFromArray(bigVal);
 }
 
 APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
-  : BitWidth(numBits), VAL(0) {
+  : BitWidth(numBits) {
   initFromArray(makeArrayRef(bigVal, numWords));
 }
 
 APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
-  : BitWidth(numbits), VAL(0) {
+  : VAL(0), BitWidth(numbits) {
   assert(BitWidth && "Bitwidth too small");
   fromString(numbits, Str, radix);
 }
@@ -153,16 +157,6 @@ APInt& APInt::AssignSlowCase(const APInt& RHS) {
   return clearUnusedBits();
 }
 
-APInt& APInt::operator=(uint64_t RHS) {
-  if (isSingleWord())
-    VAL = RHS;
-  else {
-    pVal[0] = RHS;
-    memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
-  }
-  return clearUnusedBits();
-}
-
 /// This method 'profiles' an APInt for use with FoldingSet.
 void APInt::Profile(FoldingSetNodeID& ID) const {
   ID.AddInteger(BitWidth);
@@ -177,76 +171,24 @@ void APInt::Profile(FoldingSetNodeID& ID) const {
     ID.AddInteger(pVal[i]);
 }
 
-/// This function adds a single "digit" integer, y, to the multiple
-/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
-/// 1 is returned if there is a carry out, otherwise 0 is returned.
-/// @returns the carry of the addition.
-static bool add_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
-  for (unsigned i = 0; i < len; ++i) {
-    dest[i] = y + x[i];
-    if (dest[i] < y)
-      y = 1; // Carry one to next digit.
-    else {
-      y = 0; // No need to carry so exit early
-      break;
-    }
-  }
-  return y;
-}
-
 /// @brief Prefix increment operator. Increments the APInt by one.
 APInt& APInt::operator++() {
   if (isSingleWord())
     ++VAL;
   else
-    add_1(pVal, pVal, getNumWords(), 1);
+    tcIncrement(pVal, getNumWords());
   return clearUnusedBits();
 }
 
-/// This function subtracts a single "digit" (64-bit word), y, from
-/// the multi-digit integer array, x[], propagating the borrowed 1 value until
-/// no further borrowing is needed or it runs out of "digits" in x.  The result
-/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
-/// In other words, if y > x then this function returns 1, otherwise 0.
-/// @returns the borrow out of the subtraction
-static bool sub_1(uint64_t x[], unsigned len, uint64_t y) {
-  for (unsigned i = 0; i < len; ++i) {
-    uint64_t X = x[i];
-    x[i] -= y;
-    if (y > X)
-      y = 1;  // We have to "borrow 1" from next "digit"
-    else {
-      y = 0;  // No need to borrow
-      break;  // Remaining digits are unchanged so exit early
-    }
-  }
-  return bool(y);
-}
-
 /// @brief Prefix decrement operator. Decrements the APInt by one.
 APInt& APInt::operator--() {
   if (isSingleWord())
     --VAL;
   else
-    sub_1(pVal, getNumWords(), 1);
+    tcDecrement(pVal, getNumWords());
   return clearUnusedBits();
 }
 
-/// This function adds the integer array x to the integer array Y and
-/// places the result in dest.
-/// @returns the carry out from the addition
-/// @brief General addition of 64-bit integer arrays
-static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y,
-                unsigned len) {
-  bool carry = false;
-  for (unsigned i = 0; i< len; ++i) {
-    uint64_t limit = std::min(x[i],y[i]); // must come first in case dest == x
-    dest[i] = x[i] + y[i] + carry;
-    carry = dest[i] < limit || (carry && dest[i] == limit);
-  }
-  return carry;
-}
-
 /// Adds the RHS APint to this APInt.
 /// @returns this, after addition of RHS.
 /// @brief Addition assignment operator.
@@ -254,9 +196,8 @@ APInt& APInt::operator+=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
     VAL += RHS.VAL;
-  else {
-    add(pVal, pVal, RHS.pVal, getNumWords());
-  }
+  else
+    tcAdd(pVal, RHS.pVal, 0, getNumWords());
   return clearUnusedBits();
 }
 
@@ -264,24 +205,10 @@ APInt& APInt::operator+=(uint64_t RHS) {
   if (isSingleWord())
     VAL += RHS;
   else
-    add_1(pVal, pVal, getNumWords(), RHS);
+    tcAddPart(pVal, RHS, getNumWords());
   return clearUnusedBits();
 }
 
-/// Subtracts the integer array y from the integer array x
-/// @returns returns the borrow out.
-/// @brief Generalized subtraction of 64-bit integer arrays.
-static bool sub(uint64_t *dest, const uint64_t *x, const uint64_t *y,
-                unsigned len) {
-  bool borrow = false;
-  for (unsigned i = 0; i < len; ++i) {
-    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
-    dest[i] = x_tmp - y[i];
-  }
-  return borrow;
-}
-
 /// Subtracts the RHS APInt from this APInt
 /// @returns this, after subtraction
 /// @brief Subtraction assignment operator.
@@ -290,7 +217,7 @@ APInt& APInt::operator-=(const APInt& RHS) {
   if (isSingleWord())
     VAL -= RHS.VAL;
   else
-    sub(pVal, pVal, RHS.pVal, getNumWords());
+    tcSubtract(pVal, RHS.pVal, 0, getNumWords());
   return clearUnusedBits();
 }
 
@@ -298,7 +225,7 @@ APInt& APInt::operator-=(uint64_t RHS) {
   if (isSingleWord())
     VAL -= RHS;
   else
-    sub_1(pVal, getNumWords(), RHS);
+    tcSubtractPart(pVal, RHS, getNumWords());
   return clearUnusedBits();
 }
 
@@ -339,7 +266,7 @@ static uint64_t mul_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
 
 /// Multiplies integer array x by integer array y and stores the result into
 /// the integer array dest. Note that dest's size must be >= xlen + ylen.
-/// @brief Generalized multiplicate of integer arrays.
+/// @brief Generalized multiplication of integer arrays.
 static void mul(uint64_t dest[], uint64_t x[], unsigned xlen, uint64_t y[],
                 unsigned ylen) {
   dest[xlen] = mul_1(dest, x, xlen, y[0]);
@@ -412,69 +339,19 @@ APInt& APInt::operator*=(const APInt& RHS) {
   return *this;
 }
 
-APInt& APInt::operator&=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL &= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] &= RHS.pVal[i];
+APInt& APInt::AndAssignSlowCase(const APInt& RHS) {
+  tcAnd(pVal, RHS.pVal, getNumWords());
   return *this;
 }
 
-APInt& APInt::operator|=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL |= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] |= RHS.pVal[i];
+APInt& APInt::OrAssignSlowCase(const APInt& RHS) {
+  tcOr(pVal, RHS.pVal, getNumWords());
   return *this;
 }
 
-APInt& APInt::operator^=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL ^= RHS.VAL;
-    this->clearUnusedBits();
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] ^= RHS.pVal[i];
-  return clearUnusedBits();
-}
-
-APInt APInt::AndSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t* val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] & RHS.pVal[i];
-  return APInt(val, getBitWidth());
-}
-
-APInt APInt::OrSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t *val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] | RHS.pVal[i];
-  return APInt(val, getBitWidth());
-}
-
-APInt APInt::XorSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t *val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] ^ RHS.pVal[i];
-
-  APInt Result(val, getBitWidth());
-  // 0^0==1 so clear the high bits in case they got set.
-  Result.clearUnusedBits();
-  return Result;
+APInt& APInt::XorAssignSlowCase(const APInt& RHS) {
+  tcXor(pVal, RHS.pVal, getNumWords());
+  return *this;
 }
 
 APInt APInt::operator*(const APInt& RHS) const {
@@ -511,11 +388,11 @@ bool APInt::ult(const APInt& RHS) const {
   if (n1 < n2)
     return true;
 
-  // If magnitude of RHS is greather than LHS, return false.
+  // If magnitude of RHS is greater than LHS, return false.
   if (n2 < n1)
     return false;
 
-  // If they bot fit in a word, just compare the low order word
+  // If they both fit in a word, just compare the low order word
   if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
     return pVal[0] < RHS.pVal[0];
 
@@ -545,7 +422,7 @@ bool APInt::slt(const APInt& RHS) const {
   if (lhsNeg != rhsNeg)
     return lhsNeg;
 
-  // Otherwise we can just use an unsigned comparision, because even negative
+  // Otherwise we can just use an unsigned comparison, because even negative
   // numbers compare correctly this way if both have the same signed-ness.
   return ult(RHS);
 }
@@ -557,6 +434,33 @@ void APInt::setBit(unsigned bitPosition) {
     pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
 }
 
+void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) {
+  unsigned loWord = whichWord(loBit);
+  unsigned hiWord = whichWord(hiBit);
+
+  // Create an initial mask for the low word with zeros below loBit.
+  uint64_t loMask = UINT64_MAX << whichBit(loBit);
+
+  // If hiBit is not aligned, we need a high mask.
+  unsigned hiShiftAmt = whichBit(hiBit);
+  if (hiShiftAmt != 0) {
+    // Create a high mask with zeros above hiBit.
+    uint64_t hiMask = UINT64_MAX >> (APINT_BITS_PER_WORD - hiShiftAmt);
+    // If loWord and hiWord are equal, then we combine the masks. Otherwise,
+    // set the bits in hiWord.
+    if (hiWord == loWord)
+      loMask &= hiMask;
+    else
+      pVal[hiWord] |= hiMask;
+  }
+  // Apply the mask to the low word.
+  pVal[loWord] |= loMask;
+
+  // Fill any words between loWord and hiWord with all ones.
+  for (unsigned word = loWord + 1; word < hiWord; ++word)
+    pVal[word] = UINT64_MAX;
+}
+
 /// Set the given bit to 0 whose position is given as "bitPosition".
 /// @brief Set a given bit to 0.
 void APInt::clearBit(unsigned bitPosition) {
@@ -567,6 +471,10 @@ void APInt::clearBit(unsigned bitPosition) {
 }
 
 /// @brief Toggle every bit to its opposite value.
+void APInt::flipAllBitsSlowCase() {
+  tcComplement(pVal, getNumWords());
+  clearUnusedBits();
+}
 
 /// Toggle a given bit to its opposite value whose position is given
 /// as "bitPosition".
@@ -577,9 +485,104 @@ void APInt::flipBit(unsigned bitPosition) {
   else setBit(bitPosition);
 }
 
+void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
+  unsigned subBitWidth = subBits.getBitWidth();
+  assert(0 < subBitWidth && (subBitWidth + bitPosition) <= BitWidth &&
+         "Illegal bit insertion");
+
+  // Insertion is a direct copy.
+  if (subBitWidth == BitWidth) {
+    *this = subBits;
+    return;
+  }
+
+  // Single word result can be done as a direct bitmask.
+  if (isSingleWord()) {
+    uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
+    VAL &= ~(mask << bitPosition);
+    VAL |= (subBits.VAL << bitPosition);
+    return;
+  }
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hi1Word = whichWord(bitPosition + subBitWidth - 1);
+
+  // Insertion within a single word can be done as a direct bitmask.
+  if (loWord == hi1Word) {
+    uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
+    pVal[loWord] &= ~(mask << loBit);
+    pVal[loWord] |= (subBits.VAL << loBit);
+    return;
+  }
+
+  // Insert on word boundaries.
+  if (loBit == 0) {
+    // Direct copy whole words.
+    unsigned numWholeSubWords = subBitWidth / APINT_BITS_PER_WORD;
+    memcpy(pVal + loWord, subBits.getRawData(),
+           numWholeSubWords * APINT_WORD_SIZE);
+
+    // Mask+insert remaining bits.
+    unsigned remainingBits = subBitWidth % APINT_BITS_PER_WORD;
+    if (remainingBits != 0) {
+      uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - remainingBits);
+      pVal[hi1Word] &= ~mask;
+      pVal[hi1Word] |= subBits.getWord(subBitWidth - 1);
+    }
+    return;
+  }
+
+  // General case - set/clear individual bits in dst based on src.
+  // TODO - there is scope for optimization here, but at the moment this code
+  // path is barely used so prefer readability over performance.
+  for (unsigned i = 0; i != subBitWidth; ++i) {
+    if (subBits[i])
+      setBit(bitPosition + i);
+    else
+      clearBit(bitPosition + i);
+  }
+}
+
+APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
+  assert(numBits > 0 && "Can't extract zero bits");
+  assert(bitPosition < BitWidth && (numBits + bitPosition) <= BitWidth &&
+         "Illegal bit extraction");
+
+  if (isSingleWord())
+    return APInt(numBits, VAL >> bitPosition);
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hiWord = whichWord(bitPosition + numBits - 1);
+
+  // Single word result extracting bits from a single word source.
+  if (loWord == hiWord)
+    return APInt(numBits, pVal[loWord] >> loBit);
+
+  // Extracting bits that start on a source word boundary can be done
+  // as a fast memory copy.
+  if (loBit == 0)
+    return APInt(numBits, makeArrayRef(pVal + loWord, 1 + hiWord - loWord));
+
+  // General case - shift + copy source words directly into place.
+  APInt Result(numBits, 0);
+  unsigned NumSrcWords = getNumWords();
+  unsigned NumDstWords = Result.getNumWords();
+
+  for (unsigned word = 0; word < NumDstWords; ++word) {
+    uint64_t w0 = pVal[loWord + word];
+    uint64_t w1 =
+        (loWord + word + 1) < NumSrcWords ? pVal[loWord + word + 1] : 0;
+    Result.pVal[word] = (w0 >> loBit) | (w1 << (APINT_BITS_PER_WORD - loBit));
+  }
+
+  return Result.clearUnusedBits();
+}
+
 unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
           radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -604,7 +607,7 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
     return slen * 4 + isNegative;
 
   // FIXME: base 36
-  
+
   // This is grossly inefficient but accurate. We could probably do something
   // with a computation of roughly slen*64/20 and then adjust by the value of
   // the first few digits. But, I'm not sure how accurate that could be.
@@ -613,7 +616,7 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   // be too large. This avoids the assertion in the constructor. This
   // calculation doesn't work appropriately for the numbers 0-9, so just use 4
   // bits in that case.
-  unsigned sufficient 
+  unsigned sufficient
     = radix == 10? (slen == 1 ? 4 : slen * 64/18)
                  : (slen == 1 ? 7 : slen * 16/3);
 
@@ -647,19 +650,20 @@ bool APInt::isSplat(unsigned SplatSizeInBits) const {
 
 /// This function returns the high "numBits" bits of this APInt.
 APInt APInt::getHiBits(unsigned numBits) const {
-  return APIntOps::lshr(*this, BitWidth - numBits);
+  return this->lshr(BitWidth - numBits);
 }
 
 /// This function returns the low "numBits" bits of this APInt.
 APInt APInt::getLoBits(unsigned numBits) const {
-  return APIntOps::lshr(APIntOps::shl(*this, BitWidth - numBits),
-                        BitWidth - numBits);
+  APInt Result(getLowBitsSet(BitWidth, numBits));
+  Result &= *this;
+  return Result;
 }
 
 unsigned APInt::countLeadingZerosSlowCase() const {
   unsigned Count = 0;
   for (int i = getNumWords()-1; i >= 0; --i) {
-    integerPart V = pVal[i];
+    uint64_t V = pVal[i];
     if (V == 0)
       Count += APINT_BITS_PER_WORD;
     else {
@@ -729,18 +733,6 @@ unsigned APInt::countPopulationSlowCase() const {
   return Count;
 }
 
-/// Perform a logical right-shift from Src to Dst, which must be equal or
-/// non-overlapping, of Words words, by Shift, which must be less than 64.
-static void lshrNear(uint64_t *Dst, uint64_t *Src, unsigned Words,
-                     unsigned Shift) {
-  uint64_t Carry = 0;
-  for (int I = Words - 1; I >= 0; --I) {
-    uint64_t Tmp = Src[I];
-    Dst[I] = (Tmp >> Shift) | Carry;
-    Carry = Tmp << (64 - Shift);
-  }
-}
-
 APInt APInt::byteSwap() const {
   assert(BitWidth >= 16 && BitWidth % 16 == 0 && "Cannot byteswap!");
   if (BitWidth == 16)
@@ -761,8 +753,7 @@ APInt APInt::byteSwap() const {
   for (unsigned I = 0, N = getNumWords(); I != N; ++I)
     Result.pVal[I] = ByteSwap_64(pVal[N - I - 1]);
   if (Result.BitWidth != BitWidth) {
-    lshrNear(Result.pVal, Result.pVal, getNumWords(),
-             Result.BitWidth - BitWidth);
+    Result.lshrInPlace(Result.BitWidth - BitWidth);
     Result.BitWidth = BitWidth;
   }
   return Result;
@@ -798,14 +789,46 @@ APInt APInt::reverseBits() const {
   return Reversed;
 }
 
-APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1,
-                                            const APInt& API2) {
-  APInt A = API1, B = API2;
-  while (!!B) {
-    APInt T = B;
-    B = APIntOps::urem(A, B);
-    A = T;
+APInt llvm::APIntOps::GreatestCommonDivisor(APInt A, APInt B) {
+  // Fast-path a common case.
+  if (A == B) return A;
+
+  // Corner cases: if either operand is zero, the other is the gcd.
+  if (!A) return B;
+  if (!B) return A;
+
+  // Count common powers of 2 and remove all other powers of 2.
+  unsigned Pow2;
+  {
+    unsigned Pow2_A = A.countTrailingZeros();
+    unsigned Pow2_B = B.countTrailingZeros();
+    if (Pow2_A > Pow2_B) {
+      A.lshrInPlace(Pow2_A - Pow2_B);
+      Pow2 = Pow2_B;
+    } else if (Pow2_B > Pow2_A) {
+      B.lshrInPlace(Pow2_B - Pow2_A);
+      Pow2 = Pow2_A;
+    } else {
+      Pow2 = Pow2_A;
+    }
+  }
+
+  // Both operands are odd multiples of 2^Pow_2:
+  //
+  //   gcd(a, b) = gcd(|a - b| / 2^i, min(a, b))
+  //
+  // This is a modified version of Stein's algorithm, taking advantage of
+  // efficient countTrailingZeros().
+  while (A != B) {
+    if (A.ugt(B)) {
+      A -= B;
+      A.lshrInPlace(A.countTrailingZeros() - Pow2);
+    } else {
+      B -= A;
+      B.lshrInPlace(B.countTrailingZeros() - Pow2);
+    }
   }
+
   return A;
 }
 
@@ -1117,68 +1140,59 @@ APInt APInt::lshr(const APInt &shiftAmt) const {
   return lshr((unsigned)shiftAmt.getLimitedValue(BitWidth));
 }
 
+/// Perform a logical right-shift from Src to Dst of Words words, by Shift,
+/// which must be less than 64. If the source and destination ranges overlap,
+/// we require that Src >= Dst (put another way, we require that the overall
+/// operation is a right shift on the combined range).
+static void lshrWords(APInt::WordType *Dst, APInt::WordType *Src,
+                      unsigned Words, unsigned Shift) {
+  assert(Shift < APInt::APINT_BITS_PER_WORD);
+
+  if (!Words)
+    return;
+
+  if (Shift == 0) {
+    std::memmove(Dst, Src, Words * APInt::APINT_WORD_SIZE);
+    return;
+  }
+
+  uint64_t Low = Src[0];
+  for (unsigned I = 1; I != Words; ++I) {
+    uint64_t High = Src[I];
+    Dst[I - 1] =
+        (Low >> Shift) | (High << (APInt::APINT_BITS_PER_WORD - Shift));
+    Low = High;
+  }
+  Dst[Words - 1] = Low >> Shift;
+}
+
 /// Logical right-shift this APInt by shiftAmt.
 /// @brief Logical right-shift function.
-APInt APInt::lshr(unsigned shiftAmt) const {
+void APInt::lshrInPlace(unsigned shiftAmt) {
   if (isSingleWord()) {
     if (shiftAmt >= BitWidth)
-      return APInt(BitWidth, 0);
+      VAL = 0;
     else
-      return APInt(BitWidth, this->VAL >> shiftAmt);
-  }
-
-  // If all the bits were shifted out, the result is 0. This avoids issues
-  // with shifting by the size of the integer type, which produces undefined
-  // results. We define these "undefined results" to always be 0.
-  if (shiftAmt >= BitWidth)
-    return APInt(BitWidth, 0);
-
-  // If none of the bits are shifted out, the result is *this. This avoids
-  // issues with shifting by the size of the integer type, which produces
-  // undefined results in the code below. This is also an optimization.
-  if (shiftAmt == 0)
-    return *this;
-
-  // Create some space for the result.
-  uint64_t * val = new uint64_t[getNumWords()];
-
-  // If we are shifting less than a word, compute the shift with a simple carry
-  if (shiftAmt < APINT_BITS_PER_WORD) {
-    lshrNear(val, pVal, getNumWords(), shiftAmt);
-    APInt Result(val, BitWidth);
-    Result.clearUnusedBits();
-    return Result;
+      VAL >>= shiftAmt;
+    return;
   }
 
-  // Compute some values needed by the remaining shift algorithms
-  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
-  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
+  // Don't bother performing a no-op shift.
+  if (!shiftAmt)
+    return;
 
-  // If we are shifting whole words, just move whole words
-  if (wordShift == 0) {
-    for (unsigned i = 0; i < getNumWords() - offset; ++i)
-      val[i] = pVal[i+offset];
-    for (unsigned i = getNumWords()-offset; i < getNumWords(); i++)
-      val[i] = 0;
-    APInt Result(val, BitWidth);
-    Result.clearUnusedBits();
-    return Result;
-  }
+  // Find number of complete words being shifted out and zeroed.
+  const unsigned Words = getNumWords();
+  const unsigned ShiftFullWords =
+      std::min(shiftAmt / APINT_BITS_PER_WORD, Words);
 
-  // Shift the low order words
-  unsigned breakWord = getNumWords() - offset -1;
-  for (unsigned i = 0; i < breakWord; ++i)
-    val[i] = (pVal[i+offset] >> wordShift) |
-             (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
-  // Shift the break word.
-  val[breakWord] = pVal[breakWord+offset] >> wordShift;
+  // Fill in first Words - ShiftFullWords by shifting.
+  lshrWords(pVal, pVal + ShiftFullWords, Words - ShiftFullWords,
+            shiftAmt % APINT_BITS_PER_WORD);
 
-  // Remaining words are 0
-  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
-    val[i] = 0;
-  APInt Result(val, BitWidth);
-  Result.clearUnusedBits();
-  return Result;
+  // The remaining high words are all zero.
+  for (unsigned I = Words - ShiftFullWords; I != Words; ++I)
+    pVal[I] = 0;
 }
 
 /// Left-shift this APInt by shiftAmt.
@@ -1244,8 +1258,21 @@ APInt APInt::shlSlowCase(unsigned shiftAmt) const {
   return Result;
 }
 
+// Calculate the rotate amount modulo the bit width.
+static unsigned rotateModulo(unsigned BitWidth, const APInt &rotateAmt) {
+  unsigned rotBitWidth = rotateAmt.getBitWidth();
+  APInt rot = rotateAmt;
+  if (rotBitWidth < BitWidth) {
+    // Extend the rotate APInt, so that the urem doesn't divide by 0.
+    // e.g. APInt(1, 32) would give APInt(1, 0).
+    rot = rotateAmt.zext(BitWidth);
+  }
+  rot = rot.urem(APInt(rot.getBitWidth(), BitWidth));
+  return rot.getLimitedValue(BitWidth);
+}
+
 APInt APInt::rotl(const APInt &rotateAmt) const {
-  return rotl((unsigned)rotateAmt.getLimitedValue(BitWidth));
+  return rotl(rotateModulo(BitWidth, rotateAmt));
 }
 
 APInt APInt::rotl(unsigned rotateAmt) const {
@@ -1256,7 +1283,7 @@ APInt APInt::rotl(unsigned rotateAmt) const {
 }
 
 APInt APInt::rotr(const APInt &rotateAmt) const {
-  return rotr((unsigned)rotateAmt.getLimitedValue(BitWidth));
+  return rotr(rotateModulo(BitWidth, rotateAmt));
 }
 
 APInt APInt::rotr(unsigned rotateAmt) const {
@@ -1618,7 +1645,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   if (r) {
     // The value d is expressed by the "shift" value above since we avoided
     // multiplication by d by using a shift left. So, all we have to do is
-    // shift right here. In order to mak
+    // shift right here.
     if (shift) {
       unsigned carry = 0;
       DEBUG(dbgs() << "KnuthDiv: remainder:");
@@ -2014,7 +2041,7 @@ APInt APInt::sdiv_ov(const APInt &RHS, bool &Overflow) const {
 
 APInt APInt::smul_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this * RHS;
-  
+
   if (*this != 0 && RHS != 0)
     Overflow = Res.sdiv(RHS) != *this || Res.sdiv(*this) != RHS;
   else
@@ -2041,7 +2068,7 @@ APInt APInt::sshl_ov(const APInt &ShAmt, bool &Overflow) const {
     Overflow = ShAmt.uge(countLeadingZeros());
   else
     Overflow = ShAmt.uge(countLeadingOnes());
-  
+
   return *this << ShAmt;
 }
 
@@ -2061,7 +2088,7 @@ APInt APInt::ushl_ov(const APInt &ShAmt, bool &Overflow) const {
 void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   // Check our assumptions here
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
           radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -2086,9 +2113,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   // Figure out if we can shift instead of multiply
   unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
 
-  // Set up an APInt for the digit to add outside the loop so we don't
+  // Set up an APInt for the radix multiplier outside the loop so we don't
   // constantly construct/destruct it.
-  APInt apdigit(getBitWidth(), 0);
   APInt apradix(getBitWidth(), radix);
 
   // Enter digit traversal loop
@@ -2105,11 +2131,7 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
     }
 
     // Add in the digit we just interpreted
-    if (apdigit.isSingleWord())
-      apdigit.VAL = digit;
-    else
-      apdigit.pVal[0] = digit;
-    *this += apdigit;
+    *this += digit;
   }
   // If its negative, put it in two's complement form
   if (isNeg) {
@@ -2120,7 +2142,7 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
                      bool Signed, bool formatAsCLiteral) const {
-  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 || 
+  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 ||
           Radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -2208,7 +2230,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 
   // For the 2, 8 and 16 bit cases, we can just shift instead of divide
   // because the number of bits per digit (1, 3 and 4 respectively) divides
-  // equaly.  We just shift until the value is zero.
+  // equally.  We just shift until the value is zero.
   if (Radix == 2 || Radix == 8 || Radix == 16) {
     // Just shift tmp right for each digit width until it becomes zero
     unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
@@ -2245,14 +2267,15 @@ std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
   return S.str();
 }
 
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void APInt::dump() const {
   SmallString<40> S, U;
   this->toStringUnsigned(U);
   this->toStringSigned(S);
   dbgs() << "APInt(" << BitWidth << "b, "
-         << U << "u " << S << "s)";
+         << U << "u " << S << "s)\n";
 }
+#endif
 
 void APInt::print(raw_ostream &OS, bool isSigned) const {
   SmallString<40> S;
@@ -2265,83 +2288,60 @@ void APInt::print(raw_ostream &OS, bool isSigned) const {
 
 // Assumed by lowHalf, highHalf, partMSB and partLSB.  A fairly safe
 // and unrestricting assumption.
-static_assert(integerPartWidth % 2 == 0, "Part width must be divisible by 2!");
+static_assert(APInt::APINT_BITS_PER_WORD % 2 == 0,
+              "Part width must be divisible by 2!");
 
 /* Some handy functions local to this file.  */
-namespace {
 
-  /* Returns the integer part with the least significant BITS set.
-     BITS cannot be zero.  */
-  static inline integerPart
-  lowBitMask(unsigned int bits)
-  {
-    assert(bits != 0 && bits <= integerPartWidth);
+/* Returns the integer part with the least significant BITS set.
+   BITS cannot be zero.  */
+static inline APInt::WordType lowBitMask(unsigned bits) {
+  assert(bits != 0 && bits <= APInt::APINT_BITS_PER_WORD);
 
-    return ~(integerPart) 0 >> (integerPartWidth - bits);
-  }
+  return ~(APInt::WordType) 0 >> (APInt::APINT_BITS_PER_WORD - bits);
+}
 
-  /* Returns the value of the lower half of PART.  */
-  static inline integerPart
-  lowHalf(integerPart part)
-  {
-    return part & lowBitMask(integerPartWidth / 2);
-  }
+/* Returns the value of the lower half of PART.  */
+static inline APInt::WordType lowHalf(APInt::WordType part) {
+  return part & lowBitMask(APInt::APINT_BITS_PER_WORD / 2);
+}
 
-  /* Returns the value of the upper half of PART.  */
-  static inline integerPart
-  highHalf(integerPart part)
-  {
-    return part >> (integerPartWidth / 2);
-  }
+/* Returns the value of the upper half of PART.  */
+static inline APInt::WordType highHalf(APInt::WordType part) {
+  return part >> (APInt::APINT_BITS_PER_WORD / 2);
+}
 
-  /* Returns the bit number of the most significant set bit of a part.
-     If the input number has no bits set -1U is returned.  */
-  static unsigned int
-  partMSB(integerPart value)
-  {
-    return findLastSet(value, ZB_Max);
-  }
+/* Returns the bit number of the most significant set bit of a part.
+   If the input number has no bits set -1U is returned.  */
+static unsigned partMSB(APInt::WordType value) {
+  return findLastSet(value, ZB_Max);
+}
 
-  /* Returns the bit number of the least significant set bit of a
-     part.  If the input number has no bits set -1U is returned.  */
-  static unsigned int
-  partLSB(integerPart value)
-  {
-    return findFirstSet(value, ZB_Max);
-  }
+/* Returns the bit number of the least significant set bit of a
+   part.  If the input number has no bits set -1U is returned.  */
+static unsigned partLSB(APInt::WordType value) {
+  return findFirstSet(value, ZB_Max);
 }
 
 /* Sets the least significant part of a bignum to the input value, and
    zeroes out higher parts.  */
-void
-APInt::tcSet(integerPart *dst, integerPart part, unsigned int parts)
-{
-  unsigned int i;
-
+void APInt::tcSet(WordType *dst, WordType part, unsigned parts) {
   assert(parts > 0);
 
   dst[0] = part;
-  for (i = 1; i < parts; i++)
+  for (unsigned i = 1; i < parts; i++)
     dst[i] = 0;
 }
 
 /* Assign one bignum to another.  */
-void
-APInt::tcAssign(integerPart *dst, const integerPart *src, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcAssign(WordType *dst, const WordType *src, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] = src[i];
 }
 
 /* Returns true if a bignum is zero, false otherwise.  */
-bool
-APInt::tcIsZero(const integerPart *src, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+bool APInt::tcIsZero(const WordType *src, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     if (src[i])
       return false;
 
@@ -2349,41 +2349,29 @@ APInt::tcIsZero(const integerPart *src, unsigned int parts)
 }
 
 /* Extract the given bit of a bignum; returns 0 or 1.  */
-int
-APInt::tcExtractBit(const integerPart *parts, unsigned int bit)
-{
-  return (parts[bit / integerPartWidth] &
-          ((integerPart) 1 << bit % integerPartWidth)) != 0;
+int APInt::tcExtractBit(const WordType *parts, unsigned bit) {
+  return (parts[whichWord(bit)] & maskBit(bit)) != 0;
 }
 
 /* Set the given bit of a bignum. */
-void
-APInt::tcSetBit(integerPart *parts, unsigned int bit)
-{
-  parts[bit / integerPartWidth] |= (integerPart) 1 << (bit % integerPartWidth);
+void APInt::tcSetBit(WordType *parts, unsigned bit) {
+  parts[whichWord(bit)] |= maskBit(bit);
 }
 
 /* Clears the given bit of a bignum. */
-void
-APInt::tcClearBit(integerPart *parts, unsigned int bit)
-{
-  parts[bit / integerPartWidth] &=
-    ~((integerPart) 1 << (bit % integerPartWidth));
+void APInt::tcClearBit(WordType *parts, unsigned bit) {
+  parts[whichWord(bit)] &= ~maskBit(bit);
 }
 
 /* Returns the bit number of the least significant set bit of a
    number.  If the input number has no bits set -1U is returned.  */
-unsigned int
-APInt::tcLSB(const integerPart *parts, unsigned int n)
-{
-  unsigned int i, lsb;
-
-  for (i = 0; i < n; i++) {
-      if (parts[i] != 0) {
-          lsb = partLSB(parts[i]);
+unsigned APInt::tcLSB(const WordType *parts, unsigned n) {
+  for (unsigned i = 0; i < n; i++) {
+    if (parts[i] != 0) {
+      unsigned lsb = partLSB(parts[i]);
 
-          return lsb + i * integerPartWidth;
-      }
+      return lsb + i * APINT_BITS_PER_WORD;
+    }
   }
 
   return -1U;
@@ -2391,18 +2379,14 @@ APInt::tcLSB(const integerPart *parts, unsigned int n)
 
 /* Returns the bit number of the most significant set bit of a number.
    If the input number has no bits set -1U is returned.  */
-unsigned int
-APInt::tcMSB(const integerPart *parts, unsigned int n)
-{
-  unsigned int msb;
-
+unsigned APInt::tcMSB(const WordType *parts, unsigned n) {
   do {
     --n;
 
     if (parts[n] != 0) {
-      msb = partMSB(parts[n]);
+      unsigned msb = partMSB(parts[n]);
 
-      return msb + n * integerPartWidth;
+      return msb + n * APINT_BITS_PER_WORD;
     }
   } while (n);
 
@@ -2414,31 +2398,28 @@ APInt::tcMSB(const integerPart *parts, unsigned int n)
    the least significant bit of DST.  All high bits above srcBITS in
    DST are zero-filled.  */
 void
-APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
-                 unsigned int srcBits, unsigned int srcLSB)
-{
-  unsigned int firstSrcPart, dstParts, shift, n;
-
-  dstParts = (srcBits + integerPartWidth - 1) / integerPartWidth;
+APInt::tcExtract(WordType *dst, unsigned dstCount, const WordType *src,
+                 unsigned srcBits, unsigned srcLSB) {
+  unsigned dstParts = (srcBits + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
   assert(dstParts <= dstCount);
 
-  firstSrcPart = srcLSB / integerPartWidth;
+  unsigned firstSrcPart = srcLSB / APINT_BITS_PER_WORD;
   tcAssign (dst, src + firstSrcPart, dstParts);
 
-  shift = srcLSB % integerPartWidth;
+  unsigned shift = srcLSB % APINT_BITS_PER_WORD;
   tcShiftRight (dst, dstParts, shift);
 
-  /* We now have (dstParts * integerPartWidth - shift) bits from SRC
+  /* We now have (dstParts * APINT_BITS_PER_WORD - shift) bits from SRC
      in DST.  If this is less that srcBits, append the rest, else
      clear the high bits.  */
-  n = dstParts * integerPartWidth - shift;
+  unsigned n = dstParts * APINT_BITS_PER_WORD - shift;
   if (n < srcBits) {
-    integerPart mask = lowBitMask (srcBits - n);
+    WordType mask = lowBitMask (srcBits - n);
     dst[dstParts - 1] |= ((src[firstSrcPart + dstParts] & mask)
-                          << n % integerPartWidth);
+                          << n % APINT_BITS_PER_WORD);
   } else if (n > srcBits) {
-    if (srcBits % integerPartWidth)
-      dst[dstParts - 1] &= lowBitMask (srcBits % integerPartWidth);
+    if (srcBits % APINT_BITS_PER_WORD)
+      dst[dstParts - 1] &= lowBitMask (srcBits % APINT_BITS_PER_WORD);
   }
 
   /* Clear high parts.  */
@@ -2447,18 +2428,12 @@ APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
 }
 
 /* DST += RHS + C where C is zero or one.  Returns the carry flag.  */
-integerPart
-APInt::tcAdd(integerPart *dst, const integerPart *rhs,
-             integerPart c, unsigned int parts)
-{
-  unsigned int i;
-
+APInt::WordType APInt::tcAdd(WordType *dst, const WordType *rhs,
+                             WordType c, unsigned parts) {
   assert(c <= 1);
 
-  for (i = 0; i < parts; i++) {
-    integerPart l;
-
-    l = dst[i];
+  for (unsigned i = 0; i < parts; i++) {
+    WordType l = dst[i];
     if (c) {
       dst[i] += rhs[i] + 1;
       c = (dst[i] <= l);
@@ -2471,19 +2446,29 @@ APInt::tcAdd(integerPart *dst, const integerPart *rhs,
   return c;
 }
 
-/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
-integerPart
-APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
-                  integerPart c, unsigned int parts)
-{
-  unsigned int i;
+/// This function adds a single "word" integer, src, to the multiple
+/// "word" integer array, dst[]. dst[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+APInt::WordType APInt::tcAddPart(WordType *dst, WordType src,
+                                 unsigned parts) {
+  for (unsigned i = 0; i < parts; ++i) {
+    dst[i] += src;
+    if (dst[i] >= src)
+      return 0; // No need to carry so exit early.
+    src = 1; // Carry one to next digit.
+  }
 
-  assert(c <= 1);
+  return 1;
+}
 
-  for (i = 0; i < parts; i++) {
-    integerPart l;
+/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
+APInt::WordType APInt::tcSubtract(WordType *dst, const WordType *rhs,
+                                  WordType c, unsigned parts) {
+  assert(c <= 1);
 
-    l = dst[i];
+  for (unsigned i = 0; i < parts; i++) {
+    WordType l = dst[i];
     if (c) {
       dst[i] -= rhs[i] + 1;
       c = (dst[i] >= l);
@@ -2496,10 +2481,28 @@ APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
   return c;
 }
 
+/// This function subtracts a single "word" (64-bit word), src, from
+/// the multi-word integer array, dst[], propagating the borrowed 1 value until
+/// no further borrowing is needed or it runs out of "words" in dst.  The result
+/// is 1 if "borrowing" exhausted the digits in dst, or 0 if dst was not
+/// exhausted. In other words, if src > dst then this function returns 1,
+/// otherwise 0.
+/// @returns the borrow out of the subtraction
+APInt::WordType APInt::tcSubtractPart(WordType *dst, WordType src,
+                                      unsigned parts) {
+  for (unsigned i = 0; i < parts; ++i) {
+    WordType Dst = dst[i];
+    dst[i] -= src;
+    if (src <= Dst)
+      return 0; // No need to borrow so exit early.
+    src = 1; // We have to "borrow 1" from next "word"
+  }
+
+  return 1;
+}
+
 /* Negate a bignum in-place.  */
-void
-APInt::tcNegate(integerPart *dst, unsigned int parts)
-{
+void APInt::tcNegate(WordType *dst, unsigned parts) {
   tcComplement(dst, parts);
   tcIncrement(dst, parts);
 }
@@ -2515,23 +2518,20 @@ APInt::tcNegate(integerPart *dst, unsigned int parts)
     DSTPARTS parts of the result, and if all of the omitted higher
     parts were zero return zero, otherwise overflow occurred and
     return one.  */
-int
-APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
-                      integerPart multiplier, integerPart carry,
-                      unsigned int srcParts, unsigned int dstParts,
-                      bool add)
-{
-  unsigned int i, n;
-
+int APInt::tcMultiplyPart(WordType *dst, const WordType *src,
+                          WordType multiplier, WordType carry,
+                          unsigned srcParts, unsigned dstParts,
+                          bool add) {
   /* Otherwise our writes of DST kill our later reads of SRC.  */
   assert(dst <= src || dst >= src + srcParts);
   assert(dstParts <= srcParts + 1);
 
   /* N loops; minimum of dstParts and srcParts.  */
-  n = dstParts < srcParts ? dstParts: srcParts;
+  unsigned n = dstParts < srcParts ? dstParts: srcParts;
 
+  unsigned i;
   for (i = 0; i < n; i++) {
-    integerPart low, mid, high, srcPart;
+    WordType low, mid, high, srcPart;
 
       /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
 
@@ -2543,7 +2543,7 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
 
     srcPart = src[i];
 
-    if (multiplier == 0 || srcPart == 0)        {
+    if (multiplier == 0 || srcPart == 0) {
       low = carry;
       high = 0;
     } else {
@@ -2552,14 +2552,14 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
 
       mid = lowHalf(srcPart) * highHalf(multiplier);
       high += highHalf(mid);
-      mid <<= integerPartWidth / 2;
+      mid <<= APINT_BITS_PER_WORD / 2;
       if (low + mid < low)
         high++;
       low += mid;
 
       mid = highHalf(srcPart) * lowHalf(multiplier);
       high += highHalf(mid);
-      mid <<= integerPartWidth / 2;
+      mid <<= APINT_BITS_PER_WORD / 2;
       if (low + mid < low)
         high++;
       low += mid;
@@ -2608,19 +2608,14 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
    is filled with the least significant parts of the result.  Returns
    one if overflow occurred, otherwise zero.  DST must be disjoint
    from both operands.  */
-int
-APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
-                  const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-  int overflow;
-
+int APInt::tcMultiply(WordType *dst, const WordType *lhs,
+                      const WordType *rhs, unsigned parts) {
   assert(dst != lhs && dst != rhs);
 
-  overflow = 0;
+  int overflow = 0;
   tcSet(dst, 0, parts);
 
-  for (i = 0; i < parts; i++)
+  for (unsigned i = 0; i < parts; i++)
     overflow |= tcMultiplyPart(&dst[i], lhs, rhs[i], 0, parts,
                                parts - i, true);
 
@@ -2631,25 +2626,21 @@ APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
    operands.  No overflow occurs.  DST must be disjoint from both
    operands.  Returns the number of parts required to hold the
    result.  */
-unsigned int
-APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
-                      const integerPart *rhs, unsigned int lhsParts,
-                      unsigned int rhsParts)
-{
+unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+                               const WordType *rhs, unsigned lhsParts,
+                               unsigned rhsParts) {
   /* Put the narrower number on the LHS for less loops below.  */
   if (lhsParts > rhsParts) {
     return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
   } else {
-    unsigned int n;
-
     assert(dst != lhs && dst != rhs);
 
     tcSet(dst, 0, rhsParts);
 
-    for (n = 0; n < lhsParts; n++)
-      tcMultiplyPart(&dst[n], rhs, lhs[n], 0, rhsParts, rhsParts + 1, true);
+    for (unsigned i = 0; i < lhsParts; i++)
+      tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
 
-    n = lhsParts + rhsParts;
+    unsigned n = lhsParts + rhsParts;
 
     return n - (dst[n - 1] == 0);
   }
@@ -2665,23 +2656,18 @@ APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
    use by the routine; its contents need not be initialized and are
    destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
 */
-int
-APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
-                integerPart *remainder, integerPart *srhs,
-                unsigned int parts)
-{
-  unsigned int n, shiftCount;
-  integerPart mask;
-
+int APInt::tcDivide(WordType *lhs, const WordType *rhs,
+                    WordType *remainder, WordType *srhs,
+                    unsigned parts) {
   assert(lhs != remainder && lhs != srhs && remainder != srhs);
 
-  shiftCount = tcMSB(rhs, parts) + 1;
+  unsigned shiftCount = tcMSB(rhs, parts) + 1;
   if (shiftCount == 0)
     return true;
 
-  shiftCount = parts * integerPartWidth - shiftCount;
-  n = shiftCount / integerPartWidth;
-  mask = (integerPart) 1 << (shiftCount % integerPartWidth);
+  shiftCount = parts * APINT_BITS_PER_WORD - shiftCount;
+  unsigned n = shiftCount / APINT_BITS_PER_WORD;
+  WordType mask = (WordType) 1 << (shiftCount % APINT_BITS_PER_WORD);
 
   tcAssign(srhs, rhs, parts);
   tcShiftLeft(srhs, parts, shiftCount);
@@ -2704,7 +2690,7 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
       shiftCount--;
       tcShiftRight(srhs, parts, 1);
       if ((mask >>= 1) == 0) {
-        mask = (integerPart) 1 << (integerPartWidth - 1);
+        mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
         n--;
       }
   }
@@ -2714,18 +2700,14 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
 
 /* Shift a bignum left COUNT bits in-place.  Shifted in bits are zero.
    There are no restrictions on COUNT.  */
-void
-APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
-{
+void APInt::tcShiftLeft(WordType *dst, unsigned parts, unsigned count) {
   if (count) {
-    unsigned int jump, shift;
-
     /* Jump is the inter-part jump; shift is is intra-part shift.  */
-    jump = count / integerPartWidth;
-    shift = count % integerPartWidth;
+    unsigned jump = count / APINT_BITS_PER_WORD;
+    unsigned shift = count % APINT_BITS_PER_WORD;
 
     while (parts > jump) {
-      integerPart part;
+      WordType part;
 
       parts--;
 
@@ -2735,7 +2717,7 @@ APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
       if (shift) {
         part <<= shift;
         if (parts >= jump + 1)
-          part |= dst[parts - jump - 1] >> (integerPartWidth - shift);
+          part |= dst[parts - jump - 1] >> (APINT_BITS_PER_WORD - shift);
       }
 
       dst[parts] = part;
@@ -2748,20 +2730,16 @@ APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
 
 /* Shift a bignum right COUNT bits in-place.  Shifted in bits are
    zero.  There are no restrictions on COUNT.  */
-void
-APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
-{
+void APInt::tcShiftRight(WordType *dst, unsigned parts, unsigned count) {
   if (count) {
-    unsigned int i, jump, shift;
-
     /* Jump is the inter-part jump; shift is is intra-part shift.  */
-    jump = count / integerPartWidth;
-    shift = count % integerPartWidth;
+    unsigned jump = count / APINT_BITS_PER_WORD;
+    unsigned shift = count % APINT_BITS_PER_WORD;
 
     /* Perform the shift.  This leaves the most significant COUNT bits
        of the result at zero.  */
-    for (i = 0; i < parts; i++) {
-      integerPart part;
+    for (unsigned i = 0; i < parts; i++) {
+      WordType part;
 
       if (i + jump >= parts) {
         part = 0;
@@ -2770,7 +2748,7 @@ APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
         if (shift) {
           part >>= shift;
           if (i + jump + 1 < parts)
-            part |= dst[i + jump + 1] << (integerPartWidth - shift);
+            part |= dst[i + jump + 1] << (APINT_BITS_PER_WORD - shift);
         }
       }
 
@@ -2780,107 +2758,55 @@ APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
 }
 
 /* Bitwise and of two bignums.  */
-void
-APInt::tcAnd(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcAnd(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] &= rhs[i];
 }
 
 /* Bitwise inclusive or of two bignums.  */
-void
-APInt::tcOr(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcOr(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] |= rhs[i];
 }
 
 /* Bitwise exclusive or of two bignums.  */
-void
-APInt::tcXor(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcXor(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] ^= rhs[i];
 }
 
 /* Complement a bignum in-place.  */
-void
-APInt::tcComplement(integerPart *dst, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcComplement(WordType *dst, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] = ~dst[i];
 }
 
 /* Comparison (unsigned) of two bignums.  */
-int
-APInt::tcCompare(const integerPart *lhs, const integerPart *rhs,
-                 unsigned int parts)
-{
+int APInt::tcCompare(const WordType *lhs, const WordType *rhs,
+                     unsigned parts) {
   while (parts) {
-      parts--;
-      if (lhs[parts] == rhs[parts])
-        continue;
+    parts--;
+    if (lhs[parts] == rhs[parts])
+      continue;
 
-      if (lhs[parts] > rhs[parts])
-        return 1;
-      else
-        return -1;
-    }
+    return (lhs[parts] > rhs[parts]) ? 1 : -1;
+  }
 
   return 0;
 }
 
-/* Increment a bignum in-place, return the carry flag.  */
-integerPart
-APInt::tcIncrement(integerPart *dst, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
-    if (++dst[i] != 0)
-      break;
-
-  return i == parts;
-}
-
-/* Decrement a bignum in-place, return the borrow flag.  */
-integerPart
-APInt::tcDecrement(integerPart *dst, unsigned int parts) {
-  for (unsigned int i = 0; i < parts; i++) {
-    // If the current word is non-zero, then the decrement has no effect on the
-    // higher-order words of the integer and no borrow can occur. Exit early.
-    if (dst[i]--)
-      return 0;
-  }
-  // If every word was zero, then there is a borrow.
-  return 1;
-}
-
-
 /* Set the least significant BITS bits of a bignum, clear the
    rest.  */
-void
-APInt::tcSetLeastSignificantBits(integerPart *dst, unsigned int parts,
-                                 unsigned int bits)
-{
-  unsigned int i;
-
-  i = 0;
-  while (bits > integerPartWidth) {
-    dst[i++] = ~(integerPart) 0;
-    bits -= integerPartWidth;
+void APInt::tcSetLeastSignificantBits(WordType *dst, unsigned parts,
+                                      unsigned bits) {
+  unsigned i = 0;
+  while (bits > APINT_BITS_PER_WORD) {
+    dst[i++] = ~(WordType) 0;
+    bits -= APINT_BITS_PER_WORD;
   }
 
   if (bits)
-    dst[i++] = ~(integerPart) 0 >> (integerPartWidth - bits);
+    dst[i++] = ~(WordType) 0 >> (APINT_BITS_PER_WORD - bits);
 
   while (i < parts)
     dst[i++] = 0;
diff --git a/lib/Support/ARMAttributeParser.cpp b/lib/Support/ARMAttributeParser.cpp
new file mode 100644
index 000000000000..63e800a5b78b
--- /dev/null
+++ b/lib/Support/ARMAttributeParser.cpp
@@ -0,0 +1,708 @@
+//===--- ARMAttributeParser.cpp - ARM Attribute Information Printer -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ARMAttributeParser.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+using namespace llvm::ARMBuildAttrs;
+
+
+static const EnumEntry<unsigned> TagNames[] = {
+  { "Tag_File", ARMBuildAttrs::File },
+  { "Tag_Section", ARMBuildAttrs::Section },
+  { "Tag_Symbol", ARMBuildAttrs::Symbol },
+};
+
+namespace llvm {
+#define ATTRIBUTE_HANDLER(Attr_)                                                \
+  { ARMBuildAttrs::Attr_, &ARMAttributeParser::Attr_ }
+
+const ARMAttributeParser::DisplayHandler
+ARMAttributeParser::DisplayRoutines[] = {
+  { ARMBuildAttrs::CPU_raw_name, &ARMAttributeParser::StringAttribute, },
+  { ARMBuildAttrs::CPU_name, &ARMAttributeParser::StringAttribute },
+  ATTRIBUTE_HANDLER(CPU_arch),
+  ATTRIBUTE_HANDLER(CPU_arch_profile),
+  ATTRIBUTE_HANDLER(ARM_ISA_use),
+  ATTRIBUTE_HANDLER(THUMB_ISA_use),
+  ATTRIBUTE_HANDLER(FP_arch),
+  ATTRIBUTE_HANDLER(WMMX_arch),
+  ATTRIBUTE_HANDLER(Advanced_SIMD_arch),
+  ATTRIBUTE_HANDLER(PCS_config),
+  ATTRIBUTE_HANDLER(ABI_PCS_R9_use),
+  ATTRIBUTE_HANDLER(ABI_PCS_RW_data),
+  ATTRIBUTE_HANDLER(ABI_PCS_RO_data),
+  ATTRIBUTE_HANDLER(ABI_PCS_GOT_use),
+  ATTRIBUTE_HANDLER(ABI_PCS_wchar_t),
+  ATTRIBUTE_HANDLER(ABI_FP_rounding),
+  ATTRIBUTE_HANDLER(ABI_FP_denormal),
+  ATTRIBUTE_HANDLER(ABI_FP_exceptions),
+  ATTRIBUTE_HANDLER(ABI_FP_user_exceptions),
+  ATTRIBUTE_HANDLER(ABI_FP_number_model),
+  ATTRIBUTE_HANDLER(ABI_align_needed),
+  ATTRIBUTE_HANDLER(ABI_align_preserved),
+  ATTRIBUTE_HANDLER(ABI_enum_size),
+  ATTRIBUTE_HANDLER(ABI_HardFP_use),
+  ATTRIBUTE_HANDLER(ABI_VFP_args),
+  ATTRIBUTE_HANDLER(ABI_WMMX_args),
+  ATTRIBUTE_HANDLER(ABI_optimization_goals),
+  ATTRIBUTE_HANDLER(ABI_FP_optimization_goals),
+  ATTRIBUTE_HANDLER(compatibility),
+  ATTRIBUTE_HANDLER(CPU_unaligned_access),
+  ATTRIBUTE_HANDLER(FP_HP_extension),
+  ATTRIBUTE_HANDLER(ABI_FP_16bit_format),
+  ATTRIBUTE_HANDLER(MPextension_use),
+  ATTRIBUTE_HANDLER(DIV_use),
+  ATTRIBUTE_HANDLER(DSP_extension),
+  ATTRIBUTE_HANDLER(T2EE_use),
+  ATTRIBUTE_HANDLER(Virtualization_use),
+  ATTRIBUTE_HANDLER(nodefaults)
+};
+
+#undef ATTRIBUTE_HANDLER
+
+uint64_t ARMAttributeParser::ParseInteger(const uint8_t *Data,
+                                          uint32_t &Offset) {
+  unsigned Length;
+  uint64_t Value = decodeULEB128(Data + Offset, &Length);
+  Offset = Offset + Length;
+  return Value;
+}
+
+StringRef ARMAttributeParser::ParseString(const uint8_t *Data,
+                                          uint32_t &Offset) {
+  const char *String = reinterpret_cast<const char*>(Data + Offset);
+  size_t Length = std::strlen(String);
+  Offset = Offset + Length + 1;
+  return StringRef(String, Length);
+}
+
+void ARMAttributeParser::IntegerAttribute(AttrType Tag, const uint8_t *Data,
+                                          uint32_t &Offset) {
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  Attributes.insert(std::make_pair(Tag, Value));
+
+  if (SW)
+    SW->printNumber(ARMBuildAttrs::AttrTypeAsString(Tag), Value);
+}
+
+void ARMAttributeParser::StringAttribute(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  StringRef TagName = ARMBuildAttrs::AttrTypeAsString(Tag, /*TagPrefix*/false);
+  StringRef ValueDesc = ParseString(Data, Offset);
+
+  if (SW) {
+    DictScope AS(*SW, "Attribute");
+    SW->printNumber("Tag", Tag);
+    if (!TagName.empty())
+      SW->printString("TagName", TagName);
+    SW->printString("Value", ValueDesc);
+  }
+}
+
+void ARMAttributeParser::PrintAttribute(unsigned Tag, unsigned Value,
+                                        StringRef ValueDesc) {
+  Attributes.insert(std::make_pair(Tag, Value));
+
+  if (SW) {
+    StringRef TagName = ARMBuildAttrs::AttrTypeAsString(Tag,
+                                                        /*TagPrefix*/false);
+    DictScope AS(*SW, "Attribute");
+    SW->printNumber("Tag", Tag);
+    SW->printNumber("Value", Value);
+    if (!TagName.empty())
+      SW->printString("TagName", TagName);
+    if (!ValueDesc.empty())
+      SW->printString("Description", ValueDesc);
+  }
+}
+
+void ARMAttributeParser::CPU_arch(AttrType Tag, const uint8_t *Data,
+                                  uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Pre-v4", "ARM v4", "ARM v4T", "ARM v5T", "ARM v5TE", "ARM v5TEJ", "ARM v6",
+    "ARM v6KZ", "ARM v6T2", "ARM v6K", "ARM v7", "ARM v6-M", "ARM v6S-M",
+    "ARM v7E-M", "ARM v8"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::CPU_arch_profile(AttrType Tag, const uint8_t *Data,
+                                          uint32_t &Offset) {
+  uint64_t Encoded = ParseInteger(Data, Offset);
+
+  StringRef Profile;
+  switch (Encoded) {
+  default:  Profile = "Unknown"; break;
+  case 'A': Profile = "Application"; break;
+  case 'R': Profile = "Real-time"; break;
+  case 'M': Profile = "Microcontroller"; break;
+  case 'S': Profile = "Classic"; break;
+  case 0: Profile = "None"; break;
+  }
+
+  PrintAttribute(Tag, Encoded, Profile);
+}
+
+void ARMAttributeParser::ARM_ISA_use(AttrType Tag, const uint8_t *Data,
+                                     uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::THUMB_ISA_use(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Thumb-1", "Thumb-2" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::FP_arch(AttrType Tag, const uint8_t *Data,
+                                 uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "VFPv1", "VFPv2", "VFPv3", "VFPv3-D16", "VFPv4",
+    "VFPv4-D16", "ARMv8-a FP", "ARMv8-a FP-D16"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::WMMX_arch(AttrType Tag, const uint8_t *Data,
+                                   uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "WMMXv1", "WMMXv2" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::Advanced_SIMD_arch(AttrType Tag, const uint8_t *Data,
+                                            uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "NEONv1", "NEONv2+FMA", "ARMv8-a NEON", "ARMv8.1-a NEON"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::PCS_config(AttrType Tag, const uint8_t *Data,
+                                    uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "None", "Bare Platform", "Linux Application", "Linux DSO", "Palm OS 2004",
+    "Reserved (Palm OS)", "Symbian OS 2004", "Reserved (Symbian OS)"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_R9_use(AttrType Tag, const uint8_t *Data,
+                                        uint32_t &Offset) {
+  static const char *const Strings[] = { "v6", "Static Base", "TLS", "Unused" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_RW_data(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Absolute", "PC-relative", "SB-relative", "Not Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_RO_data(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Absolute", "PC-relative", "Not Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_GOT_use(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Direct", "GOT-Indirect"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_wchar_t(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Unknown", "2-byte", "Unknown", "4-byte"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_rounding(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = { "IEEE-754", "Runtime" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_denormal(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Unsupported", "IEEE-754", "Sign Only"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_exceptions(AttrType Tag, const uint8_t *Data,
+                                           uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "IEEE-754" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_user_exceptions(AttrType Tag,
+                                                const uint8_t *Data,
+                                                uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "IEEE-754" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_number_model(AttrType Tag, const uint8_t *Data,
+                                             uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Finite Only", "RTABI", "IEEE-754"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_align_needed(AttrType Tag, const uint8_t *Data,
+                                          uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "8-byte alignment", "4-byte alignment", "Reserved"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+
+  std::string Description;
+  if (Value < array_lengthof(Strings))
+    Description = std::string(Strings[Value]);
+  else if (Value <= 12)
+    Description = std::string("8-byte alignment, ") + utostr(1ULL << Value)
+                + std::string("-byte extended alignment");
+  else
+    Description = "Invalid";
+
+  PrintAttribute(Tag, Value, Description);
+}
+
+void ARMAttributeParser::ABI_align_preserved(AttrType Tag, const uint8_t *Data,
+                                             uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Required", "8-byte data alignment", "8-byte data and code alignment",
+    "Reserved"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+
+  std::string Description;
+  if (Value < array_lengthof(Strings))
+    Description = std::string(Strings[Value]);
+  else if (Value <= 12)
+    Description = std::string("8-byte stack alignment, ") +
+                  utostr(1ULL << Value) + std::string("-byte data alignment");
+  else
+    Description = "Invalid";
+
+  PrintAttribute(Tag, Value, Description);
+}
+
+void ARMAttributeParser::ABI_enum_size(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Packed", "Int32", "External Int32"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_HardFP_use(AttrType Tag, const uint8_t *Data,
+                                        uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Tag_FP_arch", "Single-Precision", "Reserved", "Tag_FP_arch (deprecated)"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_VFP_args(AttrType Tag, const uint8_t *Data,
+                                      uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "AAPCS", "AAPCS VFP", "Custom", "Not Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_WMMX_args(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = { "AAPCS", "iWMMX", "Custom" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_optimization_goals(AttrType Tag,
+                                                const uint8_t *Data,
+                                                uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "None", "Speed", "Aggressive Speed", "Size", "Aggressive Size", "Debugging",
+    "Best Debugging"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_optimization_goals(AttrType Tag,
+                                                   const uint8_t *Data,
+                                                   uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "None", "Speed", "Aggressive Speed", "Size", "Aggressive Size", "Accuracy",
+    "Best Accuracy"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::compatibility(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  uint64_t Integer = ParseInteger(Data, Offset);
+  StringRef String = ParseString(Data, Offset);
+
+  if (SW) {
+    DictScope AS(*SW, "Attribute");
+    SW->printNumber("Tag", Tag);
+    SW->startLine() << "Value: " << Integer << ", " << String << '\n';
+    SW->printString("TagName", AttrTypeAsString(Tag, /*TagPrefix*/false));
+    switch (Integer) {
+    case 0:
+      SW->printString("Description", StringRef("No Specific Requirements"));
+      break;
+    case 1:
+      SW->printString("Description", StringRef("AEABI Conformant"));
+      break;
+    default:
+      SW->printString("Description", StringRef("AEABI Non-Conformant"));
+      break;
+    }
+  }
+}
+
+void ARMAttributeParser::CPU_unaligned_access(AttrType Tag, const uint8_t *Data,
+                                              uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "v6-style" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::FP_HP_extension(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = { "If Available", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_16bit_format(AttrType Tag, const uint8_t *Data,
+                                             uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "IEEE-754", "VFPv3" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::MPextension_use(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::DIV_use(AttrType Tag, const uint8_t *Data,
+                                 uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "If Available", "Not Permitted", "Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::DSP_extension(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::T2EE_use(AttrType Tag, const uint8_t *Data,
+                                  uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::Virtualization_use(AttrType Tag, const uint8_t *Data,
+                                            uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "TrustZone", "Virtualization Extensions",
+    "TrustZone + Virtualization Extensions"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::nodefaults(AttrType Tag, const uint8_t *Data,
+                                    uint32_t &Offset) {
+  uint64_t Value = ParseInteger(Data, Offset);
+  PrintAttribute(Tag, Value, "Unspecified Tags UNDEFINED");
+}
+
+void ARMAttributeParser::ParseIndexList(const uint8_t *Data, uint32_t &Offset,
+                                        SmallVectorImpl<uint8_t> &IndexList) {
+  for (;;) {
+    unsigned Length;
+    uint64_t Value = decodeULEB128(Data + Offset, &Length);
+    Offset = Offset + Length;
+    if (Value == 0)
+      break;
+    IndexList.push_back(Value);
+  }
+}
+
+void ARMAttributeParser::ParseAttributeList(const uint8_t *Data,
+                                            uint32_t &Offset, uint32_t Length) {
+  while (Offset < Length) {
+    unsigned Length;
+    uint64_t Tag = decodeULEB128(Data + Offset, &Length);
+    Offset += Length;
+
+    bool Handled = false;
+    for (unsigned AHI = 0, AHE = array_lengthof(DisplayRoutines);
+         AHI != AHE && !Handled; ++AHI) {
+      if (DisplayRoutines[AHI].Attribute == Tag) {
+        (this->*DisplayRoutines[AHI].Routine)(ARMBuildAttrs::AttrType(Tag),
+                                              Data, Offset);
+        Handled = true;
+        break;
+      }
+    }
+    if (!Handled) {
+      if (Tag < 32) {
+        errs() << "unhandled AEABI Tag " << Tag
+               << " (" << ARMBuildAttrs::AttrTypeAsString(Tag) << ")\n";
+        continue;
+      }
+
+      if (Tag % 2 == 0)
+        IntegerAttribute(ARMBuildAttrs::AttrType(Tag), Data, Offset);
+      else
+        StringAttribute(ARMBuildAttrs::AttrType(Tag), Data, Offset);
+    }
+  }
+}
+
+void ARMAttributeParser::ParseSubsection(const uint8_t *Data, uint32_t Length) {
+  uint32_t Offset = sizeof(uint32_t); /* SectionLength */
+
+  const char *VendorName = reinterpret_cast<const char*>(Data + Offset);
+  size_t VendorNameLength = std::strlen(VendorName);
+  Offset = Offset + VendorNameLength + 1;
+
+  if (SW) {
+    SW->printNumber("SectionLength", Length);
+    SW->printString("Vendor", StringRef(VendorName, VendorNameLength));
+  }
+
+  if (StringRef(VendorName, VendorNameLength).lower() != "aeabi") {
+    return;
+  }
+
+  while (Offset < Length) {
+    /// Tag_File | Tag_Section | Tag_Symbol   uleb128:byte-size
+    uint8_t Tag = Data[Offset];
+    Offset = Offset + sizeof(Tag);
+
+    uint32_t Size =
+      *reinterpret_cast<const support::ulittle32_t*>(Data + Offset);
+    Offset = Offset + sizeof(Size);
+
+    if (SW) {
+      SW->printEnum("Tag", Tag, makeArrayRef(TagNames));
+      SW->printNumber("Size", Size);
+    }
+
+    if (Size > Length) {
+      errs() << "subsection length greater than section length\n";
+      return;
+    }
+
+    StringRef ScopeName, IndexName;
+    SmallVector<uint8_t, 8> Indicies;
+    switch (Tag) {
+    case ARMBuildAttrs::File:
+      ScopeName = "FileAttributes";
+      break;
+    case ARMBuildAttrs::Section:
+      ScopeName = "SectionAttributes";
+      IndexName = "Sections";
+      ParseIndexList(Data, Offset, Indicies);
+      break;
+    case ARMBuildAttrs::Symbol:
+      ScopeName = "SymbolAttributes";
+      IndexName = "Symbols";
+      ParseIndexList(Data, Offset, Indicies);
+      break;
+    default:
+      errs() << "unrecognised tag: 0x" << utohexstr(Tag) << '\n';
+      return;
+    }
+
+    if (SW) {
+      DictScope ASS(*SW, ScopeName);
+      if (!Indicies.empty())
+        SW->printList(IndexName, Indicies);
+      ParseAttributeList(Data, Offset, Length);
+    } else {
+      ParseAttributeList(Data, Offset, Length);
+    }
+  }
+}
+
+void ARMAttributeParser::Parse(ArrayRef<uint8_t> Section, bool isLittle) {
+  size_t Offset = 1;
+  unsigned SectionNumber = 0;
+
+  while (Offset < Section.size()) {
+    uint32_t SectionLength = isLittle ?
+      support::endian::read32le(Section.data() + Offset) :
+      support::endian::read32be(Section.data() + Offset);
+
+    if (SW) {
+      SW->startLine() << "Section " << ++SectionNumber << " {\n";
+      SW->indent();
+    }
+
+    ParseSubsection(Section.data() + Offset, SectionLength);
+    Offset = Offset + SectionLength;
+
+    if (SW) {
+      SW->unindent();
+      SW->startLine() << "}\n";
+    }
+  }
+}
+}
+
diff --git a/lib/Support/BinaryStreamError.cpp b/lib/Support/BinaryStreamError.cpp
new file mode 100644
index 000000000000..60f5e21f041a
--- /dev/null
+++ b/lib/Support/BinaryStreamError.cpp
@@ -0,0 +1,56 @@
+//===- BinaryStreamError.cpp - Error extensions for streams -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+char BinaryStreamError::ID = 0;
+
+BinaryStreamError::BinaryStreamError(stream_error_code C)
+    : BinaryStreamError(C, "") {}
+
+BinaryStreamError::BinaryStreamError(StringRef Context)
+    : BinaryStreamError(stream_error_code::unspecified, Context) {}
+
+BinaryStreamError::BinaryStreamError(stream_error_code C, StringRef Context)
+    : Code(C) {
+  ErrMsg = "Stream Error: ";
+  switch (C) {
+  case stream_error_code::unspecified:
+    ErrMsg += "An unspecified error has occurred.";
+    break;
+  case stream_error_code::stream_too_short:
+    ErrMsg += "The stream is too short to perform the requested operation.";
+    break;
+  case stream_error_code::invalid_array_size:
+    ErrMsg += "The buffer size is not a multiple of the array element size.";
+    break;
+  case stream_error_code::invalid_offset:
+    ErrMsg += "The specified offset is invalid for the current stream.";
+    break;
+  case stream_error_code::filesystem_error:
+    ErrMsg += "An I/O error occurred on the file system.";
+    break;
+  }
+
+  if (!Context.empty()) {
+    ErrMsg += "  ";
+    ErrMsg += Context;
+  }
+}
+
+void BinaryStreamError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+StringRef BinaryStreamError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code BinaryStreamError::convertToErrorCode() const {
+  return inconvertibleErrorCode();
+}
diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp
new file mode 100644
index 000000000000..c7a2e0ddb179
--- /dev/null
+++ b/lib/Support/BinaryStreamReader.cpp
@@ -0,0 +1,95 @@
+//===- BinaryStreamReader.cpp - Reads objects from a binary stream --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamReader.h"
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+
+BinaryStreamReader::BinaryStreamReader(BinaryStreamRef S)
+    : Stream(S), Offset(0) {}
+
+Error BinaryStreamReader::readLongestContiguousChunk(
+    ArrayRef<uint8_t> &Buffer) {
+  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error BinaryStreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
+  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
+    return EC;
+  Offset += Size;
+  return Error::success();
+}
+
+Error BinaryStreamReader::readCString(StringRef &Dest) {
+  // TODO: This could be made more efficient by using readLongestContiguousChunk
+  // and searching for null terminators in the resulting buffer.
+
+  uint32_t Length = 0;
+  // First compute the length of the string by reading 1 byte at a time.
+  uint32_t OriginalOffset = getOffset();
+  const char *C;
+  while (true) {
+    if (auto EC = readObject(C))
+      return EC;
+    if (*C == '\0')
+      break;
+    ++Length;
+  }
+  // Now go back and request a reference for that many bytes.
+  uint32_t NewOffset = getOffset();
+  setOffset(OriginalOffset);
+
+  if (auto EC = readFixedString(Dest, Length))
+    return EC;
+
+  // Now set the offset back to where it was after we calculated the length.
+  setOffset(NewOffset);
+  return Error::success();
+}
+
+Error BinaryStreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
+  ArrayRef<uint8_t> Bytes;
+  if (auto EC = readBytes(Bytes, Length))
+    return EC;
+  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
+  return Error::success();
+}
+
+Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref) {
+  return readStreamRef(Ref, bytesRemaining());
+}
+
+Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref, uint32_t Length) {
+  if (bytesRemaining() < Length)
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Ref = Stream.slice(Offset, Length);
+  Offset += Length;
+  return Error::success();
+}
+
+Error BinaryStreamReader::skip(uint32_t Amount) {
+  if (Amount > bytesRemaining())
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Offset += Amount;
+  return Error::success();
+}
+
+uint8_t BinaryStreamReader::peek() const {
+  ArrayRef<uint8_t> Buffer;
+  auto EC = Stream.readBytes(Offset, 1, Buffer);
+  assert(!EC && "Cannot peek an empty buffer!");
+  llvm::consumeError(std::move(EC));
+  return Buffer[0];
+}
diff --git a/lib/Support/BinaryStreamWriter.cpp b/lib/Support/BinaryStreamWriter.cpp
new file mode 100644
index 000000000000..d60b75642d0f
--- /dev/null
+++ b/lib/Support/BinaryStreamWriter.cpp
@@ -0,0 +1,68 @@
+//===- BinaryStreamWriter.cpp - Writes objects to a BinaryStream ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamWriter.h"
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+
+BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStreamRef S)
+    : Stream(S), Offset(0) {}
+
+Error BinaryStreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
+  if (auto EC = Stream.writeBytes(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error BinaryStreamWriter::writeCString(StringRef Str) {
+  if (auto EC = writeFixedString(Str))
+    return EC;
+  if (auto EC = writeObject('\0'))
+    return EC;
+
+  return Error::success();
+}
+
+Error BinaryStreamWriter::writeFixedString(StringRef Str) {
+  return writeBytes(ArrayRef<uint8_t>(Str.bytes_begin(), Str.bytes_end()));
+}
+
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref) {
+  return writeStreamRef(Ref, Ref.getLength());
+}
+
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref, uint32_t Length) {
+  BinaryStreamReader SrcReader(Ref.slice(0, Length));
+  // This is a bit tricky.  If we just call readBytes, we are requiring that it
+  // return us the entire stream as a contiguous buffer.  There is no guarantee
+  // this can be satisfied by returning a reference straight from the buffer, as
+  // an implementation may not store all data in a single contiguous buffer.  So
+  // we iterate over each contiguous chunk, writing each one in succession.
+  while (SrcReader.bytesRemaining() > 0) {
+    ArrayRef<uint8_t> Chunk;
+    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
+      return EC;
+    if (auto EC = writeBytes(Chunk))
+      return EC;
+  }
+  return Error::success();
+}
+
+Error BinaryStreamWriter::padToAlignment(uint32_t Align) {
+  uint32_t NewOffset = alignTo(Offset, Align);
+  if (NewOffset > getLength())
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Offset = NewOffset;
+  return Error::success();
+}
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 1c41659cf8df..44ad110d456a 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -32,7 +32,9 @@ raw_ostream &BranchProbability::print(raw_ostream &OS) const {
                       Percent);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void BranchProbability::dump() const { print(dbgs()) << '\n'; }
+#endif
 
 BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   assert(Denominator > 0 && "Denominator cannot be 0!");
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 15418ad2fd06..491614b4bf63 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -9,6 +9,9 @@ elseif( CMAKE_HOST_UNIX )
   if( HAVE_LIBDL )
     set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
   endif()
+  if( HAVE_BACKTRACE )
+    set(system_libs ${system_libs} ${Backtrace_LIBRARIES})
+  endif()
   if(LLVM_ENABLE_TERMINFO)
     if(HAVE_TERMINFO)
       set(system_libs ${system_libs} ${TERMINFO_LIBS})
@@ -17,7 +20,7 @@ elseif( CMAKE_HOST_UNIX )
   if( LLVM_ENABLE_THREADS AND HAVE_LIBATOMIC )
     set(system_libs ${system_libs} atomic)
   endif()
-  set(system_libs ${system_libs} ${PTHREAD_LIB})
+  set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB})
   if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
     set(system_libs ${system_libs} z)
   endif()
@@ -31,8 +34,12 @@ add_llvm_library(LLVMSupport
   APInt.cpp
   APSInt.cpp
   ARMBuildAttrs.cpp
+  ARMAttributeParser.cpp
   ARMWinEH.cpp
   Allocator.cpp
+  BinaryStreamError.cpp
+  BinaryStreamReader.cpp
+  BinaryStreamWriter.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
   CachePruning.cpp
@@ -46,6 +53,7 @@ add_llvm_library(LLVMSupport
   CrashRecoveryContext.cpp
   DataExtractor.cpp
   Debug.cpp
+  DebugCounter.cpp
   DeltaAlgorithm.cpp
   DAGDeltaAlgorithm.cpp
   Dwarf.cpp
@@ -66,6 +74,7 @@ add_llvm_library(LLVMSupport
   LineIterator.cpp
   Locale.cpp
   LockFileManager.cpp
+  LowLevelType.cpp
   ManagedStatic.cpp
   MathExtras.cpp
   MemoryBuffer.cpp
@@ -134,7 +143,7 @@ add_llvm_library(LLVMSupport
   Windows
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support
-
+  ${Backtrace_INCLUDE_DIRS}
   LINK_LIBS ${system_libs}
   )
 
diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp
index 3831625962ca..aca123639565 100644
--- a/lib/Support/CachePruning.cpp
+++ b/lib/Support/CachePruning.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,8 +34,75 @@ static void writeTimestampFile(StringRef TimestampFile) {
   raw_fd_ostream Out(TimestampFile.str(), EC, sys::fs::F_None);
 }
 
+static Expected<std::chrono::seconds> parseDuration(StringRef Duration) {
+  if (Duration.empty())
+    return make_error<StringError>("Duration must not be empty",
+                                   inconvertibleErrorCode());
+
+  StringRef NumStr = Duration.slice(0, Duration.size()-1);
+  uint64_t Num;
+  if (NumStr.getAsInteger(0, Num))
+    return make_error<StringError>("'" + NumStr + "' not an integer",
+                                   inconvertibleErrorCode());
+
+  switch (Duration.back()) {
+  case 's':
+    return std::chrono::seconds(Num);
+  case 'm':
+    return std::chrono::minutes(Num);
+  case 'h':
+    return std::chrono::hours(Num);
+  default:
+    return make_error<StringError>("'" + Duration +
+                                       "' must end with one of 's', 'm' or 'h'",
+                                   inconvertibleErrorCode());
+  }
+}
+
+Expected<CachePruningPolicy>
+llvm::parseCachePruningPolicy(StringRef PolicyStr) {
+  CachePruningPolicy Policy;
+  std::pair<StringRef, StringRef> P = {"", PolicyStr};
+  while (!P.second.empty()) {
+    P = P.second.split(':');
+
+    StringRef Key, Value;
+    std::tie(Key, Value) = P.first.split('=');
+    if (Key == "prune_interval") {
+      auto DurationOrErr = parseDuration(Value);
+      if (!DurationOrErr)
+        return DurationOrErr.takeError();
+      Policy.Interval = *DurationOrErr;
+    } else if (Key == "prune_after") {
+      auto DurationOrErr = parseDuration(Value);
+      if (!DurationOrErr)
+        return DurationOrErr.takeError();
+      Policy.Expiration = *DurationOrErr;
+    } else if (Key == "cache_size") {
+      if (Value.back() != '%')
+        return make_error<StringError>("'" + Value + "' must be a percentage",
+                                       inconvertibleErrorCode());
+      StringRef SizeStr = Value.slice(0, Value.size() - 1);
+      uint64_t Size;
+      if (SizeStr.getAsInteger(0, Size))
+        return make_error<StringError>("'" + SizeStr + "' not an integer",
+                                       inconvertibleErrorCode());
+      if (Size > 100)
+        return make_error<StringError>("'" + SizeStr +
+                                           "' must be between 0 and 100",
+                                       inconvertibleErrorCode());
+      Policy.PercentageOfAvailableSpace = Size;
+    } else {
+      return make_error<StringError>("Unknown key: '" + Key + "'",
+                                     inconvertibleErrorCode());
+    }
+  }
+
+  return Policy;
+}
+
 /// Prune the cache of files that haven't been accessed in a long time.
-bool CachePruning::prune() {
+bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
   using namespace std::chrono;
 
   if (Path.empty())
@@ -47,7 +115,11 @@ bool CachePruning::prune() {
   if (!isPathDir)
     return false;
 
-  if (Expiration == seconds(0) && PercentageOfAvailableSpace == 0) {
+  Policy.PercentageOfAvailableSpace =
+      std::min(Policy.PercentageOfAvailableSpace, 100u);
+
+  if (Policy.Expiration == seconds(0) &&
+      Policy.PercentageOfAvailableSpace == 0) {
     DEBUG(dbgs() << "No pruning settings set, exit early\n");
     // Nothing will be pruned, early exit
     return false;
@@ -67,12 +139,12 @@ bool CachePruning::prune() {
       return false;
     }
   } else {
-    if (Interval == seconds(0)) {
+    if (Policy.Interval == seconds(0)) {
       // Check whether the time stamp is older than our pruning interval.
       // If not, do nothing.
       const auto TimeStampModTime = FileStatus.getLastModificationTime();
       auto TimeStampAge = CurrentTime - TimeStampModTime;
-      if (TimeStampAge <= Interval) {
+      if (TimeStampAge <= Policy.Interval) {
         DEBUG(dbgs() << "Timestamp file too recent ("
                      << duration_cast<seconds>(TimeStampAge).count()
                      << "s old), do not prune.\n");
@@ -85,7 +157,7 @@ bool CachePruning::prune() {
     writeTimestampFile(TimestampFile);
   }
 
-  bool ShouldComputeSize = (PercentageOfAvailableSpace > 0);
+  bool ShouldComputeSize = (Policy.PercentageOfAvailableSpace > 0);
 
   // Keep track of space
   std::set<std::pair<uint64_t, std::string>> FileSizes;
@@ -108,8 +180,11 @@ bool CachePruning::prune() {
   // Walk all of the files within this directory.
   for (sys::fs::directory_iterator File(CachePathNative, EC), FileEnd;
        File != FileEnd && !EC; File.increment(EC)) {
-    // Do not touch the timestamp.
-    if (File->path() == TimestampFile)
+    // Ignore any files not beginning with the string "llvmcache-". This
+    // includes the timestamp file as well as any files created by the user.
+    // This acts as a safeguard against data loss if the user specifies the
+    // wrong directory as their cache directory.
+    if (!sys::path::filename(File->path()).startswith("llvmcache-"))
       continue;
 
     // Look at this file. If we can't stat it, there's nothing interesting
@@ -122,7 +197,7 @@ bool CachePruning::prune() {
     // If the file hasn't been used recently enough, delete it
     const auto FileAccessTime = FileStatus.getLastAccessedTime();
     auto FileAge = CurrentTime - FileAccessTime;
-    if (FileAge > Expiration) {
+    if (FileAge > Policy.Expiration) {
       DEBUG(dbgs() << "Remove " << File->path() << " ("
                    << duration_cast<seconds>(FileAge).count() << "s old)\n");
       sys::fs::remove(File->path());
@@ -143,9 +218,11 @@ bool CachePruning::prune() {
     auto AvailableSpace = TotalSize + SpaceInfo.free;
     auto FileAndSize = FileSizes.rbegin();
     DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
-                 << "% target is: " << PercentageOfAvailableSpace << "\n");
+                 << "% target is: " << Policy.PercentageOfAvailableSpace
+                 << "\n");
     // Remove the oldest accessed files first, till we get below the threshold
-    while (((100 * TotalSize) / AvailableSpace) > PercentageOfAvailableSpace &&
+    while (((100 * TotalSize) / AvailableSpace) >
+               Policy.PercentageOfAvailableSpace &&
            FileAndSize != FileSizes.rend()) {
       // Remove the file.
       sys::fs::remove(FileAndSize->second);
diff --git a/lib/Support/Chrono.cpp b/lib/Support/Chrono.cpp
index cdadbd879979..daccaf1fc103 100644
--- a/lib/Support/Chrono.cpp
+++ b/lib/Support/Chrono.cpp
@@ -16,6 +16,13 @@ namespace llvm {
 
 using namespace sys;
 
+const char llvm::detail::unit<std::ratio<3600>>::value[] = "h";
+const char llvm::detail::unit<std::ratio<60>>::value[] = "m";
+const char llvm::detail::unit<std::ratio<1>>::value[] = "s";
+const char llvm::detail::unit<std::milli>::value[] = "ms";
+const char llvm::detail::unit<std::micro>::value[] = "us";
+const char llvm::detail::unit<std::nano>::value[] = "ns";
+
 static inline struct tm getStructTM(TimePoint<> TP) {
   struct tm Storage;
   std::time_t OurTime = toTimeT(TP);
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 3889902eea54..f4a9108b8544 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -123,7 +123,7 @@ public:
   void ResetAllOptionOccurrences();
 
   bool ParseCommandLineOptions(int argc, const char *const *argv,
-                               StringRef Overview, bool IgnoreErrors);
+                               StringRef Overview, raw_ostream *Errs = nullptr);
 
   void addLiteralOption(Option &Opt, SubCommand *SC, StringRef Name) {
     if (Opt.hasArgStr())
@@ -1013,9 +1013,9 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
 }
 
 bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
-                                 StringRef Overview, bool IgnoreErrors) {
+                                 StringRef Overview, raw_ostream *Errs) {
   return GlobalParser->ParseCommandLineOptions(argc, argv, Overview,
-                                               IgnoreErrors);
+                                               Errs);
 }
 
 void CommandLineParser::ResetAllOptionOccurrences() {
@@ -1030,7 +1030,7 @@ void CommandLineParser::ResetAllOptionOccurrences() {
 bool CommandLineParser::ParseCommandLineOptions(int argc,
                                                 const char *const *argv,
                                                 StringRef Overview,
-                                                bool IgnoreErrors) {
+                                                raw_ostream *Errs) {
   assert(hasOptions() && "No options specified!");
 
   // Expand response files.
@@ -1045,6 +1045,9 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   ProgramName = sys::path::filename(StringRef(argv[0]));
 
   ProgramOverview = Overview;
+  bool IgnoreErrors = Errs;
+  if (!Errs)
+    Errs = &errs();
   bool ErrorParsing = false;
 
   // Check out the positional arguments to collect information about them.
@@ -1097,15 +1100,14 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
         // not specified after an option that eats all extra arguments, or this
         // one will never get any!
         //
-        if (!IgnoreErrors) {
+        if (!IgnoreErrors)
           Opt->error("error - option can never match, because "
                      "another positional argument will match an "
                      "unbounded number of values, and this option"
                      " does not require a value!");
-          errs() << ProgramName << ": CommandLine Error: Option '"
-                 << Opt->ArgStr << "' is all messed up!\n";
-          errs() << PositionalOpts.size();
-        }
+        *Errs << ProgramName << ": CommandLine Error: Option '" << Opt->ArgStr
+              << "' is all messed up!\n";
+        *Errs << PositionalOpts.size();
         ErrorParsing = true;
       }
       UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
@@ -1200,15 +1202,13 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
     if (!Handler) {
       if (SinkOpts.empty()) {
-        if (!IgnoreErrors) {
-          errs() << ProgramName << ": Unknown command line argument '"
-                 << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
-
-          if (NearestHandler) {
-            // If we know a near match, report it as well.
-            errs() << ProgramName << ": Did you mean '-" << NearestHandlerString
-                   << "'?\n";
-          }
+        *Errs << ProgramName << ": Unknown command line argument '" << argv[i]
+              << "'.  Try: '" << argv[0] << " -help'\n";
+
+        if (NearestHandler) {
+          // If we know a near match, report it as well.
+          *Errs << ProgramName << ": Did you mean '-" << NearestHandlerString
+                 << "'?\n";
         }
 
         ErrorParsing = true;
@@ -1231,22 +1231,18 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
   // Check and handle positional arguments now...
   if (NumPositionalRequired > PositionalVals.size()) {
-    if (!IgnoreErrors) {
-      errs() << ProgramName
+      *Errs << ProgramName
              << ": Not enough positional command line arguments specified!\n"
              << "Must specify at least " << NumPositionalRequired
              << " positional argument" << (NumPositionalRequired > 1 ? "s" : "")
              << ": See: " << argv[0] << " - help\n";
-    }
 
     ErrorParsing = true;
   } else if (!HasUnlimitedPositionals &&
              PositionalVals.size() > PositionalOpts.size()) {
-    if (!IgnoreErrors) {
-      errs() << ProgramName << ": Too many positional arguments specified!\n"
-             << "Can specify at most " << PositionalOpts.size()
-             << " positional arguments: See: " << argv[0] << " -help\n";
-    }
+    *Errs << ProgramName << ": Too many positional arguments specified!\n"
+          << "Can specify at most " << PositionalOpts.size()
+          << " positional arguments: See: " << argv[0] << " -help\n";
     ErrorParsing = true;
 
   } else if (!ConsumeAfterOpt) {
@@ -1404,8 +1400,8 @@ static StringRef getValueStr(const Option &O, StringRef DefaultMsg) {
 // Return the width of the option tag for printing...
 size_t alias::getOptionWidth() const { return ArgStr.size() + 6; }
 
-static void printHelpStr(StringRef HelpStr, size_t Indent,
-                         size_t FirstLineIndentedBy) {
+void Option::printHelpStr(StringRef HelpStr, size_t Indent,
+                                 size_t FirstLineIndentedBy) {
   std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
   outs().indent(Indent - FirstLineIndentedBy) << " - " << Split.first << "\n";
   while (!Split.second.empty()) {
@@ -1448,7 +1444,7 @@ void basic_parser_impl::printOptionInfo(const Option &O,
   if (!ValName.empty())
     outs() << "=<" << getValueStr(O, ValName) << '>';
 
-  printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
+  Option::printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
 }
 
 void basic_parser_impl::printOptionName(const Option &O,
@@ -1587,7 +1583,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
                                           size_t GlobalWidth) const {
   if (O.hasArgStr()) {
     outs() << "  -" << O.ArgStr;
-    printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
+    Option::printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
 
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       size_t NumSpaces = GlobalWidth - getOption(i).size() - 8;
@@ -1600,7 +1596,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       auto Option = getOption(i);
       outs() << "    -" << Option;
-      printHelpStr(getDescription(i), GlobalWidth, Option.size() + 8);
+      Option::printHelpStr(getDescription(i), GlobalWidth, Option.size() + 8);
     }
   }
 }
@@ -1856,10 +1852,11 @@ public:
 
   // Helper function for printOptions().
   // It shall return a negative value if A's name should be lexicographically
-  // ordered before B's name. It returns a value greater equal zero otherwise.
+  // ordered before B's name. It returns a value greater than zero if B's name
+  // should be ordered before A's name, and it returns 0 otherwise.
   static int OptionCategoryCompare(OptionCategory *const *A,
                                    OptionCategory *const *B) {
-    return (*A)->getName() == (*B)->getName();
+    return (*A)->getName().compare((*B)->getName());
   }
 
   // Make sure we inherit our base class's operator=()
@@ -2182,5 +2179,6 @@ void cl::ResetAllOptionOccurrences() {
 
 void LLVMParseCommandLineOptions(int argc, const char *const *argv,
                                  const char *Overview) {
-  llvm::cl::ParseCommandLineOptions(argc, argv, StringRef(Overview), true);
+  llvm::cl::ParseCommandLineOptions(argc, argv, StringRef(Overview),
+                                    &llvm::nulls());
 }
diff --git a/lib/Support/Compression.cpp b/lib/Support/Compression.cpp
index 5d556462e89c..c279d10f6c61 100644
--- a/lib/Support/Compression.cpp
+++ b/lib/Support/Compression.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #if LLVM_ENABLE_ZLIB == 1 && HAVE_ZLIB_H
 #include <zlib.h>
@@ -24,6 +25,10 @@
 using namespace llvm;
 
 #if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
+static Error createError(StringRef Err) {
+  return make_error<StringError>(Err, inconvertibleErrorCode());
+}
+
 static int encodeZlibCompressionLevel(zlib::CompressionLevel Level) {
   switch (Level) {
     case zlib::NoCompression: return 0;
@@ -34,53 +39,59 @@ static int encodeZlibCompressionLevel(zlib::CompressionLevel Level) {
   llvm_unreachable("Invalid zlib::CompressionLevel!");
 }
 
-static zlib::Status encodeZlibReturnValue(int ReturnValue) {
-  switch (ReturnValue) {
-    case Z_OK: return zlib::StatusOK;
-    case Z_MEM_ERROR: return zlib::StatusOutOfMemory;
-    case Z_BUF_ERROR: return zlib::StatusBufferTooShort;
-    case Z_STREAM_ERROR: return zlib::StatusInvalidArg;
-    case Z_DATA_ERROR: return zlib::StatusInvalidData;
-    default: llvm_unreachable("unknown zlib return status!");
+static StringRef convertZlibCodeToString(int Code) {
+  switch (Code) {
+  case Z_MEM_ERROR:
+    return "zlib error: Z_MEM_ERROR";
+  case Z_BUF_ERROR:
+    return "zlib error: Z_BUF_ERROR";
+  case Z_STREAM_ERROR:
+    return "zlib error: Z_STREAM_ERROR";
+  case Z_DATA_ERROR:
+    return "zlib error: Z_DATA_ERROR";
+  case Z_OK:
+  default:
+    llvm_unreachable("unknown or unexpected zlib status code");
   }
 }
 
 bool zlib::isAvailable() { return true; }
-zlib::Status zlib::compress(StringRef InputBuffer,
-                            SmallVectorImpl<char> &CompressedBuffer,
-                            CompressionLevel Level) {
+
+Error zlib::compress(StringRef InputBuffer,
+                     SmallVectorImpl<char> &CompressedBuffer,
+                     CompressionLevel Level) {
   unsigned long CompressedSize = ::compressBound(InputBuffer.size());
   CompressedBuffer.resize(CompressedSize);
   int CLevel = encodeZlibCompressionLevel(Level);
-  Status Res = encodeZlibReturnValue(::compress2(
-      (Bytef *)CompressedBuffer.data(), &CompressedSize,
-      (const Bytef *)InputBuffer.data(), InputBuffer.size(), CLevel));
+  int Res = ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize,
+                        (const Bytef *)InputBuffer.data(), InputBuffer.size(),
+                        CLevel);
   // Tell MemorySanitizer that zlib output buffer is fully initialized.
   // This avoids a false report when running LLVM with uninstrumented ZLib.
   __msan_unpoison(CompressedBuffer.data(), CompressedSize);
   CompressedBuffer.resize(CompressedSize);
-  return Res;
+  return Res ? createError(convertZlibCodeToString(Res)) : Error::success();
 }
 
-zlib::Status zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
-                              size_t &UncompressedSize) {
-  Status Res = encodeZlibReturnValue(
+Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
+                       size_t &UncompressedSize) {
+  int Res =
       ::uncompress((Bytef *)UncompressedBuffer, (uLongf *)&UncompressedSize,
-                   (const Bytef *)InputBuffer.data(), InputBuffer.size()));
+                   (const Bytef *)InputBuffer.data(), InputBuffer.size());
   // Tell MemorySanitizer that zlib output buffer is fully initialized.
   // This avoids a false report when running LLVM with uninstrumented ZLib.
   __msan_unpoison(UncompressedBuffer, UncompressedSize);
-  return Res;
+  return Res ? createError(convertZlibCodeToString(Res)) : Error::success();
 }
 
-zlib::Status zlib::uncompress(StringRef InputBuffer,
-                              SmallVectorImpl<char> &UncompressedBuffer,
-                              size_t UncompressedSize) {
+Error zlib::uncompress(StringRef InputBuffer,
+                       SmallVectorImpl<char> &UncompressedBuffer,
+                       size_t UncompressedSize) {
   UncompressedBuffer.resize(UncompressedSize);
-  Status Res =
+  Error E =
       uncompress(InputBuffer, UncompressedBuffer.data(), UncompressedSize);
   UncompressedBuffer.resize(UncompressedSize);
-  return Res;
+  return E;
 }
 
 uint32_t zlib::crc32(StringRef Buffer) {
@@ -89,19 +100,19 @@ uint32_t zlib::crc32(StringRef Buffer) {
 
 #else
 bool zlib::isAvailable() { return false; }
-zlib::Status zlib::compress(StringRef InputBuffer,
-                            SmallVectorImpl<char> &CompressedBuffer,
-                            CompressionLevel Level) {
-  return zlib::StatusUnsupported;
+Error zlib::compress(StringRef InputBuffer,
+                     SmallVectorImpl<char> &CompressedBuffer,
+                     CompressionLevel Level) {
+  llvm_unreachable("zlib::compress is unavailable");
 }
-zlib::Status zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
-                              size_t &UncompressedSize) {
-  return zlib::StatusUnsupported;
+Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
+                       size_t &UncompressedSize) {
+  llvm_unreachable("zlib::uncompress is unavailable");
 }
-zlib::Status zlib::uncompress(StringRef InputBuffer,
-                              SmallVectorImpl<char> &UncompressedBuffer,
-                              size_t UncompressedSize) {
-  return zlib::StatusUnsupported;
+Error zlib::uncompress(StringRef InputBuffer,
+                       SmallVectorImpl<char> &UncompressedBuffer,
+                       size_t UncompressedSize) {
+  llvm_unreachable("zlib::uncompress is unavailable");
 }
 uint32_t zlib::crc32(StringRef Buffer) {
   llvm_unreachable("zlib::crc32 is unavailable");
diff --git a/lib/Support/DebugCounter.cpp b/lib/Support/DebugCounter.cpp
new file mode 100644
index 000000000000..29dae8a20f00
--- /dev/null
+++ b/lib/Support/DebugCounter.cpp
@@ -0,0 +1,108 @@
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Options.h"
+
+using namespace llvm;
+
+// This class overrides the default list implementation of printing so we
+// can pretty print the list of debug counter options.  This type of
+// dynamic option is pretty rare (basically this and pass lists).
+class DebugCounterList : public cl::list<std::string, DebugCounter> {
+private:
+  using Base = cl::list<std::string, DebugCounter>;
+
+public:
+  template <class... Mods>
+  explicit DebugCounterList(Mods &&... Ms) : Base(std::forward<Mods>(Ms)...) {}
+
+private:
+  void printOptionInfo(size_t GlobalWidth) const override {
+    // This is a variant of from generic_parser_base::printOptionInfo.  Sadly,
+    // it's not easy to make it more usable.  We could get it to print these as
+    // options if we were a cl::opt and registered them, but lists don't have
+    // options, nor does the parser for std::string.  The other mechanisms for
+    // options are global and would pollute the global namespace with our
+    // counters.  Rather than go that route, we have just overridden the
+    // printing, which only a few things call anyway.
+    outs() << "  -" << ArgStr;
+    // All of the other options in CommandLine.cpp use ArgStr.size() + 6 for
+    // width, so we do the same.
+    Option::printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6);
+    const auto &CounterInstance = DebugCounter::instance();
+    for (auto Name : CounterInstance) {
+      const auto Info =
+          CounterInstance.getCounterInfo(CounterInstance.getCounterId(Name));
+      size_t NumSpaces = GlobalWidth - Info.first.size() - 8;
+      outs() << "    =" << Info.first;
+      outs().indent(NumSpaces) << " -   " << Info.second << '\n';
+    }
+  }
+};
+
+// Create our command line option.
+static DebugCounterList DebugCounterOption(
+    "debug-counter",
+    cl::desc("Comma separated list of debug counter skip and count"),
+    cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance()));
+
+static ManagedStatic<DebugCounter> DC;
+
+DebugCounter &DebugCounter::instance() { return *DC; }
+
+// This is called by the command line parser when it sees a value for the
+// debug-counter option defined above.
+void DebugCounter::push_back(const std::string &Val) {
+  if (Val.empty())
+    return;
+  // The strings should come in as counter=value
+  auto CounterPair = StringRef(Val).split('=');
+  if (CounterPair.second.empty()) {
+    errs() << "DebugCounter Error: " << Val << " does not have an = in it\n";
+    return;
+  }
+  // Now we have counter=value.
+  // First, process value.
+  long CounterVal;
+  if (CounterPair.second.getAsInteger(0, CounterVal)) {
+    errs() << "DebugCounter Error: " << CounterPair.second
+           << " is not a number\n";
+    return;
+  }
+  // Now we need to see if this is the skip or the count, remove the suffix, and
+  // add it to the counter values.
+  if (CounterPair.first.endswith("-skip")) {
+    auto CounterName = CounterPair.first.drop_back(5);
+    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    if (!CounterID) {
+      errs() << "DebugCounter Error: " << CounterName
+             << " is not a registered counter\n";
+      return;
+    }
+
+    auto Res = Counters.insert({CounterID, {0, -1}});
+    Res.first->second.first = CounterVal;
+  } else if (CounterPair.first.endswith("-count")) {
+    auto CounterName = CounterPair.first.drop_back(6);
+    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    if (!CounterID) {
+      errs() << "DebugCounter Error: " << CounterName
+             << " is not a registered counter\n";
+      return;
+    }
+
+    auto Res = Counters.insert({CounterID, {0, -1}});
+    Res.first->second.second = CounterVal;
+  } else {
+    errs() << "DebugCounter Error: " << CounterPair.first
+           << " does not end with -skip or -count\n";
+  }
+}
+
+void DebugCounter::print(raw_ostream &OS) {
+  OS << "Counters and values:\n";
+  for (const auto &KV : Counters)
+    OS << left_justify(RegisteredCounters[KV.first], 32) << ": {"
+       << KV.second.first << "," << KV.second.second << "}\n";
+}
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 8950e8c919a4..f13da62e4a87 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -304,6 +304,17 @@ StringRef llvm::dwarf::ApplePropertyString(unsigned Prop) {
   }
 }
 
+StringRef llvm::dwarf::UnitTypeString(unsigned UT) {
+  switch (UT) {
+  default:
+    return StringRef();
+#define HANDLE_DW_UT(ID, NAME)                                                 \
+  case DW_UT_##NAME:                                                           \
+    return "DW_UT_" #NAME;
+#include "llvm/Support/Dwarf.def"
+  }
+}
+
 StringRef llvm::dwarf::AtomTypeString(unsigned AT) {
   switch (AT) {
   case dwarf::DW_ATOM_null:
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index ced21e46afe8..92ce6185306a 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -9,8 +9,6 @@
 //
 //  This file implements the operating system DynamicLibrary concept.
 //
-// FIXME: This file leaks ExplicitSymbols and OpenedHandles!
-//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
@@ -51,7 +49,7 @@ using namespace llvm::sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static DenseSet<void *> *OpenedHandles = nullptr;
+static llvm::ManagedStatic<DenseSet<void *> > OpenedHandles;
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
@@ -70,9 +68,6 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     handle = RTLD_DEFAULT;
 #endif
 
-  if (!OpenedHandles)
-    OpenedHandles = new DenseSet<void *>();
-
   // If we've already loaded this library, dlclose() the handle in order to
   // keep the internal refcount at +1.
   if (!OpenedHandles->insert(handle).second)
@@ -81,6 +76,18 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   return DynamicLibrary(handle);
 }
 
+DynamicLibrary DynamicLibrary::addPermanentLibrary(void *handle,
+                                                   std::string *errMsg) {
+  SmartScopedLock<true> lock(*SymbolsMutex);
+  // If we've already loaded this library, tell the caller.
+  if (!OpenedHandles->insert(handle).second) {
+    if (errMsg) *errMsg = "Library already loaded";
+    return DynamicLibrary();
+  }
+
+  return DynamicLibrary(handle);
+}
+
 void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
   if (!isValid())
     return nullptr;
@@ -121,7 +128,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 
 #if defined(HAVE_DLFCN_H) && defined(HAVE_DLOPEN)
   // Now search the libraries.
-  if (OpenedHandles) {
+  if (OpenedHandles.isConstructed()) {
     for (DenseSet<void *>::iterator I = OpenedHandles->begin(),
          E = OpenedHandles->end(); I != E; ++I) {
       //lt_ptr ptr = lt_dlsym(*I, symbolName);
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 57e5a8d7871c..731740d012d9 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -57,6 +57,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
         // FIXME: In posix, you use the access() call to check this.
       }
       break;
+    case sys::fs::file_type::directory_file:
+      return errc::is_a_directory;
     default:
       if (EC)
         return EC;
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index d1b40412a6fc..970ecfd7df90 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -52,25 +52,218 @@
 
 using namespace llvm;
 
-#if defined(__linux__)
-static ssize_t LLVM_ATTRIBUTE_UNUSED readCpuInfo(void *Buf, size_t Size) {
-  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
-  // memory buffer because the 'file' has 0 size (it can be read from only
-  // as a stream).
-
-  int FD;
-  std::error_code EC = sys::fs::openFileForRead("/proc/cpuinfo", FD);
-  if (EC) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << EC.message() << "\n");
-    return -1;
+static std::unique_ptr<llvm::MemoryBuffer>
+    LLVM_ATTRIBUTE_UNUSED getProcCpuinfoContent() {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFileAsStream("/proc/cpuinfo");
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Can't read "
+                 << "/proc/cpuinfo: " << EC.message() << "\n";
+    return nullptr;
   }
-  int Ret = read(FD, Buf, Size);
-  int CloseStatus = close(FD);
-  if (CloseStatus)
-    return -1;
-  return Ret;
+  return std::move(*Text);
+}
+
+StringRef sys::detail::getHostCPUNameForPowerPC(
+    const StringRef &ProcCpuinfoContent) {
+  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
+  // and so we must use an operating-system interface to determine the current
+  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
+  const char *generic = "generic";
+
+  // The cpu line is second (after the 'processor: 0' line), so if this
+  // buffer is too small then something has changed (or is wrong).
+  StringRef::const_iterator CPUInfoStart = ProcCpuinfoContent.begin();
+  StringRef::const_iterator CPUInfoEnd = ProcCpuinfoContent.end();
+
+  StringRef::const_iterator CIP = CPUInfoStart;
+
+  StringRef::const_iterator CPUStart = 0;
+  size_t CPULen = 0;
+
+  // We need to find the first line which starts with cpu, spaces, and a colon.
+  // After the colon, there may be some additional spaces and then the cpu type.
+  while (CIP < CPUInfoEnd && CPUStart == 0) {
+    if (CIP < CPUInfoEnd && *CIP == '\n')
+      ++CIP;
+
+    if (CIP < CPUInfoEnd && *CIP == 'c') {
+      ++CIP;
+      if (CIP < CPUInfoEnd && *CIP == 'p') {
+        ++CIP;
+        if (CIP < CPUInfoEnd && *CIP == 'u') {
+          ++CIP;
+          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+            ++CIP;
+
+          if (CIP < CPUInfoEnd && *CIP == ':') {
+            ++CIP;
+            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+              ++CIP;
+
+            if (CIP < CPUInfoEnd) {
+              CPUStart = CIP;
+              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
+                                          *CIP != ',' && *CIP != '\n'))
+                ++CIP;
+              CPULen = CIP - CPUStart;
+            }
+          }
+        }
+      }
+    }
+
+    if (CPUStart == 0)
+      while (CIP < CPUInfoEnd && *CIP != '\n')
+        ++CIP;
+  }
+
+  if (CPUStart == 0)
+    return generic;
+
+  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
+      .Case("604e", "604e")
+      .Case("604", "604")
+      .Case("7400", "7400")
+      .Case("7410", "7400")
+      .Case("7447", "7400")
+      .Case("7455", "7450")
+      .Case("G4", "g4")
+      .Case("POWER4", "970")
+      .Case("PPC970FX", "970")
+      .Case("PPC970MP", "970")
+      .Case("G5", "g5")
+      .Case("POWER5", "g5")
+      .Case("A2", "a2")
+      .Case("POWER6", "pwr6")
+      .Case("POWER7", "pwr7")
+      .Case("POWER8", "pwr8")
+      .Case("POWER8E", "pwr8")
+      .Case("POWER8NVL", "pwr8")
+      .Case("POWER9", "pwr9")
+      .Default(generic);
+}
+
+StringRef sys::detail::getHostCPUNameForARM(
+    const StringRef &ProcCpuinfoContent) {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+
+  // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  StringRef Hardware;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+    if (Lines[I].startswith("Hardware"))
+      Hardware = Lines[I].substr(8).ltrim("\t :");
+  }
+
+  if (Implementer == "0x41") { // ARM Ltd.
+    // MSM8992/8994 may give cpu part for the core that the kernel is running on,
+    // which is undeterministic and wrong. Always return cortex-a53 for these SoC.
+    if (Hardware.endswith("MSM8994") || Hardware.endswith("MSM8996"))
+      return "cortex-a53";
+
+
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x926", "arm926ej-s")
+            .Case("0xb02", "mpcore")
+            .Case("0xb36", "arm1136j-s")
+            .Case("0xb56", "arm1156t2-s")
+            .Case("0xb76", "arm1176jz-s")
+            .Case("0xc08", "cortex-a8")
+            .Case("0xc09", "cortex-a9")
+            .Case("0xc0f", "cortex-a15")
+            .Case("0xc20", "cortex-m0")
+            .Case("0xc23", "cortex-m3")
+            .Case("0xc24", "cortex-m4")
+            .Case("0xd04", "cortex-a35")
+            .Case("0xd03", "cortex-a53")
+            .Case("0xd07", "cortex-a57")
+            .Case("0xd08", "cortex-a72")
+            .Case("0xd09", "cortex-a73")
+            .Default("generic");
+  }
+
+  if (Implementer == "0x51") // Qualcomm Technologies, Inc.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x06f", "krait") // APQ8064
+            .Case("0x201", "kryo")
+            .Case("0x205", "kryo")
+            .Default("generic");
+
+  return "generic";
+}
+
+StringRef sys::detail::getHostCPUNameForS390x(
+    const StringRef &ProcCpuinfoContent) {
+  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
+
+  // The "processor 0:" line comes after a fair amount of other information,
+  // including a cache breakdown, but this should be plenty.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for the CPU features.
+  SmallVector<StringRef, 32> CPUFeatures;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("features")) {
+      size_t Pos = Lines[I].find(":");
+      if (Pos != StringRef::npos) {
+        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
+        break;
+      }
+    }
+
+  // We need to check for the presence of vector support independently of
+  // the machine type, since we may only use the vector register set when
+  // supported by the kernel (and hypervisor).
+  bool HaveVectorSupport = false;
+  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
+    if (CPUFeatures[I] == "vx")
+      HaveVectorSupport = true;
+  }
+
+  // Now check the processor machine type.
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("processor ")) {
+      size_t Pos = Lines[I].find("machine = ");
+      if (Pos != StringRef::npos) {
+        Pos += sizeof("machine = ") - 1;
+        unsigned int Id;
+        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 2964 && HaveVectorSupport)
+            return "z13";
+          if (Id >= 2827)
+            return "zEC12";
+          if (Id >= 2817)
+            return "z196";
+        }
+      }
+      break;
+    }
+  }
+
+  return "generic";
 }
-#endif
 
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
@@ -1020,201 +1213,21 @@ StringRef sys::getHostCPUName() {
 }
 #elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
 StringRef sys::getHostCPUName() {
-  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
-  // and so we must use an operating-system interface to determine the current
-  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
-  const char *generic = "generic";
-
-  // The cpu line is second (after the 'processor: 0' line), so if this
-  // buffer is too small then something has changed (or is wrong).
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return generic;
-
-  const char *CPUInfoStart = buffer;
-  const char *CPUInfoEnd = buffer + CPUInfoSize;
-
-  const char *CIP = CPUInfoStart;
-
-  const char *CPUStart = 0;
-  size_t CPULen = 0;
-
-  // We need to find the first line which starts with cpu, spaces, and a colon.
-  // After the colon, there may be some additional spaces and then the cpu type.
-  while (CIP < CPUInfoEnd && CPUStart == 0) {
-    if (CIP < CPUInfoEnd && *CIP == '\n')
-      ++CIP;
-
-    if (CIP < CPUInfoEnd && *CIP == 'c') {
-      ++CIP;
-      if (CIP < CPUInfoEnd && *CIP == 'p') {
-        ++CIP;
-        if (CIP < CPUInfoEnd && *CIP == 'u') {
-          ++CIP;
-          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
-            ++CIP;
-
-          if (CIP < CPUInfoEnd && *CIP == ':') {
-            ++CIP;
-            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
-              ++CIP;
-
-            if (CIP < CPUInfoEnd) {
-              CPUStart = CIP;
-              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
-                                          *CIP != ',' && *CIP != '\n'))
-                ++CIP;
-              CPULen = CIP - CPUStart;
-            }
-          }
-        }
-      }
-    }
-
-    if (CPUStart == 0)
-      while (CIP < CPUInfoEnd && *CIP != '\n')
-        ++CIP;
-  }
-
-  if (CPUStart == 0)
-    return generic;
-
-  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
-      .Case("604e", "604e")
-      .Case("604", "604")
-      .Case("7400", "7400")
-      .Case("7410", "7400")
-      .Case("7447", "7400")
-      .Case("7455", "7450")
-      .Case("G4", "g4")
-      .Case("POWER4", "970")
-      .Case("PPC970FX", "970")
-      .Case("PPC970MP", "970")
-      .Case("G5", "g5")
-      .Case("POWER5", "g5")
-      .Case("A2", "a2")
-      .Case("POWER6", "pwr6")
-      .Case("POWER7", "pwr7")
-      .Case("POWER8", "pwr8")
-      .Case("POWER8E", "pwr8")
-      .Case("POWER8NVL", "pwr8")
-      .Case("POWER9", "pwr9")
-      .Default(generic);
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForPowerPC(Content);
 }
-#elif defined(__linux__) && defined(__arm__)
+#elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 StringRef sys::getHostCPUName() {
-  // The cpuid register on arm is not accessible from user space. On Linux,
-  // it is exposed through the /proc/cpuinfo file.
-
-  // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
-  // in all cases.
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return "generic";
-
-  StringRef Str(buffer, CPUInfoSize);
-
-  SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
-
-  // Look for the CPU implementer line.
-  StringRef Implementer;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("CPU implementer"))
-      Implementer = Lines[I].substr(15).ltrim("\t :");
-
-  if (Implementer == "0x41") // ARM Ltd.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x926", "arm926ej-s")
-            .Case("0xb02", "mpcore")
-            .Case("0xb36", "arm1136j-s")
-            .Case("0xb56", "arm1156t2-s")
-            .Case("0xb76", "arm1176jz-s")
-            .Case("0xc08", "cortex-a8")
-            .Case("0xc09", "cortex-a9")
-            .Case("0xc0f", "cortex-a15")
-            .Case("0xc20", "cortex-m0")
-            .Case("0xc23", "cortex-m3")
-            .Case("0xc24", "cortex-m4")
-            .Default("generic");
-
-  if (Implementer == "0x51") // Qualcomm Technologies, Inc.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x06f", "krait") // APQ8064
-            .Default("generic");
-
-  return "generic";
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForARM(Content);
 }
 #elif defined(__linux__) && defined(__s390x__)
 StringRef sys::getHostCPUName() {
-  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
-
-  // The "processor 0:" line comes after a fair amount of other information,
-  // including a cache breakdown, but this should be plenty.
-  char buffer[2048];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return "generic";
-
-  StringRef Str(buffer, CPUInfoSize);
-  SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
-
-  // Look for the CPU features.
-  SmallVector<StringRef, 32> CPUFeatures;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("features")) {
-      size_t Pos = Lines[I].find(":");
-      if (Pos != StringRef::npos) {
-        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
-        break;
-      }
-    }
-
-  // We need to check for the presence of vector support independently of
-  // the machine type, since we may only use the vector register set when
-  // supported by the kernel (and hypervisor).
-  bool HaveVectorSupport = false;
-  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
-    if (CPUFeatures[I] == "vx")
-      HaveVectorSupport = true;
-  }
-
-  // Now check the processor machine type.
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-    if (Lines[I].startswith("processor ")) {
-      size_t Pos = Lines[I].find("machine = ");
-      if (Pos != StringRef::npos) {
-        Pos += sizeof("machine = ") - 1;
-        unsigned int Id;
-        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
-          if (Id >= 2964 && HaveVectorSupport)
-            return "z13";
-          if (Id >= 2827)
-            return "zEC12";
-          if (Id >= 2817)
-            return "z196";
-        }
-      }
-      break;
-    }
-  }
-
-  return "generic";
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForS390x(Content);
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
@@ -1232,6 +1245,7 @@ static int computeHostNumPhysicalCores() {
   if (std::error_code EC = Text.getError()) {
     llvm::errs() << "Can't read "
                  << "/proc/cpuinfo: " << EC.message() << "\n";
+    return -1;
   }
   SmallVector<StringRef, 8> strs;
   (*Text)->getBuffer().split(strs, "\n", /*MaxSplit=*/-1,
@@ -1353,6 +1367,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["tbm"] = HasExtLeaf1 && ((ECX >> 21) & 1);
   Features["mwaitx"] = HasExtLeaf1 && ((ECX >> 29) & 1);
 
+  bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
+                     !getX86CpuIDAndInfoEx(0x80000008,0x0, &EAX, &EBX, &ECX, &EDX);
+  Features["clzero"] = HasExtLeaf8 && ((EBX >> 0) & 1);
+
   bool HasLeaf7 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
@@ -1362,14 +1380,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["fsgsbase"] = HasLeaf7 && ((EBX >> 0) & 1);
   Features["sgx"] = HasLeaf7 && ((EBX >> 2) & 1);
   Features["bmi"] = HasLeaf7 && ((EBX >> 3) & 1);
-  Features["hle"] = HasLeaf7 && ((EBX >> 4) & 1);
   Features["bmi2"] = HasLeaf7 && ((EBX >> 8) & 1);
-  Features["invpcid"] = HasLeaf7 && ((EBX >> 10) & 1);
   Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1);
   Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1);
-  Features["smap"] = HasLeaf7 && ((EBX >> 20) & 1);
-  Features["pcommit"] = HasLeaf7 && ((EBX >> 22) & 1);
   Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1);
   Features["clwb"] = HasLeaf7 && ((EBX >> 24) & 1);
   Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1);
@@ -1401,17 +1415,12 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
-  // Read 1024 bytes from /proc/cpuinfo, which should contain the Features line
-  // in all cases.
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  if (!P)
     return false;
 
-  StringRef Str(buffer, CPUInfoSize);
-
   SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
+  P->getBuffer().split(Lines, "\n");
 
   SmallVector<StringRef, 32> CPUFeatures;
 
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 444aaa37c8c8..8be9879fbc24 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -304,9 +304,9 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   Interval.tv_sec = 0;
   Interval.tv_nsec = 1000000;
 #endif
-  // Don't wait more than five minutes per iteration. Total timeout for the file
-  // to appear is ~8.5 mins.
-  const unsigned MaxSeconds = 5*60;
+  // Don't wait more than 40s per iteration. Total timeout for the file
+  // to appear is ~1.5 minutes.
+  const unsigned MaxSeconds = 40;
   do {
     // Sleep for the designated interval, to allow the owning process time to
     // finish up and remove the lock file.
diff --git a/lib/Support/LowLevelType.cpp b/lib/Support/LowLevelType.cpp
new file mode 100644
index 000000000000..4290d69cd197
--- /dev/null
+++ b/lib/Support/LowLevelType.cpp
@@ -0,0 +1,47 @@
+//===-- llvm/Support/LowLevelType.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the more header-heavy bits of the LLT class to
+/// avoid polluting users' namespaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LLT::LLT(MVT VT) {
+  if (VT.isVector()) {
+    SizeInBits = VT.getVectorElementType().getSizeInBits();
+    ElementsOrAddrSpace = VT.getVectorNumElements();
+    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
+  } else if (VT.isValid()) {
+    // Aggregates are no different from real scalars as far as GlobalISel is
+    // concerned.
+    Kind = Scalar;
+    SizeInBits = VT.getSizeInBits();
+    ElementsOrAddrSpace = 1;
+    assert(SizeInBits != 0 && "invalid zero-sized type");
+  } else {
+    Kind = Invalid;
+    SizeInBits = ElementsOrAddrSpace = 0;
+  }
+}
+
+void LLT::print(raw_ostream &OS) const {
+  if (isVector())
+    OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">";
+  else if (isPointer())
+    OS << "p" << getAddressSpace();
+  else if (isValid()) {
+    assert(isScalar() && "unexpected type");
+    OS << "s" << getScalarSizeInBits();
+  } else
+    llvm_unreachable("trying to print an invalid type");
+}
diff --git a/lib/Support/MD5.cpp b/lib/Support/MD5.cpp
index 942571eab0f3..bdbf1d677938 100644
--- a/lib/Support/MD5.cpp
+++ b/lib/Support/MD5.cpp
@@ -38,9 +38,13 @@
  */
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
+#include <array>
+#include <cstdint>
 #include <cstring>
 
 // The basic MD5 functions.
@@ -68,7 +72,7 @@
        ((MD5_u32plus) ptr[(n) * 4 + 3] << 24))
 #define GET(n) (block[(n)])
 
-namespace llvm {
+using namespace llvm;
 
 /// \brief This processes one or more 64-byte data blocks, but does NOT update
 ///the bit counters.  There are no alignment requirements.
@@ -179,9 +183,7 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
   return ptr;
 }
 
-MD5::MD5()
-    : a(0x67452301), b(0xefcdab89), c(0x98badcfe), d(0x10325476), hi(0), lo(0) {
-}
+MD5::MD5() = default;
 
 /// Incrementally add the bytes in \p Data to the hash.
 void MD5::update(ArrayRef<uint8_t> Data) {
@@ -259,10 +261,16 @@ void MD5::final(MD5Result &Result) {
   support::endian::write32le(&Result[12], d);
 }
 
-void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
+SmallString<32> MD5::MD5Result::digest() const {
+  SmallString<32> Str;
   raw_svector_ostream Res(Str);
   for (int i = 0; i < 16; ++i)
-    Res << format("%.2x", Result[i]);
+    Res << format("%.2x", Bytes[i]);
+  return Str;
+}
+
+void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
+  Str = Result.digest();
 }
 
 std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
@@ -271,8 +279,5 @@ std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
   MD5::MD5Result Res;
   Hash.final(Res);
 
-  std::array<uint8_t, 16> Arr;
-  memcpy(Arr.data(), Res, sizeof(Res));
-  return Arr;
-}
+  return Res;
 }
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
index 7dd31315f90d..fb7cd070c42d 100644
--- a/lib/Support/ManagedStatic.cpp
+++ b/lib/Support/ManagedStatic.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 
 static const ManagedStaticBase *StaticList = nullptr;
 static sys::Mutex *ManagedStaticMutex = nullptr;
-LLVM_DEFINE_ONCE_FLAG(mutex_init_flag);
+static llvm::once_flag mutex_init_flag;
 
 static void initializeMutex() {
   ManagedStaticMutex = new sys::Mutex();
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index a3a18c9283ce..227e792d83dc 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -103,7 +103,7 @@ public:
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, 
-           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize);
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile);
 
 std::unique_ptr<MemoryBuffer>
 MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,
@@ -178,8 +178,8 @@ MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, 
-                           uint64_t Offset) {
-  return getFileAux(FilePath, -1, MapSize, Offset, false, false);
+                           uint64_t Offset, bool IsVolatile) {
+  return getFileAux(FilePath, -1, MapSize, Offset, false, IsVolatile);
 }
 
 
@@ -254,19 +254,19 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
-                      bool RequiresNullTerminator, bool IsVolatileSize) {
+                      bool RequiresNullTerminator, bool IsVolatile) {
   return getFileAux(Filename, FileSize, FileSize, 0,
-                    RequiresNullTerminator, IsVolatileSize);
+                    RequiresNullTerminator, IsVolatile);
 }
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
-                bool IsVolatileSize);
+                bool IsVolatile);
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
-           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize) {
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) {
   int FD;
   std::error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
@@ -274,7 +274,7 @@ getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
       getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset,
-                      RequiresNullTerminator, IsVolatileSize);
+                      RequiresNullTerminator, IsVolatile);
   close(FD);
   return Ret;
 }
@@ -285,11 +285,11 @@ static bool shouldUseMmap(int FD,
                           off_t Offset,
                           bool RequiresNullTerminator,
                           int PageSize,
-                          bool IsVolatileSize) {
+                          bool IsVolatile) {
   // mmap may leave the buffer without null terminator if the file size changed
   // by the time the last page is mapped in, so avoid it if the file size is
   // likely to change.
-  if (IsVolatileSize)
+  if (IsVolatile)
     return false;
 
   // We don't use mmap for small files because this can severely fragment our
@@ -300,7 +300,6 @@ static bool shouldUseMmap(int FD,
   if (!RequiresNullTerminator)
     return true;
 
-
   // If we don't know the file size, use fstat to find out.  fstat on an open
   // file descriptor is cheaper than stat on a random path.
   // FIXME: this chunk of code is duplicated, but it avoids a fstat when
@@ -338,7 +337,7 @@ static bool shouldUseMmap(int FD,
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
-                bool IsVolatileSize) {
+                bool IsVolatile) {
   static int PageSize = sys::Process::getPageSize();
 
   // Default is to map the full file.
@@ -365,7 +364,7 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
   }
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
-                    PageSize, IsVolatileSize)) {
+                    PageSize, IsVolatile)) {
     std::error_code EC;
     std::unique_ptr<MemoryBuffer> Result(
         new (NamedBufferAlloc(Filename))
@@ -415,17 +414,16 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
-                          bool RequiresNullTerminator, bool IsVolatileSize) {
+                          bool RequiresNullTerminator, bool IsVolatile) {
   return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
-                         RequiresNullTerminator, IsVolatileSize);
+                         RequiresNullTerminator, IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
-                               int64_t Offset) {
+                               int64_t Offset, bool IsVolatile) {
   assert(MapSize != uint64_t(-1));
-  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false,
-                         /*IsVolatileSize*/ false);
+  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 4bb035eeccca..9fd6652ce4b8 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Path.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/MachO.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/Process.h"
 #include <cctype>
 #include <cstring>
@@ -34,16 +35,29 @@ using namespace llvm::support::endian;
 namespace {
   using llvm::StringRef;
   using llvm::sys::path::is_separator;
+  using llvm::sys::path::Style;
 
+  inline Style real_style(Style style) {
 #ifdef LLVM_ON_WIN32
-  const char *separators = "\\/";
-  const char preferred_separator = '\\';
+    return (style == Style::posix) ? Style::posix : Style::windows;
 #else
-  const char  separators = '/';
-  const char preferred_separator = '/';
+    return (style == Style::windows) ? Style::windows : Style::posix;
 #endif
+  }
 
-  StringRef find_first_component(StringRef path) {
+  inline const char *separators(Style style) {
+    if (real_style(style) == Style::windows)
+      return "\\/";
+    return "/";
+  }
+
+  inline char preferred_separator(Style style) {
+    if (real_style(style) == Style::windows)
+      return '\\';
+    return '/';
+  }
+
+  StringRef find_first_component(StringRef path, Style style) {
     // Look for this first component in the following order.
     // * empty (in this case we return an empty string)
     // * either C: or {//,\\}net.
@@ -53,96 +67,85 @@ namespace {
     if (path.empty())
       return path;
 
-#ifdef LLVM_ON_WIN32
-    // C:
-    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
-        path[1] == ':')
-      return path.substr(0, 2);
-#endif
+    if (real_style(style) == Style::windows) {
+      // C:
+      if (path.size() >= 2 &&
+          std::isalpha(static_cast<unsigned char>(path[0])) && path[1] == ':')
+        return path.substr(0, 2);
+    }
 
     // //net
-    if ((path.size() > 2) &&
-        is_separator(path[0]) &&
-        path[0] == path[1] &&
-        !is_separator(path[2])) {
+    if ((path.size() > 2) && is_separator(path[0], style) &&
+        path[0] == path[1] && !is_separator(path[2], style)) {
       // Find the next directory separator.
-      size_t end = path.find_first_of(separators, 2);
+      size_t end = path.find_first_of(separators(style), 2);
       return path.substr(0, end);
     }
 
     // {/,\}
-    if (is_separator(path[0]))
+    if (is_separator(path[0], style))
       return path.substr(0, 1);
 
     // * {file,directory}name
-    size_t end = path.find_first_of(separators);
+    size_t end = path.find_first_of(separators(style));
     return path.substr(0, end);
   }
 
-  size_t filename_pos(StringRef str) {
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
+  size_t filename_pos(StringRef str, Style style) {
+    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
       return 0;
 
-    if (str.size() > 0 && is_separator(str[str.size() - 1]))
+    if (str.size() > 0 && is_separator(str[str.size() - 1], style))
       return str.size() - 1;
 
-    size_t pos = str.find_last_of(separators, str.size() - 1);
+    size_t pos = str.find_last_of(separators(style), str.size() - 1);
 
-#ifdef LLVM_ON_WIN32
-    if (pos == StringRef::npos)
-      pos = str.find_last_of(':', str.size() - 2);
-#endif
+    if (real_style(style) == Style::windows) {
+      if (pos == StringRef::npos)
+        pos = str.find_last_of(':', str.size() - 2);
+    }
 
-    if (pos == StringRef::npos ||
-        (pos == 1 && is_separator(str[0])))
+    if (pos == StringRef::npos || (pos == 1 && is_separator(str[0], style)))
       return 0;
 
     return pos + 1;
   }
 
-  size_t root_dir_start(StringRef str) {
+  size_t root_dir_start(StringRef str, Style style) {
     // case "c:/"
-#ifdef LLVM_ON_WIN32
-    if (str.size() > 2 &&
-        str[1] == ':' &&
-        is_separator(str[2]))
-      return 2;
-#endif
+    if (real_style(style) == Style::windows) {
+      if (str.size() > 2 && str[1] == ':' && is_separator(str[2], style))
+        return 2;
+    }
 
     // case "//"
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
+    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
       return StringRef::npos;
 
     // case "//net"
-    if (str.size() > 3 &&
-        is_separator(str[0]) &&
-        str[0] == str[1] &&
-        !is_separator(str[2])) {
-      return str.find_first_of(separators, 2);
+    if (str.size() > 3 && is_separator(str[0], style) && str[0] == str[1] &&
+        !is_separator(str[2], style)) {
+      return str.find_first_of(separators(style), 2);
     }
 
     // case "/"
-    if (str.size() > 0 && is_separator(str[0]))
+    if (str.size() > 0 && is_separator(str[0], style))
       return 0;
 
     return StringRef::npos;
   }
 
-  size_t parent_path_end(StringRef path) {
-    size_t end_pos = filename_pos(path);
+  size_t parent_path_end(StringRef path, Style style) {
+    size_t end_pos = filename_pos(path, style);
 
-    bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
+    bool filename_was_sep =
+        path.size() > 0 && is_separator(path[end_pos], style);
 
     // Skip separators except for root dir.
-    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
+    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos), style);
 
-    while(end_pos > 0 &&
-          (end_pos - 1) != root_dir_pos &&
-          is_separator(path[end_pos - 1]))
+    while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+           is_separator(path[end_pos - 1], style))
       --end_pos;
 
     if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
@@ -230,11 +233,12 @@ namespace llvm {
 namespace sys  {
 namespace path {
 
-const_iterator begin(StringRef path) {
+const_iterator begin(StringRef path, Style style) {
   const_iterator i;
   i.Path      = path;
-  i.Component = find_first_component(path);
+  i.Component = find_first_component(path, style);
   i.Position  = 0;
+  i.S = style;
   return i;
 }
 
@@ -259,27 +263,21 @@ const_iterator &const_iterator::operator++() {
 
   // Both POSIX and Windows treat paths that begin with exactly two separators
   // specially.
-  bool was_net = Component.size() > 2 &&
-    is_separator(Component[0]) &&
-    Component[1] == Component[0] &&
-    !is_separator(Component[2]);
+  bool was_net = Component.size() > 2 && is_separator(Component[0], S) &&
+                 Component[1] == Component[0] && !is_separator(Component[2], S);
 
   // Handle separators.
-  if (is_separator(Path[Position])) {
+  if (is_separator(Path[Position], S)) {
     // Root dir.
-    if (was_net
-#ifdef LLVM_ON_WIN32
+    if (was_net ||
         // c:/
-        || Component.endswith(":")
-#endif
-        ) {
+        (real_style(S) == Style::windows && Component.endswith(":"))) {
       Component = Path.substr(Position, 1);
       return *this;
     }
 
     // Skip extra separators.
-    while (Position != Path.size() &&
-           is_separator(Path[Position])) {
+    while (Position != Path.size() && is_separator(Path[Position], S)) {
       ++Position;
     }
 
@@ -292,7 +290,7 @@ const_iterator &const_iterator::operator++() {
   }
 
   // Find next component.
-  size_t end_pos = Path.find_first_of(separators, Position);
+  size_t end_pos = Path.find_first_of(separators(S), Position);
   Component = Path.slice(Position, end_pos);
 
   return *this;
@@ -306,10 +304,11 @@ ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
   return Position - RHS.Position;
 }
 
-reverse_iterator rbegin(StringRef Path) {
+reverse_iterator rbegin(StringRef Path, Style style) {
   reverse_iterator I;
   I.Path = Path;
   I.Position = Path.size();
+  I.S = style;
   return ++I;
 }
 
@@ -324,10 +323,9 @@ reverse_iterator rend(StringRef Path) {
 reverse_iterator &reverse_iterator::operator++() {
   // If we're at the end and the previous char was a '/', return '.' unless
   // we are the root path.
-  size_t root_dir_pos = root_dir_start(Path);
-  if (Position == Path.size() &&
-      Path.size() > root_dir_pos + 1 &&
-      is_separator(Path[Position - 1])) {
+  size_t root_dir_pos = root_dir_start(Path, S);
+  if (Position == Path.size() && Path.size() > root_dir_pos + 1 &&
+      is_separator(Path[Position - 1], S)) {
     --Position;
     Component = ".";
     return *this;
@@ -336,13 +334,12 @@ reverse_iterator &reverse_iterator::operator++() {
   // Skip separators unless it's the root directory.
   size_t end_pos = Position;
 
-  while(end_pos > 0 &&
-        (end_pos - 1) != root_dir_pos &&
-        is_separator(Path[end_pos - 1]))
+  while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+         is_separator(Path[end_pos - 1], S))
     --end_pos;
 
   // Find next separator.
-  size_t start_pos = filename_pos(Path.substr(0, end_pos));
+  size_t start_pos = filename_pos(Path.substr(0, end_pos), S);
   Component = Path.slice(start_pos, end_pos);
   Position = start_pos;
   return *this;
@@ -357,21 +354,15 @@ ptrdiff_t reverse_iterator::operator-(const reverse_iterator &RHS) const {
   return Position - RHS.Position;
 }
 
-StringRef root_path(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
+StringRef root_path(StringRef path, Style style) {
+  const_iterator b = begin(path, style), pos = b, e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if (has_net || has_drive) {
-      if ((++pos != e) && is_separator((*pos)[0])) {
+      if ((++pos != e) && is_separator((*pos)[0], style)) {
         // {C:/,//net/}, so get the first two components.
         return path.substr(0, b->size() + pos->size());
       } else {
@@ -381,7 +372,7 @@ StringRef root_path(StringRef path) {
     }
 
     // POSIX style root directory.
-    if (is_separator((*b)[0])) {
+    if (is_separator((*b)[0], style)) {
       return *b;
     }
   }
@@ -389,17 +380,12 @@ StringRef root_path(StringRef path) {
   return StringRef();
 }
 
-StringRef root_name(StringRef path) {
-  const_iterator b = begin(path),
-                 e = end(path);
+StringRef root_name(StringRef path, Style style) {
+  const_iterator b = begin(path, style), e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if (has_net || has_drive) {
       // just {C:,//net}, return the first component.
@@ -411,27 +397,21 @@ StringRef root_name(StringRef path) {
   return StringRef();
 }
 
-StringRef root_directory(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
+StringRef root_directory(StringRef path, Style style) {
+  const_iterator b = begin(path, style), pos = b, e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if ((has_net || has_drive) &&
         // {C:,//net}, skip to the next component.
-        (++pos != e) && is_separator((*pos)[0])) {
+        (++pos != e) && is_separator((*pos)[0], style)) {
       return *pos;
     }
 
     // POSIX style root directory.
-    if (!has_net && is_separator((*b)[0])) {
+    if (!has_net && is_separator((*b)[0], style)) {
       return *b;
     }
   }
@@ -440,15 +420,13 @@ StringRef root_directory(StringRef path) {
   return StringRef();
 }
 
-StringRef relative_path(StringRef path) {
-  StringRef root = root_path(path);
+StringRef relative_path(StringRef path, Style style) {
+  StringRef root = root_path(path, style);
   return path.substr(root.size());
 }
 
-void append(SmallVectorImpl<char> &path, const Twine &a,
-                                         const Twine &b,
-                                         const Twine &c,
-                                         const Twine &d) {
+void append(SmallVectorImpl<char> &path, Style style, const Twine &a,
+            const Twine &b, const Twine &c, const Twine &d) {
   SmallString<32> a_storage;
   SmallString<32> b_storage;
   SmallString<32> c_storage;
@@ -461,13 +439,15 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
   if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
 
   for (auto &component : components) {
-    bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
-    bool component_has_sep = !component.empty() && is_separator(component[0]);
-    bool is_root_name = has_root_name(component);
+    bool path_has_sep =
+        !path.empty() && is_separator(path[path.size() - 1], style);
+    bool component_has_sep =
+        !component.empty() && is_separator(component[0], style);
+    bool is_root_name = has_root_name(component, style);
 
     if (path_has_sep) {
       // Strip separators from beginning of component.
-      size_t loc = component.find_first_not_of(separators);
+      size_t loc = component.find_first_not_of(separators(style));
       StringRef c = component.substr(loc);
 
       // Append it.
@@ -477,41 +457,47 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 
     if (!component_has_sep && !(path.empty() || is_root_name)) {
       // Add a separator.
-      path.push_back(preferred_separator);
+      path.push_back(preferred_separator(style));
     }
 
     path.append(component.begin(), component.end());
   }
 }
 
-void append(SmallVectorImpl<char> &path,
-            const_iterator begin, const_iterator end) {
+void append(SmallVectorImpl<char> &path, const Twine &a, const Twine &b,
+            const Twine &c, const Twine &d) {
+  append(path, Style::native, a, b, c, d);
+}
+
+void append(SmallVectorImpl<char> &path, const_iterator begin,
+            const_iterator end, Style style) {
   for (; begin != end; ++begin)
-    path::append(path, *begin);
+    path::append(path, style, *begin);
 }
 
-StringRef parent_path(StringRef path) {
-  size_t end_pos = parent_path_end(path);
+StringRef parent_path(StringRef path, Style style) {
+  size_t end_pos = parent_path_end(path, style);
   if (end_pos == StringRef::npos)
     return StringRef();
   else
     return path.substr(0, end_pos);
 }
 
-void remove_filename(SmallVectorImpl<char> &path) {
-  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
+void remove_filename(SmallVectorImpl<char> &path, Style style) {
+  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()), style);
   if (end_pos != StringRef::npos)
     path.set_size(end_pos);
 }
 
-void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
+                       Style style) {
   StringRef p(path.begin(), path.size());
   SmallString<32> ext_storage;
   StringRef ext = extension.toStringRef(ext_storage);
 
   // Erase existing extension.
   size_t pos = p.find_last_of('.');
-  if (pos != StringRef::npos && pos >= filename_pos(p))
+  if (pos != StringRef::npos && pos >= filename_pos(p, style))
     path.set_size(pos);
 
   // Append '.' if needed.
@@ -523,8 +509,8 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
 }
 
 void replace_path_prefix(SmallVectorImpl<char> &Path,
-                         const StringRef &OldPrefix,
-                         const StringRef &NewPrefix) {
+                         const StringRef &OldPrefix, const StringRef &NewPrefix,
+                         Style style) {
   if (OldPrefix.empty() && NewPrefix.empty())
     return;
 
@@ -540,53 +526,58 @@ void replace_path_prefix(SmallVectorImpl<char> &Path,
 
   StringRef RelPath = OrigPath.substr(OldPrefix.size());
   SmallString<256> NewPath;
-  path::append(NewPath, NewPrefix);
-  path::append(NewPath, RelPath);
+  path::append(NewPath, style, NewPrefix);
+  path::append(NewPath, style, RelPath);
   Path.swap(NewPath);
 }
 
-void native(const Twine &path, SmallVectorImpl<char> &result) {
+void native(const Twine &path, SmallVectorImpl<char> &result, Style style) {
   assert((!path.isSingleStringRef() ||
           path.getSingleStringRef().data() != result.data()) &&
          "path and result are not allowed to overlap!");
   // Clear result.
   result.clear();
   path.toVector(result);
-  native(result);
+  native(result, style);
 }
 
-void native(SmallVectorImpl<char> &Path) {
-#ifdef LLVM_ON_WIN32
-  std::replace(Path.begin(), Path.end(), '/', '\\');
-#else
-  for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
-    if (*PI == '\\') {
-      auto PN = PI + 1;
-      if (PN < PE && *PN == '\\')
-        ++PI; // increment once, the for loop will move over the escaped slash
-      else
-        *PI = '/';
+void native(SmallVectorImpl<char> &Path, Style style) {
+  if (Path.empty())
+    return;
+  if (real_style(style) == Style::windows) {
+    std::replace(Path.begin(), Path.end(), '/', '\\');
+    if (Path[0] == '~' && (Path.size() == 1 || is_separator(Path[1], style))) {
+      SmallString<128> PathHome;
+      home_directory(PathHome);
+      PathHome.append(Path.begin() + 1, Path.end());
+      Path = PathHome;
+    }
+  } else {
+    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
+      if (*PI == '\\') {
+        auto PN = PI + 1;
+        if (PN < PE && *PN == '\\')
+          ++PI; // increment once, the for loop will move over the escaped slash
+        else
+          *PI = '/';
+      }
     }
   }
-#endif
 }
 
-std::string convert_to_slash(StringRef path) {
-#ifdef LLVM_ON_WIN32
+std::string convert_to_slash(StringRef path, Style style) {
+  if (real_style(style) != Style::windows)
+    return path;
+
   std::string s = path.str();
   std::replace(s.begin(), s.end(), '\\', '/');
   return s;
-#else
-  return path;
-#endif
 }
 
-StringRef filename(StringRef path) {
-  return *rbegin(path);
-}
+StringRef filename(StringRef path, Style style) { return *rbegin(path, style); }
 
-StringRef stem(StringRef path) {
-  StringRef fname = filename(path);
+StringRef stem(StringRef path, Style style) {
+  StringRef fname = filename(path, style);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return fname;
@@ -598,8 +589,8 @@ StringRef stem(StringRef path) {
       return fname.substr(0, pos);
 }
 
-StringRef extension(StringRef path) {
-  StringRef fname = filename(path);
+StringRef extension(StringRef path, Style style) {
+  StringRef fname = filename(path, style);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return StringRef();
@@ -611,110 +602,109 @@ StringRef extension(StringRef path) {
       return fname.substr(pos);
 }
 
-bool is_separator(char value) {
-  switch(value) {
-#ifdef LLVM_ON_WIN32
-    case '\\': // fall through
-#endif
-    case '/': return true;
-    default: return false;
-  }
+bool is_separator(char value, Style style) {
+  if (value == '/')
+    return true;
+  if (real_style(style) == Style::windows)
+    return value == '\\';
+  return false;
 }
 
-static const char preferred_separator_string[] = { preferred_separator, '\0' };
-
-StringRef get_separator() {
-  return preferred_separator_string;
+StringRef get_separator(Style style) {
+  if (real_style(style) == Style::windows)
+    return "\\";
+  return "/";
 }
 
-bool has_root_name(const Twine &path) {
+bool has_root_name(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_name(p).empty();
+  return !root_name(p, style).empty();
 }
 
-bool has_root_directory(const Twine &path) {
+bool has_root_directory(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_directory(p).empty();
+  return !root_directory(p, style).empty();
 }
 
-bool has_root_path(const Twine &path) {
+bool has_root_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_path(p).empty();
+  return !root_path(p, style).empty();
 }
 
-bool has_relative_path(const Twine &path) {
+bool has_relative_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !relative_path(p).empty();
+  return !relative_path(p, style).empty();
 }
 
-bool has_filename(const Twine &path) {
+bool has_filename(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !filename(p).empty();
+  return !filename(p, style).empty();
 }
 
-bool has_parent_path(const Twine &path) {
+bool has_parent_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !parent_path(p).empty();
+  return !parent_path(p, style).empty();
 }
 
-bool has_stem(const Twine &path) {
+bool has_stem(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !stem(p).empty();
+  return !stem(p, style).empty();
 }
 
-bool has_extension(const Twine &path) {
+bool has_extension(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !extension(p).empty();
+  return !extension(p, style).empty();
 }
 
-bool is_absolute(const Twine &path) {
+bool is_absolute(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  bool rootDir = has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = has_root_name(p);
-#else
-       rootName = true;
-#endif
+  bool rootDir = has_root_directory(p, style);
+  bool rootName =
+      (real_style(style) != Style::windows) || has_root_name(p, style);
 
   return rootDir && rootName;
 }
 
-bool is_relative(const Twine &path) { return !is_absolute(path); }
+bool is_relative(const Twine &path, Style style) {
+  return !is_absolute(path, style);
+}
 
-StringRef remove_leading_dotslash(StringRef Path) {
+StringRef remove_leading_dotslash(StringRef Path, Style style) {
   // Remove leading "./" (or ".//" or "././" etc.)
-  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1])) {
+  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1], style)) {
     Path = Path.substr(2);
-    while (Path.size() > 0 && is_separator(Path[0]))
+    while (Path.size() > 0 && is_separator(Path[0], style))
       Path = Path.substr(1);
   }
   return Path;
 }
 
-static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
+static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot,
+                                    Style style) {
   SmallVector<StringRef, 16> components;
 
   // Skip the root path, then look for traversal in the components.
-  StringRef rel = path::relative_path(path);
-  for (StringRef C : llvm::make_range(path::begin(rel), path::end(rel))) {
+  StringRef rel = path::relative_path(path, style);
+  for (StringRef C :
+       llvm::make_range(path::begin(rel, style), path::end(rel))) {
     if (C == ".")
       continue;
     // Leading ".." will remain in the path unless it's at the root.
@@ -723,22 +713,23 @@ static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
         components.pop_back();
         continue;
       }
-      if (path::is_absolute(path))
+      if (path::is_absolute(path, style))
         continue;
     }
     components.push_back(C);
   }
 
-  SmallString<256> buffer = path::root_path(path);
+  SmallString<256> buffer = path::root_path(path, style);
   for (StringRef C : components)
-    path::append(buffer, C);
+    path::append(buffer, style, C);
   return buffer;
 }
 
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot) {
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot,
+                 Style style) {
   StringRef p(path.data(), path.size());
 
-  SmallString<256> result = remove_dots(p, remove_dot_dot);
+  SmallString<256> result = remove_dots(p, remove_dot_dot, style);
   if (result == path)
     return false;
 
@@ -776,7 +767,7 @@ createTemporaryFile(const Twine &Model, int &ResultFD,
                     llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   SmallString<128> Storage;
   StringRef P = Model.toNullTerminatedStringRef(Storage);
-  assert(P.find_first_of(separators) == StringRef::npos &&
+  assert(P.find_first_of(separators(Style::native)) == StringRef::npos &&
          "Model must be a simple filename.");
   // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
   return createUniqueEntity(P.begin(), ResultFD, ResultPath,
@@ -818,12 +809,9 @@ static std::error_code make_absolute(const Twine &current_directory,
                                      bool use_current_directory) {
   StringRef p(path.data(), path.size());
 
-  bool rootDirectory = path::has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = path::has_root_name(p);
-#else
-       rootName = true;
-#endif
+  bool rootDirectory = path::has_root_directory(p);
+  bool rootName =
+      (real_style(Style::native) != Style::windows) || path::has_root_name(p);
 
   // Already absolute.
   if (rootName && rootDirectory)
@@ -937,6 +925,36 @@ std::error_code copy_file(const Twine &From, const Twine &To) {
   return std::error_code();
 }
 
+ErrorOr<MD5::MD5Result> md5_contents(int FD) {
+  MD5 Hash;
+
+  constexpr size_t BufSize = 4096;
+  std::vector<uint8_t> Buf(BufSize);
+  int BytesRead = 0;
+  for (;;) {
+    BytesRead = read(FD, Buf.data(), BufSize);
+    if (BytesRead <= 0)
+      break;
+    Hash.update(makeArrayRef(Buf.data(), BytesRead));
+  }
+
+  if (BytesRead < 0)
+    return std::error_code(errno, std::generic_category());
+  MD5::MD5Result Result;
+  Hash.final(Result);
+  return Result;
+}
+
+ErrorOr<MD5::MD5Result> md5_contents(const Twine &Path) {
+  int FD;
+  if (auto EC = openFileForRead(Path, FD))
+    return EC;
+
+  auto Result = md5_contents(FD);
+  close(FD);
+  return Result;
+}
+
 bool exists(file_status status) {
   return status_known(status) && status.type() != file_type::file_not_found;
 }
@@ -945,6 +963,13 @@ bool status_known(file_status s) {
   return s.type() != file_type::status_error;
 }
 
+file_type get_file_type(const Twine &Path, bool Follow) {
+  file_status st;
+  if (status(Path, st, Follow))
+    return file_type::status_error;
+  return st.type();
+}
+
 bool is_directory(file_status status) {
   return status.type() == file_type::directory_file;
 }
@@ -969,6 +994,18 @@ std::error_code is_regular_file(const Twine &path, bool &result) {
   return std::error_code();
 }
 
+bool is_symlink_file(file_status status) {
+  return status.type() == file_type::symlink_file;
+}
+
+std::error_code is_symlink_file(const Twine &path, bool &result) {
+  file_status st;
+  if (std::error_code ec = status(path, st, false))
+    return ec;
+  result = is_symlink_file(st);
+  return std::error_code();
+}
+
 bool is_other(file_status status) {
   return exists(status) &&
          !is_regular_file(status) &&
@@ -1162,7 +1199,15 @@ std::error_code identify_magic(const Twine &Path, file_magic &Result) {
 }
 
 std::error_code directory_entry::status(file_status &result) const {
-  return fs::status(Path, result);
+  return fs::status(Path, result, FollowSymlinks);
+}
+
+ErrorOr<perms> getPermissions(const Twine &Path) {
+  file_status Status;
+  if (std::error_code EC = status(Path, Status))
+    return EC;
+
+  return Status.permissions();
 }
 
 } // end namespace fs
diff --git a/lib/Support/RWMutex.cpp b/lib/Support/RWMutex.cpp
index 3b6309cef21a..6c9781c4e2d6 100644
--- a/lib/Support/RWMutex.cpp
+++ b/lib/Support/RWMutex.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Config/config.h"
 #include "llvm/Support/RWMutex.h"
-#include <cstring>
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
@@ -22,29 +21,31 @@
 
 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
 // Define all methods as no-ops if threading is explicitly disabled
-namespace llvm {
+
+using namespace llvm;
 using namespace sys;
-RWMutexImpl::RWMutexImpl() { }
-RWMutexImpl::~RWMutexImpl() { }
+
+RWMutexImpl::RWMutexImpl() = default;
+RWMutexImpl::~RWMutexImpl() = default;
+
 bool RWMutexImpl::reader_acquire() { return true; }
 bool RWMutexImpl::reader_release() { return true; }
 bool RWMutexImpl::writer_acquire() { return true; }
 bool RWMutexImpl::writer_release() { return true; }
-}
+
 #else
 
 #if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_RWLOCK_INIT)
 
 #include <cassert>
+#include <cstdlib>
 #include <pthread.h>
-#include <stdlib.h>
 
-namespace llvm {
+using namespace llvm;
 using namespace sys;
 
 // Construct a RWMutex using pthread calls
 RWMutexImpl::RWMutexImpl()
-  : data_(nullptr)
 {
   // Declare the pthread_rwlock data structures
   pthread_rwlock_t* rwlock =
@@ -113,8 +114,6 @@ RWMutexImpl::writer_release()
   return errorcode == 0;
 }
 
-}
-
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/RWMutex.inc"
 #elif defined( LLVM_ON_WIN32)
diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp
index e5e38f59c040..57f36bf175b3 100644
--- a/lib/Support/Signals.cpp
+++ b/lib/Support/Signals.cpp
@@ -29,7 +29,6 @@
 #include <vector>
 
 namespace llvm {
-using namespace sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 4cb9b2ff2cda..ca2391c10ff1 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -13,30 +13,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
 using namespace llvm;
 
 static const size_t TabStop = 8;
 
 namespace {
+
   struct LineNoCacheTy {
     const char *LastQuery;
     unsigned LastQueryBufferID;
     unsigned LineNoOfQuery;
   };
-}
+
+} // end anonymous namespace
 
 static LineNoCacheTy *getCache(void *Ptr) {
   return (LineNoCacheTy*)Ptr;
 }
 
-
 SourceMgr::~SourceMgr() {
   // Delete the line # cache if allocated.
   if (LineNoCacheTy *Cache = getCache(LineNoCache))
@@ -132,12 +145,10 @@ void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
      << ":" << FindLineNumber(IncludeLoc, CurBuf) << ":\n";
 }
 
-
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    const Twine &Msg,
                                    ArrayRef<SMRange> Ranges,
                                    ArrayRef<SMFixIt> FixIts) const {
-
   // First thing to do: find the current buffer containing the specified
   // location to pull out the source line.
   SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
@@ -223,7 +234,7 @@ void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
-  PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+  PrintMessage(errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
 }
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +244,7 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
                            int Line, int Col, SourceMgr::DiagKind Kind,
                            StringRef Msg, StringRef LineStr,
-                           ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+                           ArrayRef<std::pair<unsigned,unsigned>> Ranges,
                            ArrayRef<SMFixIt> Hints)
   : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
     Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
@@ -286,7 +297,7 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
     // FIXME: This assertion is intended to catch unintended use of multibyte
     // characters in fixits. If we decide to do this, we'll have to track
     // separate byte widths for the source and fixit lines.
-    assert((size_t)llvm::sys::locale::columnWidth(I->getText()) ==
+    assert((size_t)sys::locale::columnWidth(I->getText()) ==
            I->getText().size());
 
     // This relies on one byte per column in our fixit hints.
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index d81250e48dde..9b7cc1c1d182 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
@@ -595,6 +596,18 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
   return false;
 }
 
+bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
+  APFloat F(0.0);
+  APFloat::opStatus Status =
+      F.convertFromString(*this, APFloat::rmNearestTiesToEven);
+  if (Status != APFloat::opOK) {
+    if (!AllowInexact || Status != APFloat::opInexact)
+      return true;
+  }
+
+  Result = F.convertToDouble();
+  return false;
+}
 
 // Implementation of StringRef hashing.
 hash_code llvm::hash_value(StringRef S) {
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 42fab671a251..639d2ece263a 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -448,6 +448,8 @@ bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+spe");
   if (Extensions & AArch64::AEK_RAS)
     Features.push_back("+ras");
+  if (Extensions & AArch64::AEK_LSE)
+    Features.push_back("+lse");
 
   return true;
 }
@@ -725,6 +727,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   case ARM::AK_ARMV8R:
     return ARM::PK_R;
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7VE:
   case ARM::AK_ARMV7K:
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
@@ -761,6 +764,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   case ARM::AK_ARMV6M:
     return 6;
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7VE:
   case ARM::AK_ARMV7R:
   case ARM::AK_ARMV7M:
   case ARM::AK_ARMV7S:
diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp
index 760f9e2c388b..6a10b988d464 100644
--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@@ -14,14 +14,20 @@
 
 #include "llvm/Support/Threading.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/Atomic.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/thread.h"
+
 #include <cassert>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
 bool llvm::llvm_is_multithreaded() {
 #if LLVM_ENABLE_THREADS != 0
   return true;
@@ -30,100 +36,47 @@ bool llvm::llvm_is_multithreaded() {
 #endif
 }
 
-#if LLVM_ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H)
-#include <pthread.h>
-
-struct ThreadInfo {
-  void (*UserFn)(void *);
-  void *UserData;
-};
-static void *ExecuteOnThread_Dispatch(void *Arg) {
-  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
-  TI->UserFn(TI->UserData);
-  return nullptr;
-}
-
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+#if LLVM_ENABLE_THREADS == 0 ||                                                \
+    (!defined(LLVM_ON_WIN32) && !defined(HAVE_PTHREAD_H))
+// Support for non-Win32, non-pthread implementation.
+void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
                                   unsigned RequestedStackSize) {
-  ThreadInfo Info = { Fn, UserData };
-  pthread_attr_t Attr;
-  pthread_t Thread;
-
-  // Construct the attributes object.
-  if (::pthread_attr_init(&Attr) != 0)
-    return;
-
-  // Set the requested stack size, if given.
-  if (RequestedStackSize != 0) {
-    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
-      goto error;
-  }
-
-  // Construct and execute the thread.
-  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
-    goto error;
-
-  // Wait for the thread and clean up.
-  ::pthread_join(Thread, nullptr);
-
- error:
-  ::pthread_attr_destroy(&Attr);
+  (void)RequestedStackSize;
+  Fn(UserData);
 }
-#elif LLVM_ENABLE_THREADS!=0 && defined(LLVM_ON_WIN32)
-#include "Windows/WindowsSupport.h"
-#include <process.h>
 
-// Windows will at times define MemoryFence.
-#ifdef MemoryFence
-#undef MemoryFence
-#endif
+unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
 
-struct ThreadInfo {
-  void (*func)(void*);
-  void *param;
-};
+uint64_t llvm::get_threadid() { return 0; }
 
-static unsigned __stdcall ThreadCallback(void *param) {
-  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
-  info->func(info->param);
+uint32_t llvm::get_max_thread_name_length() { return 0; }
 
-  return 0;
-}
+void llvm::set_thread_name(const Twine &Name) {}
 
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
-                                  unsigned RequestedStackSize) {
-  struct ThreadInfo param = { Fn, UserData };
-
-  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
-                                            RequestedStackSize, ThreadCallback,
-                                            &param, 0, NULL);
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
-  if (hThread) {
-    // We actually don't care whether the wait succeeds or fails, in
-    // the same way we don't care whether the pthread_join call succeeds
-    // or fails.  There's not much we could do if this were to fail. But
-    // on success, this call will wait until the thread finishes executing
-    // before returning.
-    (void)::WaitForSingleObject(hThread, INFINITE);
-    ::CloseHandle(hThread);
-  }
-}
 #else
-// Support for non-Win32, non-pthread implementation.
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
-                                  unsigned RequestedStackSize) {
-  (void) RequestedStackSize;
-  Fn(UserData);
-}
-
-#endif
 
+#include <thread>
 unsigned llvm::heavyweight_hardware_concurrency() {
-#if !LLVM_ENABLE_THREADS
-  return 1;
-#endif
+  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
+  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
+  // allows us to not define `thread` in the llvm namespace, which conflicts
+  // with some platforms such as FreeBSD whose headers also define a struct
+  // called `thread` in the global namespace which can cause ambiguity due to
+  // ADL.
   int NumPhysical = sys::getHostNumPhysicalCores();
   if (NumPhysical == -1)
-    return thread::hardware_concurrency();
+    return std::thread::hardware_concurrency();
   return NumPhysical;
 }
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Threading.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Threading.inc"
+#endif
+
+#endif
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index fbd73d0b6b3b..8d68c6ae9682 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -72,22 +72,9 @@ std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
   return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
 }
 
-
-static TimerGroup *DefaultTimerGroup = nullptr;
 static TimerGroup *getDefaultTimerGroup() {
-  TimerGroup *tmp = DefaultTimerGroup;
-  sys::MemoryFence();
-  if (tmp) return tmp;
-
-  sys::SmartScopedLock<true> Lock(*TimerLock);
-  tmp = DefaultTimerGroup;
-  if (!tmp) {
-    tmp = new TimerGroup("misc", "Miscellaneous Ungrouped Timers");
-    sys::MemoryFence();
-    DefaultTimerGroup = tmp;
-  }
-
-  return tmp;
+  static TimerGroup DefaultTimerGroup("misc", "Miscellaneous Ungrouped Timers");
+  return &DefaultTimerGroup;
 }
 
 //===----------------------------------------------------------------------===//
@@ -309,7 +296,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // If this is not an collection of ungrouped times, print the total time.
   // Ungrouped timers don't really make sense to add up.  We still print the
   // TOTAL line to make the percentages make sense.
-  if (this != DefaultTimerGroup)
+  if (this != getDefaultTimerGroup())
     OS << format("  Total Execution Time: %5.4f seconds (%5.4f wall clock)\n",
                  Total.getProcessTime(), Total.getWallTime());
   OS << '\n';
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 6783b40a125d..64d5977e2ebd 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -510,6 +510,7 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
     .EndsWith("coff", Triple::COFF)
     .EndsWith("elf", Triple::ELF)
     .EndsWith("macho", Triple::MachO)
+    .EndsWith("wasm", Triple::Wasm)
     .Default(Triple::UnknownObjectFormat);
 }
 
@@ -550,6 +551,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
   case ARM::AK_ARMV7A:
   case ARM::AK_ARMV7R:
     return Triple::ARMSubArch_v7;
+  case ARM::AK_ARMV7VE:
+    return Triple::ARMSubArch_v7ve;
   case ARM::AK_ARMV7K:
     return Triple::ARMSubArch_v7k;
   case ARM::AK_ARMV7M:
@@ -581,6 +584,7 @@ static StringRef getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   case Triple::COFF: return "coff";
   case Triple::ELF: return "elf";
   case Triple::MachO: return "macho";
+  case Triple::Wasm: return "wasm";
   }
   llvm_unreachable("unknown object format type");
 }
@@ -1511,6 +1515,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
       return "strongarm";
     }
   case llvm::Triple::NaCl:
+  case llvm::Triple::OpenBSD:
     return "cortex-a8";
   default:
     switch (getEnvironment()) {
diff --git a/lib/Support/Twine.cpp b/lib/Support/Twine.cpp
index 465c6e6b8c4c..d17cd4e66439 100644
--- a/lib/Support/Twine.cpp
+++ b/lib/Support/Twine.cpp
@@ -173,10 +173,12 @@ void Twine::printRepr(raw_ostream &OS) const {
   OS << ")";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Twine::dump() const {
   print(dbgs());
 }
 
-void Twine::dumpRepr() const {
+LLVM_DUMP_METHOD void Twine::dumpRepr() const {
   printRepr(dbgs());
 }
+#endif
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index e0b11aaff007..93f8982196b3 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -48,6 +48,8 @@
 # endif
 #endif
 
+#include <pwd.h>
+
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #include <sys/attr.h>
@@ -65,23 +67,41 @@
 #endif
 
 #include <sys/types.h>
-#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__ANDROID__)
+#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) &&   \
+    !defined(__linux__)
 #include <sys/statvfs.h>
 #define STATVFS statvfs
+#define FSTATVFS fstatvfs
 #define STATVFS_F_FRSIZE(vfs) vfs.f_frsize
 #else
-#ifdef __OpenBSD__
+#if defined(__OpenBSD__) || defined(__FreeBSD__)
 #include <sys/param.h>
 #include <sys/mount.h>
-#elif defined(__ANDROID__)
+#elif defined(__linux__)
+#if defined(HAVE_LINUX_MAGIC_H)
+#include <linux/magic.h>
+#else
+#if defined(HAVE_LINUX_NFS_FS_H)
+#include <linux/nfs_fs.h>
+#endif
+#if defined(HAVE_LINUX_SMB_H)
+#include <linux/smb.h>
+#endif
+#endif
 #include <sys/vfs.h>
 #else
 #include <sys/mount.h>
 #endif
 #define STATVFS statfs
+#define FSTATVFS fstatfs
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
+#if defined(__NetBSD__)
+#define STATVFS_F_FLAG(vfs) (vfs).f_flag
+#else
+#define STATVFS_F_FLAG(vfs) (vfs).f_flags
+#endif
 
 using namespace llvm;
 
@@ -180,7 +200,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       if (getprogpath(exe_path, argv0))
         return exe_path;
   }
-#elif defined(HAVE_DLFCN_H)
+#elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
   // Use dladdr to get executable path if available.
   Dl_info DLInfo;
   int err = dladdr(MainAddr, &DLInfo);
@@ -210,6 +230,10 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(fs_st_dev, fs_st_ino);
 }
 
+uint32_t file_status::getLinkCount() const {
+  return fs_st_nlinks;
+}
+
 ErrorOr<space_info> disk_space(const Twine &Path) {
   struct STATVFS Vfs;
   if (::STATVFS(Path.str().c_str(), &Vfs))
@@ -257,6 +281,16 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   return std::error_code();
 }
 
+std::error_code set_current_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  if (::chdir(p.begin()) == -1)
+    return std::error_code(errno, std::generic_category());
+
+  return std::error_code();
+}
+
 std::error_code create_directory(const Twine &path, bool IgnoreExisting,
                                  perms Perms) {
   SmallString<128> path_storage;
@@ -325,6 +359,51 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   return std::error_code();
 }
 
+static bool is_local_impl(struct STATVFS &Vfs) {
+#if defined(__linux__)
+#ifndef NFS_SUPER_MAGIC
+#define NFS_SUPER_MAGIC 0x6969
+#endif
+#ifndef SMB_SUPER_MAGIC
+#define SMB_SUPER_MAGIC 0x517B
+#endif
+#ifndef CIFS_MAGIC_NUMBER
+#define CIFS_MAGIC_NUMBER 0xFF534D42
+#endif
+  switch ((uint32_t)Vfs.f_type) {
+  case NFS_SUPER_MAGIC:
+  case SMB_SUPER_MAGIC:
+  case CIFS_MAGIC_NUMBER:
+    return false;
+  default:
+    return true;
+  }
+#elif defined(__CYGWIN__)
+  // Cygwin doesn't expose this information; would need to use Win32 API.
+  return false;
+#else
+  return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL);
+#endif
+}
+
+std::error_code is_local(const Twine &Path, bool &Result) {
+  struct STATVFS Vfs;
+  if (::STATVFS(Path.str().c_str(), &Vfs))
+    return std::error_code(errno, std::generic_category());
+
+  Result = is_local_impl(Vfs);
+  return std::error_code();
+}
+
+std::error_code is_local(int FD, bool &Result) {
+  struct STATVFS Vfs;
+  if (::FSTATVFS(FD, &Vfs))
+    return std::error_code(errno, std::generic_category());
+
+  Result = is_local_impl(Vfs);
+  return std::error_code();
+}
+
 std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
@@ -405,6 +484,46 @@ std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   return std::error_code();
 }
 
+static void expandTildeExpr(SmallVectorImpl<char> &Path) {
+  StringRef PathStr(Path.begin(), Path.size());
+  if (PathStr.empty() || !PathStr.startswith("~"))
+    return;
+
+  PathStr = PathStr.drop_front();
+  StringRef Expr =
+      PathStr.take_until([](char c) { return path::is_separator(c); });
+  StringRef Remainder = PathStr.substr(Expr.size() + 1);
+  SmallString<128> Storage;
+  if (Expr.empty()) {
+    // This is just ~/..., resolve it to the current user's home dir.
+    if (!path::home_directory(Storage)) {
+      // For some reason we couldn't get the home directory.  Just exit.
+      return;
+    }
+
+    // Overwrite the first character and insert the rest.
+    Path[0] = Storage[0];
+    Path.insert(Path.begin() + 1, Storage.begin() + 1, Storage.end());
+    return;
+  }
+
+  // This is a string of the form ~username/, look up this user's entry in the
+  // password database.
+  struct passwd *Entry = nullptr;
+  std::string User = Expr.str();
+  Entry = ::getpwnam(User.c_str());
+
+  if (!Entry) {
+    // Unable to look up the entry, just return back the original path.
+    return;
+  }
+
+  Storage = Remainder;
+  Path.clear();
+  Path.append(Entry->pw_dir, Entry->pw_dir + strlen(Entry->pw_dir));
+  llvm::sys::path::append(Path, Storage);
+}
+
 static std::error_code fillStatus(int StatRet, const struct stat &Status,
                              file_status &Result) {
   if (StatRet != 0) {
@@ -430,22 +549,23 @@ static std::error_code fillStatus(int StatRet, const struct stat &Status,
     Type = file_type::fifo_file;
   else if (S_ISSOCK(Status.st_mode))
     Type = file_type::socket_file;
+  else if (S_ISLNK(Status.st_mode))
+    Type = file_type::symlink_file;
 
-  perms Perms = static_cast<perms>(Status.st_mode);
-  Result =
-      file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_atime,
-                  Status.st_mtime, Status.st_uid, Status.st_gid,
-                  Status.st_size);
+  perms Perms = static_cast<perms>(Status.st_mode) & all_perms;
+  Result = file_status(Type, Perms, Status.st_dev, Status.st_nlink,
+                       Status.st_ino, Status.st_atime, Status.st_mtime,
+                       Status.st_uid, Status.st_gid, Status.st_size);
 
   return std::error_code();
 }
 
-std::error_code status(const Twine &Path, file_status &Result) {
+std::error_code status(const Twine &Path, file_status &Result, bool Follow) {
   SmallString<128> PathStorage;
   StringRef P = Path.toNullTerminatedStringRef(PathStorage);
 
   struct stat Status;
-  int StatRet = ::stat(P.begin(), &Status);
+  int StatRet = (Follow ? ::stat : ::lstat)(P.begin(), &Status);
   return fillStatus(StatRet, Status, Result);
 }
 
@@ -455,6 +575,15 @@ std::error_code status(int FD, file_status &Result) {
   return fillStatus(StatRet, Status, Result);
 }
 
+std::error_code setPermissions(const Twine &Path, perms Permissions) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+  if (::chmod(P.begin(), Permissions))
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
+}
+
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
 #if defined(HAVE_FUTIMENS)
   timespec Times[2];
@@ -481,6 +610,26 @@ std::error_code mapped_file_region::init(int FD, uint64_t Offset,
 
   int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
   int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
+#if defined(__APPLE__)
+  //----------------------------------------------------------------------
+  // Newer versions of MacOSX have a flag that will allow us to read from
+  // binaries whose code signature is invalid without crashing by using
+  // the MAP_RESILIENT_CODESIGN flag. Also if a file from removable media
+  // is mapped we can avoid crashing and return zeroes to any pages we try
+  // to read if the media becomes unavailable by using the
+  // MAP_RESILIENT_MEDIA flag.  These flags are only usable when mapping
+  // with PROT_READ, so take care not to specify them otherwise.
+  //----------------------------------------------------------------------
+  if (Mode == readonly) {
+#if defined(MAP_RESILIENT_CODESIGN)
+    flags |= MAP_RESILIENT_CODESIGN;
+#endif
+#if defined(MAP_RESILIENT_MEDIA)
+    flags |= MAP_RESILIENT_MEDIA;
+#endif
+  }
+#endif // #if defined (__APPLE__)
+
   Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
     return std::error_code(errno, std::generic_category());
@@ -526,7 +675,8 @@ int mapped_file_region::alignment() {
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
+                                                     StringRef path,
+                                                     bool follow_symlinks) {
   SmallString<128> path_null(path);
   DIR *directory = ::opendir(path_null.c_str());
   if (!directory)
@@ -535,7 +685,7 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
   it.IterationHandle = reinterpret_cast<intptr_t>(directory);
   // Add something for replace_filename to replace.
   path::append(path_null, ".");
-  it.CurrentEntry = directory_entry(path_null.str());
+  it.CurrentEntry = directory_entry(path_null.str(), follow_symlinks);
   return directory_iterator_increment(it);
 }
 
@@ -577,10 +727,19 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
                                 SmallVectorImpl<char> *RealPath) {
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
-  while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
+  int OpenFlags = O_RDONLY;
+#ifdef O_CLOEXEC
+  OpenFlags |= O_CLOEXEC;
+#endif
+  while ((ResultFD = open(P.begin(), OpenFlags)) < 0) {
     if (errno != EINTR)
       return std::error_code(errno, std::generic_category());
   }
+#ifndef O_CLOEXEC
+  int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
+  (void)r;
+  assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
+#endif
   // Attempt to get the real name of the file, if the user asked
   if(!RealPath)
     return std::error_code();
@@ -616,6 +775,10 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
 
   int OpenFlags = O_CREAT;
 
+#ifdef O_CLOEXEC
+  OpenFlags |= O_CLOEXEC;
+#endif
+
   if (Flags & F_RW)
     OpenFlags |= O_RDWR;
   else
@@ -635,6 +798,11 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
     if (errno != EINTR)
       return std::error_code(errno, std::generic_category());
   }
+#ifndef O_CLOEXEC
+  int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
+  (void)r;
+  assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
+#endif
   return std::error_code();
 }
 
@@ -685,18 +853,85 @@ std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
   return std::error_code();
 }
 
+template <typename T>
+static std::error_code remove_directories_impl(const T &Entry,
+                                               bool IgnoreErrors) {
+  std::error_code EC;
+  directory_iterator Begin(Entry, EC, false);
+  directory_iterator End;
+  while (Begin != End) {
+    auto &Item = *Begin;
+    file_status st;
+    EC = Item.status(st);
+    if (EC && !IgnoreErrors)
+      return EC;
+
+    if (is_directory(st)) {
+      EC = remove_directories_impl(Item, IgnoreErrors);
+      if (EC && !IgnoreErrors)
+        return EC;
+    }
+
+    EC = fs::remove(Item.path(), true);
+    if (EC && !IgnoreErrors)
+      return EC;
+
+    Begin.increment(EC);
+    if (EC && !IgnoreErrors)
+      return EC;
+  }
+  return std::error_code();
+}
+
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  auto EC = remove_directories_impl(path, IgnoreErrors);
+  if (EC && !IgnoreErrors)
+    return EC;
+  EC = fs::remove(path, true);
+  if (EC && !IgnoreErrors)
+    return EC;
+  return std::error_code();
+}
+
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
+                          bool expand_tilde) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return std::error_code();
+
+  if (expand_tilde) {
+    SmallString<128> Storage;
+    path.toVector(Storage);
+    expandTildeExpr(Storage);
+    return real_path(Storage, dest, false);
+  }
+
+  int fd;
+  std::error_code EC = openFileForRead(path, fd, &dest);
+
+  if (EC)
+    return EC;
+  ::close(fd);
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
 
 bool home_directory(SmallVectorImpl<char> &result) {
-  if (char *RequestedDir = getenv("HOME")) {
-    result.clear();
-    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
-    return true;
+  char *RequestedDir = getenv("HOME");
+  if (!RequestedDir) {
+    struct passwd *pw = getpwuid(getuid());
+    if (pw && pw->pw_dir)
+      RequestedDir = pw->pw_dir;
   }
+  if (!RequestedDir)
+    return false;
 
-  return false;
+  result.clear();
+  result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+  return true;
 }
 
 static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 9752b70644c6..88ad21e9806e 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -25,8 +25,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
-#if HAVE_EXECINFO_H
-# include <execinfo.h>         // For backtrace().
+#ifdef HAVE_BACKTRACE
+# include BACKTRACE_HEADER         // For backtrace().
 #endif
 #if HAVE_SIGNAL_H
 #include <signal.h>
@@ -59,7 +59,7 @@ using namespace llvm;
 
 static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 
-static ManagedStatic<SmartMutex<true> > SignalsMutex;
+static ManagedStatic<sys::SmartMutex<true> > SignalsMutex;
 
 /// InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = nullptr;
@@ -149,11 +149,7 @@ static void CreateSigAltStack() {}
 #endif
 
 static void RegisterHandlers() {
-  // We need to dereference the signals mutex during handler registration so
-  // that we force its construction. This is to prevent the first use being
-  // during handling an actual signal because you can't safely call new in a
-  // signal handler.
-  *SignalsMutex;
+  sys::SmartScopedLock<true> Guard(*SignalsMutex);
 
   // If the handlers are already registered, we're done.
   if (NumRegisteredSignals != 0) return;
@@ -223,7 +219,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
   {
-    unique_lock<SmartMutex<true>> Guard(*SignalsMutex);
+    unique_lock<sys::SmartMutex<true>> Guard(*SignalsMutex);
     RemoveFilesToRemove();
 
     if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
@@ -412,7 +408,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 
   if (printSymbolizedStackTrace(Argv0, StackTrace, depth, OS))
     return;
-#if HAVE_DLFCN_H && __GNUG__ && !defined(__CYGWIN__)
+#if HAVE_DLFCN_H && HAVE_DLADDR
   int width = 0;
   for (int i = 0; i < depth; ++i) {
     Dl_info dlinfo;
@@ -462,7 +458,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 }
 
 static void PrintStackTraceSignalHandler(void *) {
-  PrintStackTrace(llvm::errs());
+  sys::PrintStackTrace(llvm::errs());
 }
 
 void llvm::sys::DisableSystemDialogsOnCrash() {}
diff --git a/lib/Support/Unix/Threading.inc b/lib/Support/Unix/Threading.inc
new file mode 100644
index 000000000000..407b194e1b6a
--- /dev/null
+++ b/lib/Support/Unix/Threading.inc
@@ -0,0 +1,215 @@
+//===- Unix/Threading.inc - Unix Threading Implementation ----- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Unix specific implementation of Threading functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#if defined(__APPLE__)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#endif
+
+#include <pthread.h>
+
+#if defined(__FreeBSD__)
+#include <pthread_np.h> // For pthread_getthreadid_np()
+#endif
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+#if defined(__NetBSD__)
+#include <lwp.h>  // For _lwp_self()
+#endif
+
+#if defined(__linux__)
+#include <unistd.h> // For syscall()
+#include <sys/syscall.h>  // For syscall codes
+#endif
+
+namespace {
+  struct ThreadInfo {
+    void(*UserFn)(void *);
+    void *UserData;
+  };
+}
+
+static void *ExecuteOnThread_Dispatch(void *Arg) {
+  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
+  TI->UserFn(TI->UserData);
+  return nullptr;
+}
+
+void llvm::llvm_execute_on_thread(void(*Fn)(void*), void *UserData,
+  unsigned RequestedStackSize) {
+  ThreadInfo Info = { Fn, UserData };
+  pthread_attr_t Attr;
+  pthread_t Thread;
+
+  // Construct the attributes object.
+  if (::pthread_attr_init(&Attr) != 0)
+    return;
+
+  // Set the requested stack size, if given.
+  if (RequestedStackSize != 0) {
+    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
+      goto error;
+  }
+
+  // Construct and execute the thread.
+  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
+    goto error;
+
+  // Wait for the thread and clean up.
+  ::pthread_join(Thread, nullptr);
+
+error:
+  ::pthread_attr_destroy(&Attr);
+}
+
+
+uint64_t llvm::get_threadid() {
+#if defined(__APPLE__)
+  // Calling "mach_thread_self()" bumps the reference count on the thread
+  // port, so we need to deallocate it. mach_task_self() doesn't bump the ref
+  // count.
+  thread_port_t Self = mach_thread_self();
+  mach_port_deallocate(mach_task_self(), Self);
+  return Self;
+#elif defined(__FreeBSD__)
+  return uint64_t(pthread_getthreadid_np());
+#elif defined(__NetBSD__)
+  return uint64_t(_lwp_self());
+#elif defined(__ANDROID__)
+  return uint64_t(gettid());
+#elif defined(__linux__)
+  return uint64_t(syscall(SYS_gettid));
+#elif defined(LLVM_ON_WIN32)
+  return uint64_t(::GetCurrentThreadId());
+#else
+  return uint64_t(pthread_self());
+#endif
+}
+
+
+static constexpr uint32_t get_max_thread_name_length_impl() {
+#if defined(__NetBSD__)
+	return PTHREAD_MAX_NAMELEN_NP;
+#elif defined(__APPLE__)
+	return 64;
+#elif defined(__linux__)
+#if HAVE_PTHREAD_SETNAME_NP
+	return 16;
+#else
+	return 0;
+#endif
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  return 16;
+#else
+  return 0;
+#endif
+}
+
+uint32_t llvm::get_max_thread_name_length() {
+  return get_max_thread_name_length_impl();
+}
+
+void llvm::set_thread_name(const Twine &Name) {
+  // Make sure the input is null terminated.
+  SmallString<64> Storage;
+  StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
+
+  // Truncate from the beginning, not the end, if the specified name is too
+  // long.  For one, this ensures that the resulting string is still null
+  // terminated, but additionally the end of a long thread name will usually
+  // be more unique than the beginning, since a common pattern is for similar
+  // threads to share a common prefix.
+  if (get_max_thread_name_length() > 0)
+    NameStr = NameStr.take_back(get_max_thread_name_length());
+  (void)NameStr;
+#if defined(__linux__)
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
+#if HAVE_PTHREAD_SETNAME_NP
+  ::pthread_setname_np(::pthread_self(), NameStr.data());
+#endif
+#endif
+#elif defined(__FreeBSD__)
+  ::pthread_set_name_np(::pthread_self(), NameStr.data());
+#elif defined(__NetBSD__)
+  ::pthread_setname_np(::pthread_self(), "%s",
+    const_cast<char *>(NameStr.data()));
+#elif defined(__APPLE__)
+  ::pthread_setname_np(NameStr.data());
+#endif
+}
+
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
+  Name.clear();
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  int pid = ::getpid();
+  uint64_t tid = get_threadid();
+
+  struct kinfo_proc *kp = nullptr, *nkp;
+  size_t len = 0;
+  int error;
+  int ctl[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID | KERN_PROC_INC_THREAD,
+    (int)pid };
+
+  while (1) {
+    error = sysctl(ctl, 4, kp, &len, nullptr, 0);
+    if (kp == nullptr || (error != 0 && errno == ENOMEM)) {
+      // Add extra space in case threads are added before next call.
+      len += sizeof(*kp) + len / 10;
+      nkp = (struct kinfo_proc *)realloc(kp, len);
+      if (nkp == nullptr) {
+        free(kp);
+        return;
+      }
+      kp = nkp;
+      continue;
+    }
+    if (error != 0)
+      len = 0;
+    break;
+  }
+
+  for (size_t i = 0; i < len / sizeof(*kp); i++) {
+    if (kp[i].ki_tid == (lwpid_t)tid) {
+      Name.append(kp[i].ki_tdname, kp[i].ki_tdname + strlen(kp[i].ki_tdname));
+      break;
+    }
+  }
+  free(kp);
+  return;
+#elif defined(__NetBSD__)
+  constexpr uint32_t len = get_max_thread_name_length_impl();
+  char buf[len];
+  ::pthread_getname_np(::pthread_self(), buf, len);
+
+  Name.append(buf, buf + strlen(buf));
+#elif defined(__linux__)
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
+#if HAVE_PTHREAD_GETNAME_NP
+  constexpr uint32_t len = get_max_thread_name_length_impl();
+  char Buffer[len];
+  if (0 == ::pthread_getname_np(::pthread_self(), Buffer, len))
+    Name.append(Buffer, Buffer + strlen(Buffer));
+#endif
+#endif
+#endif
+}
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 050689483deb..709499deeafa 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -24,7 +24,6 @@
 #endif
 
 namespace llvm {
-using namespace sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
@@ -33,7 +32,7 @@ using namespace sys;
 
 typedef BOOL (WINAPI *fpEnumerateLoadedModules)(HANDLE,PENUMLOADED_MODULES_CALLBACK64,PVOID);
 static fpEnumerateLoadedModules fEnumerateLoadedModules;
-static DenseSet<HMODULE> *OpenedHandles;
+static llvm::ManagedStatic<DenseSet<HMODULE> > OpenedHandles;
 
 static bool loadDebugHelp(void) {
   HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
@@ -51,15 +50,13 @@ ELM_Callback(PCSTR ModuleName, DWORD64 ModuleBase,
   return TRUE;
 }
 
-DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
-                                                   std::string *errMsg) {
+sys::DynamicLibrary
+sys::DynamicLibrary::getPermanentLibrary(const char *filename,
+                                         std::string *errMsg) {
   SmartScopedLock<true> lock(*SymbolsMutex);
 
   if (!filename) {
     // When no file is specified, enumerate all DLLs and EXEs in the process.
-    if (OpenedHandles == 0)
-      OpenedHandles = new DenseSet<HMODULE>();
-
     if (!fEnumerateLoadedModules) {
       if (!loadDebugHelp()) {
         assert(false && "These APIs should always be available");
@@ -79,7 +76,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16");
     return DynamicLibrary();
   }
-  
+
   HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
   if (a_handle == 0) {
@@ -87,9 +84,6 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     return DynamicLibrary();
   }
 
-  if (OpenedHandles == 0)
-    OpenedHandles = new DenseSet<HMODULE>();
-
   // If we've already loaded this library, FreeLibrary() the handle in order to
   // keep the internal refcount at +1.
   if (!OpenedHandles->insert(a_handle).second)
@@ -98,6 +92,18 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   return DynamicLibrary(a_handle);
 }
 
+sys::DynamicLibrary
+sys::DynamicLibrary::addPermanentLibrary(void *handle, std::string *errMsg) {
+  SmartScopedLock<true> lock(*SymbolsMutex);
+  // If we've already loaded this library, tell the caller.
+  if (!OpenedHandles->insert((HMODULE)handle).second) {
+    MakeErrMsg(errMsg, "Library already loaded");
+    return DynamicLibrary();
+  }
+
+  return DynamicLibrary(handle);
+}
+
 // Stack probing routines are in the support library (e.g. libgcc), but we don't
 // have dynamic linking on windows. Provide a hook.
 #define EXPLICIT_SYMBOL(SYM)                    \
@@ -123,7 +129,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #undef INLINE_DEF_SYMBOL1
 #undef INLINE_DEF_SYMBOL2
 
-void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+void *sys::DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
   SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
@@ -135,7 +141,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
   }
 
   // Now search the libraries.
-  if (OpenedHandles) {
+  if (OpenedHandles.isConstructed()) {
     for (DenseSet<HMODULE>::iterator I = OpenedHandles->begin(),
          E = OpenedHandles->end(); I != E; ++I) {
       FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
@@ -171,7 +177,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
   return 0;
 }
 
-void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+void *sys::DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
   if (!isValid())
     return NULL;
   if (Data == &OpenedHandles)
diff --git a/lib/Support/Windows/Mutex.inc b/lib/Support/Windows/Mutex.inc
index ab79d079122f..0af145ec9a4e 100644
--- a/lib/Support/Windows/Mutex.inc
+++ b/lib/Support/Windows/Mutex.inc
@@ -20,15 +20,14 @@
 #include "llvm/Support/Mutex.h"
 
 namespace llvm {
-using namespace sys;
 
-MutexImpl::MutexImpl(bool /*recursive*/)
+sys::MutexImpl::MutexImpl(bool /*recursive*/)
 {
   data_ = new CRITICAL_SECTION;
   InitializeCriticalSection((LPCRITICAL_SECTION)data_);
 }
 
-MutexImpl::~MutexImpl()
+sys::MutexImpl::~MutexImpl()
 {
   DeleteCriticalSection((LPCRITICAL_SECTION)data_);
   delete (LPCRITICAL_SECTION)data_;
@@ -36,21 +35,21 @@ MutexImpl::~MutexImpl()
 }
 
 bool
-MutexImpl::acquire()
+sys::MutexImpl::acquire()
 {
   EnterCriticalSection((LPCRITICAL_SECTION)data_);
   return true;
 }
 
 bool
-MutexImpl::release()
+sys::MutexImpl::release()
 {
   LeaveCriticalSection((LPCRITICAL_SECTION)data_);
   return true;
 }
 
 bool
-MutexImpl::tryacquire()
+sys::MutexImpl::tryacquire()
 {
   return TryEnterCriticalSection((LPCRITICAL_SECTION)data_);
 }
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 27b250b428a5..b00d3905f658 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -26,6 +26,7 @@
 // These two headers must be included last, and make sure shlobj is required
 // after Windows.h to make sure it picks up our definition of _WIN32_WINNT
 #include "WindowsSupport.h"
+#include <shellapi.h>
 #include <shlobj.h>
 
 #undef max
@@ -178,6 +179,10 @@ TimePoint<> file_status::getLastModificationTime() const {
   return toTimePoint(Time);
 }
 
+uint32_t file_status::getLinkCount() const {
+  return NumLinks;
+}
+
 std::error_code current_path(SmallVectorImpl<char> &result) {
   SmallVector<wchar_t, MAX_PATH> cur_path;
   DWORD len = MAX_PATH;
@@ -200,6 +205,18 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
+std::error_code set_current_path(const Twine &path) {
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_path;
+  if (std::error_code ec = widenPath(path, wide_path))
+    return ec;
+
+  if (!::SetCurrentDirectoryW(wide_path.begin()))
+    return mapWindowsError(::GetLastError());
+
+  return std::error_code();
+}
+
 std::error_code create_directory(const Twine &path, bool IgnoreExisting,
                                  perms Perms) {
   SmallVector<wchar_t, 128> path_utf16;
@@ -265,6 +282,80 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   return std::error_code();
 }
 
+static std::error_code is_local_internal(SmallVectorImpl<wchar_t> &Path,
+                                         bool &Result) {
+  SmallVector<wchar_t, 128> VolumePath;
+  size_t Len = 128;
+  while (true) {
+    VolumePath.resize(Len);
+    BOOL Success =
+        ::GetVolumePathNameW(Path.data(), VolumePath.data(), VolumePath.size());
+
+    if (Success)
+      break;
+
+    DWORD Err = ::GetLastError();
+    if (Err != ERROR_INSUFFICIENT_BUFFER)
+      return mapWindowsError(Err);
+
+    Len *= 2;
+  }
+  // If the output buffer has exactly enough space for the path name, but not
+  // the null terminator, it will leave the output unterminated.  Push a null
+  // terminator onto the end to ensure that this never happens.
+  VolumePath.push_back(L'\0');
+  VolumePath.set_size(wcslen(VolumePath.data()));
+  const wchar_t *P = VolumePath.data();
+
+  UINT Type = ::GetDriveTypeW(P);
+  switch (Type) {
+  case DRIVE_FIXED:
+    Result = true;
+    return std::error_code();
+  case DRIVE_REMOTE:
+  case DRIVE_CDROM:
+  case DRIVE_RAMDISK:
+  case DRIVE_REMOVABLE:
+    Result = false;
+    return std::error_code();
+  default:
+    return make_error_code(errc::no_such_file_or_directory);
+  }
+  llvm_unreachable("Unreachable!");
+}
+
+std::error_code is_local(const Twine &path, bool &result) {
+  if (!llvm::sys::fs::exists(path) || !llvm::sys::path::has_root_path(path))
+    return make_error_code(errc::no_such_file_or_directory);
+
+  SmallString<128> Storage;
+  StringRef P = path.toStringRef(Storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> WidePath;
+  if (std::error_code ec = widenPath(P, WidePath))
+    return ec;
+  return is_local_internal(WidePath, result);
+}
+
+std::error_code is_local(int FD, bool &Result) {
+  SmallVector<wchar_t, 128> FinalPath;
+  HANDLE Handle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+
+  size_t Len = 128;
+  do {
+    FinalPath.reserve(Len);
+    Len = ::GetFinalPathNameByHandleW(Handle, FinalPath.data(),
+                                      FinalPath.capacity() - 1, VOLUME_NAME_NT);
+    if (Len == 0)
+      return mapWindowsError(::GetLastError());
+  } while (Len > FinalPath.capacity());
+
+  FinalPath.set_size(Len);
+
+  return is_local_internal(FinalPath, Result);
+}
+
 std::error_code rename(const Twine &from, const Twine &to) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
@@ -443,13 +534,16 @@ static std::error_code getStatus(HANDLE FileHandle, file_status &Result) {
     file_type Type = (Info.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
                          ? file_type::directory_file
                          : file_type::regular_file;
-    Result =
-        file_status(Type, Info.ftLastAccessTime.dwHighDateTime,
-                    Info.ftLastAccessTime.dwLowDateTime,
-                    Info.ftLastWriteTime.dwHighDateTime,
-                    Info.ftLastWriteTime.dwLowDateTime,
-                    Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
-                    Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
+    perms Permissions = (Info.dwFileAttributes & FILE_ATTRIBUTE_READONLY)
+                            ? (all_read | all_exe)
+                            : all_all;
+    Result = file_status(
+        Type, Permissions, Info.nNumberOfLinks,
+        Info.ftLastAccessTime.dwHighDateTime,
+        Info.ftLastAccessTime.dwLowDateTime,
+        Info.ftLastWriteTime.dwHighDateTime, Info.ftLastWriteTime.dwLowDateTime,
+        Info.dwVolumeSerialNumber, Info.nFileSizeHigh, Info.nFileSizeLow,
+        Info.nFileIndexHigh, Info.nFileIndexLow);
     return std::error_code();
   }
 
@@ -465,7 +559,7 @@ handle_status_error:
   return mapWindowsError(LastError);
 }
 
-std::error_code status(const Twine &path, file_status &result) {
+std::error_code status(const Twine &path, file_status &result, bool Follow) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
@@ -482,28 +576,19 @@ std::error_code status(const Twine &path, file_status &result) {
   if (attr == INVALID_FILE_ATTRIBUTES)
     return getStatus(INVALID_HANDLE_VALUE, result);
 
+  DWORD Flags = FILE_FLAG_BACKUP_SEMANTICS;
   // Handle reparse points.
-  if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
-    ScopedFileHandle h(
-      ::CreateFileW(path_utf16.begin(),
-                    0, // Attributes only.
-                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL,
-                    OPEN_EXISTING,
-                    FILE_FLAG_BACKUP_SEMANTICS,
-                    0));
-    if (!h)
-      return getStatus(INVALID_HANDLE_VALUE, result);
-  }
+  if (!Follow && (attr & FILE_ATTRIBUTE_REPARSE_POINT))
+    Flags |= FILE_FLAG_OPEN_REPARSE_POINT;
 
   ScopedFileHandle h(
       ::CreateFileW(path_utf16.begin(), 0, // Attributes only.
                     FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0));
-    if (!h)
-      return getStatus(INVALID_HANDLE_VALUE, result);
+                    NULL, OPEN_EXISTING, Flags, 0));
+  if (!h)
+    return getStatus(INVALID_HANDLE_VALUE, result);
 
-    return getStatus(h, result);
+  return getStatus(h, result);
 }
 
 std::error_code status(int FD, file_status &Result) {
@@ -511,6 +596,37 @@ std::error_code status(int FD, file_status &Result) {
   return getStatus(FileHandle, Result);
 }
 
+std::error_code setPermissions(const Twine &Path, perms Permissions) {
+  SmallVector<wchar_t, 128> PathUTF16;
+  if (std::error_code EC = widenPath(Path, PathUTF16))
+    return EC;
+
+  DWORD Attributes = ::GetFileAttributesW(PathUTF16.begin());
+  if (Attributes == INVALID_FILE_ATTRIBUTES)
+    return mapWindowsError(GetLastError());
+
+  // There are many Windows file attributes that are not to do with the file
+  // permissions (e.g. FILE_ATTRIBUTE_HIDDEN). We need to be careful to preserve
+  // them.
+  if (Permissions & all_write) {
+    Attributes &= ~FILE_ATTRIBUTE_READONLY;
+    if (Attributes == 0)
+      // FILE_ATTRIBUTE_NORMAL indicates no other attributes are set.
+      Attributes |= FILE_ATTRIBUTE_NORMAL;
+  }
+  else {
+    Attributes |= FILE_ATTRIBUTE_READONLY;
+    // FILE_ATTRIBUTE_NORMAL is not compatible with any other attributes, so
+    // remove it, if it is present.
+    Attributes &= ~FILE_ATTRIBUTE_NORMAL;
+  }
+
+  if (!::SetFileAttributesW(PathUTF16.begin(), Attributes))
+    return mapWindowsError(GetLastError());
+
+  return std::error_code();
+}
+
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
   FILETIME FT = toFILETIME(Time);
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
@@ -616,7 +732,8 @@ int mapped_file_region::alignment() {
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
+                                                     StringRef path,
+                                                     bool follow_symlinks) {
   SmallVector<wchar_t, 128> path_utf16;
 
   if (std::error_code ec = widenPath(path, path_utf16))
@@ -661,7 +778,7 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
   it.IterationHandle = intptr_t(FindHandle.take());
   SmallString<128> directory_entry_path(path);
   path::append(directory_entry_path, directory_entry_name_utf8);
-  it.CurrentEntry = directory_entry(directory_entry_path);
+  it.CurrentEntry = directory_entry(directory_entry_path, follow_symlinks);
 
   return std::error_code();
 }
@@ -701,6 +818,52 @@ std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return std::error_code();
 }
 
+static std::error_code realPathFromHandle(HANDLE H,
+                                          SmallVectorImpl<char> &RealPath) {
+  RealPath.clear();
+  llvm::SmallVector<wchar_t, MAX_PATH> Buffer;
+  DWORD CountChars = ::GetFinalPathNameByHandleW(
+      H, Buffer.begin(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+  if (CountChars > Buffer.capacity()) {
+    // The buffer wasn't big enough, try again.  In this case the return value
+    // *does* indicate the size of the null terminator.
+    Buffer.reserve(CountChars);
+    CountChars = ::GetFinalPathNameByHandleW(
+        H, Buffer.data(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+  }
+  if (CountChars == 0)
+    return mapWindowsError(GetLastError());
+
+  const wchar_t *Data = Buffer.data();
+  if (CountChars >= 4) {
+    if (0 == ::memcmp(Data, L"\\\\?\\", 8)) {
+      CountChars -= 4;
+      Data += 4;
+    }
+  }
+
+  // Convert the result from UTF-16 to UTF-8.
+  return UTF16ToUTF8(Data, CountChars, RealPath);
+}
+
+static std::error_code directoryRealPath(const Twine &Name,
+                                         SmallVectorImpl<char> &RealPath) {
+  SmallVector<wchar_t, 128> PathUTF16;
+
+  if (std::error_code EC = widenPath(Name, PathUTF16))
+    return EC;
+
+  HANDLE H =
+      ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+  if (H == INVALID_HANDLE_VALUE)
+    return mapWindowsError(GetLastError());
+  std::error_code EC = realPathFromHandle(H, RealPath);
+  ::CloseHandle(H);
+  return EC;
+}
+
 std::error_code openFileForRead(const Twine &Name, int &ResultFD,
                                 SmallVectorImpl<char> *RealPath) {
   SmallVector<wchar_t, 128> PathUTF16;
@@ -732,20 +895,8 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
   }
 
   // Fetch the real name of the file, if the user asked
-  if (RealPath) {
-    RealPath->clear();
-    wchar_t RealPathUTF16[MAX_PATH];
-    DWORD CountChars =
-      ::GetFinalPathNameByHandleW(H, RealPathUTF16, MAX_PATH,
-                                  FILE_NAME_NORMALIZED);
-    if (CountChars > 0 && CountChars < MAX_PATH) {
-      // Convert the result from UTF-16 to UTF-8.
-      SmallString<MAX_PATH> RealPathUTF8;
-      if (!UTF16ToUTF8(RealPathUTF16, CountChars, RealPathUTF8))
-        RealPath->append(RealPathUTF8.data(),
-                         RealPathUTF8.data() + strlen(RealPathUTF8.data()));
-    }
-  }
+  if (RealPath)
+    realPathFromHandle(H, *RealPath);
 
   ResultFD = FD;
   return std::error_code();
@@ -843,6 +994,81 @@ std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
 
   return windows::UTF16ToUTF8(TempPath.data(), CharCount, ResultPath);
 }
+
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> Path16;
+  std::error_code EC = widenPath(path, Path16);
+  if (EC && !IgnoreErrors)
+    return EC;
+
+  // SHFileOperation() accepts a list of paths, and so must be double null-
+  // terminated to indicate the end of the list.  The buffer is already null
+  // terminated, but since that null character is not considered part of the
+  // vector's size, pushing another one will just consume that byte.  So we
+  // need to push 2 null terminators.
+  Path16.push_back(0);
+  Path16.push_back(0);
+
+  SHFILEOPSTRUCTW shfos = {};
+  shfos.wFunc = FO_DELETE;
+  shfos.pFrom = Path16.data();
+  shfos.fFlags = FOF_NO_UI;
+
+  int result = ::SHFileOperationW(&shfos);
+  if (result != 0 && !IgnoreErrors)
+    return mapWindowsError(result);
+  return std::error_code();
+}
+
+static void expandTildeExpr(SmallVectorImpl<char> &Path) {
+  // Path does not begin with a tilde expression.
+  if (Path.empty() || Path[0] != '~')
+    return;
+
+  StringRef PathStr(Path.begin(), Path.size());
+  PathStr = PathStr.drop_front();
+  StringRef Expr = PathStr.take_until([](char c) { return path::is_separator(c); });
+
+  if (!Expr.empty()) {
+    // This is probably a ~username/ expression.  Don't support this on Windows.
+    return;
+  }
+
+  SmallString<128> HomeDir;
+  if (!path::home_directory(HomeDir)) {
+    // For some reason we couldn't get the home directory.  Just exit.
+    return;
+  }
+
+  // Overwrite the first character and insert the rest.
+  Path[0] = HomeDir[0];
+  Path.insert(Path.begin() + 1, HomeDir.begin() + 1, HomeDir.end());
+}
+
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
+                          bool expand_tilde) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return std::error_code();
+
+  if (expand_tilde) {
+    SmallString<128> Storage;
+    path.toVector(Storage);
+    expandTildeExpr(Storage);
+    return real_path(Storage, dest, false);
+  }
+
+  if (is_directory(path))
+    return directoryRealPath(path, dest);
+
+  int fd;
+  if (std::error_code EC = llvm::sys::fs::openFileForRead(path, fd, &dest))
+    return EC;
+  ::close(fd);
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 8d646b3217a0..18aef610d54a 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -47,7 +47,6 @@
 #endif
 
 using namespace llvm;
-using namespace sys;
 
 // This function retrieves the page size using GetNativeSystemInfo() and is
 // present solely so it can be called once to initialize the self_process member
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 78fc538bd9bf..721167da5b15 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -29,7 +29,6 @@
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-using namespace sys;
 
 ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
 
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 2d1d25f67b8a..ac60c2fc05be 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -19,7 +19,6 @@
 #include "WindowsSupport.h"
 
 namespace llvm {
-using namespace sys;
 
 // Windows has slim read-writer lock support on Vista and higher, so we
 // will attempt to load the APIs.  If they exist, we will use them, and
@@ -73,7 +72,7 @@ static bool loadSRW() {
   return sHasSRW;
 }
 
-RWMutexImpl::RWMutexImpl() {
+sys::RWMutexImpl::RWMutexImpl() {
   if (loadSRW()) {
     data_ = calloc(1, sizeof(SRWLOCK));
     fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
@@ -83,14 +82,14 @@ RWMutexImpl::RWMutexImpl() {
   }
 }
 
-RWMutexImpl::~RWMutexImpl() {
+sys::RWMutexImpl::~RWMutexImpl() {
   if (!sHasSRW)
     DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
   // Nothing to do in the case of slim reader/writers except free the memory.
   free(data_);
 }
 
-bool RWMutexImpl::reader_acquire() {
+bool sys::RWMutexImpl::reader_acquire() {
   if (sHasSRW) {
     fpAcquireSRWLockShared(static_cast<PSRWLOCK>(data_));
   } else {
@@ -99,7 +98,7 @@ bool RWMutexImpl::reader_acquire() {
   return true;
 }
 
-bool RWMutexImpl::reader_release() {
+bool sys::RWMutexImpl::reader_release() {
   if (sHasSRW) {
     fpReleaseSRWLockShared(static_cast<PSRWLOCK>(data_));
   } else {
@@ -108,7 +107,7 @@ bool RWMutexImpl::reader_release() {
   return true;
 }
 
-bool RWMutexImpl::writer_acquire() {
+bool sys::RWMutexImpl::writer_acquire() {
   if (sHasSRW) {
     fpAcquireSRWLockExclusive(static_cast<PSRWLOCK>(data_));
   } else {
@@ -117,7 +116,7 @@ bool RWMutexImpl::writer_acquire() {
   return true;
 }
 
-bool RWMutexImpl::writer_release() {
+bool sys::RWMutexImpl::writer_release() {
   if (sHasSRW) {
     fpReleaseSRWLockExclusive(static_cast<PSRWLOCK>(data_));
   } else {
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index f739421eece4..1ef51888baf3 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -776,7 +776,7 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   // the nasty sorts of crashes that aren't 100% reproducible from a set of
   // inputs (or in the event that the user is unable or unwilling to provide a
   // reproducible case).
-  if (!llvm::Process::AreCoreFilesPrevented()) {
+  if (!llvm::sys::Process::AreCoreFilesPrevented()) {
     MINIDUMP_EXCEPTION_INFORMATION ExceptionInfo;
     ExceptionInfo.ThreadId = ::GetCurrentThreadId();
     ExceptionInfo.ExceptionPointers = ep;
diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc
index b9cb8ff9836e..8be1c3ecfbb9 100644
--- a/lib/Support/Windows/ThreadLocal.inc
+++ b/lib/Support/Windows/ThreadLocal.inc
@@ -20,33 +20,32 @@
 #include "llvm/Support/ThreadLocal.h"
 
 namespace llvm {
-using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() : data() {
+sys::ThreadLocalImpl::ThreadLocalImpl() : data() {
   static_assert(sizeof(DWORD) <= sizeof(data), "size too big");
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   *tls = TlsAlloc();
   assert(*tls != TLS_OUT_OF_INDEXES);
 }
 
-ThreadLocalImpl::~ThreadLocalImpl() {
+sys::ThreadLocalImpl::~ThreadLocalImpl() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   TlsFree(*tls);
 }
 
-void *ThreadLocalImpl::getInstance() {
+void *sys::ThreadLocalImpl::getInstance() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   return TlsGetValue(*tls);
 }
 
-void ThreadLocalImpl::setInstance(const void* d){
+void sys::ThreadLocalImpl::setInstance(const void* d){
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   int errorcode = TlsSetValue(*tls, const_cast<void*>(d));
   assert(errorcode != 0);
   (void)errorcode;
 }
 
-void ThreadLocalImpl::removeInstance() {
+void sys::ThreadLocalImpl::removeInstance() {
   setInstance(0);
 }
 
diff --git a/lib/Support/Windows/Threading.inc b/lib/Support/Windows/Threading.inc
new file mode 100644
index 000000000000..decb48887af2
--- /dev/null
+++ b/lib/Support/Windows/Threading.inc
@@ -0,0 +1,109 @@
+//===- Windows/Threading.inc - Win32 Threading Implementation - -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of Threading functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#include "Windows/WindowsSupport.h"
+#include <process.h>
+
+// Windows will at times define MemoryFence.
+#ifdef MemoryFence
+#undef MemoryFence
+#endif
+
+namespace {
+  struct ThreadInfo {
+    void(*func)(void*);
+    void *param;
+  };
+}
+
+static unsigned __stdcall ThreadCallback(void *param) {
+  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
+  info->func(info->param);
+
+  return 0;
+}
+
+void llvm::llvm_execute_on_thread(void(*Fn)(void*), void *UserData,
+  unsigned RequestedStackSize) {
+  struct ThreadInfo param = { Fn, UserData };
+
+  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
+    RequestedStackSize, ThreadCallback,
+    &param, 0, NULL);
+
+  if (hThread) {
+    // We actually don't care whether the wait succeeds or fails, in
+    // the same way we don't care whether the pthread_join call succeeds
+    // or fails.  There's not much we could do if this were to fail. But
+    // on success, this call will wait until the thread finishes executing
+    // before returning.
+    (void)::WaitForSingleObject(hThread, INFINITE);
+    ::CloseHandle(hThread);
+  }
+}
+
+uint64_t llvm::get_threadid() {
+  return uint64_t(::GetCurrentThreadId());
+}
+
+uint32_t llvm::get_max_thread_name_length() { return 0; }
+
+#if defined(_MSC_VER)
+static void SetThreadName(DWORD Id, LPCSTR Name) {
+  constexpr DWORD MS_VC_EXCEPTION = 0x406D1388;
+
+#pragma pack(push, 8)
+  struct THREADNAME_INFO {
+    DWORD dwType;     // Must be 0x1000.
+    LPCSTR szName;    // Pointer to thread name
+    DWORD dwThreadId; // Thread ID (-1 == current thread)
+    DWORD dwFlags;    // Reserved.  Do not use.
+  };
+#pragma pack(pop)
+
+  THREADNAME_INFO info;
+  info.dwType = 0x1000;
+  info.szName = Name;
+  info.dwThreadId = Id;
+  info.dwFlags = 0;
+
+  __try {
+    ::RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR),
+      (ULONG_PTR *)&info);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+  }
+}
+#endif
+
+void llvm::set_thread_name(const Twine &Name) {
+#if defined(_MSC_VER)
+  // Make sure the input is null terminated.
+  SmallString<64> Storage;
+  StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
+  SetThreadName(::GetCurrentThreadId(), NameStr.data());
+#endif
+}
+
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
+  // "Name" is not an inherent property of a thread on Windows.  In fact, when
+  // you "set" the name, you are only firing a one-time message to a debugger
+  // which it interprets as a program setting its threads' name.  We may be
+  // able to get fancy by creating a TLS entry when someone calls
+  // set_thread_name so that subsequent calls to get_thread_name return this
+  // value.
+  Name.clear();
+}
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 9849b3aa1ce9..c410b1d56086 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -398,17 +398,10 @@ bool Input::canElideEmptySequence() {
 //===----------------------------------------------------------------------===//
 
 Output::Output(raw_ostream &yout, void *context, int WrapColumn)
-    : IO(context),
-      Out(yout),
-      WrapColumn(WrapColumn),
-      Column(0),
-      ColumnAtFlowStart(0),
-      ColumnAtMapFlowStart(0),
-      NeedBitValueComma(false),
-      NeedFlowSequenceComma(false),
-      EnumerationMatchFound(false),
-      NeedsNewLine(false) {
-}
+    : IO(context), Out(yout), WrapColumn(WrapColumn), Column(0),
+      ColumnAtFlowStart(0), ColumnAtMapFlowStart(0), NeedBitValueComma(false),
+      NeedFlowSequenceComma(false), EnumerationMatchFound(false),
+      NeedsNewLine(false), WriteDefaultValues(false) {}
 
 Output::~Output() {
 }
@@ -462,7 +455,7 @@ std::vector<StringRef> Output::keys() {
 bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
                           bool &UseDefault, void *&) {
   UseDefault = false;
-  if (Required || !SameAsDefault) {
+  if (Required || !SameAsDefault || WriteDefaultValues) {
     auto State = StateStack.back();
     if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
       flowKey(Key);
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index d073802db932..1abc8ed8683d 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -465,8 +465,7 @@ void format_object_base::home() {
 static int getFD(StringRef Filename, std::error_code &EC,
                  sys::fs::OpenFlags Flags) {
   // Handle "-" as stdout. Note that when we do this, we consider ourself
-  // the owner of stdout. This means that we can do things like close the
-  // file descriptor when we're done and set the "binary" flag globally.
+  // the owner of stdout and may set the "binary" flag globally based on Flags.
   if (Filename == "-") {
     EC = std::error_code();
     // If user requested binary then put stdout into binary mode if
@@ -497,6 +496,13 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
     ShouldClose = false;
     return;
   }
+  // We do not want to close STDOUT as there may have been several uses of it
+  // such as the case: llc %s -o=- -pass-remarks-output=- -filetype=asm
+  // which cause multiple closes of STDOUT_FILENO and/or use-after-close of it.
+  // Using dup() in getFD doesn't work as we end up with original STDOUT_FILENO
+  // open anyhow.
+  if (FD <= STDERR_FILENO)
+    ShouldClose = false;
 
   // Get the starting position.
   off_t loc = ::lseek(FD, 0, SEEK_CUR);
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index ea9c9a19904e..33d3de5daf33 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -40,7 +40,9 @@ IntRecTy IntRecTy::Shared;
 StringRecTy StringRecTy::Shared;
 DagRecTy DagRecTy::Shared;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecTy::dump() const { print(errs()); }
+#endif
 
 ListRecTy *RecTy::getListTy() {
   if (!ListTy)
@@ -161,7 +163,9 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
 //===----------------------------------------------------------------------===//
 
 void Init::anchor() { }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Init::dump() const { return print(errs()); }
+#endif
 
 UnsetInit *UnsetInit::get() {
   static UnsetInit TheInit;
@@ -1591,7 +1595,9 @@ StringRef RecordVal::getName() const {
   return cast<StringInit>(getNameInit())->getValue();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecordVal::dump() const { errs() << *this; }
+#endif
 
 void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
   if (getPrefix()) OS << "field ";
@@ -1673,7 +1679,9 @@ void Record::resolveReferencesTo(const RecordVal *RV) {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Record::dump() const { errs() << *this; }
+#endif
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   OS << R.getNameInitAsString();
@@ -1865,6 +1873,7 @@ DagInit *Record::getValueAsDag(StringRef FieldName) const {
     FieldName + "' does not have a dag initializer!");
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MultiClass::dump() const {
   errs() << "Record:\n";
   Rec.dump();
@@ -1875,6 +1884,7 @@ LLVM_DUMP_METHOD void MultiClass::dump() const {
 }
 
 LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; }
+#endif
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
   OS << "------------- Classes -----------------\n";
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 1a91b37b742b..96015b06d798 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -54,6 +54,7 @@ struct SubMultiClassReference {
   void dump() const;
 };
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
   errs() << "Multiclass:\n";
 
@@ -63,6 +64,7 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
   for (Init *TA : TemplateArgs)
     TA->dump();
 }
+#endif
 
 } // end namespace llvm
 
@@ -945,7 +947,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       else if (ListInit *Arg0 = dyn_cast<ListInit>(InitList[0]))
         Type = Arg0->getType();
       else {
-        InitList[0]->dump();
+        InitList[0]->print(errs());
         Error(OpLoc, "expected a list");
         return nullptr;
       }
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index fd106a8d9b0b..b44b13e36e15 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -22,8 +22,11 @@
 
 namespace llvm {
 
+class AArch64RegisterBankInfo;
+class AArch64Subtarget;
 class AArch64TargetMachine;
 class FunctionPass;
+class InstructionSelector;
 class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
@@ -45,6 +48,9 @@ FunctionPass *createAArch64A53Fix835769();
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &,
+                                 AArch64Subtarget &, AArch64RegisterBankInfo &);
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 91c335fac32d..519ca2894683 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -27,7 +27,7 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-  "Enable cryptographic instructions">;
+  "Enable cryptographic instructions", [FeatureNEON]>;
 
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
@@ -38,6 +38,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
 def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
   "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
 
+def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
+  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
+
 def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
   "Enable ARMv8 PMUv3 Performance Monitors extension">;
 
@@ -100,6 +103,14 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
     "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
     "CPU fuses arithmetic + cbz/cbnz operations">;
 
+def FeatureFuseAES : SubtargetFeature<
+    "fuse-aes", "HasFuseAES", "true",
+    "CPU fuses AES crypto operations">;
+
+def FeatureFuseLiterals : SubtargetFeature<
+    "fuse-literals", "HasFuseLiterals", "true",
+    "CPU fuses literal generation operations">;
+
 def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
     "Disable latency scheduling heuristic">;
@@ -108,12 +119,22 @@ def FeatureUseRSqrt : SubtargetFeature<
     "use-reciprocal-square-root", "UseRSqrt", "true",
     "Use the reciprocal square root approximation">;
 
+def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
+                                        "NegativeImmediates", "false",
+                                        "Convert immediates and instructions "
+                                        "to their negated or complemented "
+                                        "equivalent when the immediate does "
+                                        "not fit in the encoding.">;
+
+def FeatureLSLFast : SubtargetFeature<
+    "lsl-fast", "HasLSLFast", "true",
+    "CPU has a fastpath logical shift of up to 3 places">;
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
 
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
-  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>;
+  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
   "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
@@ -123,6 +144,7 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
 //===----------------------------------------------------------------------===//
 
 include "AArch64RegisterInfo.td"
+include "AArch64RegisterBanks.td"
 include "AArch64CallingConvention.td"
 
 //===----------------------------------------------------------------------===//
@@ -149,7 +171,8 @@ include "AArch64SchedCyclone.td"
 include "AArch64SchedFalkor.td"
 include "AArch64SchedKryo.td"
 include "AArch64SchedM1.td"
-include "AArch64SchedVulcan.td"
+include "AArch64SchedThunderX.td"
+include "AArch64SchedThunderX2T99.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", [
@@ -180,6 +203,8 @@ def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
+                                   FeatureFuseLiterals,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
@@ -226,6 +251,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
@@ -256,7 +282,8 @@ def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
 def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -269,19 +296,66 @@ def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing
+                                   FeatureRDM,
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
-def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
-                                   "Broadcom Vulcan processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
-                                   FeatureArithmeticBccFusion,
-                                   FeatureNEON,
-                                   FeaturePostRAScheduler,
-                                   FeaturePredictableSelectIsExpensive,
-                                   HasV8_1aOps]>;
+def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
+                                         "ThunderX2T99",
+                                         "Cavium ThunderX2 processors", [
+                                          FeatureCRC,
+                                          FeatureCrypto,
+                                          FeatureFPARMv8,
+                                          FeatureArithmeticBccFusion,
+                                          FeatureNEON,
+                                          FeaturePostRAScheduler,
+                                          FeaturePredictableSelectIsExpensive,
+                                          FeatureLSE,
+                                          HasV8_1aOps]>;
+
+def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
+                                    "Cavium ThunderX processors", [
+                                    FeatureCRC,
+                                    FeatureCrypto,
+                                    FeatureFPARMv8,
+                                    FeaturePerfMon,
+                                    FeaturePostRAScheduler,
+                                    FeaturePredictableSelectIsExpensive,
+                                    FeatureNEON]>;
+
+def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
+                                       "ThunderXT88",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
+
+def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
+                                       "ThunderXT81",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
+
+def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
+                                       "ThunderXT83",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
 
 def : ProcessorModel<"generic", NoSchedModel, [
                      FeatureCRC,
@@ -291,11 +365,11 @@ def : ProcessorModel<"generic", NoSchedModel, [
                      FeaturePostRAScheduler
                      ]>;
 
-// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+// FIXME: Cortex-A35 is currently modeled as a Cortex-A53.
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+// FIXME: Cortex-A72 and Cortex-A73 are currently modeled as a Cortex-A57.
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
 def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
@@ -304,7 +378,13 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
 def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
 def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
 def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
-def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
+// Cavium ThunderX/ThunderX T8X  Processors
+def : ProcessorModel<"thunderx", ThunderXT8XModel,  [ProcThunderX]>;
+def : ProcessorModel<"thunderxt88", ThunderXT8XModel,  [ProcThunderXT88]>;
+def : ProcessorModel<"thunderxt81", ThunderXT8XModel,  [ProcThunderXT81]>;
+def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
+// Cavium ThunderX2T9X  Processors. Formerly Broadcom Vulcan.
+def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 0aa597bcdc56..4a7e0b2b803e 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -493,43 +493,30 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
 
 int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
                                                 MachineBasicBlock &MBB) {
-  RegScavenger RS;
-  RS.enterBasicBlock(MBB);
-  RS.forward(MachineBasicBlock::iterator(G->getStart()));
-
   // Can we find an appropriate register that is available throughout the life
-  // of the chain?
-  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
-  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-  for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
-    RS.forward(I);
-    AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-
-    // Remove any registers clobbered by a regmask or any def register that is
-    // immediately dead.
-    for (auto J : I->operands()) {
-      if (J.isRegMask())
-        AvailableRegs.clearBitsNotInMask(J.getRegMask());
-
-      if (J.isReg() && J.isDef()) {
-        MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
-        if (J.isDead())
-          for (; AI.isValid(); ++AI)
-            AvailableRegs.reset(*AI);
-#ifndef NDEBUG
-        else
-          for (; AI.isValid(); ++AI)
-            assert(!AvailableRegs[*AI] &&
-                   "Non-dead def should have been removed by now!");
-#endif
-      }
-    }
+  // of the chain? Simulate liveness backwards until the end of the chain.
+  LiveRegUnits Units(*TRI);
+  Units.addLiveOuts(MBB);
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator ChainEnd = G->end();
+  while (I != ChainEnd) {
+    --I;
+    Units.stepBackward(*I);
   }
 
+  // Check which register units are alive throughout the chain.
+  MachineBasicBlock::iterator ChainBegin = G->begin();
+  assert(ChainBegin != ChainEnd && "Chain should contain instructions");
+  do {
+    --I;
+    Units.accumulateBackward(*I);
+  } while (I != ChainBegin);
+
   // Make sure we allocate in-order, to get the cheapest registers first.
+  unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass;
   auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
   for (auto Reg : Ord) {
-    if (!AvailableRegs[Reg])
+    if (!Units.available(Reg))
       continue;
     if (C == getColor(Reg))
       return Reg;
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 0cbb2db1134a..e1b8ee6d03c3 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -31,16 +31,23 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -59,12 +66,12 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
 //===----------------------------------------------------------------------===//
 
 namespace {
-class AArch64AddressTypePromotion : public FunctionPass {
 
+class AArch64AddressTypePromotion : public FunctionPass {
 public:
   static char ID;
-  AArch64AddressTypePromotion()
-      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+
+  AArch64AddressTypePromotion() : FunctionPass(ID) {
     initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
   }
 
@@ -76,10 +83,11 @@ public:
 
 private:
   /// The current function.
-  Function *Func;
+  Function *Func = nullptr;
+
   /// Filter out all sexts that does not have this type.
   /// Currently initialized with Int64Ty.
-  Type *ConsideredSExtType;
+  Type *ConsideredSExtType = nullptr;
 
   // This transformation requires dominator info.
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -129,7 +137,8 @@ private:
   void mergeSExts(ValueToInsts &ValToSExtendedUses,
                   SetOfInstructions &ToRemove);
 };
-} // end anonymous namespace.
+
+} // end anonymous namespace
 
 char AArch64AddressTypePromotion::ID = 0;
 
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index a4950af32097..b2f55a7e1e09 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//===--- AArch64CallLowering.cpp - Call lowering --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,15 +15,36 @@
 
 #include "AArch64CallLowering.h"
 #include "AArch64ISelLowering.h"
-
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -31,12 +52,12 @@ using namespace llvm;
 #endif
 
 AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
-  : CallLowering(&TLI) {
-}
+  : CallLowering(&TLI) {}
 
 struct IncomingArgHandler : public CallLowering::ValueHandler {
-  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
-    : ValueHandler(MIRBuilder, MRI) {}
+  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                     CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -45,6 +66,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
     unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
     MIRBuilder.buildFrameIndex(AddrReg, FI);
+    StackUsed = std::max(StackUsed, Size + Offset);
     return AddrReg;
   }
 
@@ -67,11 +89,14 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
   /// parameters (it's a basic-block live-in), and a call instruction
   /// (it's an implicit-def of the BL).
   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+  uint64_t StackUsed;
 };
 
 struct FormalArgHandler : public IncomingArgHandler {
-  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
-      : IncomingArgHandler(MIRBuilder, MRI) {}
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn *AssignFn)
+    : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIRBuilder.getMBB().addLiveIn(PhysReg);
@@ -80,8 +105,8 @@ struct FormalArgHandler : public IncomingArgHandler {
 
 struct CallReturnHandler : public IncomingArgHandler {
   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       MachineInstrBuilder MIB)
-    : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
+                    MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+    : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIB.addDef(PhysReg, RegState::Implicit);
@@ -92,8 +117,10 @@ struct CallReturnHandler : public IncomingArgHandler {
 
 struct OutgoingArgHandler : public CallLowering::ValueHandler {
   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                     MachineInstrBuilder MIB)
-      : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+                     MachineInstrBuilder MIB, CCAssignFn *AssignFn,
+                     CCAssignFn *AssignFnVarArg)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+        AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -126,14 +153,29 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
   }
 
+  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                 CCValAssign::LocInfo LocInfo,
+                 const CallLowering::ArgInfo &Info,
+                 CCState &State) override {
+    bool Res;
+    if (Info.IsFixed)
+      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+    else
+      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+
+    StackSize = State.getNextStackOffset();
+    return Res;
+  }
+
   MachineInstrBuilder MIB;
+  CCAssignFn *AssignFnVarArg;
+  uint64_t StackSize;
 };
 
-void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
-                                            SmallVectorImpl<ArgInfo> &SplitArgs,
-                                            const DataLayout &DL,
-                                            MachineRegisterInfo &MRI,
-                                            SplitArgTy PerformArgSplit) const {
+void AArch64CallLowering::splitToValueTypes(
+    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+    const DataLayout &DL, MachineRegisterInfo &MRI,
+    const SplitArgTy &PerformArgSplit) const {
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
@@ -145,7 +187,7 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     // No splitting to do, but we want to replace the original type (e.g. [1 x
     // double] -> double).
     SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
-                           OrigArg.Flags);
+                           OrigArg.Flags, OrigArg.IsFixed);
     return;
   }
 
@@ -154,19 +196,12 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS).
     Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
     SplitArgs.push_back(
-        ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy,
-                OrigArg.Flags});
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+                SplitTy, OrigArg.Flags, OrigArg.IsFixed});
   }
 
-  SmallVector<uint64_t, 4> BitOffsets;
-  for (auto Offset : Offsets)
-    BitOffsets.push_back(Offset * 8);
-
-  SmallVector<unsigned, 8> SplitRegs;
-  for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I)
-    SplitRegs.push_back(I->Reg);
-
-  PerformArgSplit(SplitRegs, BitOffsets);
+  for (unsigned i = 0; i < Offsets.size(); ++i)
+    PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
 }
 
 bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -184,16 +219,16 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     auto &DL = F.getParent()->getDataLayout();
 
     ArgInfo OrigArg{VReg, Val->getType()};
-    setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F);
+    setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
 
     SmallVector<ArgInfo, 8> SplitArgs;
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildExtract(Regs, Offsets, VReg);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, VReg, Offset);
                       });
 
-    OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
-    Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
+    OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
+    Success = handleAssignments(MIRBuilder, SplitArgs, Handler);
   }
 
   MIRBuilder.insertInstr(MIB);
@@ -203,7 +238,6 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                                const Function &F,
                                                ArrayRef<unsigned> VRegs) const {
-  auto &Args = F.getArgumentList();
   MachineFunction &MF = MIRBuilder.getMF();
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -211,13 +245,27 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   SmallVector<ArgInfo, 8> SplitArgs;
   unsigned i = 0;
-  for (auto &Arg : Args) {
+  for (auto &Arg : F.args()) {
     ArgInfo OrigArg{VRegs[i], Arg.getType()};
     setArgFlags(OrigArg, i + 1, DL, F);
+    bool Split = false;
+    LLT Ty = MRI.getType(VRegs[i]);
+    unsigned Dst = VRegs[i];
+
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildSequence(VRegs[i], Regs, Offsets);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        if (!Split) {
+                          Split = true;
+                          Dst = MRI.createGenericVirtualRegister(Ty);
+                          MIRBuilder.buildUndef(Dst);
+                        }
+                        unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+                        MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
+                        Dst = Tmp;
                       });
+
+    if (Dst != VRegs[i])
+      MIRBuilder.buildCopy(VRegs[i], Dst);
     ++i;
   }
 
@@ -228,10 +276,25 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
 
-  FormalArgHandler Handler(MIRBuilder, MRI);
-  if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler))
+  FormalArgHandler Handler(MIRBuilder, MRI, AssignFn);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
+  if (F.isVarArg()) {
+    if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
+      // FIXME: we need to reimplement saveVarArgsRegisters from
+      // AArch64ISelLowering.
+      return false;
+    }
+
+    // We currently pass all varargs at 8-byte alignment.
+    uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+    FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+  }
+
   // Move back to the end of the basic block.
   MIRBuilder.setMBB(MBB);
 
@@ -239,6 +302,7 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 }
 
 bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                    CallingConv::ID CallConv,
                                     const MachineOperand &Callee,
                                     const ArgInfo &OrigRet,
                                     ArrayRef<ArgInfo> OrigArgs) const {
@@ -250,21 +314,25 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   SmallVector<ArgInfo, 8> SplitArgs;
   for (auto &OrigArg : OrigArgs) {
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset);
                       });
   }
 
   // Find out which ABI gets to decide where things go.
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
-  CCAssignFn *CallAssignFn =
-      TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+  CCAssignFn *AssignFnFixed =
+      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+  CCAssignFn *AssignFnVarArg =
+      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
+
+  auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
   auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
                                                           : AArch64::BL);
-  MIB.addOperand(Callee);
+  MIB.add(Callee);
 
   // Tell the call which registers are clobbered.
   auto TRI = MF.getSubtarget().getRegisterInfo();
@@ -272,8 +340,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Do the actual argument marshalling.
   SmallVector<unsigned, 8> PhysRegs;
-  OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
-  if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler))
+  OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+                             AssignFnVarArg);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
   // Now we can add the actual call instruction to the correct basic block.
@@ -298,20 +367,23 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     SmallVector<uint64_t, 8> RegOffsets;
     SmallVector<unsigned, 8> SplitRegs;
     splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        std::copy(Offsets.begin(), Offsets.end(),
-                                  std::back_inserter(RegOffsets));
-                        std::copy(Regs.begin(), Regs.end(),
-                                  std::back_inserter(SplitRegs));
+                      [&](unsigned Reg, uint64_t Offset) {
+                        RegOffsets.push_back(Offset);
+                        SplitRegs.push_back(Reg);
                       });
 
-    CallReturnHandler Handler(MIRBuilder, MRI, MIB);
-    if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler))
+    CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
+    if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
       return false;
 
     if (!RegOffsets.empty())
       MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
   }
 
+  CallSeqStart.addImm(Handler.StackSize);
+  MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
+      .addImm(Handler.StackSize)
+      .addImm(0);
+
   return true;
 }
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
index ce6676249df6..d96ce95c4de0 100644
--- a/lib/Target/AArch64/AArch64CallLowering.h
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,20 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
-#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include <cstdint>
+#include <functional>
 
 namespace llvm {
 
 class AArch64TargetLowering;
 
 class AArch64CallLowering: public CallLowering {
- public:
+public:
   AArch64CallLowering(const AArch64TargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
@@ -32,8 +34,8 @@ class AArch64CallLowering: public CallLowering {
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee,
-                 const ArgInfo &OrigRet,
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
                  ArrayRef<ArgInfo> OrigArgs) const override;
 
 private:
@@ -44,13 +46,14 @@ private:
   typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)>
       MemHandler;
 
-  typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)>
-      SplitArgTy;
+  typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
 
   void splitToValueTypes(const ArgInfo &OrigArgInfo,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          const DataLayout &DL, MachineRegisterInfo &MRI,
-                         SplitArgTy SplitArg) const;
+                         const SplitArgTy &SplitArg) const;
 };
-} // End of namespace llvm;
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 8b186328d125..2dfcd2d1c393 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -265,10 +265,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
 
   // Change immediate in comparison instruction (ADDS or SUBS).
   BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
-      .addOperand(CmpMI->getOperand(0))
-      .addOperand(CmpMI->getOperand(1))
+      .add(CmpMI->getOperand(0))
+      .add(CmpMI->getOperand(1))
       .addImm(Imm)
-      .addOperand(CmpMI->getOperand(3));
+      .add(CmpMI->getOperand(3));
   CmpMI->eraseFromParent();
 
   // The fact that this comparison was picked ensures that it's related to the
@@ -278,7 +278,7 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
   // Change condition in branch instruction.
   BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
       .addImm(Cmp)
-      .addOperand(BrMI.getOperand(1));
+      .add(BrMI.getOperand(1));
   BrMI.eraseFromParent();
 
   MBB->updateTerminator();
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index da09b36cac9c..00a0111f2bd2 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -594,7 +594,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
     BuildMI(*Head, Head->end(), TermDL, MCID)
         .addReg(DestReg, RegState::Define | RegState::Dead)
-        .addOperand(HeadCond[2])
+        .add(HeadCond[2])
         .addImm(0)
         .addImm(0);
     // SUBS uses the GPR*sp register classes.
@@ -650,13 +650,12 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   if (CmpMI->getOperand(FirstOp + 1).isReg())
     MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
                            TII->getRegClass(MCID, 1, TRI, *MF));
-  MachineInstrBuilder MIB =
-      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
-          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+                                .add(CmpMI->getOperand(FirstOp)); // Register Rn
   if (isZBranch)
     MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
   else
-    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+    MIB.add(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
   MIB.addImm(NZCV).addImm(HeadCmpBBCC);
 
   // If CmpMI was a terminator, we need a new conditional branch to replace it.
@@ -666,7 +665,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
                 CmpMI->getOpcode() == AArch64::CBNZX;
     BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
         .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
-        .addOperand(CmpMI->getOperand(1)); // Branch target.
+        .add(CmpMI->getOperand(1)); // Branch target.
   }
   CmpMI->eraseFromParent();
   Head->updateTerminator();
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index fe1c0beee0eb..d0c0956b87ca 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -70,9 +71,9 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
     const MachineOperand &MO = OldMI.getOperand(i);
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
-      UseMI.addOperand(MO);
+      UseMI.add(MO);
     else
-      DefMI.addOperand(MO);
+      DefMI.add(MO);
   }
 }
 
@@ -112,7 +113,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(AArch64::XZR)
             .addImm(Encoding);
 
@@ -179,7 +180,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(AArch64::XZR)
             .addImm(Encoding);
 
@@ -362,7 +363,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
   AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
   MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-          .addOperand(MI.getOperand(0))
+          .add(MI.getOperand(0))
           .addReg(AArch64::XZR)
           .addImm(Encoding);
 
@@ -425,7 +426,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
     unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
             .addImm(Encoding);
     transferImpOps(MI, MIB, MIB);
@@ -539,15 +540,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   if (Imm != 0) {
     unsigned LZ = countLeadingZeros(Imm);
     unsigned TZ = countTrailingZeros(Imm);
-    Shift = ((63 - LZ) / 16) * 16;
-    LastShift = (TZ / 16) * 16;
+    Shift = (TZ / 16) * 16;
+    LastShift = ((63 - LZ) / 16) * 16;
   }
   unsigned Imm16 = (Imm >> Shift) & Mask;
   bool DstIsDead = MI.getOperand(0).isDead();
   MachineInstrBuilder MIB1 =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
           .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && Shift == LastShift))
+                  getDeadRegState(DstIsDead && Shift == LastShift))
           .addImm(Imm16)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
 
@@ -564,15 +565,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
 
   MachineInstrBuilder MIB2;
   unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
-  while (Shift != LastShift) {
-    Shift -= 16;
+  while (Shift < LastShift) {
+    Shift += 16;
     Imm16 = (Imm >> Shift) & Mask;
     if (Imm16 == (isNeg ? Mask : 0))
       continue; // This 16-bit portion is already set correctly.
     MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
                .addReg(DstReg,
                        RegState::Define |
-                           getDeadRegState(DstIsDead && Shift == LastShift))
+                       getDeadRegState(DstIsDead && Shift == LastShift))
                .addReg(DstReg)
                .addImm(Imm16)
                .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
@@ -627,7 +628,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
       .addReg(Addr.getReg());
   BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
       .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
-      .addOperand(Desired)
+      .add(Desired)
       .addImm(ExtendImm);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
       .addImm(AArch64CC::NE)
@@ -643,9 +644,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
   StoreBB->addLiveIn(New.getReg());
   addPostLoopLiveIns(StoreBB, LiveRegs);
 
-  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
-      .addOperand(New)
-      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg).add(New).add(Addr);
   BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
       .addReg(StatusReg, RegState::Kill)
       .addMBB(LoadCmpBB);
@@ -710,7 +709,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
       .addReg(Addr.getReg());
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
       .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
-      .addOperand(DesiredLo)
+      .add(DesiredLo)
       .addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
     .addUse(AArch64::WZR)
@@ -718,7 +717,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
     .addImm(AArch64CC::EQ);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
       .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
-      .addOperand(DesiredHi)
+      .add(DesiredHi)
       .addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
       .addUse(StatusReg, RegState::Kill)
@@ -738,9 +737,9 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   StoreBB->addLiveIn(NewHi.getReg());
   addPostLoopLiveIns(StoreBB, LiveRegs);
   BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
-      .addOperand(NewLo)
-      .addOperand(NewHi)
-      .addOperand(Addr);
+      .add(NewLo)
+      .add(NewHi)
+      .add(Addr);
   BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
       .addReg(StatusReg, RegState::Kill)
       .addMBB(LoadCmpBB);
@@ -825,8 +824,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
                 MI.getOperand(0).getReg())
-            .addOperand(MI.getOperand(1))
-            .addOperand(MI.getOperand(2))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(2))
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     transferImpOps(MI, MIB1, MIB1);
     MI.eraseFromParent();
@@ -842,7 +841,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
     MachineInstrBuilder MIB2 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(DstReg);
 
     if (MO1.isGlobal()) {
@@ -878,19 +877,31 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned DstReg = MI.getOperand(0).getReg();
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
-            .addOperand(MI.getOperand(1));
+            .add(MI.getOperand(1));
 
     MachineInstrBuilder MIB2 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(DstReg)
-            .addOperand(MI.getOperand(2))
+            .add(MI.getOperand(2))
             .addImm(0);
 
     transferImpOps(MI, MIB1, MIB2);
     MI.eraseFromParent();
     return true;
   }
+  case AArch64::MOVbaseTLS: {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    auto SysReg = AArch64SysReg::TPIDR_EL0;
+    MachineFunction *MF = MBB.getParent();
+    if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
+        MF->getTarget().getCodeModel() == CodeModel::Kernel)
+      SysReg = AArch64SysReg::TPIDR_EL1;
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
+        .addImm(SysReg);
+    MI.eraseFromParent();
+    return true;
+  }
 
   case AArch64::MOVi32imm:
     return expandMOVImm(MBB, MBBI, 32);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index fe2c2d4550a7..4e5e3e43a468 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -15,28 +15,62 @@
 
 #include "AArch64.h"
 #include "AArch64CallingConvention.h"
+#include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -50,48 +84,55 @@ class AArch64FastISel final : public FastISel {
     } BaseKind;
 
   private:
-    BaseKind Kind;
-    AArch64_AM::ShiftExtendType ExtType;
+    BaseKind Kind = RegBase;
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::InvalidShiftExtend;
     union {
       unsigned Reg;
       int FI;
     } Base;
-    unsigned OffsetReg;
-    unsigned Shift;
-    int64_t Offset;
-    const GlobalValue *GV;
+    unsigned OffsetReg = 0;
+    unsigned Shift = 0;
+    int64_t Offset = 0;
+    const GlobalValue *GV = nullptr;
 
   public:
-    Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
-      OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
+    Address() { Base.Reg = 0; }
+
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
     void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
     AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
+
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
       Base.Reg = Reg;
     }
+
     unsigned getReg() const {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+
     void setOffsetReg(unsigned Reg) {
       OffsetReg = Reg;
     }
+
     unsigned getOffsetReg() const {
       return OffsetReg;
     }
+
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index  access!");
       Base.FI = FI;
     }
+
     unsigned getFI() const {
       assert(isFIBase() && "Invalid base frame index access!");
       return Base.FI;
     }
+
     void setOffset(int64_t O) { Offset = O; }
     int64_t getOffset() { return Offset; }
     void setShift(unsigned S) { Shift = S; }
@@ -417,7 +458,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 
   // MachO still uses GOT for large code-model accesses, but ELF requires
   // movz/movk sequences, which FastISel doesn't handle yet.
-  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+  if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
     return 0;
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
@@ -531,23 +572,23 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
   switch (Opcode) {
   default:
     break;
-  case Instruction::BitCast: {
+  case Instruction::BitCast:
     // Look through bitcasts.
     return computeAddress(U->getOperand(0), Addr, Ty);
-  }
-  case Instruction::IntToPtr: {
+
+  case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
     if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
         TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
-  }
-  case Instruction::PtrToInt: {
+
+  case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
     if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
-  }
+
   case Instruction::GetElementPtr: {
     Address SavedAddr = Addr;
     uint64_t TmpOffset = Addr.getOffset();
@@ -563,7 +604,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
         TmpOffset += SL->getElementOffset(Idx);
       } else {
         uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
+        while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
             TmpOffset += CI->getSExtValue() * S;
@@ -2813,8 +2854,8 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
-  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
-          "Unexpected value type.");
+  assert((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+         "Unexpected value type.");
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
@@ -3106,8 +3147,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   CodeModel::Model CM = TM.getCodeModel();
-  // Only support the small and large code model.
-  if (CM != CodeModel::Small && CM != CodeModel::Large)
+  // Only support the small-addressing and large code models.
+  if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
     return false;
 
   // FIXME: Add large code model support for ELF.
@@ -3158,7 +3199,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  if (CM == CodeModel::Small) {
+  if (Subtarget->useSmallAddressing()) {
     const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
     if (Symbol)
@@ -3369,8 +3410,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
     MFI.setFrameAddressIsTaken(true);
 
-    const AArch64RegisterInfo *RegInfo =
-        static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
+    const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -3521,11 +3561,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, ResultReg);
     return true;
   }
-  case Intrinsic::trap: {
+  case Intrinsic::trap:
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
-  }
+
   case Intrinsic::sqrt: {
     Type *RetTy = II->getCalledFunction()->getReturnType();
 
@@ -5092,8 +5132,10 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
 }
 
 namespace llvm {
-llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+
+FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
   return new AArch64FastISel(FuncInfo, LibInfo);
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index f5b8c35375f8..550174b22a89 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -90,21 +90,42 @@
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
 
 using namespace llvm;
 
@@ -245,14 +266,13 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   if (&MF->front() == MBB)
     return AArch64::X9;
 
-  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
-  LivePhysRegs LiveRegs(&TRI);
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
   LiveRegs.addLiveIns(*MBB);
 
   // Mark callee saved registers as used so we will not choose them.
-  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     LiveRegs.addReg(CSRegs[i]);
 
@@ -319,7 +339,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
-
   unsigned NewOpc;
   bool NewIsUnscaled = false;
   switch (MBBI->getOpcode()) {
@@ -362,7 +381,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   unsigned OpndIdx = 0;
   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
        ++OpndIdx)
-    MIB.addOperand(MBBI->getOperand(OpndIdx));
+    MIB.add(MBBI->getOperand(OpndIdx));
 
   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
          "Unexpected immediate offset in first/last callee-save save/restore "
@@ -863,22 +882,26 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
 
 static bool produceCompactUnwindFrame(MachineFunction &MF) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  AttributeSet Attrs = MF.getFunction()->getAttributes();
+  AttributeList Attrs = MF.getFunction()->getAttributes();
   return Subtarget.isTargetMachO() &&
          !(Subtarget.getTargetLowering()->supportSwiftError() &&
            Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
 namespace {
+
 struct RegPairInfo {
-  RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
-  unsigned Reg1;
-  unsigned Reg2;
+  unsigned Reg1 = AArch64::NoRegister;
+  unsigned Reg2 = AArch64::NoRegister;
   int FrameIdx;
   int Offset;
   bool IsGPR;
+
+  RegPairInfo() = default;
+
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
 };
+
 } // end anonymous namespace
 
 static void computeCalleeSaveRegisterPairs(
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index d472a54d9543..8b1c9740d2ad 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -16,281 +16,198 @@
 #endif
 
 namespace llvm {
-namespace AArch64 {
-
-const uint32_t GPRCoverageData[] = {
-    // Classes 0-31
-    (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) |
-        (1u << AArch64::GPR32spRegClassID) |
-        (1u << AArch64::GPR32commonRegClassID) |
-        (1u << AArch64::GPR32sponlyRegClassID) |
-        (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) |
-        (1u << AArch64::GPR64spRegClassID) |
-        (1u << AArch64::GPR64commonRegClassID) |
-        (1u << AArch64::tcGPR64RegClassID) |
-        (1u << AArch64::GPR64sponlyRegClassID),
-    // Classes 32-63
-    0,
-    // FIXME: The entries below this point can be safely removed once this is
-    // tablegenerated. It's only needed because of the hardcoded register class
-    // limit.
-    // Classes 64-96
-    0,
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
-};
-
-const uint32_t FPRCoverageData[] = {
-    // Classes 0-31
-    (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) |
-        (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) |
-        (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) |
-        (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) |
-        (1u << AArch64::DDDDRegClassID),
-    // Classes 32-63
-    (1u << (AArch64::QQRegClassID - 32)) |
-        (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
-        (1u
-         << (AArch64::
-                 QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u << (AArch64::QQQQRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID -
-             32)) |
-        (1u << (AArch64::QQQRegClassID - 32)) |
-        (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
-        (1u
-         << (AArch64::
-                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID -
-             32)),
-    // FIXME: The entries below this point can be safely removed once this
-    // is tablegenerated. It's only needed because of the hardcoded register
-    // class limit.
-    // Classes 64-96
-    0,
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
-};
-
-const uint32_t CCRCoverageData[] = {
-    // Classes 0-31
-    1u << AArch64::CCRRegClassID,
-    // Classes 32-63
-    0,
-    // FIXME: The entries below this point can be safely removed once this
-    // is tablegenerated. It's only needed because of the hardcoded register
-    // class limit.
-    // Classes 64-96
-    0,
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
-};
-
-RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData);
-RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData);
-RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData);
-
-RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
-
-// PartialMappings.
-enum PartialMappingIdx {
-  PMI_None = -1,
-  PMI_GPR32 = 1,
-  PMI_GPR64,
-  PMI_FPR32,
-  PMI_FPR64,
-  PMI_FPR128,
-  PMI_FPR256,
-  PMI_FPR512,
-  PMI_FirstGPR = PMI_GPR32,
-  PMI_LastGPR = PMI_GPR64,
-  PMI_FirstFPR = PMI_FPR32,
-  PMI_LastFPR = PMI_FPR512,
-  PMI_Min = PMI_FirstGPR,
-};
-
-static unsigned getRegBankBaseIdxOffset(unsigned Size) {
-  assert(Size && "0-sized type!!");
-  // Make anything smaller than 32 gets 32
-  Size = ((Size + 31) / 32) * 32;
-  // 32 is 0, 64 is 1, 128 is 2, and so on.
-  return Log2_32(Size) - /*Log2_32(32)=*/ 5;
-}
-
-RegisterBankInfo::PartialMapping PartMappings[] {
-  /* StartIdx, Length, RegBank */
-  // 0: GPR 32-bit value.
-  {0, 32, GPRRegBank},
-  // 1: GPR 64-bit value.
-  {0, 64, GPRRegBank},
-  // 2: FPR 32-bit value.
-  {0, 32, FPRRegBank},
-  // 3: FPR 64-bit value.
-  {0, 64, FPRRegBank},
-  // 4: FPR 128-bit value.
-  {0, 128, FPRRegBank},
-  // 5: FPR 256-bit value.
-  {0, 256, FPRRegBank},
-  // 6: FPR 512-bit value.
-  {0, 512, FPRRegBank}
-};
-
-enum ValueMappingIdx {
-  First3OpsIdx = 0,
-  Last3OpsIdx = 18,
-  DistanceBetweenRegBanks = 3,
-  FirstCrossRegCpyIdx = 21,
-  LastCrossRegCpyIdx = 27,
-  DistanceBetweenCrossRegCpy = 2
+RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{
+    /* StartIdx, Length, RegBank */
+    // 0: FPR 32-bit value.
+    {0, 32, AArch64::FPRRegBank},
+    // 1: FPR 64-bit value.
+    {0, 64, AArch64::FPRRegBank},
+    // 2: FPR 128-bit value.
+    {0, 128, AArch64::FPRRegBank},
+    // 3: FPR 256-bit value.
+    {0, 256, AArch64::FPRRegBank},
+    // 4: FPR 512-bit value.
+    {0, 512, AArch64::FPRRegBank},
+    // 5: GPR 32-bit value.
+    {0, 32, AArch64::GPRRegBank},
+    // 6: GPR 64-bit value.
+    {0, 64, AArch64::GPRRegBank},
 };
 
 // ValueMappings.
-RegisterBankInfo::ValueMapping ValMappings[]{
+RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
     /* BreakDown, NumBreakDowns */
+    // 0: invalid
+    {nullptr, 0},
     // 3-operands instructions (all binary operations should end up with one of
     // those mapping).
-    // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 3: GPR 64-bit value.
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    // 6: FPR 32-bit value.
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 9: FPR 64-bit value.
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    // 12: FPR 128-bit value.
-    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
-    // 15: FPR 256-bit value.
-    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
-    // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
-    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    // 1: FPR 32-bit value. <-- This must match First3OpsIdx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 4: FPR 64-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 7: FPR 128-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    // 10: FPR 256-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+    // 13: FPR 512-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+    // 16: GPR 32-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 19: GPR 64-bit value. <-- This must match Last3OpsIdx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     // Cross register bank copies.
-    // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match
+    // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match
     //                                               FirstCrossRegCpyIdx.
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 23: GPR 64-bit value to FPR 64-bit value.
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    // 25: FPR 32-bit value to GPR 32-bit value.
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 24: FPR 64-bit value to GPR 64-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    // 26: FPR 128-bit value to GPR 128-bit value (invalid)
+    {nullptr, 1},
+    {nullptr, 1},
+    // 28: FPR 256-bit value to GPR 256-bit value (invalid)
+    {nullptr, 1},
+    {nullptr, 1},
+    // 30: FPR 512-bit value to GPR 512-bit value (invalid)
+    {nullptr, 1},
+    {nullptr, 1},
+    // 32: GPR 32-bit value to FPR 32-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match
     //                                               LastCrossRegCpyIdx.
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1}
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
 };
 
-/// Get the pointer to the ValueMapping representing the RegisterBank
-/// at \p RBIdx with a size of \p Size.
-///
-/// The returned mapping works for instructions with the same kind of
-/// operands for up to 3 operands.
-///
-/// \pre \p RBIdx != PartialMappingIdx::None
+bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx,
+                                                 unsigned ValStartIdx,
+                                                 unsigned ValLength,
+                                                 const RegisterBank &RB) {
+  const PartialMapping &Map = PartMappings[Idx - PartialMappingIdx::PMI_Min];
+  return Map.StartIdx == ValStartIdx && Map.Length == ValLength &&
+         Map.RegBank == &RB;
+}
+
+bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx,
+                                                   unsigned FirstInBank,
+                                                   unsigned Size,
+                                                   unsigned Offset) {
+  unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min;
+  const ValueMapping &Map =
+      AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset];
+  return Map.BreakDown == &PartMappings[PartialMapBaseIdx] &&
+         Map.NumBreakDowns == 1;
+}
+
+bool AArch64GenRegisterBankInfo::checkPartialMappingIdx(
+    PartialMappingIdx FirstAlias, PartialMappingIdx LastAlias,
+    ArrayRef<PartialMappingIdx> Order) {
+  if (Order.front() != FirstAlias)
+    return false;
+  if (Order.back() != LastAlias)
+    return false;
+  if (Order.front() > Order.back())
+    return false;
+
+  PartialMappingIdx Previous = Order.front();
+  bool First = true;
+  for (const auto &Current : Order) {
+    if (First) {
+      First = false;
+      continue;
+    }
+    if (Previous + 1 != Current)
+      return false;
+    Previous = Current;
+  }
+  return true;
+}
+
+unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
+                                                             unsigned Size) {
+  if (RBIdx == PMI_FirstGPR) {
+    if (Size <= 32)
+      return 0;
+    if (Size <= 64)
+      return 1;
+    return -1;
+  }
+  if (RBIdx == PMI_FirstFPR) {
+    if (Size <= 32)
+      return 0;
+    if (Size <= 64)
+      return 1;
+    if (Size <= 128)
+      return 2;
+    if (Size <= 256)
+      return 3;
+    if (Size <= 512)
+      return 4;
+    return -1;
+  }
+  return -1;
+}
+
 const RegisterBankInfo::ValueMapping *
-getValueMapping(PartialMappingIdx RBIdx, unsigned Size) {
+AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx,
+                                            unsigned Size) {
   assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
-  unsigned ValMappingIdx = First3OpsIdx +
-                           (RBIdx - AArch64::PartialMappingIdx::PMI_Min +
-                            getRegBankBaseIdxOffset(Size)) *
-                               ValueMappingIdx::DistanceBetweenRegBanks;
-  assert(ValMappingIdx >= AArch64::First3OpsIdx &&
-         ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
+  unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size);
+  if (BaseIdxOffset == -1u)
+    return &ValMappings[InvalidIdx];
+
+  unsigned ValMappingIdx =
+      First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) *
+                         ValueMappingIdx::DistanceBetweenRegBanks;
+  assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx &&
+         "Mapping out of bound");
 
   return &ValMappings[ValMappingIdx];
 }
 
-/// Get the pointer to the ValueMapping of the operands of a copy
-/// instruction from a GPR or FPR register to a GPR or FPR register
-/// with a size of \p Size.
-///
-/// If \p DstIsGPR is true, the destination of the copy is on GPR,
-/// otherwise it is on FPR. Same thing for \p SrcIsGPR.
+AArch64GenRegisterBankInfo::PartialMappingIdx
+    AArch64GenRegisterBankInfo::BankIDToCopyMapIdx[]{
+        PMI_None,     // CCR
+        PMI_FirstFPR, // FPR
+        PMI_FirstGPR, // GPR
+    };
+
 const RegisterBankInfo::ValueMapping *
-getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) {
-  PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
-  PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
+                                           unsigned SrcBankID, unsigned Size) {
+  assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
+  assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
+  PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID];
+  PartialMappingIdx SrcRBIdx = BankIDToCopyMapIdx[SrcBankID];
+  assert(DstRBIdx != PMI_None && "No such mapping");
+  assert(SrcRBIdx != PMI_None && "No such mapping");
+
   if (DstRBIdx == SrcRBIdx)
     return getValueMapping(DstRBIdx, Size);
+
   assert(Size <= 64 && "GPR cannot handle that size");
   unsigned ValMappingIdx =
       FirstCrossRegCpyIdx +
-      (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) *
+      (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(DstRBIdx, Size)) *
           ValueMappingIdx::DistanceBetweenCrossRegCpy;
-  assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx &&
-         ValMappingIdx <= AArch64::LastCrossRegCpyIdx &&
-         "Mapping out of bound");
+  assert(ValMappingIdx >= FirstCrossRegCpyIdx &&
+         ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound");
   return &ValMappings[ValMappingIdx];
 }
-
-} // End AArch64 namespace.
 } // End llvm namespace.
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3099383e5b32..ae01ea477bb9 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -328,11 +328,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
   }
 }
 
+/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// mode.
+static bool isWorthFoldingSHL(SDValue V) {
+  assert(V.getOpcode() == ISD::SHL && "invalid opcode");
+  // It is worth folding logical shift of up to three places.
+  auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!CSD)
+    return false;
+  unsigned ShiftVal = CSD->getZExtValue();
+  if (ShiftVal > 3)
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = V.getNode();
+  for (SDNode *UI : Node->uses())
+    if (!isa<MemSDNode>(*UI))
+      for (SDNode *UII : UI->uses())
+        if (!isa<MemSDNode>(*UII))
+          return false;
+  return true;
+}
+
 /// \brief Determine whether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the value is used at least twice, unless we are optimizing
-  // for code size.
-  return ForCodeSize || V.hasOneUse();
+  // Trivial if we are optimizing for code size or if there is only
+  // one use of the value.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  // If a subtarget has a fastpath LSL we can fold a logical shift into
+  // the addressing mode and save a cycle.
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+      isWorthFoldingSHL(V))
+    return true;
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+    const SDValue LHS = V.getOperand(0);
+    const SDValue RHS = V.getOperand(1);
+    if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
+      return true;
+    if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
+      return true;
+  }
+
+  // It hurts otherwise, since the value will be reused.
+  return false;
 }
 
 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 849058bdfbdb..0d3289ac84c3 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -554,8 +555,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setSchedulingPreference(Sched::Hybrid);
 
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
   EnableExtLdPromotion = true;
 
   // Set required alignment.
@@ -793,7 +792,7 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
 /// KnownZero/KnownOne bitsets.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
     const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
+    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
@@ -2113,8 +2112,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
@@ -2124,8 +2123,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2231,19 +2231,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::SIGN_EXTEND)
-    return true;
-  if (isExtendedBUILD_VECTOR(N, DAG, true))
-    return true;
-  return false;
+  return N->getOpcode() == ISD::SIGN_EXTEND ||
+         isExtendedBUILD_VECTOR(N, DAG, true);
 }
 
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::ZERO_EXTEND)
-    return true;
-  if (isExtendedBUILD_VECTOR(N, DAG, false))
-    return true;
-  return false;
+  return N->getOpcode() == ISD::ZERO_EXTEND ||
+         isExtendedBUILD_VECTOR(N, DAG, false);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -3578,7 +3572,7 @@ SDValue
 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+  assert(Subtarget->useSmallAddressing() &&
          "ELF TLS only supported in small memory model");
   // Different choices can be made for the maximum size of the TLS area for a
   // module. For the small address model, the default TLS size is 16MiB and the
@@ -3679,7 +3673,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                      SelectionDAG &DAG) const {
   if (Subtarget->isTargetDarwin())
     return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
+  if (Subtarget->isTargetELF())
     return LowerELFGlobalTLSAddress(Op, DAG);
 
   llvm_unreachable("Unexpected platform trying to use TLS");
@@ -4516,7 +4510,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                                   SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
+                       .Case("x18", AArch64::X18)
+                       .Case("w18", AArch64::W18)
                        .Default(0);
+  if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
+      !Subtarget->isX18Reserved())
+    Reg = 0;
   if (Reg)
     return Reg;
   report_fatal_error(Twine("Invalid register name \""
@@ -6591,21 +6590,20 @@ FailedModImm:
   if (!isConstant && !usesOnlyOneValue) {
     SDValue Vec = DAG.getUNDEF(VT);
     SDValue Op0 = Op.getOperand(0);
-    unsigned ElemSize = VT.getScalarSizeInBits();
     unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+
+    // Use SCALAR_TO_VECTOR for lane zero to
     // a) Avoid a RMW dependency on the full vector register, and
     // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    // Do not do this for UNDEF/LOAD nodes because we have better patterns
-    // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
-    if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
-        (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, dl, MVT::i32));
-      Vec = SDValue(N, 0);
+    //    value is already in an S or D register, and we're forced to emit an
+    //    INSERT_SUBREG that we can't fold anywhere.
+    //
+    // We also allow types like i8 and i16 which are illegal scalar but legal
+    // vector element types. After type-legalization the inserted value is
+    // extended (i32) and it is safe to cast them to the vector type by ignoring
+    // the upper bits of the lowest lane (e.g. v8i8, v4i16).
+    if (!Op0.isUndef()) {
+      Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
       ++i;
     }
     for (; i < NumElts; ++i) {
@@ -7249,6 +7247,33 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
   return NumBits == 32 || NumBits == 64;
 }
 
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+                                                 const DataLayout &DL) const {
+  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+bool AArch64TargetLowering::isLegalInterleavedAccessType(
+    VectorType *VecTy, const DataLayout &DL) const {
+
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+  // Ensure the number of vector elements is greater than 1.
+  if (VecTy->getNumElements() < 2)
+    return false;
+
+  // Ensure the element type is legal.
+  if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+    return false;
+
+  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+  // 128 will be split into multiple interleaved accesses.
+  return VecSize == 64 || VecSize % 128 == 0;
+}
+
 /// \brief Lower an interleaved load into a ldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -7272,12 +7297,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
 
-  // Skip if we do not have NEON and skip illegal vector types.
-  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
+  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getVectorElementType();
@@ -7285,6 +7313,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
+  IRBuilder<> Builder(LI);
+
+  // The base address of the load.
+  Value *BaseAddr = LI->getPointerOperand();
+
+  if (NumLoads > 1) {
+    // If we're going to generate more than one load, reset the sub-vector type
+    // to something legal.
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
+
+    // We will compute the pointer operand of each load from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
+  }
+
   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
   Type *Tys[2] = {VecTy, PtrTy};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
@@ -7293,39 +7340,49 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   Function *LdNFunc =
       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
-  IRBuilder<> Builder(LI);
-  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+  // Holds sub-vectors extracted from the load intrinsic return values. The
+  // sub-vectors are associated with the shufflevector instructions they will
+  // replace.
+  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
-  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
-  // Replace uses of each shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SVI = Shuffles[i];
-    unsigned Index = Indices[i];
+    // If we're generating more than one load, compute the base address of
+    // subsequent loads as an offset from the previous.
+    if (LoadCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(
+          BaseAddr, VecTy->getVectorNumElements() * Factor);
 
-    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+    CallInst *LdN = Builder.CreateCall(
+        LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
 
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+    // Extract and store the sub-vectors returned by the load intrinsic.
+    for (unsigned i = 0; i < Shuffles.size(); i++) {
+      ShuffleVectorInst *SVI = Shuffles[i];
+      unsigned Index = Indices[i];
 
-    SVI->replaceAllUsesWith(SubVec);
-  }
+      Value *SubVec = Builder.CreateExtractValue(LdN, Index);
 
-  return true;
-}
+      // Convert the integer vector to pointer vector if the element is pointer.
+      if (EltTy->isPointerTy())
+        SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
 
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+      SubVecs[SVI].push_back(SubVec);
+    }
+  }
+
+  // Replace uses of the shufflevector instructions with the sub-vectors
+  // returned by the load intrinsic. If a shufflevector instruction is
+  // associated with more than one sub-vector, those sub-vectors will be
+  // concatenated into a single wide vector.
+  for (ShuffleVectorInst *SVI : Shuffles) {
+    auto &SubVec = SubVecs[SVI];
+    auto *WideVec =
+        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+    SVI->replaceAllUsesWith(WideVec);
+  }
 
-  return ConstantVector::get(Mask);
+  return true;
 }
 
 /// \brief Lower an interleaved store into a stN intrinsic.
@@ -7369,12 +7426,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
-  // Skip if we do not have NEON and skip illegal vector types.
-  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
@@ -7394,6 +7454,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
+  // The base address of the store.
+  Value *BaseAddr = SI->getPointerOperand();
+
+  if (NumStores > 1) {
+    // If we're going to generate more than one store, reset the lane length
+    // and sub-vector type to something legal.
+    LaneLen /= NumStores;
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+    // We will compute the pointer operand of each store from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
+  }
+
+  auto Mask = SVI->getShuffleMask();
+
   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
   Type *Tys[2] = {SubVecTy, PtrTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
@@ -7402,34 +7481,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   Function *StNFunc =
       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
 
-  SmallVector<Value *, 5> Ops;
+  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
-  // Split the shufflevector operands into sub vectors for the new stN call.
-  auto Mask = SVI->getShuffleMask();
-  for (unsigned i = 0; i < Factor; i++) {
-    if (Mask[i] >= 0) {
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
-    } else {
-      unsigned StartMask = 0;
-      for (unsigned j = 1; j < LaneLen; j++) {
-        if (Mask[j*Factor + i] >= 0) {
-          StartMask = Mask[j*Factor + i] - j;
-          break;
+    SmallVector<Value *, 5> Ops;
+
+    // Split the shufflevector operands into sub vectors for the new stN call.
+    for (unsigned i = 0; i < Factor; i++) {
+      unsigned IdxI = StoreCount * LaneLen * Factor + i;
+      if (Mask[IdxI] >= 0) {
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+      } else {
+        unsigned StartMask = 0;
+        for (unsigned j = 1; j < LaneLen; j++) {
+          unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+          if (Mask[IdxJ * Factor + IdxI] >= 0) {
+            StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+            break;
+          }
         }
+        // Note: Filling undef gaps with random elements is ok, since
+        // those elements were being written anyway (with undefs).
+        // In the case of all undefs we're defaulting to using elems from 0
+        // Note: StartMask cannot be negative, it's checked in
+        // isReInterleaveMask
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
-      // Note: If all elements in a chunk are undefs, StartMask=0!
-      // Note: Filling undef gaps with random elements is ok, since
-      // those elements were being written anyway (with undefs).
-      // In the case of all undefs we're defaulting to using elems from 0
-      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
     }
-  }
 
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
-  Builder.CreateCall(StNFunc, Ops);
+    // If we generating more than one store, we compute the base address of
+    // subsequent stores as an offset from the previous.
+    if (StoreCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
   return true;
 }
 
@@ -7690,7 +7778,7 @@ SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
 
@@ -9267,7 +9355,7 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
-  /// This function handles the log2-shuffle pattern produced by the
+/// This function handles the log2-shuffle pattern produced by the
 /// LoopVectorizer for the across vector reduction. It consists of
 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
 /// are reduced, where s is an induction variable from 0 to
@@ -10483,9 +10571,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
-  if (!Subtarget->isTargetAndroid())
-    return true;
-  return TargetLowering::useLoadStackGuardNode();
+  if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
 }
 
 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10623,36 +10711,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
   return false;
 }
 
-Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
-    return TargetLowering::getIRStackGuard(IRB);
-
-  // Android provides a fixed TLS slot for the stack cookie. See the definition
-  // of TLS_SLOT_STACK_GUARD in
-  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  const unsigned TlsOffset = 0x28;
+static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
-      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
 }
 
-Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
-    return TargetLowering::getSafeStackPointerLocation(IRB);
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  if (Subtarget->isTargetAndroid())
+    return UseTlsOffset(IRB, 0x28);
 
+  // Fuchsia is similar.
+  // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+  if (Subtarget->isTargetFuchsia())
+    return UseTlsOffset(IRB, -0x10);
+
+  return TargetLowering::getIRStackGuard(IRB);
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  const unsigned TlsOffset = 0x48;
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
-  return IRB.CreatePointerCast(
-      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
-      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+  if (Subtarget->isTargetAndroid())
+    return UseTlsOffset(IRB, 0x48);
+
+  // Fuchsia is similar.
+  // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+  if (Subtarget->isTargetFuchsia())
+    return UseTlsOffset(IRB, -0x8);
+
+  return TargetLowering::getSafeStackPointerLocation(IRB);
+}
+
+bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  // Only sink 'and' mask to cmp use block if it is masking a single bit, since
+  // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
+  // may be beneficial to sink in other cases, but we would have to check that
+  // the cmp would not get folded into the br to form a cbz for these to be
+  // beneficial.
+  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask)
+    return false;
+  return Mask->getUniqueInteger().isPowerOf2();
 }
 
 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
@@ -10702,7 +10810,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
   }
 }
 
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on AArch64 is expensive. However, when aggressively
   // optimizing for code size, we prefer to use a div instruction, as it is
   // usually smaller than the alternative sequence.
@@ -10711,6 +10819,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
   bool OptSize =
-      Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
+
+unsigned
+AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
+  if (Subtarget->isTargetDarwin())
+    return getPointerTy(DL).getSizeInBits();
+
+  return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 054ccc31674f..2ad6c8b23df8 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -251,7 +251,8 @@ public:
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     APInt &KnownOne, const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
@@ -402,7 +403,7 @@ public:
     return AArch64::X1;
   }
 
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool isCheapToSpeculateCttz() const override {
     return true;
@@ -412,6 +413,8 @@ public:
     return true;
   }
 
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
   bool hasAndNotCompare(SDValue) const override {
     // 'bics'
     return true;
@@ -435,6 +438,20 @@ public:
     return true;
   }
 
+  /// Returns the size of the platform's va_list object.
+  unsigned getVaListSizeInBits(const DataLayout &DL) const override;
+
+  /// Returns true if \p VecTy is a legal interleaved access type. This
+  /// function checks the vector element type and the overall width of the
+  /// vector.
+  bool isLegalInterleavedAccessType(VectorType *VecTy,
+                                    const DataLayout &DL) const;
+
+  /// Returns the number of interleaved accesses that will be generated when
+  /// lowering accesses of the given type.
+  unsigned getNumInterleavedAccesses(VectorType *VecTy,
+                                     const DataLayout &DL) const;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index cefdf51b50d2..16be4432b160 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -39,6 +39,9 @@ class AArch64Inst<Format f, string cstr> : Instruction {
   let Constraints = cstr;
 }
 
+class InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+  : InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>;
+
 // Pseudo instructions (don't have encoding information)
 class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
     : AArch64Inst<PseudoFrm, cstr> {
@@ -257,6 +260,7 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 class AsmImmRange<int Low, int High> : AsmOperandClass {
   let Name = "Imm" # Low # "_" # High;
   let DiagnosticType = "InvalidImm" # Low # "_" # High;
+  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
 }
 
 def Imm1_8Operand : AsmImmRange<1, 8>;
@@ -264,6 +268,20 @@ def Imm1_16Operand : AsmImmRange<1, 16>;
 def Imm1_32Operand : AsmImmRange<1, 32>;
 def Imm1_64Operand : AsmImmRange<1, 64>;
 
+class BranchTarget<int N> : AsmOperandClass {
+  let Name = "BranchTarget" # N;
+  let DiagnosticType = "InvalidLabel";
+  let PredicateMethod = "isBranchTarget<" # N # ">";
+}
+
+class PCRelLabel<int N> : BranchTarget<N> {
+  let Name = "PCRelLabel" # N;
+}
+
+def BranchTarget14Operand : BranchTarget<14>;
+def BranchTarget26Operand : BranchTarget<26>;
+def PCRelLabel19Operand   : PCRelLabel<19>;
+
 def MovZSymbolG3AsmOperand : AsmOperandClass {
   let Name = "MovZSymbolG3";
   let RenderMethod = "addImmOperands";
@@ -500,7 +518,8 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 // imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def Imm0_255Operand : AsmImmRange<0,255>;
+
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 256;
 }]> {
@@ -673,6 +692,14 @@ def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
 def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
 def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
 
+def gi_addsub_shifted_imm32 :
+    GIComplexOperandMatcher<s32, (ops i32imm, i32imm), "selectArithImmed">,
+    GIComplexPatternEquiv<addsub_shifted_imm32>;
+
+def gi_addsub_shifted_imm64 :
+    GIComplexOperandMatcher<s64, (ops i32imm, i32imm), "selectArithImmed">,
+    GIComplexPatternEquiv<addsub_shifted_imm64>;
+
 class neg_addsub_shifted_imm<ValueType Ty>
     : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
   let PrintMethod = "printAddSubImm";
@@ -1094,10 +1121,6 @@ def inv_ccode : Operand<i32> {
 
 // Conditional branch target. 19-bit immediate. The low two bits of the target
 // offset are implied zero and so are not part of the immediate.
-def PCRelLabel19Operand : AsmOperandClass {
-  let Name = "PCRelLabel19";
-  let DiagnosticType = "InvalidLabel";
-}
 def am_brcond : Operand<OtherVT> {
   let EncoderMethod = "getCondBranchTargetOpValue";
   let DecoderMethod = "DecodePCRelLabel19";
@@ -1154,9 +1177,6 @@ multiclass CmpBranch<bit op, string asm, SDNode node> {
 //---
 // Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
 // the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
 def am_tbrcond : Operand<OtherVT> {
   let EncoderMethod = "getTestBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
@@ -1166,11 +1186,12 @@ def am_tbrcond : Operand<OtherVT> {
 // AsmOperand classes to emit (or not) special diagnostics
 def TBZImm0_31Operand : AsmOperandClass {
   let Name = "TBZImm0_31";
-  let PredicateMethod = "isImm0_31";
+  let PredicateMethod = "isImmInRange<0,31>";
   let RenderMethod = "addImm0_31Operands";
 }
 def TBZImm32_63Operand : AsmOperandClass {
   let Name = "Imm32_63";
+  let PredicateMethod = "isImmInRange<32,63>";
   let DiagnosticType = "InvalidImm0_63";
 }
 
@@ -1232,10 +1253,6 @@ multiclass TestBranch<bit op, string asm, SDNode node> {
 //---
 // Unconditional branch (immediate) instructions.
 //---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-  let DiagnosticType = "InvalidLabel";
-}
 def am_b_target : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
@@ -1784,10 +1801,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
   }
 
   // add Rd, Rb, -imm -> sub Rd, Rn, imm
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1859,10 +1876,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
   } // Defs = [NZCV]
 
   // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1883,9 +1900,9 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
                   XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
 
   // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
-  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+  def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
                   WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+  def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
                   XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
 
   // Compare shorthands
@@ -2100,10 +2117,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
     let Inst{31} = 1;
   }
 
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2122,10 +2139,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
   }
   } // end Defs = [NZCV]
 
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2454,7 +2471,7 @@ class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
 
 // Load literal address: 19-bit immediate. The low two bits of the target
 // offset are implied zero and so are not part of the immediate.
-def am_ldrlit : Operand<OtherVT> {
+def am_ldrlit : Operand<iPTR> {
   let EncoderMethod = "getLoadLiteralOpValue";
   let DecoderMethod = "DecodePCRelLabel19";
   let PrintMethod = "printAlignedLabel";
@@ -9060,7 +9077,7 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
 // AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
 //----------------------------------------------------------------------------
 
-let Predicates = [HasNEON, HasV8_1a] in {
+let Predicates = [HasNEON, HasRDM] in {
 
 class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
                                     RegisterOperand regtype, string asm, 
@@ -9221,7 +9238,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
     let Inst{21} = idx{0};
   }
 }
-} // let Predicates = [HasNeon, HasV8_1a]
+} // let Predicates = [HasNeon, HasRDM]
 
 //----------------------------------------------------------------------------
 // Crypto extensions
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4c789926e3e4..41fc8eceab5c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
@@ -369,7 +370,7 @@ void AArch64InstrInfo::instantiateCondBranch(
     // Folded compare-and-branch
     // Note that we use addOperand instead of addReg to keep the flags.
     const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
     if (Cond.size() > 3)
       MIB.addImm(Cond[3].getImm());
     MIB.addMBB(TBB);
@@ -762,6 +763,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
+bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const {
+  if (MI.getNumOperands() < 4)
+    return false;
+  unsigned ShOpVal = MI.getOperand(3).getImm();
+  unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal);
+  if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL &&
+       ShImm < 4)
+    return true;
+  return false;
+}
+
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                              unsigned &SrcReg, unsigned &DstReg,
                                              unsigned &SubIdx) const {
@@ -1299,16 +1311,16 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         .addMemOperand(*MI.memoperands_begin());
   } else if (TM.getCodeModel() == CodeModel::Large) {
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
         .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
         .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
         .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
         .addReg(Reg, RegState::Kill)
         .addImm(0)
@@ -1345,14 +1357,6 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
   case AArch64::BICSXrs:
   case AArch64::BICWrs:
   case AArch64::BICXrs:
-  case AArch64::CRC32Brr:
-  case AArch64::CRC32CBrr:
-  case AArch64::CRC32CHrr:
-  case AArch64::CRC32CWrr:
-  case AArch64::CRC32CXrr:
-  case AArch64::CRC32Hrr:
-  case AArch64::CRC32Wrr:
-  case AArch64::CRC32Xrr:
   case AArch64::EONWrs:
   case AArch64::EONXrs:
   case AArch64::EORWrs:
@@ -1691,16 +1695,59 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   } else
     return false;
 
-  // Offset is calculated as the immediate operand multiplied by the scaling factor.
-  // Unscaled instructions have scaling factor set to 1.
+  // Get the scaling factor for the instruction and set the width for the 
+  // instruction.
   unsigned Scale = 0;
-  switch (LdSt.getOpcode()) {
+  int64_t Dummy1, Dummy2;
+
+  // If this returns false, then it's an instruction we don't want to handle.
+  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
+    return false;
+
+  // Compute the offset. Offset is calculated as the immediate operand
+  // multiplied by the scaling factor. Unscaled instructions have scaling factor
+  // set to 1.
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
+  return true;
+}
+
+MachineOperand&
+AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1);
+  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
+  return OfsOp;
+}
+
+bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
+                                    unsigned &Width, int64_t &MinOffset,
+                                    int64_t &MaxOffset) const {
+  switch (Opcode) {
+  // Not a memory operation or something we want to handle.  
   default:
+    Scale = Width = 0;
+    MinOffset = MaxOffset = 0;
     return false;
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    Width = 32;
+    Scale = 4;
+    MinOffset = -256;
+    MaxOffset = 255;
+    break;
   case AArch64::LDURQi:
   case AArch64::STURQi:
     Width = 16;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURXi:
   case AArch64::LDURDi:
@@ -1708,6 +1755,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURDi:
     Width = 8;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURWi:
   case AArch64::LDURSi:
@@ -1716,6 +1765,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURSi:
     Width = 4;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURHi:
   case AArch64::LDURHHi:
@@ -1725,6 +1776,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURHHi:
     Width = 2;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURBi:
   case AArch64::LDURBBi:
@@ -1734,6 +1787,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURBBi:
     Width = 1;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDPQi:
   case AArch64::LDNPQi:
@@ -1741,10 +1796,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPQi:
     Scale = 16;
     Width = 32;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDPXi:
   case AArch64::LDPDi:
@@ -1756,12 +1815,16 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPDi:
     Scale = 8;
     Width = 16;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
     Scale = Width = 8;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDPWi:
   case AArch64::LDPSi:
@@ -1773,6 +1836,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPSi:
     Scale = 4;
     Width = 8;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
@@ -1780,29 +1845,27 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDRHui:
   case AArch64::LDRHHui:
   case AArch64::STRHui:
   case AArch64::STRHHui:
     Scale = Width = 2;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDRBui:
   case AArch64::LDRBBui:
   case AArch64::STRBui:
   case AArch64::STRBBui:
     Scale = Width = 1;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   }
 
-  if (LdSt.getNumExplicitOperands() == 3) {
-    BaseReg = LdSt.getOperand(1).getReg();
-    Offset = LdSt.getOperand(2).getImm() * Scale;
-  } else {
-    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
-    BaseReg = LdSt.getOperand(2).getReg();
-    Offset = LdSt.getOperand(3).getImm() * Scale;
-  }
   return true;
 }
 
@@ -1903,88 +1966,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(
-    const MachineInstr &First, const MachineInstr &Second) const {
-  if (Subtarget.hasArithmeticBccFusion()) {
-    // Fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::Bcc) {
-      switch (First.getOpcode()) {
-      default:
-        return false;
-      case AArch64::ADDSWri:
-      case AArch64::ADDSWrr:
-      case AArch64::ADDSXri:
-      case AArch64::ADDSXrr:
-      case AArch64::ANDSWri:
-      case AArch64::ANDSWrr:
-      case AArch64::ANDSXri:
-      case AArch64::ANDSXrr:
-      case AArch64::SUBSWri:
-      case AArch64::SUBSWrr:
-      case AArch64::SUBSXri:
-      case AArch64::SUBSXrr:
-      case AArch64::BICSWrr:
-      case AArch64::BICSXrr:
-        return true;
-      case AArch64::ADDSWrs:
-      case AArch64::ADDSXrs:
-      case AArch64::ANDSWrs:
-      case AArch64::ANDSXrs:
-      case AArch64::SUBSWrs:
-      case AArch64::SUBSXrs:
-      case AArch64::BICSWrs:
-      case AArch64::BICSXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
-      }
-    }
-  }
-  if (Subtarget.hasArithmeticCbzFusion()) {
-    // Fuse ALU operations followed by CBZ/CBNZ.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
-        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First.getOpcode()) {
-      default:
-        return false;
-      case AArch64::ADDWri:
-      case AArch64::ADDWrr:
-      case AArch64::ADDXri:
-      case AArch64::ADDXrr:
-      case AArch64::ANDWri:
-      case AArch64::ANDWrr:
-      case AArch64::ANDXri:
-      case AArch64::ANDXrr:
-      case AArch64::EORWri:
-      case AArch64::EORWrr:
-      case AArch64::EORXri:
-      case AArch64::EORXrr:
-      case AArch64::ORRWri:
-      case AArch64::ORRWrr:
-      case AArch64::ORRXri:
-      case AArch64::ORRXrr:
-      case AArch64::SUBWri:
-      case AArch64::SUBWrr:
-      case AArch64::SUBXri:
-      case AArch64::SUBXrr:
-        return true;
-      case AArch64::ADDWrs:
-      case AArch64::ADDXrs:
-      case AArch64::ANDWrs:
-      case AArch64::ANDXrs:
-      case AArch64::SUBWrs:
-      case AArch64::SUBXrs:
-      case AArch64::BICWrs:
-      case AArch64::BICXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
-      }
-    }
-  }
-  return false;
-}
-
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
     const MDNode *Expr, const DebugLoc &DL) const {
@@ -3793,7 +3774,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MachineInstrBuilder MIB1 =
         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
             .addReg(ZeroReg)
-            .addOperand(Root.getOperand(2));
+            .add(Root.getOperand(2));
     InsInstrs.push_back(MIB1);
     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
@@ -4286,3 +4267,199 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
       {MO_TLS, "aarch64-tls"}};
   return makeArrayRef(TargetFlags);
 }
+
+unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize,
+                                               size_t Occurrences,
+                                               bool CanBeTailCall) const {
+  unsigned NotOutlinedSize = SequenceSize * Occurrences;
+  unsigned OutlinedSize;
+
+  // Is this candidate something we can outline as a tail call?
+  if (CanBeTailCall) {
+    // If yes, then we just outline the sequence and replace each of its
+    // occurrences with a branch instruction.
+    OutlinedSize = SequenceSize + Occurrences;
+  } else {
+    // If no, then we outline the sequence (SequenceSize), add a return (+1),
+    // and replace each occurrence with a save/restore to LR and a call
+    // (3 * Occurrences)
+    OutlinedSize = (SequenceSize + 1) + (3 * Occurrences);
+  }
+
+  // Return the number of instructions saved by outlining this sequence.
+  return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+  return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+AArch64GenInstrInfo::MachineOutlinerInstrType
+AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+  // Don't outline LOHs.
+  if (FuncInfo->getLOHRelated().count(&MI))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugValue() || MI.isIndirectDebugValue())
+    return MachineOutlinerInstrType::Invisible;
+
+  // Is this a terminator for a basic block?
+  if (MI.isTerminator()) {
+
+    // Is this the end of a function?
+    if (MI.getParent()->succ_empty())
+        return MachineOutlinerInstrType::Legal;
+
+    // It's not, so don't outline it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  // Don't outline positions.
+  if (MI.isPosition())
+    return MachineOutlinerInstrType::Illegal;
+
+  // Make sure none of the operands are un-outlinable.
+  for (const MachineOperand &MOP : MI.operands())
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return MachineOutlinerInstrType::Illegal;
+
+  // Don't outline anything that uses the link register.
+  if (MI.modifiesRegister(AArch64::LR, &RI) ||
+      MI.readsRegister(AArch64::LR, &RI))
+      return MachineOutlinerInstrType::Illegal;
+
+  // Does this use the stack?
+  if (MI.modifiesRegister(AArch64::SP, &RI) ||
+      MI.readsRegister(AArch64::SP, &RI)) {
+
+    // Is it a memory operation?
+    if (MI.mayLoadOrStore()) {
+      unsigned Base; // Filled with the base regiser of MI.
+      int64_t Offset; // Filled with the offset of MI.
+      unsigned DummyWidth;
+
+      // Does it allow us to offset the base register and is the base SP?
+      if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
+                                      Base != AArch64::SP)
+        return MachineOutlinerInstrType::Illegal;
+
+      // Find the minimum/maximum offset for this instruction and check if
+      // fixing it up would be in range.
+      int64_t MinOffset, MaxOffset;
+      unsigned DummyScale;
+      getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset,
+                   MaxOffset);
+
+      // TODO: We should really test what happens if an instruction overflows.
+      // This is tricky to test with IR tests, but when the outliner is moved
+      // to a MIR test, it really ought to be checked.
+      if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset)
+	return MachineOutlinerInstrType::Illegal;
+
+      // It's in range, so we can outline it.
+      return MachineOutlinerInstrType::Legal;
+    }
+
+    // We can't fix it up, so don't outline it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  return MachineOutlinerInstrType::Legal;
+}
+
+void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
+  for (MachineInstr &MI : MBB) {
+    unsigned Base, Width;
+    int64_t Offset;
+
+    // Is this a load or store with an immediate offset with SP as the base?
+    if (!MI.mayLoadOrStore() ||
+        !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) ||
+        Base != AArch64::SP)
+      continue;
+
+    // It is, so we have to fix it up.
+    unsigned Scale;
+    int64_t Dummy1, Dummy2;
+
+    MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
+    assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
+    getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
+    assert(Scale != 0 && "Unexpected opcode!");
+
+    // We've pushed the return address to the stack, so add 16 to the offset.
+    // This is safe, since we already checked if it would overflow when we
+    // checked if this instruction was legal to outline.
+    int64_t NewImm = (Offset + 16)/Scale;
+    StackOffsetOperand.setImm(NewImm);
+  }
+}
+
+void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                              MachineFunction &MF,
+                                              bool IsTailCall) const {
+
+  // If this is a tail call outlined function, then there's already a return.
+  if (IsTailCall)
+    return;
+
+  // It's not a tail call, so we have to insert the return ourselves.
+  MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
+                          .addReg(AArch64::LR, RegState::Undef);
+  MBB.insert(MBB.end(), ret);
+
+  // Walk over the basic block and fix up all the stack accesses.
+  fixupPostOutline(MBB);
+}
+
+void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                              MachineFunction &MF,
+                                              bool IsTailCall) const {}
+
+MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
+    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+    MachineFunction &MF, bool IsTailCall) const {
+
+  // Are we tail calling?
+  if (IsTailCall) {
+    // If yes, then we can just branch to the label.
+    It = MBB.insert(It,
+                    BuildMI(MF, DebugLoc(), get(AArch64::B))
+                        .addGlobalAddress(M.getNamedValue(MF.getName())));
+    return It;
+  }
+
+  // We're not tail calling, so we have to save LR before the call and restore
+  // it after.
+  MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
+                              .addReg(AArch64::SP, RegState::Define)
+                              .addReg(AArch64::LR)
+                              .addReg(AArch64::SP)
+                              .addImm(-16);
+  It = MBB.insert(It, STRXpre);
+  It++;
+
+  // Insert the call.
+  It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(AArch64::BL))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+
+  It++;
+
+  // Restore the link register.
+  MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
+                               .addReg(AArch64::SP, RegState::Define)
+                               .addReg(AArch64::LR)
+                               .addReg(AArch64::SP)
+                               .addImm(16);
+  It = MBB.insert(It, LDRXpost);
+
+  return It;
+}
+
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 5037866925d3..bacce441f6c5 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -133,12 +133,19 @@ public:
                                   int64_t &Offset, unsigned &Width,
                                   const TargetRegisterInfo *TRI) const;
 
+  /// Return the immediate offset of the base register in a load/store \p LdSt.
+  MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
+
+  /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set
+  /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
+  ///
+  /// For unscaled instructions, \p Scale is set to 1.
+  bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+                    int64_t &MinOffset, int64_t &MaxOffset) const;
+
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
                            unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
-
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
                                          const MDNode *Expr,
@@ -245,7 +252,33 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+  unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences,
+                               bool CanBeTailCall) const override;
+  AArch64GenInstrInfo::MachineOutlinerInstrType
+  getOutliningType(MachineInstr &MI) const override;
+  void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool IsTailCall) const override;
+  void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool isTailCall) const override;
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It,
+                     MachineFunction &MF,
+                     bool IsTailCall) const override;
+  /// Returns true if the instruction has a shift by immediate that can be
+  /// executed in one cycle less.
+  bool isFalkorLSLFast(const MachineInstr &MI) const;
 private:
+
+  /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+  /// so that they will be valid post-outlining.
+  ///
+  /// \param MBB A \p MachineBasicBlock in an outlined function.
+  void fixupPostOutline(MachineBasicBlock &MBB) const;
+
   void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
                              MachineBasicBlock *TBB,
                              ArrayRef<MachineOperand> Cond) const;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 2244baacca17..4449412532f3 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -30,6 +30,8 @@ def HasLSE           : Predicate<"Subtarget->hasLSE()">,
                                  AssemblerPredicate<"FeatureLSE", "lse">;
 def HasRAS           : Predicate<"Subtarget->hasRAS()">,
                                  AssemblerPredicate<"FeatureRAS", "ras">;
+def HasRDM           : Predicate<"Subtarget->hasRDM()">,
+                                 AssemblerPredicate<"FeatureRDM", "rdm">;
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
@@ -41,6 +43,11 @@ def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def UseAlternateSExtLoadCVTF32
     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
+def UseNegativeImmediates
+    : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+                                             "NegativeImmediates">;
+
+
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
 //
@@ -424,8 +431,10 @@ def MSRpstateImm1 : MSRpstateImm0_1;
 def MSRpstateImm4 : MSRpstateImm0_15;
 
 // The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// TPIDR_EL0.  Add pseudo op so we can mark it as not having any side effects.
+let hasSideEffects = 0 in
+def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
+                       [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>;
 
 // The cycle counter PMC register is PMCCNTR_EL0.
 let Predicates = [HasPerfMon] in
@@ -574,31 +583,31 @@ def : Pat<(f64 fpimm:$in),
 // sequences.
 def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
                              tglobaladdr:$g1, tglobaladdr:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
-                                  tglobaladdr:$g2, 32),
-                          tglobaladdr:$g1, 16),
-                  tglobaladdr:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
+                                  tglobaladdr:$g1, 16),
+                          tglobaladdr:$g2, 32),
+                  tglobaladdr:$g3, 48)>;
 
 def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
                              tblockaddress:$g1, tblockaddress:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
-                                  tblockaddress:$g2, 32),
-                          tblockaddress:$g1, 16),
-                  tblockaddress:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
+                                  tblockaddress:$g1, 16),
+                          tblockaddress:$g2, 32),
+                  tblockaddress:$g3, 48)>;
 
 def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
                              tconstpool:$g1, tconstpool:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
-                                  tconstpool:$g2, 32),
-                          tconstpool:$g1, 16),
-                  tconstpool:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
+                                  tconstpool:$g1, 16),
+                          tconstpool:$g2, 32),
+                  tconstpool:$g3, 48)>;
 
 def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
                              tjumptable:$g1, tjumptable:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
-                                  tjumptable:$g2, 32),
-                          tjumptable:$g1, 16),
-                  tjumptable:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
+                                  tjumptable:$g1, 16),
+                          tjumptable:$g2, 32),
+                  tjumptable:$g3, 48)>;
 
 
 //===----------------------------------------------------------------------===//
@@ -3284,7 +3293,7 @@ defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>
 defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
 defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
-let Predicates = [HasV8_1a] in {
+let Predicates = [HasRDM] in {
   defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
   defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
   def : Pat<(i32 (int_aarch64_neon_sqadd
@@ -5029,7 +5038,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                0),
                              dsub)))>,
     Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
- 
+
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index b51473524c72..878dac6bff1e 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -12,17 +12,19 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
-#include "AArch64InstructionSelector.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
@@ -36,13 +38,61 @@ using namespace llvm;
 #error "You shouldn't build this"
 #endif
 
+namespace {
+
+class AArch64InstructionSelector : public InstructionSelector {
+public:
+  AArch64InstructionSelector(const AArch64TargetMachine &TM,
+                             const AArch64Subtarget &STI,
+                             const AArch64RegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
+                          MachineRegisterInfo &MRI) const;
+  bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
+                           MachineRegisterInfo &MRI) const;
+
+  bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
+                           MachineRegisterInfo &MRI) const;
+
+  bool selectArithImmed(MachineOperand &Root, MachineOperand &Result1,
+                        MachineOperand &Result2) const;
+
+  const AArch64TargetMachine &TM;
+  const AArch64Subtarget &STI;
+  const AArch64InstrInfo &TII;
+  const AArch64RegisterInfo &TRI;
+  const AArch64RegisterBankInfo &RBI;
+
+// We declare the temporaries used by selectImpl() in the class to minimize the
+// cost of constructing placeholder values.
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
 #include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
 
 AArch64InstructionSelector::AArch64InstructionSelector(
     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
     const AArch64RegisterBankInfo &RBI)
-  : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI)
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
@@ -119,67 +169,34 @@ static bool unsupportedBinOp(const MachineInstr &I,
 }
 
 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
-/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID
+/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
 /// and of size \p OpSize.
 /// \returns \p GenericOpc if the combination is unsupported.
 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
                                unsigned OpSize) {
   switch (RegBankID) {
   case AArch64::GPRRegBankID:
-    if (OpSize <= 32) {
-      assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV &&
-                               GenericOpc != TargetOpcode::G_UDIV &&
-                               GenericOpc != TargetOpcode::G_LSHR &&
-                               GenericOpc != TargetOpcode::G_ASHR)) &&
-             "operation should have been legalized before now");
-
+    if (OpSize == 32) {
       switch (GenericOpc) {
-      case TargetOpcode::G_OR:
-        return AArch64::ORRWrr;
-      case TargetOpcode::G_XOR:
-        return AArch64::EORWrr;
-      case TargetOpcode::G_AND:
-        return AArch64::ANDWrr;
-      case TargetOpcode::G_ADD:
-        assert(OpSize != 32 && "s32 G_ADD should have been selected");
-        return AArch64::ADDWrr;
-      case TargetOpcode::G_SUB:
-        return AArch64::SUBWrr;
       case TargetOpcode::G_SHL:
         return AArch64::LSLVWr;
       case TargetOpcode::G_LSHR:
         return AArch64::LSRVWr;
       case TargetOpcode::G_ASHR:
         return AArch64::ASRVWr;
-      case TargetOpcode::G_SDIV:
-        return AArch64::SDIVWr;
-      case TargetOpcode::G_UDIV:
-        return AArch64::UDIVWr;
       default:
         return GenericOpc;
       }
     } else if (OpSize == 64) {
       switch (GenericOpc) {
-      case TargetOpcode::G_OR:
-        return AArch64::ORRXrr;
-      case TargetOpcode::G_XOR:
-        return AArch64::EORXrr;
-      case TargetOpcode::G_AND:
-        return AArch64::ANDXrr;
       case TargetOpcode::G_GEP:
         return AArch64::ADDXrr;
-      case TargetOpcode::G_SUB:
-        return AArch64::SUBXrr;
       case TargetOpcode::G_SHL:
         return AArch64::LSLVXr;
       case TargetOpcode::G_LSHR:
         return AArch64::LSRVXr;
       case TargetOpcode::G_ASHR:
         return AArch64::ASRVXr;
-      case TargetOpcode::G_SDIV:
-        return AArch64::SDIVXr;
-      case TargetOpcode::G_UDIV:
-        return AArch64::UDIVXr;
       default:
         return GenericOpc;
       }
@@ -473,6 +490,82 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
   }
 }
 
+bool AArch64InstructionSelector::selectCompareBranch(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+
+  const unsigned CondReg = I.getOperand(0).getReg();
+  MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+  MachineInstr *CCMI = MRI.getVRegDef(CondReg);
+  if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
+    return false;
+
+  unsigned LHS = CCMI->getOperand(2).getReg();
+  unsigned RHS = CCMI->getOperand(3).getReg();
+  if (!getConstantVRegVal(RHS, MRI))
+    std::swap(RHS, LHS);
+
+  const auto RHSImm = getConstantVRegVal(RHS, MRI);
+  if (!RHSImm || *RHSImm != 0)
+    return false;
+
+  const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
+  if (RB.getID() != AArch64::GPRRegBankID)
+    return false;
+
+  const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
+  if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
+    return false;
+
+  const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
+  unsigned CBOpc = 0;
+  if (CmpWidth <= 32)
+    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
+  else if (CmpWidth == 64)
+    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
+  else
+    return false;
+
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
+                 .addUse(LHS)
+                 .addMBB(DestMBB);
+
+  constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::selectVaStartAAPCS(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  return false;
+}
+
+bool AArch64InstructionSelector::selectVaStartDarwin(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned ListReg = I.getOperand(0).getReg();
+
+  unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+
+  auto MIB =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
+          .addDef(ArgsAddrReg)
+          .addFrameIndex(FuncInfo->getVarArgsStackIndex())
+          .addImm(0)
+          .addImm(0);
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+  MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
+            .addUse(ArgsAddrReg)
+            .addUse(ListReg)
+            .addImm(0)
+            .addMemOperand(*I.memoperands_begin());
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -549,6 +642,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const unsigned CondReg = I.getOperand(0).getReg();
     MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
+    if (selectCompareBranch(I, MF, MRI))
+      return true;
+
     auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
                    .addUse(CondReg)
                    .addImm(/*bit offset=*/0)
@@ -558,6 +654,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_BRINDIRECT: {
+    I.setDesc(TII.get(AArch64::BR));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
   case TargetOpcode::G_FCONSTANT:
   case TargetOpcode::G_CONSTANT: {
     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
@@ -629,9 +730,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       // FIXME: Is going through int64_t always correct?
       ImmOp.ChangeToImmediate(
           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
-    } else {
+    } else if (I.getOperand(1).isCImm()) {
       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
       I.getOperand(1).ChangeToImmediate(Val);
+    } else if (I.getOperand(1).isImm()) {
+      uint64_t Val = I.getOperand(1).getImm();
+      I.getOperand(1).ChangeToImmediate(Val);
     }
 
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -686,10 +790,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       return false;
     }
 
-#ifndef NDEBUG
-    // Sanity-check the pointer register.
+    auto &MemOp = **I.memoperands_begin();
+    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      return false;
+    }
+
     const unsigned PtrReg = I.getOperand(1).getReg();
+#ifndef NDEBUG
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
+    // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
            "Load/Store pointer operand isn't a GPR");
     assert(MRI.getType(PtrReg).isPointer() &&
@@ -706,11 +816,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
     I.setDesc(TII.get(NewOpc));
 
-    I.addOperand(MachineOperand::CreateImm(0));
+    uint64_t Offset = 0;
+    auto *PtrMI = MRI.getVRegDef(PtrReg);
+
+    // Try to fold a GEP into our unsigned immediate addressing mode.
+    if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
+      if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
+        int64_t Imm = *COff;
+        const unsigned Size = MemTy.getSizeInBits() / 8;
+        const unsigned Scale = Log2_32(Size);
+        if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
+          unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+          I.getOperand(1).setReg(Ptr2Reg);
+          PtrMI = MRI.getVRegDef(Ptr2Reg);
+          Offset = Imm / Size;
+        }
+      }
+    }
+
+    // If we haven't folded anything into our addressing mode yet, try to fold
+    // a frame index into the base+offset.
+    if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+      I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
+
+    I.addOperand(MachineOperand::CreateImm(Offset));
+
+    // If we're storing a 0, use WZR/XZR.
+    if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
+      if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
+        if (I.getOpcode() == AArch64::STRWui)
+          I.getOperand(0).setReg(AArch64::WZR);
+        else if (I.getOpcode() == AArch64::STRXui)
+          I.getOperand(0).setReg(AArch64::XZR);
+      }
+    }
+
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
-  case TargetOpcode::G_MUL: {
+  case TargetOpcode::G_SMULH:
+  case TargetOpcode::G_UMULH: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
@@ -719,48 +864,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n");
+      DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
       return false;
     }
 
-    unsigned ZeroReg;
-    unsigned NewOpc;
-    if (Ty.isScalar() && Ty.getSizeInBits() <= 32) {
-      NewOpc = AArch64::MADDWrrr;
-      ZeroReg = AArch64::WZR;
-    } else if (Ty == LLT::scalar(64)) {
-      NewOpc = AArch64::MADDXrrr;
-      ZeroReg = AArch64::XZR;
-    } else {
-      DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: "
-                   << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n');
+    if (Ty != LLT::scalar(64)) {
+      DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
+                   << ", expected: " << LLT::scalar(64) << '\n');
       return false;
     }
 
+    unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
+                                                             : AArch64::UMULHrr;
     I.setDesc(TII.get(NewOpc));
 
-    I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false));
-
     // Now that we selected an opcode, we need to constrain the register
     // operands to use appropriate classes.
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
-
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
 
   case TargetOpcode::G_OR:
-  case TargetOpcode::G_XOR:
-  case TargetOpcode::G_AND:
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR:
-  case TargetOpcode::G_SDIV:
-  case TargetOpcode::G_UDIV:
-  case TargetOpcode::G_ADD:
-  case TargetOpcode::G_SUB:
   case TargetOpcode::G_GEP: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
@@ -783,6 +913,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_PTR_MASK: {
+    uint64_t Align = I.getOperand(2).getImm();
+    if (Align >= 64 || Align == 0)
+      return false;
+
+    uint64_t Mask = ~((1ULL << Align) - 1);
+    I.setDesc(TII.get(AArch64::ANDXri));
+    I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_TRUNC: {
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -1026,7 +1167,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
     if (Ty == LLT::scalar(32)) {
       CSelOpc = AArch64::CSELWr;
-    } else if (Ty == LLT::scalar(64)) {
+    } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
       CSelOpc = AArch64::CSELXr;
     } else {
       return false;
@@ -1134,7 +1275,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
              .addDef(Def1Reg)
              .addUse(AArch64::WZR)
              .addUse(AArch64::WZR)
-             .addImm(CC1);
+             .addImm(getInvertedCondCode(CC1));
 
     if (CC2 != AArch64CC::AL) {
       unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
@@ -1143,7 +1284,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
                .addDef(Def2Reg)
                .addUse(AArch64::WZR)
                .addUse(AArch64::WZR)
-               .addImm(CC2);
+               .addImm(getInvertedCondCode(CC2));
       MachineInstr &OrMI =
           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
                .addDef(DefReg)
@@ -1159,7 +1300,69 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     I.eraseFromParent();
     return true;
   }
+  case TargetOpcode::G_VASTART:
+    return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
+                                : selectVaStartAAPCS(I, MF, MRI);
   }
 
   return false;
 }
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64InstructionSelector::selectArithImmed(
+    MachineOperand &Root, MachineOperand &Result1,
+    MachineOperand &Result2) const {
+  MachineInstr &MI = *Root.getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  uint64_t Immed;
+  if (Root.isImm())
+    Immed = Root.getImm();
+  else if (Root.isCImm())
+    Immed = Root.getCImm()->getZExtValue();
+  else if (Root.isReg()) {
+    MachineInstr *Def = MRI.getVRegDef(Root.getReg());
+    if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
+      return false;
+    MachineOperand &Op1 = Def->getOperand(1);
+    if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
+      return false;
+    Immed = Op1.getCImm()->getZExtValue();
+  } else
+    return false;
+
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Result1.ChangeToImmediate(Immed);
+  Result1.clearParent();
+  Result2.ChangeToImmediate(ShVal);
+  Result2.clearParent();
+  return true;
+}
+
+namespace llvm {
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &TM,
+                                 AArch64Subtarget &Subtarget,
+                                 AArch64RegisterBankInfo &RBI) {
+  return new AArch64InstructionSelector(TM, Subtarget, RBI);
+}
+}
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.h b/lib/Target/AArch64/AArch64InstructionSelector.h
deleted file mode 100644
index 2c6e5a912fb7..000000000000
--- a/lib/Target/AArch64/AArch64InstructionSelector.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- AArch64InstructionSelector --------------------------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file declares the targeting of the InstructionSelector class for
-/// AArch64.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-
-namespace llvm {
-
-class AArch64InstrInfo;
-class AArch64RegisterBankInfo;
-class AArch64RegisterInfo;
-class AArch64Subtarget;
-class AArch64TargetMachine;
-
-class AArch64InstructionSelector : public InstructionSelector {
-public:
-  AArch64InstructionSelector(const AArch64TargetMachine &TM,
-                             const AArch64Subtarget &STI,
-                             const AArch64RegisterBankInfo &RBI);
-
-  bool select(MachineInstr &I) const override;
-
-private:
-  /// tblgen-erated 'select' implementation, used as the initial selector for
-  /// the patterns that don't require complex C++.
-  bool selectImpl(MachineInstr &I) const;
-
-  const AArch64TargetMachine &TM;
-  const AArch64Subtarget &STI;
-  const AArch64InstrInfo &TII;
-  const AArch64RegisterInfo &TRI;
-  const AArch64RegisterBankInfo &RBI;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 83f276a8161b..6e6daf812295 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,7 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64LegalizerInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Target/TargetOpcodes.h"
@@ -36,11 +39,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   const LLT v4s32 = LLT::vector(4, 32);
   const LLT v2s64 = LLT::vector(2, 64);
 
-  for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
     // These operations naturally get the right answer when used on
     // GPR32, even if the actual type is narrower.
-    for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
+    for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
       setAction({BinOp, Ty}, Legal);
+
+    for (auto Ty : {s1, s8, s16})
+      setAction({BinOp, Ty}, WidenScalar);
   }
 
   setAction({G_GEP, p0}, Legal);
@@ -49,7 +55,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   for (auto Ty : {s1, s8, s16, s32})
     setAction({G_GEP, 1, Ty}, WidenScalar);
 
-  for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
+  setAction({G_PTR_MASK, p0}, Legal);
+
+  for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
@@ -57,25 +65,41 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
       setAction({BinOp, Ty}, WidenScalar);
   }
 
-  for (auto BinOp : { G_SREM, G_UREM })
+  for (unsigned BinOp : {G_SREM, G_UREM})
     for (auto Ty : { s1, s8, s16, s32, s64 })
       setAction({BinOp, Ty}, Lower);
 
-  for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) {
+  for (unsigned Op : {G_SMULO, G_UMULO})
+      setAction({Op, s64}, Lower);
+
+  for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
     for (auto Ty : { s32, s64 })
       setAction({Op, Ty}, Legal);
 
     setAction({Op, 1, s1}, Legal);
   }
 
-  for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
-  setAction({G_FREM, s32}, Libcall);
-  setAction({G_FREM, s64}, Libcall);
+  for (unsigned BinOp : {G_FREM, G_FPOW}) {
+    setAction({BinOp, s32}, Libcall);
+    setAction({BinOp, s64}, Libcall);
+  }
 
-  for (auto MemOp : {G_LOAD, G_STORE}) {
+  for (auto Ty : {s32, s64, p0}) {
+    setAction({G_INSERT, Ty}, Legal);
+    setAction({G_INSERT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {s1, s8, s16}) {
+    setAction({G_INSERT, Ty}, WidenScalar);
+    setAction({G_INSERT, 1, Ty}, Legal);
+    // FIXME: Can't widen the sources because that violates the constraints on
+    // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
+  }
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
     for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
       setAction({MemOp, Ty}, Legal);
 
@@ -141,12 +165,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_TRUNC, 1, Ty}, Legal);
 
   // Conversions
-  for (auto Ty : { s1, s8, s16, s32, s64 }) {
+  for (auto Ty : { s32, s64 }) {
     setAction({G_FPTOSI, 0, Ty}, Legal);
     setAction({G_FPTOUI, 0, Ty}, Legal);
     setAction({G_SITOFP, 1, Ty}, Legal);
     setAction({G_UITOFP, 1, Ty}, Legal);
   }
+  for (auto Ty : { s1, s8, s16 }) {
+    setAction({G_FPTOSI, 0, Ty}, WidenScalar);
+    setAction({G_FPTOUI, 0, Ty}, WidenScalar);
+    setAction({G_SITOFP, 1, Ty}, WidenScalar);
+    setAction({G_UITOFP, 1, Ty}, WidenScalar);
+  }
 
   for (auto Ty : { s32, s64 }) {
     setAction({G_FPTOSI, 1, Ty}, Legal);
@@ -158,9 +188,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   // Control-flow
   for (auto Ty : {s1, s8, s16, s32})
     setAction({G_BRCOND, Ty}, Legal);
+  setAction({G_BRINDIRECT, p0}, Legal);
 
   // Select
-  for (auto Ty : {s1, s8, s16, s32, s64})
+  for (auto Ty : {s1, s8, s16})
+    setAction({G_SELECT, Ty}, WidenScalar);
+
+  for (auto Ty : {s32, s64, p0})
     setAction({G_SELECT, Ty}, Legal);
 
   setAction({G_SELECT, 1, s1}, Legal);
@@ -200,5 +234,82 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
   }
 
+  setAction({G_VASTART, p0}, Legal);
+
+  // va_list must be a pointer, but most sized types are pretty easy to handle
+  // as the destination.
+  setAction({G_VAARG, 1, p0}, Legal);
+
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({G_VAARG, Ty}, Custom);
+
   computeTables();
 }
+
+bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                          MachineRegisterInfo &MRI,
+                                          MachineIRBuilder &MIRBuilder) const {
+  switch (MI.getOpcode()) {
+  default:
+    // No idea what to do.
+    return false;
+  case TargetOpcode::G_VAARG:
+    return legalizeVaArg(MI, MRI, MIRBuilder);
+  }
+
+  llvm_unreachable("expected switch to return");
+}
+
+bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineIRBuilder &MIRBuilder) const {
+  MIRBuilder.setInstr(MI);
+  MachineFunction &MF = MIRBuilder.getMF();
+  unsigned Align = MI.getOperand(2).getImm();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned ListPtr = MI.getOperand(1).getReg();
+
+  LLT PtrTy = MRI.getType(ListPtr);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
+  unsigned List = MRI.createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildLoad(
+      List, ListPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+                               PtrSize, /* Align = */ PtrSize));
+
+  unsigned DstPtr;
+  if (Align > PtrSize) {
+    // Realign the list to the actual required alignment.
+    unsigned AlignMinus1 = MRI.createGenericVirtualRegister(IntPtrTy);
+    MIRBuilder.buildConstant(AlignMinus1, Align - 1);
+
+    unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildGEP(ListTmp, List, AlignMinus1);
+
+    DstPtr = MRI.createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
+  } else
+    DstPtr = List;
+
+  uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8;
+  MIRBuilder.buildLoad(
+      Dst, DstPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+                               ValSize, std::max(Align, PtrSize)));
+
+  unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy);
+  MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize));
+
+  unsigned NewList = MRI.createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildGEP(NewList, DstPtr, SizeReg);
+
+  MIRBuilder.buildStore(
+      NewList, ListPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
+                               PtrSize, /* Align = */ PtrSize));
+
+  MI.eraseFromParent();
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index feacbef9f147..42d4ac130c5c 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -25,6 +25,13 @@ class LLVMContext;
 class AArch64LegalizerInfo : public LegalizerInfo {
 public:
   AArch64LegalizerInfo();
+
+  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &MIRBuilder) const override;
+
+private:
+  bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     MachineIRBuilder &MIRBuilder) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 8e312dcf276f..976498aa70d6 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -16,19 +16,29 @@
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-ldst-opt"
@@ -58,15 +68,15 @@ typedef struct LdStPairFlags {
   // If a matching instruction is found, MergeForward is set to true if the
   // merge is to remove the first instruction and replace the second with
   // a pair-wise insn, and false if the reverse is true.
-  bool MergeForward;
+  bool MergeForward = false;
 
   // SExtIdx gives the index of the result of the load pair that must be
   // extended. The value of SExtIdx assumes that the paired load produces the
   // value in this order: (I, returned iterator), i.e., -1 means no value has
   // to be extended, 0 means I, and 1 means the returned iterator.
-  int SExtIdx;
+  int SExtIdx = -1;
 
-  LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+  LdStPairFlags() = default;
 
   void setMergeForward(bool V = true) { MergeForward = V; }
   bool getMergeForward() const { return MergeForward; }
@@ -78,10 +88,12 @@ typedef struct LdStPairFlags {
 
 struct AArch64LoadStoreOpt : public MachineFunctionPass {
   static char ID;
+
   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
   }
 
+  AliasAnalysis *AA;
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const AArch64Subtarget *Subtarget;
@@ -89,6 +101,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Track which registers have been modified and used.
   BitVector ModifiedRegs, UsedRegs;
 
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
@@ -162,8 +179,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
 };
+
 char AArch64LoadStoreOpt::ID = 0;
-} // namespace
+
+} // end anonymous namespace
 
 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
@@ -246,7 +265,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   default:
     if (IsValidLdStrOpc)
       *IsValidLdStrOpc = false;
-    return UINT_MAX;
+    return std::numeric_limits<unsigned>::max();
   case AArch64::STRDui:
   case AArch64::STURDi:
   case AArch64::STRQui:
@@ -595,7 +614,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
   MachineInstrBuilder MIB;
   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
-            .addOperand(BaseRegOp)
+            .add(BaseRegOp)
             .addImm(OffsetImm)
             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
   (void)MIB;
@@ -709,9 +728,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     }
   }
   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
-            .addOperand(RegOp0)
-            .addOperand(RegOp1)
-            .addOperand(BaseRegOp)
+            .add(RegOp0)
+            .add(RegOp1)
+            .add(BaseRegOp)
             .addImm(OffsetImm)
             .setMemRefs(I->mergeMemRefsWith(*Paired));
 
@@ -923,7 +942,7 @@ static int alignTo(int Num, int PowOf2) {
 }
 
 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
-                     const AArch64InstrInfo *TII) {
+                     AliasAnalysis *AA) {
   // One of the instructions must modify memory.
   if (!MIa.mayStore() && !MIb.mayStore())
     return false;
@@ -932,14 +951,14 @@ static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
     return false;
 
-  return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
+  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
 }
 
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
-                     const AArch64InstrInfo *TII) {
+                     AliasAnalysis *AA) {
   for (MachineInstr *MIb : MemInsns)
-    if (mayAlias(MIa, *MIb, TII))
+    if (mayAlias(MIa, *MIb, AA))
       return true;
 
   return false;
@@ -997,7 +1016,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
+    if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
       return false;
   } while (MBBI != B && Count < Limit);
   return false;
@@ -1167,7 +1186,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // first.
         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
-            !mayAlias(MI, MemInsns, TII)) {
+            !mayAlias(MI, MemInsns, AA)) {
           Flags.setMergeForward(false);
           return MBBI;
         }
@@ -1178,7 +1197,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // into the second.
         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
-            !mayAlias(FirstMI, MemInsns, TII)) {
+            !mayAlias(FirstMI, MemInsns, AA)) {
           Flags.setMergeForward(true);
           return MBBI;
         }
@@ -1233,19 +1252,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(*Update))
-              .addOperand(getLdStRegOp(*I))
-              .addOperand(getLdStBaseOp(*I))
+              .add(getLdStRegOp(*Update))
+              .add(getLdStRegOp(*I))
+              .add(getLdStBaseOp(*I))
               .addImm(Value)
               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   } else {
     // Paired instruction.
     int Scale = getMemScale(*I);
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(*Update))
-              .addOperand(getLdStRegOp(*I, 0))
-              .addOperand(getLdStRegOp(*I, 1))
-              .addOperand(getLdStBaseOp(*I))
+              .add(getLdStRegOp(*Update))
+              .add(getLdStRegOp(*I, 0))
+              .add(getLdStRegOp(*I, 1))
+              .add(getLdStBaseOp(*I))
               .addImm(Value / Scale)
               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   }
@@ -1545,7 +1564,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
     case AArch64::LDURBBi:
     case AArch64::LDURHHi:
     case AArch64::LDURWi:
-    case AArch64::LDURXi: {
+    case AArch64::LDURXi:
       if (tryToPromoteLoadFromStore(MBBI)) {
         Modified = true;
         break;
@@ -1553,7 +1572,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
       break;
     }
-    }
   }
   // 2) Merge adjacent zero stores into a wider store.
   //      e.g.,
@@ -1722,6 +1740,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   TRI = Subtarget->getRegisterInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   // Resize the modified and used register bitfield trackers.  We do this once
   // per function and then clear the bitfield each time we optimize a load or
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
new file mode 100644
index 000000000000..a6926a6700e1
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -0,0 +1,272 @@
+//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file contains the AArch64 implementation of the DAG scheduling mutation
+// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MacroFusion.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace {
+
+/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
+/// together.  Given an anchor instr, when the other instr is unspecified, then
+/// check if the anchor instr may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr *SecondMI) {
+  assert((FirstMI || SecondMI) && "At least one instr must be specified");
+
+  const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
+  const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
+
+  // Assume wildcards for unspecified instrs.
+  unsigned FirstOpcode =
+    FirstMI ? FirstMI->getOpcode()
+	    : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode =
+    SecondMI ? SecondMI->getOpcode()
+             : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+
+  if (ST.hasArithmeticBccFusion())
+    // Fuse CMN, CMP, TST followed by Bcc.
+    if (SecondOpcode == AArch64::Bcc)
+      switch (FirstOpcode) {
+      default:
+        return false;
+      case AArch64::ADDSWri:
+      case AArch64::ADDSWrr:
+      case AArch64::ADDSXri:
+      case AArch64::ADDSXrr:
+      case AArch64::ANDSWri:
+      case AArch64::ANDSWrr:
+      case AArch64::ANDSXri:
+      case AArch64::ANDSXrr:
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+      case AArch64::BICSWrr:
+      case AArch64::BICSXrr:
+        return true;
+      case AArch64::ADDSWrs:
+      case AArch64::ADDSXrs:
+      case AArch64::ANDSWrs:
+      case AArch64::ANDSXrs:
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+      case AArch64::BICSWrs:
+      case AArch64::BICSXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !II.hasShiftedReg(*FirstMI);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
+      }
+
+  if (ST.hasArithmeticCbzFusion())
+    // Fuse ALU operations followed by CBZ/CBNZ.
+    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
+      switch (FirstOpcode) {
+      default:
+        return false;
+      case AArch64::ADDWri:
+      case AArch64::ADDWrr:
+      case AArch64::ADDXri:
+      case AArch64::ADDXrr:
+      case AArch64::ANDWri:
+      case AArch64::ANDWrr:
+      case AArch64::ANDXri:
+      case AArch64::ANDXrr:
+      case AArch64::EORWri:
+      case AArch64::EORWrr:
+      case AArch64::EORXri:
+      case AArch64::EORXrr:
+      case AArch64::ORRWri:
+      case AArch64::ORRWrr:
+      case AArch64::ORRXri:
+      case AArch64::ORRXrr:
+      case AArch64::SUBWri:
+      case AArch64::SUBWrr:
+      case AArch64::SUBXri:
+      case AArch64::SUBXrr:
+        return true;
+      case AArch64::ADDWrs:
+      case AArch64::ADDXrs:
+      case AArch64::ANDWrs:
+      case AArch64::ANDXrs:
+      case AArch64::SUBWrs:
+      case AArch64::SUBXrs:
+      case AArch64::BICWrs:
+      case AArch64::BICXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !II.hasShiftedReg(*FirstMI);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
+      }
+
+  if (ST.hasFuseAES())
+    // Fuse AES crypto operations.
+    switch(FirstOpcode) {
+    // AES encode.
+    case AArch64::AESErr:
+      return SecondOpcode == AArch64::AESMCrr ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // AES decode.
+    case AArch64::AESDrr:
+      return SecondOpcode == AArch64::AESIMCrr ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    }
+
+  if (ST.hasFuseLiterals())
+    // Fuse literal generation operations.
+    switch (FirstOpcode) {
+    // PC relative address.
+    case AArch64::ADRP:
+      return SecondOpcode == AArch64::ADDXri ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // 32 bit immediate.
+    case AArch64::MOVZWi:
+      return (SecondOpcode == AArch64::MOVKWi &&
+              SecondMI->getOperand(3).getImm() == 16) ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Lower half of 64 bit immediate.
+    case AArch64::MOVZXi:
+      return (SecondOpcode == AArch64::MOVKXi &&
+              SecondMI->getOperand(3).getImm() == 16) ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Upper half of 64 bit immediate.
+    case AArch64::MOVKXi:
+      return FirstMI->getOperand(3).getImm() == 32 &&
+             ((SecondOpcode == AArch64::MOVKXi &&
+               SecondMI->getOperand(3).getImm() == 48) ||
+              SecondOpcode == AArch64::INSTRUCTION_LIST_END);
+    }
+
+  return false;
+}
+
+/// \brief Implement the fusion of instr pairs in the scheduling DAG,
+/// anchored at the instr in AnchorSU..
+static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
+  const MachineInstr *AnchorMI = AnchorSU.getInstr();
+  if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient())
+    return false;
+
+  // If the anchor instr is the ExitSU, then consider its predecessors;
+  // otherwise, its successors.
+  bool Preds = (&AnchorSU == &DAG->ExitSU);
+  SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;
+
+  const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
+  const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;
+
+  // Check if the anchor instr may be fused.
+  if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
+                              FirstMI, SecondMI))
+    return false;
+
+  // Explorer for fusion candidates among the dependencies of the anchor instr.
+  for (SDep &Dep : AnchorDeps) {
+    // Ignore dependencies that don't enforce ordering.
+    if (Dep.isWeak())
+      continue;
+
+    SUnit &DepSU = *Dep.getSUnit();
+    // Ignore the ExitSU if the dependents are successors.
+    if (!Preds && &DepSU == &DAG->ExitSU)
+      continue;
+
+    const MachineInstr *DepMI = DepSU.getInstr();
+    if (!DepMI || DepMI->isPseudo() || DepMI->isTransient())
+      continue;
+
+    FirstMI = Preds ? DepMI : AnchorMI;
+    SecondMI = Preds ? AnchorMI : DepMI;
+    if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
+                                FirstMI, SecondMI))
+      continue;
+
+    // Create a single weak edge between the adjacent instrs. The only effect is
+    // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
+    SUnit &FirstSU = Preds ? DepSU : AnchorSU;
+    SUnit &SecondSU = Preds ? AnchorSU : DepSU;
+    DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
+
+    // Adjust the latency between the anchor instr and its
+    // predecessors/successors.
+    for (SDep &IDep : AnchorDeps)
+      if (IDep.getSUnit() == &DepSU)
+        IDep.setLatency(0);
+
+    // Adjust the latency between the dependent instr and its
+    // successors/predecessors.
+    for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
+      if (IDep.getSUnit() == &AnchorSU)
+        IDep.setLatency(0);
+
+    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
+          FirstSU.print(dbgs(), DAG); dbgs() << " - ";
+          SecondSU.print(dbgs(), DAG); dbgs() << " /  ";
+          dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
+                    DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
+
+    ++NumFused;
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Post-process the DAG to create cluster edges between instrs that may
+/// be fused by the processor into a single operation.
+class AArch64MacroFusion : public ScheduleDAGMutation {
+public:
+  AArch64MacroFusion() {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For each of the SUnits in the scheduling block, try to fuse the instr in it
+  // with one in its successors.
+  for (SUnit &ISU : DAG->SUnits)
+    scheduleAdjacentImpl(DAG, ISU);
+
+  // Try to fuse the instr in the ExitSU with one in its predecessors.
+  scheduleAdjacentImpl(DAG, DAG->ExitSU);
+}
+
+} // end namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
+  return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h
new file mode 100644
index 000000000000..e5efedd9fbfd
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MacroFusion.h
@@ -0,0 +1,29 @@
+//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// \fileThis file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 definition of the DAG scheduling mutation
+// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+//===----------------------------------------------------------------------===//
+// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createAArch64MacroFusionDAGMutation());
+/// to AArch64PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation();
+
+} // llvm
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 8f45e6a80a36..f3c8e7e9bdc2 100644
--- a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -12,13 +12,14 @@
 //    CBZW %W0, <BB#2>
 //  BB#2:
 //    %W0 = COPY %WZR
-// This pass should be run after register allocation.
+// Similarly, this pass also handles non-zero copies.
+//  BB#0:
+//    cmp x0, #1
+//    b.eq .LBB0_1
+//  .LBB0_1:
+//    orr x0, xzr, #0x1
 //
-// FIXME: This should be extended to handle any constant other than zero. E.g.,
-//   cmp w0, #1
-//     b.eq .BB1
-//   BB1:
-//     mov w0, #1
+// This pass should be run after register allocation.
 //
 // FIXME: This could also be extended to check the whole dominance subtree below
 // the comparison if the compile time regression is acceptable.
@@ -26,6 +27,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -43,6 +45,7 @@ namespace {
 class AArch64RedundantCopyElimination : public MachineFunctionPass {
   const MachineRegisterInfo *MRI;
   const TargetRegisterInfo *TRI;
+  BitVector ClobberedRegs;
 
 public:
   static char ID;
@@ -50,6 +53,16 @@ public:
     initializeAArch64RedundantCopyEliminationPass(
         *PassRegistry::getPassRegistry());
   }
+
+  struct RegImm {
+    MCPhysReg Reg;
+    int32_t Imm;
+    RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {}
+  };
+
+  Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr,
+                                      MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator &FirstUse);
   bool optimizeCopy(MachineBasicBlock *MBB);
   bool runOnMachineFunction(MachineFunction &MF) override;
   MachineFunctionProperties getRequiredProperties() const override {
@@ -66,18 +79,120 @@ char AArch64RedundantCopyElimination::ID = 0;
 INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
                 "AArch64 redundant copy elimination pass", false, false)
 
-static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
-  unsigned Opc = MI.getOpcode();
+/// Remember what registers the specified instruction modifies.
+static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs,
+                         const TargetRegisterInfo *TRI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      ClobberedRegs.setBitsNotInMask(MO.getRegMask());
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (!MO.isDef())
+      continue;
+
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      ClobberedRegs.set(*AI);
+  }
+}
+
+/// It's possible to determine the value of a register based on a dominating
+/// condition.  To do so, this function checks to see if the basic block \p MBB
+/// is the target to which a conditional branch \p CondBr jumps and whose
+/// equality comparison is against a constant.  If so, return a known physical
+/// register and constant value pair.  Otherwise, return None.
+Optional<AArch64RedundantCopyElimination::RegImm>
+AArch64RedundantCopyElimination::knownRegValInBlock(
+    MachineInstr &CondBr, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator &FirstUse) {
+  unsigned Opc = CondBr.getOpcode();
+
   // Check if the current basic block is the target block to which the
   // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
-  if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
-      MBB == MI.getOperand(1).getMBB())
-    return true;
-  else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
-           MBB != MI.getOperand(1).getMBB())
-    return true;
-
-  return false;
+  if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+       MBB == CondBr.getOperand(1).getMBB()) ||
+      ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+       MBB != CondBr.getOperand(1).getMBB())) {
+    FirstUse = CondBr;
+    return RegImm(CondBr.getOperand(0).getReg(), 0);
+  }
+
+  // Otherwise, must be a conditional branch.
+  if (Opc != AArch64::Bcc)
+    return None;
+
+  // Must be an equality check (i.e., == or !=).
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return None;
+
+  MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB();
+  if ((CC == AArch64CC::EQ && BrTarget != MBB) ||
+      (CC == AArch64CC::NE && BrTarget == MBB))
+    return None;
+
+  // Stop if we get to the beginning of PredMBB.
+  MachineBasicBlock *PredMBB = *MBB->pred_begin();
+  assert(PredMBB == CondBr.getParent() &&
+         "Conditional branch not in predecessor block!");
+  if (CondBr == PredMBB->begin())
+    return None;
+
+  // Registers clobbered in PredMBB between CondBr instruction and current
+  // instruction being checked in loop.
+  ClobberedRegs.reset();
+
+  // Find compare instruction that sets NZCV used by CondBr.
+  MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator();
+  for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) {
+
+    // Track clobbered registers.
+    trackRegDefs(PredI, ClobberedRegs, TRI);
+
+    bool IsCMN = false;
+    switch (PredI.getOpcode()) {
+    default:
+      break;
+
+    // CMN is an alias for ADDS with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      IsCMN = true;
+    // CMP is an alias for SUBS with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri: {
+      MCPhysReg SrcReg = PredI.getOperand(1).getReg();
+
+      // Must not be a symbolic immediate.
+      if (!PredI.getOperand(2).isImm())
+        return None;
+
+      // The src register must not be modified between the cmp and conditional
+      // branch.  This includes a self-clobbering compare.
+      if (ClobberedRegs[SrcReg])
+        return None;
+
+      // We've found the Cmp that sets NZCV.
+      int32_t KnownImm = PredI.getOperand(2).getImm();
+      int32_t Shift = PredI.getOperand(3).getImm();
+      KnownImm <<= Shift;
+      if (IsCMN)
+        KnownImm = -KnownImm;
+      FirstUse = PredI;
+      return RegImm(SrcReg, KnownImm);
+    }
+    }
+
+    // Bail if we see an instruction that defines NZCV that we don't handle.
+    if (PredI.definesRegister(AArch64::NZCV))
+      return None;
+  }
+  return None;
 }
 
 bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
@@ -85,79 +200,187 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
   if (MBB->pred_size() != 1)
     return false;
 
+  // Check if the predecessor has two successors, implying the block ends in a
+  // conditional branch.
   MachineBasicBlock *PredMBB = *MBB->pred_begin();
-  MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
-  if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+  if (PredMBB->succ_size() != 2)
+    return false;
+
+  MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr();
+  if (CondBr == PredMBB->end())
     return false;
 
-  ++CompBr;
+  // Keep track of the earliest point in the PredMBB block where kill markers
+  // need to be removed if a COPY is removed.
+  MachineBasicBlock::iterator FirstUse;
+  // After calling knownRegValInBlock, FirstUse will either point to a CBZ/CBNZ
+  // or a compare (i.e., SUBS).  In the latter case, we must take care when
+  // updating FirstUse when scanning for COPY instructions.  In particular, if
+  // there's a COPY in between the compare and branch the COPY should not
+  // update FirstUse.
+  bool SeenFirstUse = false;
+  // Registers that contain a known value at the start of MBB.
+  SmallVector<RegImm, 4> KnownRegs;
+
+  MachineBasicBlock::iterator Itr = std::next(CondBr);
   do {
-    --CompBr;
-    if (guaranteesZeroRegInBlock(*CompBr, MBB))
-      break;
-  } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+    --Itr;
 
-  // We've not found a CBZ/CBNZ, time to bail out.
-  if (!guaranteesZeroRegInBlock(*CompBr, MBB))
-    return false;
+    Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse);
+    if (KnownRegImm == None)
+      continue;
 
-  unsigned TargetReg = CompBr->getOperand(0).getReg();
-  if (!TargetReg)
-    return false;
-  assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
-         "Expect physical register");
+    KnownRegs.push_back(*KnownRegImm);
+
+    // Reset the clobber list, which is used by knownRegValInBlock.
+    ClobberedRegs.reset();
+
+    // Look backward in PredMBB for COPYs from the known reg to find other
+    // registers that are known to be a constant value.
+    for (auto PredI = Itr;; --PredI) {
+      if (FirstUse == PredI)
+        SeenFirstUse = true;
+
+      if (PredI->isCopy()) {
+        MCPhysReg CopyDstReg = PredI->getOperand(0).getReg();
+        MCPhysReg CopySrcReg = PredI->getOperand(1).getReg();
+        for (auto &KnownReg : KnownRegs) {
+          if (ClobberedRegs[KnownReg.Reg])
+            continue;
+          // If we have X = COPY Y, and Y is known to be zero, then now X is
+          // known to be zero.
+          if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) {
+            KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm));
+            if (SeenFirstUse)
+              FirstUse = PredI;
+            break;
+          }
+          // If we have X = COPY Y, and X is known to be zero, then now Y is
+          // known to be zero.
+          if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) {
+            KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm));
+            if (SeenFirstUse)
+              FirstUse = PredI;
+            break;
+          }
+        }
+      }
+
+      // Stop if we get to the beginning of PredMBB.
+      if (PredI == PredMBB->begin())
+        break;
+
+      trackRegDefs(*PredI, ClobberedRegs, TRI);
+      // Stop if all of the known-zero regs have been clobbered.
+      if (all_of(KnownRegs, [&](RegImm KnownReg) {
+            return ClobberedRegs[KnownReg.Reg];
+          }))
+        break;
+    }
+    break;
+
+  } while (Itr != PredMBB->begin() && Itr->isTerminator());
 
-  // Remember all registers aliasing with TargetReg.
-  SmallSetVector<unsigned, 8> TargetRegs;
-  for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
-    TargetRegs.insert(*AI);
+  // We've not found a registers with a known value, time to bail out.
+  if (KnownRegs.empty())
+    return false;
 
   bool Changed = false;
+  // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB.
+  SmallSetVector<unsigned, 4> UsedKnownRegs;
   MachineBasicBlock::iterator LastChange = MBB->begin();
-  unsigned SmallestDef = TargetReg;
-  // Remove redundant Copy instructions unless TargetReg is modified.
+  // Remove redundant Copy instructions unless KnownReg is modified.
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     MachineInstr *MI = &*I;
     ++I;
-    if (MI->isCopy() && MI->getOperand(0).isReg() &&
-        MI->getOperand(1).isReg()) {
-
-      unsigned DefReg = MI->getOperand(0).getReg();
-      unsigned SrcReg = MI->getOperand(1).getReg();
-
-      if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
-          !MRI->isReserved(DefReg) &&
-          (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
-        DEBUG(dbgs() << "Remove redundant Copy : ");
-        DEBUG((MI)->print(dbgs()));
-
-        MI->eraseFromParent();
-        Changed = true;
-        LastChange = I;
-        NumCopiesRemoved++;
-        SmallestDef =
-            TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
-        continue;
+    bool RemovedMI = false;
+    bool IsCopy = MI->isCopy();
+    bool IsMoveImm = MI->isMoveImmediate();
+    if (IsCopy || IsMoveImm) {
+      MCPhysReg DefReg = MI->getOperand(0).getReg();
+      MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0;
+      int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0;
+      if (!MRI->isReserved(DefReg) &&
+          ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) ||
+           IsMoveImm)) {
+        for (RegImm &KnownReg : KnownRegs) {
+          if (KnownReg.Reg != DefReg &&
+              !TRI->isSuperRegister(DefReg, KnownReg.Reg))
+            continue;
+
+          // For a copy, the known value must be a zero.
+          if (IsCopy && KnownReg.Imm != 0)
+            continue;
+
+          if (IsMoveImm) {
+            // For a move immediate, the known immediate must match the source
+            // immediate.
+            if (KnownReg.Imm != SrcImm)
+              continue;
+
+            // Don't remove a move immediate that implicitly defines the upper
+            // bits when only the lower 32 bits are known.
+            MCPhysReg CmpReg = KnownReg.Reg;
+            if (any_of(MI->implicit_operands(), [CmpReg](MachineOperand &O) {
+                  return !O.isDead() && O.isReg() && O.isDef() &&
+                         O.getReg() != CmpReg;
+                }))
+              continue;
+          }
+
+          if (IsCopy)
+            DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
+          else
+            DEBUG(dbgs() << "Remove redundant Move : " << *MI);
+
+          MI->eraseFromParent();
+          Changed = true;
+          LastChange = I;
+          NumCopiesRemoved++;
+          UsedKnownRegs.insert(KnownReg.Reg);
+          RemovedMI = true;
+          break;
+        }
       }
     }
 
-    if (MI->modifiesRegister(TargetReg, TRI))
+    // Skip to the next instruction if we removed the COPY/MovImm.
+    if (RemovedMI)
+      continue;
+
+    // Remove any regs the MI clobbers from the KnownConstRegs set.
+    for (unsigned RI = 0; RI < KnownRegs.size();)
+      if (MI->modifiesRegister(KnownRegs[RI].Reg, TRI)) {
+        std::swap(KnownRegs[RI], KnownRegs[KnownRegs.size() - 1]);
+        KnownRegs.pop_back();
+        // Don't increment RI since we need to now check the swapped-in
+        // KnownRegs[RI].
+      } else {
+        ++RI;
+      }
+
+    // Continue until the KnownRegs set is empty.
+    if (KnownRegs.empty())
       break;
   }
 
   if (!Changed)
     return false;
 
-  // Otherwise, we have to fixup the use-def chain, starting with the
-  // CBZ/CBNZ. Conservatively mark as much as we can live.
-  CompBr->clearRegisterKills(SmallestDef, TRI);
+  // Add newly used regs to the block's live-in list if they aren't there
+  // already.
+  for (MCPhysReg KnownReg : UsedKnownRegs)
+    if (!MBB->isLiveIn(KnownReg))
+      MBB->addLiveIn(KnownReg);
 
-  if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
-    MBB->addLiveIn(TargetReg);
-
-  // Clear any kills of TargetReg between CompBr and the last removed COPY.
+  // Clear kills in the range where changes were made.  This is conservative,
+  // but should be okay since kill markers are being phased out.
+  DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
+               << "\tLastChange: " << *LastChange);
+  for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end()))
+    MMI.clearKillInfo();
   for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
-    MMI.clearRegisterKills(SmallestDef, TRI);
+    MMI.clearKillInfo();
 
   return true;
 }
@@ -168,6 +391,11 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
     return false;
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
+
+  // Resize the clobber register bitfield tracker.  We do this once per
+  // function and then clear the bitfield each time we optimize a copy.
+  ClobberedRegs.resize(TRI->getNumRegs());
+
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF)
     Changed |= optimizeCopy(&MBB);
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index b292c9c87dcd..20a5979f9b4b 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//===- AArch64RegisterBankInfo.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,13 +13,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64RegisterBankInfo.h"
-#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+
+#define GET_TARGET_REGBANK_IMPL
+#include "AArch64GenRegisterBank.inc"
 
 // This file will be TableGen'ed at some point.
 #include "AArch64GenRegisterBankInfo.def"
@@ -31,7 +42,7 @@ using namespace llvm;
 #endif
 
 AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
-    : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) {
+    : AArch64GenRegisterBankInfo() {
   static bool AlreadyInit = false;
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
@@ -78,44 +89,21 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
   // Check that the TableGen'ed like file is in sync we our expectations.
   // First, the Idx.
-  assert(AArch64::PartialMappingIdx::PMI_GPR32 ==
-             AArch64::PartialMappingIdx::PMI_FirstGPR &&
-         "GPR32 index not first in the GPR list");
-  assert(AArch64::PartialMappingIdx::PMI_GPR64 ==
-             AArch64::PartialMappingIdx::PMI_LastGPR &&
-         "GPR64 index not last in the GPR list");
-  assert(AArch64::PartialMappingIdx::PMI_FirstGPR <=
-             AArch64::PartialMappingIdx::PMI_LastGPR &&
-         "GPR list is backward");
-  assert(AArch64::PartialMappingIdx::PMI_FPR32 ==
-             AArch64::PartialMappingIdx::PMI_FirstFPR &&
-         "FPR32 index not first in the FPR list");
-  assert(AArch64::PartialMappingIdx::PMI_FPR512 ==
-             AArch64::PartialMappingIdx::PMI_LastFPR &&
-         "FPR512 index not last in the FPR list");
-  assert(AArch64::PartialMappingIdx::PMI_FirstFPR <=
-             AArch64::PartialMappingIdx::PMI_LastFPR &&
-         "FPR list is backward");
-  assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR64 &&
-         AArch64::PartialMappingIdx::PMI_FPR64 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR128 &&
-         AArch64::PartialMappingIdx::PMI_FPR128 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR256 &&
-         AArch64::PartialMappingIdx::PMI_FPR256 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR512 &&
-         "FPR indices not properly ordered");
+  assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
+                                {PMI_GPR32, PMI_GPR64}) &&
+         "PartialMappingIdx's are incorrectly ordered");
+  assert(checkPartialMappingIdx(
+             PMI_FirstFPR, PMI_LastFPR,
+             {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) &&
+         "PartialMappingIdx's are incorrectly ordered");
 // Now, the content.
 // Check partial mapping.
 #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB)                      \
   do {                                                                         \
-    const PartialMapping &Map =                                                \
-        AArch64::PartMappings[AArch64::PartialMappingIdx::Idx -                \
-                              AArch64::PartialMappingIdx::PMI_Min];            \
-    (void)Map;                                                                 \
-    assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength &&           \
-           Map.RegBank == &RB && #Idx " is incorrectly initialized");          \
-  } while (0)
+    assert(                                                                    \
+        checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \
+        #Idx " is incorrectly initialized");                                   \
+  } while (false)
 
   CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
   CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
@@ -128,17 +116,11 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 // Check value mapping.
 #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset)                              \
   do {                                                                         \
-    unsigned PartialMapBaseIdx =                                               \
-        AArch64::PartialMappingIdx::PMI_##RBName##Size -                       \
-        AArch64::PartialMappingIdx::PMI_Min;                                   \
-    (void)PartialMapBaseIdx;                                                   \
-    const ValueMapping &Map = AArch64::getValueMapping(                        \
-        AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset];          \
-    (void)Map;                                                                 \
-    assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] &&       \
-           Map.NumBreakDowns == 1 && #RBName #Size                             \
-           " " #Offset " is incorrectly initialized");                         \
-  } while (0)
+    assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size,            \
+                             PartialMappingIdx::PMI_First##RBName, Size,       \
+                             Offset) &&                                        \
+           #RBName #Size " " #Offset " is incorrectly initialized");           \
+  } while (false)
 
 #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
 
@@ -157,7 +139,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
     CHECK_VALUEMAP_IMPL(RBName, Size, 0);                                      \
     CHECK_VALUEMAP_IMPL(RBName, Size, 1);                                      \
     CHECK_VALUEMAP_IMPL(RBName, Size, 2);                                      \
-  } while (0)
+  } while (false)
 
   CHECK_VALUEMAP_3OPS(GPR, 32);
   CHECK_VALUEMAP_3OPS(GPR, 64);
@@ -169,24 +151,23 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
 #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size)                 \
   do {                                                                         \
-    unsigned PartialMapDstIdx =                                                \
-        AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min;                     \
-    unsigned PartialMapSrcIdx =                                                \
-        AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min;                     \
-    (void) PartialMapDstIdx;                                                   \
-    (void) PartialMapSrcIdx;                                                   \
-    const ValueMapping *Map = AArch64::getCopyMapping(                         \
-        AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR,                \
-        AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size);         \
-    (void) Map;                                                                \
-    assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] &&     \
+    unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min;               \
+    unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min;               \
+    (void)PartialMapDstIdx;                                                    \
+    (void)PartialMapSrcIdx;                                                    \
+    const ValueMapping *Map = getCopyMapping(                                  \
+        AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size);  \
+    (void)Map;                                                                 \
+    assert(Map[0].BreakDown ==                                                 \
+               &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] &&  \
            Map[0].NumBreakDowns == 1 && #RBNameDst #Size                       \
            " Dst is incorrectly initialized");                                 \
-    assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] &&     \
+    assert(Map[1].BreakDown ==                                                 \
+               &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] &&  \
            Map[1].NumBreakDowns == 1 && #RBNameSrc #Size                       \
            " Src is incorrectly initialized");                                 \
                                                                                \
-  } while (0)
+  } while (false)
 
   CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
   CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
@@ -280,12 +261,10 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
       break;
     InstructionMappings AltMappings;
     InstructionMapping GPRMapping(
-        /*ID*/ 1, /*Cost*/ 1,
-        AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+        /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size),
         /*NumOperands*/ 3);
     InstructionMapping FPRMapping(
-        /*ID*/ 2, /*Cost*/ 1,
-        AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+        /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size),
         /*NumOperands*/ 3);
 
     AltMappings.emplace_back(std::move(GPRMapping));
@@ -305,21 +284,21 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
     InstructionMappings AltMappings;
     InstructionMapping GPRMapping(
         /*ID*/ 1, /*Cost*/ 1,
-        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size),
+        getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size),
         /*NumOperands*/ 2);
     InstructionMapping FPRMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size),
+        getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size),
         /*NumOperands*/ 2);
     InstructionMapping GPRToFPRMapping(
         /*ID*/ 3,
         /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
-        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size),
+        getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size),
         /*NumOperands*/ 2);
     InstructionMapping FPRToGPRMapping(
         /*ID*/ 3,
         /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
-        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size),
+        getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size),
         /*NumOperands*/ 2);
 
     AltMappings.emplace_back(std::move(GPRMapping));
@@ -341,17 +320,15 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
     InstructionMappings AltMappings;
     InstructionMapping GPRMapping(
         /*ID*/ 1, /*Cost*/ 1,
-        getOperandsMapping(
-            {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
-             // Addresses are GPR 64-bit.
-             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+        getOperandsMapping({getValueMapping(PMI_FirstGPR, Size),
+                            // Addresses are GPR 64-bit.
+                            getValueMapping(PMI_FirstGPR, 64)}),
         /*NumOperands*/ 2);
     InstructionMapping FPRMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        getOperandsMapping(
-            {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
-             // Addresses are GPR 64-bit.
-             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+        getOperandsMapping({getValueMapping(PMI_FirstFPR, Size),
+                            // Addresses are GPR 64-bit.
+                            getValueMapping(PMI_FirstGPR, 64)}),
         /*NumOperands*/ 2);
 
     AltMappings.emplace_back(std::move(GPRMapping));
@@ -369,13 +346,12 @@ void AArch64RegisterBankInfo::applyMappingImpl(
   switch (OpdMapper.getMI().getOpcode()) {
   case TargetOpcode::G_OR:
   case TargetOpcode::G_BITCAST:
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
     // Those ID must match getInstrAlternativeMappings.
     assert((OpdMapper.getInstrMapping().getID() >= 1 &&
             OpdMapper.getInstrMapping().getID() <= 4) &&
            "Don't know how to handle that ID");
     return applyDefaultMapping(OpdMapper);
-  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -411,6 +387,8 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
   unsigned Size = Ty.getSizeInBits();
   bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
 
+  PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR;
+
 #ifndef NDEBUG
   // Make sure all the operands are using similar size and type.
   // Should probably be checked by the machine verifier.
@@ -422,20 +400,19 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
   // for each types.
   for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
     LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
-    assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) ==
-               AArch64::getRegBankBaseIdxOffset(Size) &&
-           "Operand has incompatible size");
+    assert(
+        AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(
+            RBIdx, OpTy.getSizeInBits()) ==
+            AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) &&
+        "Operand has incompatible size");
     bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
     (void)OpIsFPR;
     assert(IsFPR == OpIsFPR && "Operand has incompatible type");
   }
 #endif // End NDEBUG.
 
-  AArch64::PartialMappingIdx RBIdx =
-      IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR;
-
-  return InstructionMapping{DefaultMappingID, 1,
-                            AArch64::getValueMapping(RBIdx, Size), NumOperands};
+  return InstructionMapping{DefaultMappingID, 1, getValueMapping(RBIdx, Size),
+                            NumOperands};
 }
 
 RegisterBankInfo::InstructionMapping
@@ -485,9 +462,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
     const RegisterBank &SrcRB =
         SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
-    return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size),
-                              AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size),
-                              /*NumOperands*/ 2};
+    return InstructionMapping{
+        DefaultMappingID, copyCost(DstRB, SrcRB, Size),
+        getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
+        /*NumOperands*/ 2};
   }
   case TargetOpcode::G_SEQUENCE:
     // FIXME: support this, but the generic code is really not going to do
@@ -501,7 +479,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
   // Track the size and bank of each register.  We don't do partial mappings.
   SmallVector<unsigned, 4> OpSize(NumOperands);
-  SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+  SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
   for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
     auto &MO = MI.getOperand(Idx);
     if (!MO.isReg())
@@ -513,9 +491,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
     // For floating-point instructions, scalars go in FPRs.
     if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
-      OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR;
+      OpRegBankIdx[Idx] = PMI_FirstFPR;
     else
-      OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR;
+      OpRegBankIdx[Idx] = PMI_FirstGPR;
   }
 
   unsigned Cost = 1;
@@ -523,49 +501,50 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // fine-tune the computed mapping.
   switch (Opc) {
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_UITOFP: {
-    OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR};
+  case TargetOpcode::G_UITOFP:
+    OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
     break;
-  }
   case TargetOpcode::G_FPTOSI:
-  case TargetOpcode::G_FPTOUI: {
-    OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR};
+  case TargetOpcode::G_FPTOUI:
+    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
     break;
-  }
-  case TargetOpcode::G_FCMP: {
-    OpRegBankIdx = {AArch64::PMI_FirstGPR,
-                    /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR,
-                    AArch64::PMI_FirstFPR};
+  case TargetOpcode::G_FCMP:
+    OpRegBankIdx = {PMI_FirstGPR,
+                    /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR};
     break;
-  }
-  case TargetOpcode::G_BITCAST: {
+  case TargetOpcode::G_BITCAST:
     // This is going to be a cross register bank copy and this is expensive.
     if (OpRegBankIdx[0] != OpRegBankIdx[1])
-      Cost =
-          copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank,
-                   *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]);
+      Cost = copyCost(
+          *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank,
+          *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank,
+          OpSize[0]);
     break;
-  }
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
     // Loading in vector unit is slightly more expensive.
     // This is actually only true for the LD1R and co instructions,
     // but anyway for the fast mode this number does not matter and
     // for the greedy mode the cost of the cross bank copy will
     // offset this number.
     // FIXME: Should be derived from the scheduling model.
-    if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR)
+    if (OpRegBankIdx[0] >= PMI_FirstFPR)
       Cost = 2;
-  }
+    break;
   }
 
   // Finally construct the computed mapping.
   RegisterBankInfo::InstructionMapping Mapping =
       InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands};
   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
-  for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
-    if (MI.getOperand(Idx).isReg())
-      OpdsMapping[Idx] =
-          AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    if (MI.getOperand(Idx).isReg()) {
+      auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+      if (!Mapping->isValid())
+        return InstructionMapping();
+
+      OpdsMapping[Idx] = Mapping;
+    }
+  }
 
   Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
   return Mapping;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
index f763235049d4..0a795a42c0b1 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -16,25 +16,78 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "AArch64GenRegisterBank.inc"
+
 namespace llvm {
 
 class TargetRegisterInfo;
 
-namespace AArch64 {
-enum {
-  GPRRegBankID = 0, /// General Purpose Registers: W, X.
-  FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
-  CCRRegBankID = 2, /// Conditional register: NZCV.
-  NumRegisterBanks
-};
+class AArch64GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+
+  enum PartialMappingIdx {
+    PMI_None = -1,
+    PMI_FPR32 = 1,
+    PMI_FPR64,
+    PMI_FPR128,
+    PMI_FPR256,
+    PMI_FPR512,
+    PMI_GPR32,
+    PMI_GPR64,
+    PMI_FirstGPR = PMI_GPR32,
+    PMI_LastGPR = PMI_GPR64,
+    PMI_FirstFPR = PMI_FPR32,
+    PMI_LastFPR = PMI_FPR512,
+    PMI_Min = PMI_FirstFPR,
+  };
+
+  static RegisterBankInfo::PartialMapping PartMappings[];
+  static RegisterBankInfo::ValueMapping ValMappings[];
+  static PartialMappingIdx BankIDToCopyMapIdx[];
+
+  enum ValueMappingIdx {
+    InvalidIdx = 0,
+    First3OpsIdx = 1,
+    Last3OpsIdx = 19,
+    DistanceBetweenRegBanks = 3,
+    FirstCrossRegCpyIdx = 22,
+    LastCrossRegCpyIdx = 34,
+    DistanceBetweenCrossRegCpy = 2
+  };
+
+  static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
+                              unsigned ValLength, const RegisterBank &RB);
+  static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank,
+                                unsigned Size, unsigned Offset);
+  static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias,
+                                     PartialMappingIdx LastAlias,
+                                     ArrayRef<PartialMappingIdx> Order);
 
-extern RegisterBank GPRRegBank;
-extern RegisterBank FPRRegBank;
-extern RegisterBank CCRRegBank;
-} // End AArch64 namespace.
+  static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size);
+
+  /// Get the pointer to the ValueMapping representing the RegisterBank
+  /// at \p RBIdx with a size of \p Size.
+  ///
+  /// The returned mapping works for instructions with the same kind of
+  /// operands for up to 3 operands.
+  ///
+  /// \pre \p RBIdx != PartialMappingIdx::None
+  static const RegisterBankInfo::ValueMapping *
+  getValueMapping(PartialMappingIdx RBIdx, unsigned Size);
+
+  /// Get the pointer to the ValueMapping of the operands of a copy
+  /// instruction from the \p SrcBankID register bank to the \p DstBankID
+  /// register bank with a size of \p Size.
+  static const RegisterBankInfo::ValueMapping *
+  getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
+
+#define GET_TARGET_REGBANK_CLASS
+#include "AArch64GenRegisterBank.inc"
+};
 
 /// This class provides the information for the target register banks.
-class AArch64RegisterBankInfo final : public RegisterBankInfo {
+class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
   /// See RegisterBankInfo::applyMapping.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
 
diff --git a/lib/Target/AArch64/AArch64RegisterBanks.td b/lib/Target/AArch64/AArch64RegisterBanks.td
new file mode 100644
index 000000000000..c2b6c0b04e9b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -0,0 +1,20 @@
+//=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: W, X.
+def GPRRegBank : RegisterBank<"GPR", [GPR64all]>;
+
+/// Floating Point/Vector Registers: B, H, S, D, Q.
+def FPRRegBank : RegisterBank<"FPR", [QQQQ]>;
+
+/// Conditional register: NZCV.
+def CCRRegBank : RegisterBank<"CCR", [CCR]>;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 98fad71aa18a..baf15ac540cf 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -118,25 +118,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
-  markSuperRegs(Reserved, AArch64::SP);
-  markSuperRegs(Reserved, AArch64::XZR);
   markSuperRegs(Reserved, AArch64::WSP);
   markSuperRegs(Reserved, AArch64::WZR);
 
-  if (TFI->hasFP(MF) || TT.isOSDarwin()) {
-    markSuperRegs(Reserved, AArch64::FP);
+  if (TFI->hasFP(MF) || TT.isOSDarwin())
     markSuperRegs(Reserved, AArch64::W29);
-  }
 
-  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
-    markSuperRegs(Reserved, AArch64::X18); // Platform register
-    markSuperRegs(Reserved, AArch64::W18);
-  }
+  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
+    markSuperRegs(Reserved, AArch64::W18); // Platform register
 
-  if (hasBasePointer(MF)) {
-    markSuperRegs(Reserved, AArch64::X19);
+  if (hasBasePointer(MF))
     markSuperRegs(Reserved, AArch64::W19);
-  }
 
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index 93ca079275c8..18d000ace94c 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -13,7 +13,7 @@
 
 // ===---------------------------------------------------------------------===//
 // The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
+// This works with MachineScheduler. See MCSchedule.h for details.
 
 // Cortex-A53 machine model for scheduling and other instruction cost heuristics.
 def CortexA53Model : SchedMachineModel {
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 99c48d0146e4..303398ea0b7f 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -162,7 +162,9 @@ def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
 // Cryptography Extensions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def A57ReadAES  : SchedReadAdvance<3, [A57Write_3cyc_1W]>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES[DE]")>;
+def : InstRW<[A57Write_3cyc_1W, A57ReadAES], (instregex "^AESI?MC")>;
 def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index 19a6d6f2a1ad..eec089087fe0 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -17,10 +17,112 @@
 // instruction cost model.
 
 def FalkorModel : SchedMachineModel {
-  let IssueWidth = 4;          // 4-wide issue for expanded uops.
+  let IssueWidth = 8;          // 8 uops are dispatched per cycle.
   let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer.
   let LoopMicroOpBufferSize = 16;
   let LoadLatency = 3;         // Optimistic load latency.
   let MispredictPenalty = 11;  // Minimum branch misprediction penalty.
-  let CompleteModel = 0;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Falkor.
+
+let SchedModel = FalkorModel in {
+
+  def FalkorUnitB    : ProcResource<1>; // Branch
+  def FalkorUnitLD   : ProcResource<1>; // Load pipe
+  def FalkorUnitSD   : ProcResource<1>; // Store data
+  def FalkorUnitST   : ProcResource<1>; // Store pipe
+  def FalkorUnitX    : ProcResource<1>; // Complex arithmetic
+  def FalkorUnitY    : ProcResource<1>; // Simple arithmetic
+  def FalkorUnitZ    : ProcResource<1>; // Simple arithmetic
+
+  def FalkorUnitVSD  : ProcResource<1>; // Vector store data
+  def FalkorUnitVX   : ProcResource<1>; // Vector X-pipe
+  def FalkorUnitVY   : ProcResource<1>; // Vector Y-pipe
+
+  def FalkorUnitGTOV : ProcResource<1>; // Scalar to Vector
+  def FalkorUnitVTOG : ProcResource<1>; // Vector to Scalar
+
+  // Define the resource groups.
+  def FalkorUnitXY   : ProcResGroup<[FalkorUnitX, FalkorUnitY]>;
+  def FalkorUnitXYZ  : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ]>;
+  def FalkorUnitXYZB : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ,
+                                     FalkorUnitB]>;
+  def FalkorUnitZB   : ProcResGroup<[FalkorUnitZ, FalkorUnitB]>;
+  def FalkorUnitVXVY : ProcResGroup<[FalkorUnitVX, FalkorUnitVY]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Falkor.
+
+let SchedModel = FalkorModel in {
+
+def : WriteRes<WriteImm,   [FalkorUnitXYZ]> { let Latency = 1; }
+def : WriteRes<WriteI,     [FalkorUnitXYZ]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 1; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [FalkorUnitXYZ, FalkorUnitXYZ]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr,  [FalkorUnitXYZ, FalkorUnitXYZ]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS,    [FalkorUnitXYZ]> { let Latency = 1; }
+def : WriteRes<WriteID32,  [FalkorUnitX, FalkorUnitZ]>
+      { let Latency = 8; let NumMicroOps = 2; }
+def : WriteRes<WriteID64,  [FalkorUnitX, FalkorUnitZ]>
+      { let Latency = 16; let NumMicroOps = 2; }
+def : WriteRes<WriteIM32,  [FalkorUnitX]> { let Latency = 4; }
+def : WriteRes<WriteIM64,  [FalkorUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr,    [FalkorUnitB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; }
+def : WriteRes<WriteLD,    [FalkorUnitLD]> { let Latency = 3; }
+def : WriteRes<WriteST,    [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 3; let NumMicroOps = 3; }
+def : WriteRes<WriteSTP,   [FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 0; let NumMicroOps = 2; }
+def : WriteRes<WriteAdr,   [FalkorUnitXYZ]> { let Latency = 5; }
+def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; }
+def : WriteRes<WriteSTIdx, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 4; let NumMicroOps = 3; }
+def : WriteRes<WriteF,     [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp,  [FalkorUnitVXVY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt,  [FalkorUnitVXVY]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [FalkorUnitVXVY]> { let Latency = 4; }
+def : WriteRes<WriteFImm,  [FalkorUnitVXVY]> { let Latency = 4; }
+def : WriteRes<WriteFMul,  [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv,  [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV,     [FalkorUnitVXVY]> { let Latency = 6; }
+def : WriteRes<WriteVLD,   [FalkorUnitLD]> { let Latency = 3; }
+def : WriteRes<WriteVST,   [FalkorUnitST, FalkorUnitVSD]>
+      { let Latency = 0; let NumMicroOps = 2; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 3; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+// Detailed Refinements
+// -----------------------------------------------------------------------------
+include "AArch64SchedFalkorDetails.td"
+
 }
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
new file mode 100644
index 000000000000..6bce4ef6b652
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -0,0 +1,523 @@
+//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Falkor subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+include "AArch64SchedFalkorWriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the earlier mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// SIMD Floating-point Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FAC(GE|GT)(16|32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTXNv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FMULX16, FMULX32)>;
+
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FMULX64)>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+
+def : InstRW<[FalkorWr_3VXVY_4cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>;
+
+def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>;
+// SIMD Integer Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs ADDPv2i64p)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(AND|ORR|ORN|BIC|EOR)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIC|ORR)(v2i32|v4i16)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^NEG(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHLv1i64$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs PMULv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)(ABD|ADALP)(v8i8|v4i16|v2i32)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i16v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(s|h|b)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs ADDVv4i16v)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQABS(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)ADDLVv8i8v$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs ADDVv8i8v)>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs ADDVv4i32v)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs ADDVv8i16v)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(ADD|SUB)HNv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)ABA(v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs ADDVv16i8v)>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32)_shift?$")>;
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^R(ADD|SUB)HNv.*$")>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^ADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs ADDPv2i64)>; // sz==11
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(AND|ORR|ORN|BIC|EOR)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIC|ORR)(v8i16|v4i32)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(NEG|SUB)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)ADDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v16i8|v2i64|v4i32|v8i16)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SHR(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SUBLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; // sz!=11
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^PMULL2?(v8i8|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)ABD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)ABDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16|v4i32|v2i64)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL2?(v1i64|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_3VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i32v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(S|U)ADDLVv8i16v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_6cyc],   (instregex "^(S|U)ADDLVv16i8v$")>;
+
+def : InstRW<[FalkorWr_4VXVY_2cyc],   (instregex "^(S|U)(ADD|SUB)Wv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_3cyc],   (instregex "^(S|U)ABALv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4cyc],   (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>;
+// SIMD Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteVLD],                               (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[WriteVLD],                               (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD],                               (instrs LD2i64)>;
+def : InstRW<[WriteVLD, WriteAdr],                     (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[WriteVLD, WriteAdr],                     (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteVLD, WriteAdr],                     (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc],                (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr],      (instregex "LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instrs LD3i64)>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instrs LD4i64)>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc],                (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr],      (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc],                (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc],                (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr],      (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr],      (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc],                      (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_3LD_3cyc],                      (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_3LD_3cyc],                      (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc],                (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr],      (instregex "LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc],                (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc],                (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr],      (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr],      (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc],                      (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_4LD_3cyc],                      (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_4LD_3cyc],                      (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc],                (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr],      (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc],          (instregex "LD3Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc],          (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],      (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],      (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_ADD],          (instregex "^ADD(S)?(W|X)r(s|x)$")>;
+def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^SUB(S)?(W|X)r(s|x)$")>;
+
+// SIMD Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs NOTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^REV(16|32|64)v.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "(S|U)QXTU?Nv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPXv1i32, FRECPXv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FRECPS64, FRSQRTS64)>;
+
+def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs TBLv16i8One)>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs TBLv8i8Two)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^TBX(v8|v16)i8One$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBL(v8i8Three|v16i8Two)$")>;
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBX(v8i8Two|v16i8Two)$")>;
+
+def : InstRW<[FalkorWr_4VXVY_6cyc],   (instregex "^TBL(v8i8Four|v16i8Three)$")>;
+def : InstRW<[FalkorWr_4VXVY_6cyc],   (instregex "^TBX(v8i8Three|v16i8Three)$")>;
+
+def : InstRW<[FalkorWr_5VXVY_7cyc],   (instrs TBLv16i8Four)>;
+def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
+
+// SIMD Store Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteVST],                                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[WriteVST],                                                        (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
+def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST4(i8|i16|i32|i64)$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
+
+def : InstRW<[WriteV, WriteVST, WriteVST],                                      (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr],                            (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+
+def : InstRW<[WriteVST, WriteVST, WriteVST],                                    (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],                                    (instrs ST3Threev2d)>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr],                          (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr],                          (instrs ST3Threev2d_POST)>;
+
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST],                              (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr],                    (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],                          (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],                          (instrs ST4Fourv2d)>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],                (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],                (instrs ST4Fourv2d_POST)>;
+
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST],          (instregex "^ST3Three(v16b|v8h|v4s)$")>;
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST],          (instregex "^ST4Four(v16b|v8h|v4s)$")>;
+def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs B)>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>;
+def : InstRW<[FalkorWr_1ZB_0cyc],     (instrs Bcc)>;
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instrs BL)>;
+def : InstRW<[FalkorWr_1Z_1XY_0cyc],  (instrs BLR)>;
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs SHA1Hrr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs AESIMCrr, AESMCrr)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs AESDrr, AESErr)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def : InstRW<[FalkorWr_1VX_1VY_4cyc], (instregex "^SHA1(C|M|P)rrr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_5cyc], (instrs SHA256H2rrr, SHA256Hrrr)>;
+def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
+
+// FP Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteLD],               (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[WriteLD, WriteAdr],     (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[WriteLD],               (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDR],          (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>;
+
+// FP Data Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCCMP(E)?(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCSEL(H|S|D)rrr$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs FCVTHSr, FCVTHDr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FABD(16|32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FADD|FSUB)(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTSHr, FCVTDHr)>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTSDr, FCVTDSr)>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^F(N)?MUL(H|S)rr$")>;
+
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^F(N)?MULDrr$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
+def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+// FP Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>;
+
+def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>;
+
+
+// Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFUMi)>;
+
+def : InstRW<[WriteLD, WriteLDHi],    (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[WriteLD, WriteLDHi],    (instregex "^LDP(W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(B|H|W|X)ui$")>;
+def : InstRW<[WriteLD, WriteAdr],     (instregex "^LDR(B|H|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(B|H|W|X)i$")>;
+
+def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_1LD_4cyc],     (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+
+def : InstRW<[FalkorWr_PRFM],         (instregex "^PRFMro(W|X)$")>;
+def : InstRW<[FalkorWr_LDR],          (instregex "^LDR(B|H|W|X)ro(W|X)$")>;
+
+def : InstRW<[FalkorWr_LDRS],         (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+
+def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>;
+def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>;
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(S|U)?BFM(W|X)ri$")>;
+def : InstRW<[FalkorWr_1X_2cyc],      (instregex "^CRC32.*$")>;
+def : InstRW<[FalkorWr_1XYZ_2cyc],    (instregex "^(CLS|CLZ|RBIT|REV|REV16|REV32)(W|X)r$")>;
+def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^EXTR(W|X)rri$")>;
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^M(ADD|SUB)Wrrr$")>;
+
+def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^M(ADD|SUB)Xrrr$")>;
+
+def : InstRW<[FalkorWr_1X_1Z_8cyc],   (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[FalkorWr_1X_1Z_16cyc],  (instregex "^(S|U)DIVXr$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>;
+
+// Move and Shift Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|X).*")>;
+def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^ADRP?$")>;
+def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^MOVN(W|X)i$")>;
+def : InstRW<[FalkorWr_MOVZ],         (instregex "^MOVZ(W|X)i$")>;
+
+// Other Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_0cyc],     (instrs CLREX, DMB, DSB)>;
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HLT, HVC, ISB, SMC, SVC)>;
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs SYSxt, SYSLxt)>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instrs MSRpstateImm1, MSRpstateImm4)>;
+
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS)>;
+
+def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
+
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
+def : InstRW<[WriteVST],              (instrs STNPDi, STNPSi)>;
+def : InstRW<[WriteSTP],               (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
+
+def : InstRW<[WriteST],               (instregex "^LDC.*$")>;
+def : InstRW<[WriteST],               (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[WriteST],               (instregex "^STXP(W|X)$")>;
+def : InstRW<[WriteST],               (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[WriteSTX],              (instregex "^STLXP(W|X)$")>;
+def : InstRW<[WriteSTX],              (instregex "^STLXR(B|H|W|X)$")>;
+def : InstRW<[WriteVST, WriteVST],    (instrs STNPQi)>;
+
+// Store Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteVST],              (instregex "^STP(D|S)(i|post|pre)$")>;
+def : InstRW<[WriteST],               (instregex "^STP(W|X)(i|post|pre)$")>;
+def : InstRW<[WriteST],               (instregex "^STR(Q|D|S|BB|HH)ui$")>;
+def : InstRW<[WriteST],               (instregex "^STUR(Q|D|S|BB|HH)i$")>;
+def : InstRW<[WriteST],               (instregex "^STR(B|H|W|X)(post|pre|ui)$")>;
+def : InstRW<[WriteST],               (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[WriteST],               (instregex "^STUR(B|H|W|X)i$")>;
+
+def : InstRW<[WriteST, WriteAdr],     (instregex "^STR(B|H|W|X)ro(W|X)$")>;
+
+def : InstRW<[WriteVST, WriteVST],    (instregex "^STPQ(i|post|pre)$")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
new file mode 100644
index 000000000000..9cdb4be4246b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -0,0 +1,361 @@
+//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Falkor specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: FalkorWr
+//   MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
+//   Latency: #cyc
+//
+// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
+//      down one Z pipe, six SD pipes, four VX pipes and the total latency is
+//      six cycles.
+//
+// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
+//
+// Contains all of the Falkor specific WriteVariant types for immediate zero
+// and LSLFast.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define 1 micro-op types
+
+
+def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
+def FalkorWr_1X_4cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 4; }
+def FalkorWr_1X_5cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 5; }
+def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
+def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
+def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
+def FalkorWr_1LD_4cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 4; }
+def FalkorWr_1XYZ_1cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
+def FalkorWr_1XYZ_2cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
+def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
+def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
+def FalkorWr_1none_0cyc : SchedWriteRes<[]>              { let Latency = 0; }
+
+def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
+def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
+def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
+def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+
+def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
+def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
+def FalkorWr_1ST_3cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 3; }
+
+def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
+def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
+def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define 2 micro-op types
+
+def FalkorWr_2VXVY_1cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_2cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_3cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_4cyc  : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2LD_3cyc   : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2GTOV_1cyc    : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2XYZ_2cyc   : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1X_1Z_8cyc  : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 8];
+}
+
+def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 16;
+  let ResourceCycles = [2, 16];
+}
+
+def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 3 micro-op types
+
+def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1LD_2VXVY_4cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def FalkorWr_2LD_1none_3cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3LD_3cyc        : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                              FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1Z_3cyc     : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 4 micro-op types
+
+def FalkorWr_2VX_2VY_2cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                            FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4VXVY_2cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_3cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_4cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_6cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4LD_3cyc      : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 5 micro-op types
+
+def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_5VXVY_7cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 6 micro-op types
+
+def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 8 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 9 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitXYZ,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitXYZ,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+// Forwarding logic is modeled for vector multiply and accumulate
+// -----------------------------------------------------------------------------
+def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc,
+                                         FalkorWr_2VXVY_4cyc]>;
+def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc,
+                                         FalkorWr_1VXVY_6cyc,
+                                         FalkorWr_2VXVY_5cyc,
+                                         FalkorWr_2VXVY_6cyc]>;
+
+// SchedPredicates and WriteVariants for Immediate Zero and LSLFast
+// -----------------------------------------------------------------------------
+def FalkorImmZPred    : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; 
+
+def FalkorWr_FMOV  : SchedWriteVariant<[
+                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,    [FalkorWr_1GTOV_1cyc]>]>;
+
+def FalkorWr_MOVZ  : SchedWriteVariant<[
+                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_1cyc]>]>;
+
+def FalkorWr_LDR   : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_3cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1LD_4cyc]>]>;
+
+def FalkorWr_ADD   : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
+                       SchedVar<FalkorImmZPred,    [FalkorWr_1XYZ_1cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_2XYZ_2cyc]>]>;
+
+def FalkorWr_PRFM  : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1ST_3cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1ST_4cyc]>]>;
+
+def FalkorWr_LDRS  : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_4cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1LD_5cyc]>]>;
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 426ae6103e4b..02cccccd3078 100644
--- a/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -776,23 +776,29 @@ def KryoWrite_4cyc_X_X_115ln :
 }
 def : InstRW<[KryoWrite_4cyc_X_X_115ln],
 	(instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
-def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+def KryoWrite_10cyc_XA_Y_noRSV_43ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 10; let NumMicroOps = 3;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
-	(instrs FDIVDrr, FDIVSrr)>;
-def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVSrr)>;
+def KryoWrite_14cyc_XA_Y_noRSV_43ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 14; let NumMicroOps = 3;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+def : InstRW<[KryoWrite_14cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVDrr)>;
+def KryoWrite_10cyc_XA_Y_noRSV_121ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 10; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_121ln],
 	(instrs FDIVv2f32)>;
-def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+def KryoWrite_14cyc_XA_Y_XA_Y_123ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 4;
+	let Latency = 14; let NumMicroOps = 4;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+def : InstRW<[KryoWrite_14cyc_XA_Y_XA_Y_123ln],
 	(instrs FDIVv2f64, FDIVv4f32)>;
 def KryoWrite_5cyc_X_noRSV_55ln :
 	SchedWriteRes<[KryoUnitX]> {
@@ -968,24 +974,36 @@ def KryoWrite_2cyc_XY_XY_109ln :
 }
 def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
 	(instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
-def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+def KryoWrite_12cyc_XA_Y_noRSV_42ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 12; let NumMicroOps = 3;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
-	(instregex "FSQRT(S|D)r")>;
-def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_42ln],
+	(instrs FSQRTSr)>;
+def KryoWrite_21cyc_XA_Y_noRSV_42ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 21; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_21cyc_XA_Y_noRSV_42ln],
+	(instrs FSQRTDr)>;
+def KryoWrite_12cyc_XA_Y_noRSV_120ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 12; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_120ln],
+	(instrs FSQRTv2f32)>;
+def KryoWrite_21cyc_XA_Y_XA_Y_122ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 21; let NumMicroOps = 4;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
-	(instregex "FSQRTv2f32")>;
-def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+def : InstRW<[KryoWrite_21cyc_XA_Y_XA_Y_122ln],
+	(instrs FSQRTv4f32)>;
+def KryoWrite_36cyc_XA_Y_XA_Y_122ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 4;
+	let Latency = 36; let NumMicroOps = 4;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
-	(instregex "FSQRT(v2f64|v4f32)")>;
+def : InstRW<[KryoWrite_36cyc_XA_Y_XA_Y_122ln],
+	(instrs FSQRTv2f64)>;
 def KryoWrite_1cyc_X_201ln :
 	SchedWriteRes<[KryoUnitX]> {
 	let Latency = 1; let NumMicroOps = 1;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 14d6891253fa..3fbbc0be682d 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -366,7 +366,8 @@ def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
 // Cryptography instructions.
 def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
 def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
-def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>;
 
 def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
 def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td
new file mode 100644
index 000000000000..9a0cb702518d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -0,0 +1,352 @@
+//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM ThunderX T8X
+// (T88, T81, T83) processors.
+// Loosely based on Cortex-A53 which is somewhat similar.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details.
+
+// Cavium ThunderX T8X scheduling machine model.
+def ThunderXT8XModel : SchedMachineModel {
+  let IssueWidth = 2;         // 2 micro-ops dispatched per cycle.
+  let MicroOpBufferSize = 0;  // ThunderX T88/T81/T83 are in-order.
+  let LoadLatency = 3;        // Optimistic load latency.
+  let MispredictPenalty = 8;  // Branch mispredict penalty.
+  let PostRAScheduler = 1;    // Use PostRA scheduler.
+  let CompleteModel = 1;
+}
+
+// Modeling each pipeline with BufferSize == 0 since T8X is in-order.
+def THXT8XUnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def THXT8XUnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def THXT8XUnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def THXT8XUnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def THXT8XUnitBr     : ProcResource<1> { let BufferSize = 0; } // Branch
+def THXT8XUnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def THXT8XUnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mul/Div/Sqrt
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types mapping the ProcResources and
+// latencies.
+
+let SchedModel = ThunderXT8XModel in {
+
+// ALU
+def : WriteRes<WriteImm, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteI, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIEReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIS, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [THXT8XUnitALU]> { let Latency = 2; }
+
+// MAC
+def : WriteRes<WriteIM32, [THXT8XUnitMAC]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+def : WriteRes<WriteIM64, [THXT8XUnitMAC]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+// Div
+def : WriteRes<WriteID32, [THXT8XUnitDiv]> {
+  let Latency = 12;
+  let ResourceCycles = [6];
+}
+
+def : WriteRes<WriteID64, [THXT8XUnitDiv]> {
+  let Latency = 14;
+  let ResourceCycles = [8];
+}
+
+// Load
+def : WriteRes<WriteLD, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDIdx, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDHi, [THXT8XUnitLdSt]> { let Latency = 3; }
+
+// Vector Load
+def : WriteRes<WriteVLD, [THXT8XUnitLdSt]> {
+  let Latency = 8;
+  let ResourceCycles = [3];
+}
+
+def THXT8XWriteVLD1 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 6;
+  let ResourceCycles = [1];
+}
+
+def THXT8XWriteVLD2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 11;
+  let ResourceCycles = [7];
+}
+
+def THXT8XWriteVLD3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 12;
+  let ResourceCycles = [8];
+}
+
+def THXT8XWriteVLD4 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 13;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteVLD5 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 13;
+  let ResourceCycles = [9];
+}
+
+// Pre/Post Indexing
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTX, [THXT8XUnitLdSt]> { let Latency = 1; }
+
+// Vector Store
+def : WriteRes<WriteVST, [THXT8XUnitLdSt]>;
+def THXT8XWriteVST1 : SchedWriteRes<[THXT8XUnitLdSt]>;
+
+def THXT8XWriteVST2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 10;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteVST3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 11;
+  let ResourceCycles = [10];
+}
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [THXT8XUnitBr]>;
+def THXT8XWriteBR : SchedWriteRes<[THXT8XUnitBr]>;
+def : WriteRes<WriteBrReg, [THXT8XUnitBr]>;
+def THXT8XWriteBRR : SchedWriteRes<[THXT8XUnitBr]>;
+def THXT8XWriteRET : SchedWriteRes<[THXT8XUnitALU]>;
+def : WriteRes<WriteSys, [THXT8XUnitBr]>;
+def : WriteRes<WriteBarrier, [THXT8XUnitBr]>;
+def : WriteRes<WriteHint, [THXT8XUnitBr]>;
+
+// FP ALU
+def : WriteRes<WriteF, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [THXT8XUnitFPMDS]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THXT8XWriteFMAC : SchedWriteRes<[THXT8XUnitFPMDS]> { let Latency = 10; }
+
+def THXT8XWriteFDivSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 12;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteFDivDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THXT8XWriteFSqrtSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 17;
+  let ResourceCycles = [14];
+}
+
+def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 31;
+  let ResourceCycles = [28];
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 1>;
+def : ReadAdvance<ReadAdrBase, 2>;
+def : ReadAdvance<ReadVLD, 2>;
+
+// FIXME: This needs more targeted benchmarking.
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycles later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm, WriteI,
+                             WriteISReg, WriteIEReg, WriteIS,
+                             WriteID32, WriteID64,
+                             WriteIM32, WriteIM64]>;
+def THXT8XReadShifted : SchedReadAdvance<1, [WriteImm, WriteI,
+                                          WriteISReg, WriteIEReg, WriteIS,
+                                          WriteID32, WriteID64,
+                                          WriteIM32, WriteIM64]>;
+def THXT8XReadNotShifted : SchedReadAdvance<2, [WriteImm, WriteI,
+                                             WriteISReg, WriteIEReg, WriteIS,
+                                             WriteID32, WriteID64,
+                                             WriteIM32, WriteIM64]>;
+def THXT8XReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [THXT8XReadShifted]>,
+	SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, THXT8XReadISReg>;
+
+def THXT8XReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [THXT8XReadShifted]>,
+	SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, THXT8XReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg, WriteIS,
+                              WriteID32, WriteID64,
+                              WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm, WriteI,
+                               WriteISReg, WriteIEReg, WriteIS,
+                               WriteID32, WriteID64,
+                               WriteIM32, WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm, WriteI,
+                              WriteISReg, WriteIEReg, WriteIS,
+                              WriteID32, WriteID64,
+                              WriteIM32, WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRW.
+
+//---
+// Branch
+//---
+def : InstRW<[THXT8XWriteBR], (instregex "^B")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^BL")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>;
+
+//---
+// Ret
+//---
+def : InstRW<[THXT8XWriteRET], (instregex "^RET")>;
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[THXT8XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THXT8XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THXT8XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THXT8XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THXT8XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THXT8XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 35a40c314bf4..3654eeca530a 100644
--- a/lib/Target/AArch64/AArch64SchedVulcan.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1,4 +1,4 @@
-//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 Scheduling ---*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,23 +6,23 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// 1. Introduction
 //
-// This file defines the machine model for Broadcom Vulcan to support
-// instruction scheduling and other instruction cost heuristics.
+// This file defines the scheduling model for Cavium ThunderX2T99
+// processors.
+// Based on Broadcom Vulcan.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // 2. Pipeline Description.
 
-def VulcanModel : SchedMachineModel {
+def ThunderX2T99Model : SchedMachineModel {
   let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
   let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
   // Determined via a mix of micro-arch details and experimentation.
-  let LoopMicroOpBufferSize =  32; 
+  let LoopMicroOpBufferSize =  32;
   let PostRAScheduler       =   1; // Using PostRA sched.
   let CompleteModel         =   1;
 }
@@ -30,155 +30,155 @@ def VulcanModel : SchedMachineModel {
 // Define the issue ports.
 
 // Port 0: ALU, FP/SIMD.
-def VulcanP0 : ProcResource<1>;
+def THX2T99P0 : ProcResource<1>;
 
 // Port 1: ALU, FP/SIMD, integer mul/div.
-def VulcanP1 : ProcResource<1>;
+def THX2T99P1 : ProcResource<1>;
 
 // Port 2: ALU, Branch.
-def VulcanP2 : ProcResource<1>;
+def THX2T99P2 : ProcResource<1>;
 
 // Port 3: Store data.
-def VulcanP3 : ProcResource<1>;
+def THX2T99P3 : ProcResource<1>;
 
 // Port 4: Load/store.
-def VulcanP4 : ProcResource<1>;
+def THX2T99P4 : ProcResource<1>;
 
 // Port 5: Load/store.
-def VulcanP5 : ProcResource<1>;
+def THX2T99P5 : ProcResource<1>;
 
-let SchedModel = VulcanModel in {
+let SchedModel = ThunderX2T99Model in {
 
 // Define groups for the functional units on each issue port.  Each group
 // created will be used by a WriteRes later on.
 //
 // NOTE: Some groups only contain one member.  This is a way to create names for
 // the various functional units that share a single issue port.  For example,
-// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1.
+// "THX2T99I1" for ALU ops on port 1 and "THX2T99F1" for FP ops on port 1.
 
 // Integer divide and multiply micro-ops only on port 1.
-def VulcanI1 : ProcResGroup<[VulcanP1]>;
+def THX2T99I1 : ProcResGroup<[THX2T99P1]>;
 
 // Branch micro-ops only on port 2.
-def VulcanI2 : ProcResGroup<[VulcanP2]>;
+def THX2T99I2 : ProcResGroup<[THX2T99P2]>;
 
 // ALU micro-ops on ports 0, 1, and 2.
-def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+def THX2T99I012 : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2]>;
 
 // Crypto FP/SIMD micro-ops only on port 1.
-def VulcanF1 : ProcResGroup<[VulcanP1]>;
+def THX2T99F1 : ProcResGroup<[THX2T99P1]>;
 
 // FP/SIMD micro-ops on ports 0 and 1.
-def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+def THX2T99F01 : ProcResGroup<[THX2T99P0, THX2T99P1]>;
 
 // Store data micro-ops only on port 3.
-def VulcanSD : ProcResGroup<[VulcanP3]>;
+def THX2T99SD : ProcResGroup<[THX2T99P3]>;
 
 // Load/store micro-ops on ports 4 and 5.
-def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>;
 
 // 60 entry unified scheduler.
-def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
-                              VulcanP3, VulcanP4, VulcanP5]> {
+def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2,
+                              THX2T99P3, THX2T99P4, THX2T99P5]> {
   let BufferSize=60;
 }
 
 // Define commonly used write types for InstRW specializations.
-// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+// All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>.
 
 // 3 cycles on I1.
-def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 3; }
 
 // 4 cycles on I1.
-def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 4; }
 
 // 1 cycle on I0, I1, or I2.
-def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { let Latency = 1; }
 
 // 5 cycles on F1.
-def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 5; }
 
 // 7 cycles on F1.
-def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 7; }
 
 // 4 cycles on F0 or F1.
-def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 4; }
 
 // 5 cycles on F0 or F1.
-def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 5; }
 
 // 6 cycles on F0 or F1.
-def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 6; }
 
 // 7 cycles on F0 or F1.
-def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 7; }
 
 // 8 cycles on F0 or F1.
-def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 8; }
 
 // 16 cycles on F0 or F1.
-def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 16;
   let ResourceCycles = [8];
 }
 
 // 23 cycles on F0 or F1.
-def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 23;
   let ResourceCycles = [11];
 }
 
 // 1 cycles on LS0 or LS1.
-def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 1; }
 
 // 4 cycles on LS0 or LS1.
-def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 4; }
 
 // 5 cycles on LS0 or LS1.
-def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 5; }
 
 // 6 cycles on LS0 or LS1.
-def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 6; }
 
 // 5 cycles on LS0 or LS1 and I0, I1, or I2.
-def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
 
 // 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
-def VulcanWrite_6Cyc_LS01_I012_I012 : 
-  SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+def THX2T99Write_6Cyc_LS01_I012_I012 : 
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
   let Latency = 6;
   let NumMicroOps = 3;
 }
 
 // 1 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 1;
   let NumMicroOps = 2;
 }
 
 // 5 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
 
 // 6 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 6;
   let NumMicroOps = 2;
 }
 
 // 7 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 7;
   let NumMicroOps = 2;
 }
 
 // 8 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 8;
   let NumMicroOps = 2;
 }
@@ -202,7 +202,7 @@ def : ReadAdvance<ReadVLD,     0>;
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
 
-let SchedModel = VulcanModel in {
+let SchedModel = ThunderX2T99Model in {
 
 //---
 // 3.1 Branch Instructions
@@ -211,7 +211,7 @@ let SchedModel = VulcanModel in {
 // Branch, immed
 // Branch and link, immed
 // Compare and branch
-def : WriteRes<WriteBr,      [VulcanI2]> { let Latency = 1; }
+def : WriteRes<WriteBr,      [THX2T99I2]> { let Latency = 1; }
 
 def : WriteRes<WriteSys,     []> { let Latency = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
@@ -222,7 +222,7 @@ def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
 // Branch, register
 // Branch and link, register != LR
 // Branch and link, register = LR
-def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
+def : WriteRes<WriteBrReg,   [THX2T99I2]> { let Latency = 1; }
 
 //---
 // 3.2 Arithmetic and Logical Instructions
@@ -233,25 +233,25 @@ def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
 // Conditional compare
 // Conditional select
 // Address generation
-def : WriteRes<WriteI,       [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteI,       [THX2T99I012]> { let Latency = 1; }
 def : InstRW<[WriteI], (instrs COPY)>;
 
 // ALU, extend and/or shift
-def : WriteRes<WriteISReg,   [VulcanI012]> {
+def : WriteRes<WriteISReg,   [THX2T99I012]> {
   let Latency = 2;
   let ResourceCycles = [2];
 }
 
-def : WriteRes<WriteIEReg,   [VulcanI012]> {
+def : WriteRes<WriteIEReg,   [THX2T99I012]> {
   let Latency = 2;
   let ResourceCycles = [2];
 }
 
 // Move immed
-def : WriteRes<WriteImm,     [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteImm,     [THX2T99I012]> { let Latency = 1; }
 
 // Variable shift
-def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteIS,      [THX2T99I012]> { let Latency = 1; }
 
 //---
 // 3.4 Divide and Multiply Instructions
@@ -259,33 +259,33 @@ def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
 
 // Divide, W-form
 // Latency range of 13-23.  Take the average.
-def : WriteRes<WriteID32,    [VulcanI1]> {
+def : WriteRes<WriteID32,    [THX2T99I1]> {
   let Latency = 18;
   let ResourceCycles = [18];
 }
 
 // Divide, X-form
 // Latency range of 13-39.  Take the average.
-def : WriteRes<WriteID64,    [VulcanI1]> {
+def : WriteRes<WriteID64,    [THX2T99I1]> {
   let Latency = 26;
   let ResourceCycles = [26];
 }
 
 // Multiply accumulate, W-form
-def : WriteRes<WriteIM32,    [VulcanI012]> { let Latency = 5; }
+def : WriteRes<WriteIM32,    [THX2T99I012]> { let Latency = 5; }
 
 // Multiply accumulate, X-form
-def : WriteRes<WriteIM64,    [VulcanI012]> { let Latency = 5; }
+def : WriteRes<WriteIM64,    [THX2T99I012]> { let Latency = 5; }
 
 // Bitfield extract, two reg
-def : WriteRes<WriteExtr,    [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteExtr,    [THX2T99I012]> { let Latency = 1; }
 
 // Bitfield move, basic
 // Bitfield move, insert
 // NOTE: Handled by WriteIS.
 
 // Count leading
-def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
                                                "^CLZ(W|X)r$")>;
 
 // Reverse bits/bytes
@@ -300,13 +300,13 @@ def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
 // Load register, unscaled immed
 // Load register, immed unprivileged
 // Load register, unsigned immed
-def : WriteRes<WriteLD,      [VulcanLS01]> { let Latency = 4; }
+def : WriteRes<WriteLD,      [THX2T99LS01]> { let Latency = 4; }
 
 // Load register, immed post-index
 // NOTE: Handled by WriteLD, WriteI.
 // Load register, immed pre-index
 // NOTE: Handled by WriteLD, WriteAdr.
-def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteAdr,     [THX2T99I012]> { let Latency = 1; }
 
 // Load register offset, basic
 // Load register, register offset, scale by 4/8
@@ -314,15 +314,15 @@ def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
 // Load register offset, extend
 // Load register, register offset, extend, scale by 4/8
 // Load register, register offset, extend, scale by 2
-def VulcanWriteLDIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
-  SchedVar<NoSchedPred,   [VulcanWrite_5Cyc_LS01_I012]>]>;
-def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+def THX2T99WriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [THX2T99Write_6Cyc_LS01_I012_I012]>,
+  SchedVar<NoSchedPred,   [THX2T99Write_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, THX2T99WriteLDIdx>;
 
-def VulcanReadAdrBase : SchedReadVariant<[
+def THX2T99ReadAdrBase : SchedReadVariant<[
   SchedVar<ScaledIdxPred, [ReadDefault]>,
   SchedVar<NoSchedPred,   [ReadDefault]>]>;
-def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>;
 
 // Load pair, immed offset, normal
 // Load pair, immed offset, signed words, base != SP
@@ -347,7 +347,7 @@ def : WriteRes<WriteLDHi,    []> {
 // Store register, unscaled immed
 // Store register, immed unprivileged
 // Store register, unsigned immed
-def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
+def : WriteRes<WriteST,      [THX2T99LS01, THX2T99SD]> {
   let Latency = 1;
   let NumMicroOps = 2;
 }
@@ -364,14 +364,14 @@ def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
 // Store register, register offset, extend
 // Store register, register offset, extend, scale by 4/8
 // Store register, register offset, extend, scale by 1
-def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+def : WriteRes<WriteSTIdx, [THX2T99LS01, THX2T99SD, THX2T99I012]> {
   let Latency = 1;
   let NumMicroOps = 3;
 }
 
 // Store pair, immed offset, W-form
 // Store pair, immed offset, X-form
-def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
+def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
   let Latency = 1;
   let NumMicroOps = 2;
 }
@@ -389,35 +389,35 @@ def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
 // FP absolute value
 // FP min/max
 // FP negate
-def : WriteRes<WriteF,       [VulcanF01]> { let Latency = 5; }
+def : WriteRes<WriteF,       [THX2T99F01]> { let Latency = 5; }
 
 // FP arithmetic
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
 
 // FP compare
-def : WriteRes<WriteFCmp,    [VulcanF01]> { let Latency = 5; }
+def : WriteRes<WriteFCmp,    [THX2T99F01]> { let Latency = 5; }
 
 // FP divide, S-form
 // FP square root, S-form
-def : WriteRes<WriteFDiv,    [VulcanF01]> {
+def : WriteRes<WriteFDiv,    [THX2T99F01]> {
   let Latency = 16;
   let ResourceCycles = [8];
 }
 
 // FP divide, D-form
 // FP square root, D-form
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
 
 // FP multiply
 // FP multiply accumulate
-def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+def : WriteRes<WriteFMul, [THX2T99F01]> { let Latency = 6; }
 
 // FP round to integral
-def : InstRW<[VulcanWrite_7Cyc_F01],
+def : InstRW<[THX2T99Write_7Cyc_F01],
             (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
 
 // FP select
-def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>;
 
 //---
 // 3.9 FP Miscellaneous Instructions
@@ -426,16 +426,16 @@ def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
 // FP convert, from vec to vec reg
 // FP convert, from gen to vec reg
 // FP convert, from vec to gen reg
-def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+def : WriteRes<WriteFCvt, [THX2T99F01]> { let Latency = 7; }
 
 // FP move, immed
 // FP move, register
-def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+def : WriteRes<WriteFImm, [THX2T99F01]> { let Latency = 4; }
 
 // FP transfer, from gen to vec reg
 // FP transfer, from vec to gen reg
-def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+def : WriteRes<WriteFCopy, [THX2T99F01]> { let Latency = 4; }
+def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 
 //---
 // 3.12 ASIMD Integer Instructions
@@ -470,39 +470,39 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+def : WriteRes<WriteV, [THX2T99F01]> { let Latency = 7; }
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD arith, reduce, 8B/8H
 // ASIMD arith, reduce, 16B
-def : InstRW<[VulcanWrite_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01], 
             (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
 
 // ASIMD logical (MOV, MVN, ORN, ORR)
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
 
 // ASIMD polynomial (8x8) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
 
 //---
 // 3.13 ASIMD Floating-point Instructions
 //---
 
 // ASIMD FP absolute value
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>;
 
 // ASIMD FP arith, normal, D-form
 // ASIMD FP arith, normal, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
 
 // ASIMD FP arith,pairwise, D-form
 // ASIMD FP arith, pairwise, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>;
 
 // ASIMD FP compare, D-form
 // ASIMD FP compare, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
                                                 "^FCMGTv", "^FCMLEv",
                                                 "^FCMLTv")>;
 
@@ -513,42 +513,42 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
 // NOTE: Handled by WriteV.
 
 // ASIMD FP divide, D-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>;
 
 // ASIMD FP divide, Q-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>;
 
 // ASIMD FP divide, Q-form, F64
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>;
 
 // ASIMD FP max/min, normal, D-form
 // ASIMD FP max/min, normal, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
                                                 "^FMINv", "^FMINNMv")>;
 
 // ASIMD FP max/min, pairwise, D-form
 // ASIMD FP max/min, pairwise, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
                                                 "^FMINPv", "^FMINNMPv")>;
 
 // ASIMD FP max/min, reduce
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
                                                 "^FMINVv", "^FMINNMVv")>;
 
 // ASIMD FP multiply, D-form, FZ
 // ASIMD FP multiply, D-form, no FZ
 // ASIMD FP multiply, Q-form, FZ
 // ASIMD FP multiply, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
 
 // ASIMD FP multiply accumulate, Dform, FZ
 // ASIMD FP multiply accumulate, Dform, no FZ
 // ASIMD FP multiply accumulate, Qform, FZ
 // ASIMD FP multiply accumulate, Qform, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
 
 // ASIMD FP negate
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>;
 
 // ASIMD FP round, D-form
 // ASIMD FP round, Q-form
@@ -559,39 +559,39 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
 //--
 
 // ASIMD bit reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
 
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
 
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
 
 // ASIMD extract
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>;
 
 // ASIMD extract narrow
 // ASIMD extract narrow, saturating
 // NOTE: Handled by WriteV.
 
 // ASIMD insert, element to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
 // ASIMD move, integer immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
 
 // ASIMD move, FP immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
 
 // ASIMD reciprocal estimate, D-form
 // ASIMD reciprocal estimate, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01], 
             (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
                                    "^FRSQRTEv", "^URSQRTEv")>;
 
@@ -599,31 +599,31 @@ def : InstRW<[VulcanWrite_5Cyc_F01],
 // ASIMD reciprocal step, D-form, no FZ
 // ASIMD reciprocal step, Q-form, FZ
 // ASIMD reciprocal step, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
 
 // ASIMD reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01], 
             (instregex "^REV16v", "^REV32v", "^REV64v")>;
 
 // ASIMD table lookup, D-form
 // ASIMD table lookup, Q-form
-def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
 
 // ASIMD transfer, element to word or word
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^UMOVv")>;
 
 // ASIMD transfer, element to gen reg
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
 
 // ASIMD transfer gen reg to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
 // ASIMD transpose
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
                                                 "^UZP1v", "^UZP2v")>;
 
 // ASIMD unzip/zip
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
 
 //--
 // 3.15 ASIMD Load Instructions 
@@ -631,114 +631,114 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
 
 // ASIMD load, 1 element, multiple, 1 reg, D-form
 // ASIMD load, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01], 
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg, D-form
 // ASIMD load, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01], 
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg, D-form
 // ASIMD load, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01], 
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], 
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg, D-form
 // ASIMD load, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_6Cyc_LS01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01], 
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], 
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, one lane, B/H/S
 // ASIMD load, 1 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD1i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 1 element, all lanes, D-form, B/H/S
 // ASIMD load, 1 element, all lanes, D-form, D
 // ASIMD load, 1 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, multiple, D-form, B/H/S
 // ASIMD load, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, one lane, B/H
 // ASIMD load, 2 element, one lane, S
 // ASIMD load, 2 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD2i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 2 element, all lanes, D-form, B/H/S
 // ASIMD load, 2 element, all lanes, D-form, D
 // ASIMD load, 2 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, multiple, D-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, one lone, B/H
 // ASIMD load, 3 element, one lane, S
 // ASIMD load, 3 element, one lane, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD3i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 3 element, all lanes, D-form, B/H/S
 // ASIMD load, 3 element, all lanes, D-form, D
 // ASIMD load, 3 element, all lanes, Q-form, B/H/S
 // ASIMD load, 3 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01], 
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, multiple, D-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, one lane, B/H
 // ASIMD load, 4 element, one lane, S
 // ASIMD load, 4 element, one lane, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD4i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 4 element, all lanes, D-form, B/H/S
 // ASIMD load, 4 element, all lanes, D-form, D
 // ASIMD load, 4 element, all lanes, Q-form, B/H/S
 // ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01], 
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //--
@@ -747,82 +747,82 @@ def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
 
 // ASIMD store, 1 element, multiple, 1 reg, D-form
 // ASIMD store, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, D-form
 // ASIMD store, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, D-form
 // ASIMD store, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, D-form
 // ASIMD store, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, one lane, B/H/S
 // ASIMD store, 1 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST1i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 2 element, multiple, D-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 2 element, one lane, B/H/S
 // ASIMD store, 2 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST2i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 3 element, multiple, D-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 3 element, one lane, B/H
 // ASIMD store, 3 element, one lane, S
 // ASIMD store, 3 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST3i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 4 element, multiple, D-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 4 element, one lane, B/H
 // ASIMD store, 4 element, one lane, S
 // ASIMD store, 4 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST4i(8|16|32|64)_POST$")>;
 
 //--
@@ -830,23 +830,23 @@ def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
 //--
 
 // Crypto AES ops
-def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES")>;
 
 // Crypto polynomial (64x64) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
 
 // Crypto SHA1 xor ops
 // Crypto SHA1 schedule acceleration ops
 // Crypto SHA256 schedule acceleration op (1 u-op)
 // Crypto SHA256 schedule acceleration op (2 u-ops)
 // Crypto SHA256 hash acceleration ops
-def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA")>;
 
 //--
 // 3.18 CRC
 //--
 
 // CRC checksum ops
-def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32")>;
 
-} // SchedModel = VulcanModel
+} // SchedModel = ThunderX2T99Model
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 66a8f332513a..7f5507371fa0 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -42,10 +42,12 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     Entry.Node = Size;
     Args.push_back(Entry);
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
-      .setDiscardResult();
+    CLI.setDebugLoc(dl)
+        .setChain(Chain)
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                      std::move(Args))
+        .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
@@ -53,7 +55,5 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
 }
 bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
     CodeGenOpt::Level OptLevel) const {
-  if (OptLevel >= CodeGenOpt::Aggressive)
-    return true;
-  return false;
+  return OptLevel >= CodeGenOpt::Aggressive;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 03e01329e036..b3aba4781db8 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -81,8 +81,22 @@ void AArch64Subtarget::initializeProperties() {
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
     break;
-  case Vulcan:
+  case ThunderX2T99:
+    CacheLineSize = 64;
+    PrefFunctionAlignment = 3;
+    PrefLoopAlignment = 2;
     MaxInterleaveFactor = 4;
+    PrefetchDistance = 128;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 4;
+    break;
+  case ThunderX:
+  case ThunderXT88:
+  case ThunderXT81:
+  case ThunderXT83:
+    CacheLineSize = 128;
+    PrefFunctionAlignment = 3;
+    PrefLoopAlignment = 2;
     break;
   case CortexA35: break;
   case CortexA53: break;
@@ -133,9 +147,9 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
     return AArch64II::MO_GOT;
 
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+  // The small code model's direct accesses use ADRP, which cannot
+  // necessarily produce the value 0 (if the code is above 4GB).
+  if (useSmallAddressing() && GV->hasExternalWeakLinkage())
     return AArch64II::MO_GOT;
 
   return AArch64II::MO_NO_FLAG;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index a99340225082..40ad9185012c 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -45,7 +45,11 @@ public:
     ExynosM1,
     Falkor,
     Kryo,
-    Vulcan
+    ThunderX2T99,
+    ThunderX,
+    ThunderXT81,
+    ThunderXT83,
+    ThunderXT88
   };
 
 protected:
@@ -61,9 +65,11 @@ protected:
   bool HasCRC = false;
   bool HasLSE = false;
   bool HasRAS = false;
+  bool HasRDM = false;
   bool HasPerfMon = false;
   bool HasFullFP16 = false;
   bool HasSPE = false;
+  bool HasLSLFast = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -73,6 +79,10 @@ protected:
 
   // StrictAlign - Disallow unaligned memory accesses.
   bool StrictAlign = false;
+
+  // NegativeImmediates - transform instructions with negative immediates
+  bool NegativeImmediates = true;
+
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -83,6 +93,8 @@ protected:
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
   bool HasArithmeticCbzFusion = false;
+  bool HasFuseAES = false;
+  bool HasFuseLiterals = false;
   bool DisableLatencySchedHeuristic = false;
   bool UseRSqrt = false;
   uint8_t MaxInterleaveFactor = 2;
@@ -183,6 +195,7 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasLSE() const { return HasLSE; }
   bool hasRAS() const { return HasRAS; }
+  bool hasRDM() const { return HasRDM; }
   bool balanceFPOps() const { return BalanceFPOps; }
   bool predictableSelectIsExpensive() const {
     return PredictableSelectIsExpensive;
@@ -195,6 +208,8 @@ public:
   }
   bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
   bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+  bool hasFuseAES() const { return HasFuseAES; }
+  bool hasFuseLiterals() const { return HasFuseLiterals; }
   bool useRSqrt() const { return UseRSqrt; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
@@ -218,6 +233,7 @@ public:
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
+  bool hasLSLFast() const { return HasLSLFast; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -226,6 +242,7 @@ public:
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
@@ -233,9 +250,17 @@ public:
 
   bool useAA() const override { return UseAA; }
 
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
+  bool useSmallAddressing() const {
+    switch (TLInfo.getTargetMachine().getCodeModel()) {
+      case CodeModel::Kernel:
+        // Kernel is currently allowed only for Fuchsia targets,
+        // where it is the same as Small for almost all purposes.
+      case CodeModel::Small:
+        return true;
+      default:
+        return false;
+    }
+  }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index a3736c0868fb..7c5dcb0853eb 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,35 +18,37 @@ include "llvm/TableGen/SearchableTable.td"
 // AT (address translate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm,
          bits<3> op2> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
+  code Requires = [{ {} }];
 }
 
-def : AT<"S1E1R",  0b01, 0b000, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E2R",  0b01, 0b100, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E3R",  0b01, 0b110, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E1W",  0b01, 0b000, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E2W",  0b01, 0b100, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E3W",  0b01, 0b110, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E0R",  0b01, 0b000, 0b0111, 0b1000, 0b010>;
-def : AT<"S1E0W",  0b01, 0b000, 0b0111, 0b1000, 0b011>;
-def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
-def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
-def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
-def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
-def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
-def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
-
+def : AT<"S1E1R",  0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R",  0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R",  0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W",  0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W",  0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W",  0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R",  0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W",  0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in {
+def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>;
+}
 
 //===----------------------------------------------------------------------===//
 // DMB/DSB (data barrier) instruction options.
@@ -77,28 +79,31 @@ def : DB<"sy",    0xf>;
 // DC (data cache maintenance) instruction options.
 //===----------------------------------------------------------------------===//
 
-class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm,
          bits<3> op2> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
+  code Requires = [{ {} }];
 }
 
-def : DC<"ZVA",   0b01, 0b011, 0b0111, 0b0100, 0b001>;
-def : DC<"IVAC",  0b01, 0b000, 0b0111, 0b0110, 0b001>;
-def : DC<"ISW",   0b01, 0b000, 0b0111, 0b0110, 0b010>;
-def : DC<"CVAC",  0b01, 0b011, 0b0111, 0b1010, 0b001>;
-def : DC<"CSW",   0b01, 0b000, 0b0111, 0b1010, 0b010>;
-def : DC<"CVAU",  0b01, 0b011, 0b0111, 0b1011, 0b001>;
-def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
-def : DC<"CISW",  0b01, 0b000, 0b0111, 0b1110, 0b010>;
+def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC",  0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW",   0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU",  0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW",  0b000, 0b0111, 0b1110, 0b010>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : DC<"CVAP",  0b011, 0b0111, 0b1100, 0b001>;
 
 //===----------------------------------------------------------------------===//
 // IC (instruction cache maintenance) instruction options.
@@ -120,7 +125,7 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
 
 def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
 def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
-def : IC<"IVAU",    0b000, 0b0111, 0b0001, 0b000, 1>;
+def : IC<"IVAU",    0b011, 0b0111, 0b0101, 0b001, 1>;
 
 //===----------------------------------------------------------------------===//
 // ISB (instruction-fetch barrier) instruction options.
@@ -213,14 +218,13 @@ def : PSB<"csync", 0x11>;
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
              bits<3> op2, bit needsreg = 1> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
@@ -228,38 +232,38 @@ class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
   bit NeedsReg = needsreg;
 }
 
-def : TLBI<"IPAS2E1IS",    0b01, 0b100, 0b1000, 0b0000, 0b001>;
-def : TLBI<"IPAS2LE1IS",   0b01, 0b100, 0b1000, 0b0000, 0b101>;
-def : TLBI<"VMALLE1IS",    0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"VAE1IS",       0b01, 0b000, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE2IS",       0b01, 0b100, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE3IS",       0b01, 0b110, 0b1000, 0b0011, 0b001>;
-def : TLBI<"ASIDE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b010>;
-def : TLBI<"VAAE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b011>;
-def : TLBI<"ALLE1IS",      0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
-def : TLBI<"VALE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
-def : TLBI<"VAALE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b111>;
-def : TLBI<"IPAS2E1",      0b01, 0b100, 0b1000, 0b0100, 0b001>;
-def : TLBI<"IPAS2LE1",     0b01, 0b100, 0b1000, 0b0100, 0b101>;
-def : TLBI<"VMALLE1",      0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE2",        0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE3",        0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"VAE1",         0b01, 0b000, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE2",         0b01, 0b100, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE3",         0b01, 0b110, 0b1000, 0b0111, 0b001>;
-def : TLBI<"ASIDE1",       0b01, 0b000, 0b1000, 0b0111, 0b010>;
-def : TLBI<"VAAE1",        0b01, 0b000, 0b1000, 0b0111, 0b011>;
-def : TLBI<"ALLE1",        0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
-def : TLBI<"VALE1",        0b01, 0b000, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE2",        0b01, 0b100, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE3",        0b01, 0b110, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VMALLS12E1",   0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
-def : TLBI<"VAALE1",       0b01, 0b000, 0b1000, 0b0111, 0b111>;
+def : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS",   0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS",    0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS",      0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS",      0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS",       0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS",       0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS",       0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS",     0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS",      0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS",      0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS",      0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS",      0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS",      0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS",     0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1",      0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1",     0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1",      0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2",        0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3",        0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1",         0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2",         0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3",         0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1",       0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1",        0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1",        0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1",        0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2",        0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d2883941e2c4..dcc51bf02329 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -12,9 +12,11 @@
 
 #include "AArch64.h"
 #include "AArch64CallLowering.h"
-#include "AArch64InstructionSelector.h"
 #include "AArch64LegalizerInfo.h"
+#include "AArch64MacroFusion.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
 #include "AArch64RegisterBankInfo.h"
+#endif
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
@@ -115,7 +117,7 @@ EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
 static cl::opt<bool>
     EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden,
                                cl::desc("Enable the type promotion pass"),
-                               cl::init(true));
+                               cl::init(false));
 
 static cl::opt<bool>
     EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
@@ -136,6 +138,11 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
+static cl::opt<int> EnableGlobalISelAtO(
+    "aarch64-enable-global-isel-at-O", cl::Hidden,
+    cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
+    cl::init(-1));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -278,7 +285,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     // FIXME: At this point, we can't rely on Subtarget having RBI.
     // It's awkward to mix passing RBI and the Subtarget; should we pass
     // TII/TRI as well?
-    GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI));
+    GISel->InstSelector.reset(
+        createAArch64InstructionSelector(*this, *I, *RBI));
 
     GISel->RegBankInfo.reset(RBI);
 #endif
@@ -323,10 +331,24 @@ public:
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    DAG->addMutation(createAArch64MacroFusionDAGMutation());
     return DAG;
   }
 
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
+    if (ST.hasFuseLiterals()) {
+      // Run the Macro Fusion after RA again since literals are expanded from
+      // pseudos then (v. addPreSched2()).
+      ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+      DAG->addMutation(createAArch64MacroFusionDAGMutation());
+      return DAG;
+    }
+
+    return nullptr;
+  }
+
   void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
@@ -341,6 +363,8 @@ public:
   void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
+
+  bool isGlobalISelEnabled() const override;
 };
 
 } // end anonymous namespace
@@ -450,6 +474,10 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
 }
 #endif
 
+bool AArch64PassConfig::isGlobalISelEnabled() const {
+  return TM->getOptLevel() <= EnableGlobalISelAtO;
+}
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 6fa5e83957e1..2c75a3258c1c 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -21,6 +21,8 @@
 
 namespace llvm {
 
+class AArch64RegisterBankInfo;
+
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5a5552..4d59da0c646d 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,7 +176,8 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -436,7 +437,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 }
 
 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy) {
+                                       Type *CondTy, const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
@@ -463,11 +464,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return Entry->Cost;
     }
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
-                                    unsigned Alignment, unsigned AddressSpace) {
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -505,12 +507,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
-    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one ldN/stN instruction.
+    if (NumElts % Factor == 0 &&
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -594,8 +598,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_ld4:
     Info.ReadMem = true;
     Info.WriteMem = false;
-    Info.IsSimple = true;
-    Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(0);
     break;
   case Intrinsic::aarch64_neon_st2:
@@ -603,8 +605,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.IsSimple = true;
-    Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
     break;
   }
@@ -628,6 +628,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   return true;
 }
 
+/// See if \p I should be considered for address type promotion. We check if \p
+/// I is a sext with right type and used in memory accesses. If it used in a
+/// "complex" getelementptr, we allow it to be promoted without finding other
+/// sext instructions that sign extended the same initial value. A getelementptr
+/// is considered as "complex" if it has more than 2 operands.
+bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
+    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
+  bool Considerable = false;
+  AllowPromotionWithoutCommonHeader = false;
+  if (!isa<SExtInst>(&I))
+    return false;
+  Type *ConsideredSExtType =
+      Type::getInt64Ty(I.getParent()->getParent()->getContext());
+  if (I.getType() != ConsideredSExtType)
+    return false;
+  // See if the sext is the one with the right type and used in at least one
+  // GetElementPtrInst.
+  for (const User *U : I.users()) {
+    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
+      Considerable = true;
+      // A getelementptr is considered as "complex" if it has more than 2
+      // operands. We will promote a SExt used in such complex GEP as we
+      // expect some computation to be merged if they are done on 64 bits.
+      if (GEPInst->getNumOperands() > 2) {
+        AllowPromotionWithoutCommonHeader = true;
+        break;
+      }
+    }
+  }
+  return Considerable;
+}
+
 unsigned AArch64TTIImpl::getCacheLineSize() {
   return ST->getCacheLineSize();
 }
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 18287ed6653f..e37c003e064c 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -34,10 +34,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const AArch64Subtarget *getST() const { return ST; }
   const AArch64TargetLowering *getTLI() const { return TLI; }
 
@@ -90,7 +86,8 @@ public:
 
   unsigned getMaxInterleaveFactor(unsigned VF);
 
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
 
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index);
@@ -107,10 +104,11 @@ public:
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
 
   int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
@@ -125,6 +123,10 @@ public:
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace);
 
+  bool
+  shouldConsiderAddressTypePromotion(const Instruction &I,
+                                     bool &AllowPromotionWithoutCommonHeader);
+
   unsigned getCacheLineSize();
 
   unsigned getPrefetchDistance();
diff --git a/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
index e3b1d7cea48d..f53af2315ec9 100644
--- a/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
+++ b/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -19,13 +19,27 @@
 //    is rewritten into
 //    dup v3.4s, v2.s[1]
 //    fmla v0.4s, v1.4s, v3.4s
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
 
 using namespace llvm;
 
@@ -41,14 +55,15 @@ namespace {
 
 struct AArch64VectorByElementOpt : public MachineFunctionPass {
   static char ID;
-  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
-    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
-  }
 
   const TargetInstrInfo *TII;
   MachineRegisterInfo *MRI;
   TargetSchedModel SchedModel;
 
+  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+  }
+
   /// Based only on latency of instructions, determine if it is cost efficient
   /// to replace the instruction InstDesc by the two instructions InstDescRep1
   /// and InstDescRep2.
@@ -90,8 +105,10 @@ struct AArch64VectorByElementOpt : public MachineFunctionPass {
     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
   }
 };
+
 char AArch64VectorByElementOpt::ID = 0;
-} // namespace
+
+} // end anonymous namespace
 
 INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index b86a283b40d4..cbab68979c56 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -74,6 +74,7 @@ private:
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
   unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
@@ -537,154 +538,15 @@ public:
     return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
-  bool isImm0_1() const {
+  template <int N, int M>
+  bool isImmInRange() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
       return false;
     int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 2);
-  }
-
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-
-  bool isImm1_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-
-  bool isImm32_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 32 && Val < 64);
+    return (Val >= N && Val <= M);
   }
 
   bool isLogicalImm32() const {
@@ -804,31 +666,8 @@ public:
     return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
 
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-
-  bool isPCRelLabel19() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-
-  bool isBranchTarget14() const {
+  template<int N>
+  bool isBranchTarget() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
@@ -837,7 +676,8 @@ public:
     int64_t Val = MCE->getValue();
     if (Val & 0x3)
       return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+    assert(N > 0 && "Branch target immediate cannot be 0 bits!");
+    return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2));
   }
 
   bool
@@ -2494,6 +2334,35 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
+  if (FBS[AArch64::HasV8_1aOps])
+    Str += "ARMv8.1a";
+  else if (FBS[AArch64::HasV8_2aOps])
+    Str += "ARMv8.2a";
+  else
+    Str += "(unknown)";
+}
+
+void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands,
+                                      SMLoc S) {
+  const uint16_t Op2 = Encoding & 7;
+  const uint16_t Cm = (Encoding & 0x78) >> 3;
+  const uint16_t Cn = (Encoding & 0x780) >> 7;
+  const uint16_t Op1 = (Encoding & 0x3800) >> 11;
+
+  const MCExpr *Expr = MCConstantExpr::create(Op1, getContext());
+
+  Operands.push_back(
+      AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));
+  Expr = MCConstantExpr::create(Op2, getContext());
+  Operands.push_back(
+      AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+}
+
 /// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
 /// the SYS instruction. Parse them specially so that we create a SYS MCInst.
 bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
@@ -2510,228 +2379,48 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
 
-  const MCExpr *Expr = nullptr;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
-    Expr = MCConstantExpr::create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-  } while (false)
-
   if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
+    const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
+    if (!IC)
       return TokError("invalid operand for IC instruction");
+    else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("IC " + std::string(IC->Name) + " requires ");
+      setRequiredFeatureString(IC->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(IC->Encoding, Operands, S);
   } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else if (!Op.compare_lower("cvap")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #3, C7, C12, #1
-        SYS_ALIAS(3, 7, 12, 1);
-      } else {
-        return TokError("DC CVAP requires ARMv8.2a");
-      }
-    } else {
+    const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op);
+    if (!DC)
       return TokError("invalid operand for DC instruction");
+    else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("DC " + std::string(DC->Name) + " requires ");
+      setRequiredFeatureString(DC->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(DC->Encoding, Operands, S);
   } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else if (!Op.compare_lower("s1e1rp")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #0, C7, C9, #0
-        SYS_ALIAS(0, 7, 9, 0);
-      } else {
-        return TokError("AT S1E1RP requires ARMv8.2a");
-      }
-    } else if (!Op.compare_lower("s1e1wp")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #0, C7, C9, #1
-        SYS_ALIAS(0, 7, 9, 1);
-      } else {
-        return TokError("AT S1E1WP requires ARMv8.2a");
-      }
-    } else {
+    const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op);
+    if (!AT)
       return TokError("invalid operand for AT instruction");
+    else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("AT " + std::string(AT->Name) + " requires ");
+      setRequiredFeatureString(AT->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(AT->Encoding, Operands, S);
   } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("ipas2e1is")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 0, 1);
-    } else if (!Op.compare_lower("ipas2le1is")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 0, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
+    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op);
+    if (!TLBI)
       return TokError("invalid operand for TLBI instruction");
+    else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
+      setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(TLBI->Encoding, Operands, S);
   }
 
-#undef SYS_ALIAS
-
   Parser.Lex(); // Eat operand.
 
   bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
@@ -2744,12 +2433,10 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     HasRegister = true;
   }
 
-  if (ExpectRegister && !HasRegister) {
+  if (ExpectRegister && !HasRegister)
     return TokError("specified " + Mnemonic + " op requires a register");
-  }
-  else if (!ExpectRegister && HasRegister) {
+  else if (!ExpectRegister && HasRegister)
     return TokError("specified " + Mnemonic + " op does not use a register");
-  }
 
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
@@ -2884,7 +2571,6 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
 
 /// parseRegister - Parse a non-vector register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   // Try for a vector register.
   if (!tryParseVectorRegister(Operands))
@@ -2897,30 +2583,6 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
   Operands.push_back(
       AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
 
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  SMLoc LBracS = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  if (parseOptionalToken(AsmToken::LBrac)) {
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        SMLoc RBracS = getLoc();
-        if (parseOptionalToken(AsmToken::RBrac)) {
-          Operands.push_back(
-              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              AArch64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
   return false;
 }
 
@@ -3696,6 +3358,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "immediate must be an integer in range [0, 63].");
   case Match_InvalidImm0_127:
     return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_255:
+    return Error(Loc, "immediate must be an integer in range [0, 255].");
   case Match_InvalidImm0_65535:
     return Error(Loc, "immediate must be an integer in range [0, 65535].");
   case Match_InvalidImm1_8:
@@ -4120,6 +3784,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidImm0_31:
   case Match_InvalidImm0_63:
   case Match_InvalidImm0_127:
+  case Match_InvalidImm0_255:
   case Match_InvalidImm0_65535:
   case Match_InvalidImm1_8:
   case Match_InvalidImm1_16:
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 6bcf67fb3fef..6d0930c358f1 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -14,6 +14,7 @@ tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
 if(LLVM_BUILD_GLOBAL_ISEL)
+  tablegen(LLVM AArch64GenRegisterBank.inc -gen-register-bank)
   tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
 endif()
 
@@ -55,6 +56,7 @@ add_llvm_target(AArch64CodeGen
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
+  AArch64MacroFusion.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index b4f85204714f..41ae70f85e58 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -16,12 +16,20 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -451,8 +459,8 @@ static const LdStNInstrDesc LdStNInstInfo[] = {
   { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
   { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
   { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
-  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
-  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12 },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24 },
   { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
   { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
   { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
@@ -731,7 +739,6 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
 #endif
 
-  const char *Asm = nullptr;
   const MCOperand &Op1 = MI->getOperand(0);
   const MCOperand &Cn = MI->getOperand(1);
   const MCOperand &Cm = MI->getOperand(2);
@@ -742,230 +749,74 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   unsigned CmVal = Cm.getImm();
   unsigned Op2Val = Op2.getImm();
 
+  uint16_t Encoding = Op2Val;
+  Encoding |= CmVal << 3;
+  Encoding |= CnVal << 7;
+  Encoding |= Op1Val << 11;
+
+  bool NeedsReg;
+  std::string Ins;
+  std::string Name;
+
   if (CnVal == 7) {
     switch (CmVal) {
-    default:
-      break;
-
+    default: return false;
     // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
+    case 1: case 5: {
+      const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding);
+      if (!IC || !IC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = IC->NeedsReg;
+      Ins = "ic\t";
+      Name = std::string(IC->Name);
+    }
+    break;
     // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 12:
-      if (Op1Val == 3 && Op2Val == 1 &&
-          (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
-        Asm = "dc\tcvap";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
+    case 4: case 6: case 10: case 11: case 12: case 14:
+    {
+      const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding);
+      if (!DC || !DC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "dc\t";
+      Name = std::string(DC->Name);
+    }
+    break;
     // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    case 9:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
-          switch (Op2Val) {
-          default:
-            break;
-          case 0: Asm = "at\ts1e1rp"; break;
-          case 1: Asm = "at\ts1e1wp"; break;
-          }
-        }
-        break;
-      }
+    case 8: case 9: {
+      const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding);
+      if (!AT || !AT->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "at\t";
+      Name = std::string(AT->Name);
+    }
+    break;
     }
   } else if (CnVal == 8) {
     // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 0:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1is"; break;
-        case 5: Asm = "tlbi\tipas2le1is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
+    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
+    if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
+      return false;
+
+    NeedsReg = TLBI->NeedsReg;
+    Ins = "tlbi\t";
+    Name = std::string(TLBI->Name);
   }
+  else
+    return false;
 
-  if (Asm) {
-    unsigned Reg = MI->getOperand(4).getReg();
+  std::string Str = Ins + Name;
+  std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower);
 
-    O << '\t' << Asm;
-    if (StringRef(Asm).lower().find("all") == StringRef::npos)
-      O << ", " << getRegisterName(Reg);
-  }
+  O << '\t' << Str;
+  if (NeedsReg)
+    O << ", " << getRegisterName(MI->getOperand(4).getReg());
 
-  return Asm != nullptr;
+  return true;
 }
 
 void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 65dca99ed04e..a45258cb97b7 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -37,9 +38,11 @@ public:
                                        unsigned PrintMethodIdx,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O);
+
   virtual StringRef getRegName(unsigned RegNo) const {
     return getRegisterName(RegNo);
   }
+
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = AArch64::NoRegAltName);
 
@@ -177,12 +180,15 @@ public:
                                unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI,
                                raw_ostream &O) override;
+
   StringRef getRegName(unsigned RegNo) const override {
     return getRegisterName(RegNo);
   }
+
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = AArch64::NoRegAltName);
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 14c0327f5fa8..ebf05ae303dd 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -73,7 +73,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -138,15 +138,15 @@ static unsigned AdrImmBits(unsigned Value) {
 }
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx) {
+                                 MCContext &Ctx) {
   unsigned Kind = Fixup.getKind();
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
-    if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     return AdrImmBits(Value & 0x1fffffULL);
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -154,66 +154,65 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case AArch64::fixup_aarch64_pcrel_branch19:
     // Signed 21-bit immediate
     if (SignedValue > 2097151 || SignedValue < -2097152)
-      if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     // Low two bits are not encoded.
     return (Value >> 2) & 0x7ffff;
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
     // Unsigned 12-bit immediate
-    if (Ctx && Value >= 0x1000)
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value >= 0x1000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     return Value;
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
     // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Ctx && (Value >= 0x2000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x1))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
+    if (Value >= 0x2000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
     return Value >> 1;
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
     // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Ctx && (Value >= 0x4000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
+    if (Value >= 0x4000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
     return Value >> 2;
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
     // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Ctx && (Value >= 0x8000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x7))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
+    if (Value >= 0x8000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x7)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
     return Value >> 3;
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
     // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Ctx && (Value >= 0x10000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0xf))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
+    if (Value >= 0x10000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0xf)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
     return Value >> 4;
   case AArch64::fixup_aarch64_movw:
-    if (Ctx)
-      Ctx->reportError(Fixup.getLoc(),
-                       "no resolvable MOVZ/MOVK fixups supported yet");
+    Ctx.reportError(Fixup.getLoc(),
+                    "no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
   case AArch64::fixup_aarch64_pcrel_branch14:
     // Signed 16-bit immediate
-    if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 32767 || SignedValue < -32768)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     // Signed 28-bit immediate
-    if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3ffffff;
   case FK_Data_1:
   case FK_Data_2:
@@ -264,13 +263,13 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 
 void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                    unsigned DataSize, uint64_t Value,
-                                   bool IsPCRel) const {
+                                   bool IsPCRel, MCContext &Ctx) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
   // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup, Value, nullptr);
+  Value = adjustFixupValue(Fixup, Value, Ctx);
 
   // Shift the value into position.
   Value <<= Info.TargetOffset;
@@ -521,17 +520,6 @@ public:
 
     return CompactUnwindEncoding;
   }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
-    // Try to get the encoded value for the fixup as-if we're mapping it into
-    // the instruction. This allows adjustFixupValue() to issue a diagnostic
-    // if the value is invalid.
-    if (IsResolved)
-      (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
-  }
 };
 
 } // end anonymous namespace
@@ -575,12 +563,6 @@ void ELFAArch64AsmBackend::processFixupValue(
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
     IsResolved = false;
-
-  // Try to get the encoded value for the fixup as-if we're mapping it into
-  // the instruction. This allows adjustFixupValue() to issue a diagnostic
-  // if the value is invalid.
-  if (IsResolved)
-    (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
 }
 
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 685907a2178e..271263507ae1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -14,27 +14,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetStreamer.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -106,8 +102,8 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override {
     EmitA64MappingSymbol();
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
@@ -180,6 +176,7 @@ private:
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 };
+
 } // end anonymous namespace
 
 AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
@@ -191,6 +188,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
 }
 
 namespace llvm {
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
@@ -214,4 +212,5 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
     return new AArch64TargetELFStreamer(S);
   return nullptr;
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index e9d38d3dcf10..f710065d9bc7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -84,9 +84,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
   // no matter how far away they are.
   else if (CM == CodeModel::JITDefault)
     CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error(
-        "Only small and large code models are allowed on AArch64");
+  else if (CM != CodeModel::Small && CM != CodeModel::Large) {
+    if (!TT.isOSFuchsia())
+      report_fatal_error(
+          "Only small and large code models are allowed on AArch64");
+    else if (CM != CodeModel::Kernel)
+      report_fatal_error(
+          "Only small, kernel, and large code models are allowed on AArch64");
+  }
 }
 
 static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 53a68527ee8e..3d296ba4806b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -16,14 +16,22 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/MachO.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 namespace {
+
 class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
   bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
                                   const MCSymbolRefExpr *Sym,
@@ -38,7 +46,8 @@ public:
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
 };
-}
+
+} // end anonymous namespace
 
 bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
@@ -51,18 +60,18 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     return false;
 
   case FK_Data_1:
-    Log2Size = llvm::Log2_32(1);
+    Log2Size = Log2_32(1);
     return true;
   case FK_Data_2:
-    Log2Size = llvm::Log2_32(2);
+    Log2Size = Log2_32(2);
     return true;
   case FK_Data_4:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
   case FK_Data_8:
-    Log2Size = llvm::Log2_32(8);
+    Log2Size = Log2_32(8);
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
@@ -72,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     switch (Sym->getKind()) {
     default:
       return false;
@@ -87,14 +96,13 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
       return true;
     }
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     // This encompasses the relocation for the whole 21-bit value.
     switch (Sym->getKind()) {
-    default: {
+    default:
       Asm.getContext().reportError(Fixup.getLoc(),
                                    "ADR/ADRP relocations must be GOT relative");
       return false;
-    }
     case MCSymbolRefExpr::VK_PAGE:
       RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
       return true;
@@ -108,7 +116,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     return true;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
     return true;
   }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index dcc39176031c..5d76681cd97b 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,82 +266,86 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
 }
 } // end namespace AArch64CC
 
+struct SysAlias {
+  const char *Name;
+  uint16_t Encoding;
+  FeatureBitset FeaturesRequired;
+
+  SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
+  SysAlias (const char *N, uint16_t E, FeatureBitset F) :
+    Name(N), Encoding(E), FeaturesRequired(F) {};
+
+  bool haveFeatures(FeatureBitset ActiveFeatures) const {
+    return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+  }
+
+  FeatureBitset getRequiredFeatures() const { return FeaturesRequired; }
+};
+
+struct SysAliasReg : SysAlias {
+  bool NeedsReg;
+  SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
+};
+
 namespace AArch64AT{
-  struct AT {
-    const char *Name;
-    uint16_t Encoding;
+  struct AT : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_AT_DECL
   #include "AArch64GenSystemOperands.inc"
-
 }
+
 namespace AArch64DB {
-  struct DB {
-    const char *Name;
-    uint16_t Encoding;
+  struct DB : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_DB_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64DC {
-  struct DC {
-    const char *Name;
-    uint16_t Encoding;
+  struct DC : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_DC_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64IC {
-  struct IC {
-    const char *Name;
-    uint16_t Encoding;
-    bool NeedsReg;
+  struct IC : SysAliasReg {
+    using SysAliasReg::SysAliasReg;
   };
   #define GET_IC_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64ISB {
-  struct ISB {
-    const char *Name;
-    uint16_t Encoding;
+  struct ISB : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_ISB_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PRFM {
-  struct PRFM {
-    const char *Name;
-    uint16_t Encoding;
+  struct PRFM : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_PRFM_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PState {
-  struct PState {
-    const char *Name;
-    uint16_t Encoding;
-    FeatureBitset FeaturesRequired;
-
-    bool haveFeatures(FeatureBitset ActiveFeatures) const {
-      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
-    }
+  struct PState : SysAlias{
+    using SysAlias::SysAlias;
   };
   #define GET_PSTATE_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PSBHint {
-  struct PSB {
-    const char *Name;
-    uint16_t Encoding;
+  struct PSB : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_PSB_DECL
   #include "AArch64GenSystemOperands.inc"
@@ -451,10 +455,8 @@ namespace AArch64SysReg {
 }
 
 namespace AArch64TLBI {
-  struct TLBI {
-    const char *Name;
-    uint16_t Encoding;
-    bool NeedsReg;
+  struct TLBI : SysAliasReg {
+    using SysAliasReg::SysAliasReg;
   };
   #define GET_TLBI_DECL
   #include "AArch64GenSystemOperands.inc"
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 7b0a7f4b6058..8f6e1e7d8846 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -11,6 +11,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -23,6 +24,7 @@ class Pass;
 class Target;
 class TargetMachine;
 class PassRegistry;
+class Module;
 
 // R600 Passes
 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
@@ -37,6 +39,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
@@ -45,21 +48,32 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
+ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr);
+void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
+extern char &AMDGPULowerIntrinsicsID;
+
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIPeepholeSDWAPass(PassRegistry &);
+extern char &SIPeepholeSDWAID;
+
 void initializeSIShrinkInstructionsPass(PassRegistry&);
 extern char &SIShrinkInstructionsID;
 
 void initializeSIFixSGPRCopiesPass(PassRegistry &);
 extern char &SIFixSGPRCopiesID;
 
+void initializeSIFixVGPRCopiesPass(PassRegistry &);
+extern char &SIFixVGPRCopiesID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -86,11 +100,11 @@ extern char &AMDGPUPromoteAllocaID;
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
                                   CodeGenOpt::Level OptLevel);
-ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
 ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
-FunctionPass* createAMDGPUUnifyMetadataPass();
+ModulePass* createAMDGPUUnifyMetadataPass();
 void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
 extern char &AMDGPUUnifyMetadataID;
 
@@ -112,6 +126,15 @@ extern char &SIDebuggerInsertNopsID;
 void initializeSIInsertWaitsPass(PassRegistry&);
 extern char &SIInsertWaitsID;
 
+void initializeSIInsertWaitcntsPass(PassRegistry&);
+extern char &SIInsertWaitcntsID;
+
+void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
+extern char &AMDGPUUnifyDivergentExitNodesID;
+
+ImmutablePass *createAMDGPUAAWrapperPass();
+void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+
 Target &getTheAMDGPUTarget();
 Target &getTheGCNTarget();
 
@@ -133,43 +156,53 @@ enum TargetIndex {
 /// however on the GPU, each address space points to
 /// a separate piece of memory that is unique from other
 /// memory locations.
-namespace AMDGPUAS {
-enum AddressSpaces : unsigned {
-  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
-  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
-  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
-  REGION_ADDRESS   = 5, ///< Address space for region memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+struct AMDGPUAS {
+  // The following address space values depend on the triple environment.
+  unsigned PRIVATE_ADDRESS;  ///< Address space for private memory.
+  unsigned FLAT_ADDRESS;     ///< Address space for flat memory.
+  unsigned REGION_ADDRESS;   ///< Address space for region memory.
+
+  // The maximum value for flat, generic, local, private, constant and region.
+  const static unsigned MAX_COMMON_ADDRESS = 5;
+
+  const static unsigned GLOBAL_ADDRESS   = 1;  ///< Address space for global memory (RAT0, VTX0).
+  const static unsigned CONSTANT_ADDRESS = 2;  ///< Address space for constant memory (VTX2)
+  const static unsigned LOCAL_ADDRESS    = 3;  ///< Address space for local memory.
+  const static unsigned PARAM_D_ADDRESS  = 6;  ///< Address space for direct addressible parameter memory (CONST0)
+  const static unsigned PARAM_I_ADDRESS  = 7;  ///< Address space for indirect addressible parameter memory (VTX1)
 
   // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
   // order to be able to dynamically index a constant buffer, for example:
   //
   // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
 
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
+  const static unsigned CONSTANT_BUFFER_0 = 8;
+  const static unsigned CONSTANT_BUFFER_1 = 9;
+  const static unsigned CONSTANT_BUFFER_2 = 10;
+  const static unsigned CONSTANT_BUFFER_3 = 11;
+  const static unsigned CONSTANT_BUFFER_4 = 12;
+  const static unsigned CONSTANT_BUFFER_5 = 13;
+  const static unsigned CONSTANT_BUFFER_6 = 14;
+  const static unsigned CONSTANT_BUFFER_7 = 15;
+  const static unsigned CONSTANT_BUFFER_8 = 16;
+  const static unsigned CONSTANT_BUFFER_9 = 17;
+  const static unsigned CONSTANT_BUFFER_10 = 18;
+  const static unsigned CONSTANT_BUFFER_11 = 19;
+  const static unsigned CONSTANT_BUFFER_12 = 20;
+  const static unsigned CONSTANT_BUFFER_13 = 21;
+  const static unsigned CONSTANT_BUFFER_14 = 22;
+  const static unsigned CONSTANT_BUFFER_15 = 23;
 
   // Some places use this if the address space can't be determined.
-  UNKNOWN_ADDRESS_SPACE = ~0u
+  const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u;
 };
 
-} // namespace AMDGPUAS
+namespace llvm {
+namespace AMDGPU {
+AMDGPUAS getAMDGPUAS(const Module &M);
+AMDGPUAS getAMDGPUAS(const TargetMachine &TM);
+AMDGPUAS getAMDGPUAS(Triple T);
+} // namespace AMDGPU
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 13022009af16..2c7a2d8962d0 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -67,12 +67,24 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "Support unaligned global loads and stores"
 >;
 
+def FeatureTrapHandler: SubtargetFeature<"trap-handler",
+  "TrapHandler",
+  "true",
+  "Trap handler support"
+>;
+
 def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
   "UnalignedScratchAccess",
   "true",
   "Support unaligned scratch loads and stores"
 >;
 
+def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
+  "HasApertureRegs",
+  "true",
+  "Has Memory Aperture Base and Size Registers"
+>;
+
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
@@ -154,6 +166,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
   "Additional intstructions for CI+"
 >;
 
+def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
+  "GFX9Insts",
+  "true",
+  "Additional intstructions for GFX9+"
+>;
+
 def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
   "HasSMemRealTime",
   "true",
@@ -172,6 +190,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
   "Has i16/f16 instructions"
 >;
 
+def FeatureVOP3P : SubtargetFeature<"vop3p",
+  "HasVOP3PInsts",
+  "true",
+  "Has VOP3P packed instructions"
+>;
+
 def FeatureMovrel : SubtargetFeature<"movrel",
   "HasMovrel",
   "true",
@@ -190,16 +214,22 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores",
   "Has store scalar memory instructions"
 >;
 
-//===------------------------------------------------------------===//
-// Subtarget Features (options and debugging)
-//===------------------------------------------------------------===//
+def FeatureSDWA : SubtargetFeature<"sdwa",
+  "HasSDWA",
+  "true",
+  "Support SDWA (Sub-DWORD Addressing) extension"
+>;
 
-def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
-  "FP16Denormals",
+def FeatureDPP : SubtargetFeature<"dpp",
+  "HasDPP",
   "true",
-  "Enable half precision denormal handling"
+  "Support DPP (Data Parallel Primitives) extension"
 >;
 
+//===------------------------------------------------------------===//
+// Subtarget Features (options and debugging)
+//===------------------------------------------------------------===//
+
 // Some instructions do not support denormals despite this flag. Using
 // fp32 denormals also causes instructions to run at the double
 // precision rate for the device.
@@ -209,13 +239,36 @@ def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
   "Enable single precision denormal handling"
 >;
 
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
-  "FP64Denormals",
+// Denormal handling for fp64 and fp16 is controlled by the same
+// config register when fp16 supported.
+// TODO: Do we need a separate f16 setting when not legal?
+def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals",
+  "FP64FP16Denormals",
   "true",
-  "Enable double precision denormal handling",
+  "Enable double and half precision denormal handling",
   [FeatureFP64]
 >;
 
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+  "FP64FP16Denormals",
+  "true",
+  "Enable double and half precision denormal handling",
+  [FeatureFP64, FeatureFP64FP16Denormals]
+>;
+
+def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
+  "FP64FP16Denormals",
+  "true",
+  "Enable half precision denormal handling",
+  [FeatureFP64FP16Denormals]
+>;
+
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+  "DX10Clamp",
+  "true",
+  "clamp modifier clamps NaNs to 0.0"
+>;
+
 def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
   "FPExceptions",
   "true",
@@ -343,7 +396,17 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
-   FeatureScalarStores, FeatureInv2PiInlineImm
+   FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
+   FeatureDPP
+  ]
+>;
+
+def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+  [FeatureFP64, FeatureLocalMemorySize65536,
+   FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
+   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode
   ]
 >;
 
@@ -399,6 +462,9 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
    FeatureLDSBankCount16,
    FeatureXNACK]>;
 
+def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,[]>;
+def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,[]>;
+
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
 //===----------------------------------------------------------------------===//
@@ -504,14 +570,27 @@ def isVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate<"FeatureGCN3Encoding">;
 
+def isGFX9 : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
+  AssemblerPredicate<"FeatureGFX9Insts">;
+
+// TODO: Either the name to be changed or we simply use IsCI!
 def isCIVI : Predicate <
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
-  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
->, AssemblerPredicate<"FeatureCIInsts">;
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
+  AssemblerPredicate<"FeatureCIInsts">;
 
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
 
-def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
+  AssemblerPredicate<"Feature16BitInsts">;
+def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
+  AssemblerPredicate<"FeatureVOP3P">;
+
+def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
+  AssemblerPredicate<"FeatureSDWA">;
+
+def HasDPP : Predicate<"Subtarget->hasDPP()">,
+  AssemblerPredicate<"FeatureDPP">;
 
 class PredicateControl {
   Predicate SubtargetPredicate;
@@ -532,5 +611,6 @@ include "Processors.td"
 include "AMDGPUInstrInfo.td"
 include "AMDGPUIntrinsics.td"
 include "AMDGPURegisterInfo.td"
+include "AMDGPURegisterBanks.td"
 include "AMDGPUInstructions.td"
 include "AMDGPUCallingConv.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
new file mode 100644
index 000000000000..3c99f48e818a
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -0,0 +1,147 @@
+//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is the AMGPU address space based alias analysis pass.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-aa"
+
+// Register this pass...
+char AMDGPUAAWrapperPass::ID = 0;
+INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
+                "AMDGPU Address space based Alias Analysis", false, true)
+
+ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
+  return new AMDGPUAAWrapperPass();
+}
+
+void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+// Must match the table in getAliasResult.
+AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_)
+  : Arch(Arch_), AS(AS_) {
+  // These arrarys are indexed by address space value
+  // enum elements 0 ... to 5
+  static const AliasResult ASAliasRulesPrivIsZero[6][6] = {
+  /*             Private    Global    Constant  Group     Flat      Region*/
+  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Global   */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias},
+  /* Group    */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias},
+  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
+  };
+  static const AliasResult ASAliasRulesGenIsZero[6][6] = {
+  /*             Flat       Global    Constant  Group     Region    Private */
+  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
+  /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
+  /* Group    */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
+  /* Region   */ {MayAlias, NoAlias , NoAlias , NoAlias,  MayAlias, NoAlias},
+  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias}
+  };
+  assert(AS.MAX_COMMON_ADDRESS <= 5);
+  if (AS.FLAT_ADDRESS == 0) {
+    assert(AS.GLOBAL_ADDRESS   == 1 &&
+           AS.REGION_ADDRESS   == 4 &&
+           AS.LOCAL_ADDRESS    == 3 &&
+           AS.CONSTANT_ADDRESS == 2 &&
+           AS.PRIVATE_ADDRESS  == 5);
+    ASAliasRules = &ASAliasRulesGenIsZero;
+  } else {
+    assert(AS.PRIVATE_ADDRESS  == 0 &&
+           AS.GLOBAL_ADDRESS   == 1 &&
+           AS.CONSTANT_ADDRESS == 2 &&
+           AS.LOCAL_ADDRESS    == 3 &&
+           AS.FLAT_ADDRESS     == 4 &&
+           AS.REGION_ADDRESS   == 5);
+    ASAliasRules = &ASAliasRulesPrivIsZero;
+  }
+}
+
+AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1,
+    unsigned AS2) const {
+  if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) {
+    if (Arch == Triple::amdgcn)
+      report_fatal_error("Pointer address space out of range");
+    return AS1 == AS2 ? MayAlias : NoAlias;
+  }
+
+  return (*ASAliasRules)[AS1][AS2];
+}
+
+AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
+                                  const MemoryLocation &LocB) {
+  unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
+  unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
+
+  AliasResult Result = ASAliasRules.getAliasResult(asA, asB);
+  if (Result == NoAlias) return Result;
+
+  // Forward the query to the next alias analysis.
+  return AAResultBase::alias(LocA, LocB);
+}
+
+bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
+                                            bool OrLocal) {
+  const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
+
+  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+    return true;
+  }
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+    if (GV->isConstant())
+      return true;
+  } else if (const Argument *Arg = dyn_cast<Argument>(Base)) {
+    const Function *F = Arg->getParent();
+
+    // Only assume constant memory for arguments on kernels.
+    switch (F->getCallingConv()) {
+    default:
+      return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+    case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_GS:
+    case CallingConv::AMDGPU_PS:
+    case CallingConv::AMDGPU_CS:
+    case CallingConv::AMDGPU_KERNEL:
+    case CallingConv::SPIR_KERNEL:
+      break;
+    }
+
+    unsigned ArgNo = Arg->getArgNo();
+    /* On an argument, ReadOnly attribute indicates that the function does
+       not write through this pointer argument, even though it may write
+       to the memory that the pointer points to.
+       On an argument, ReadNone attribute indicates that the function does
+       not dereference that pointer argument, even though it may read or write
+       the memory that the pointer points to if accessed through other pointers.
+     */
+    if (F->hasParamAttribute(ArgNo, Attribute::NoAlias) &&
+        (F->hasParamAttribute(ArgNo, Attribute::ReadNone) ||
+         F->hasParamAttribute(ArgNo, Attribute::ReadOnly))) {
+      return true;
+    }
+  }
+  return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
new file mode 100644
index 000000000000..5f8ed9b1f9a3
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -0,0 +1,102 @@
+//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is the AMGPU address space based alias analysis pass.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A simple AA result that uses TBAA metadata to answer queries.
+class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
+  friend AAResultBase<AMDGPUAAResult>;
+
+  const DataLayout &DL;
+  AMDGPUAS AS;
+
+public:
+  explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
+    DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {}
+  AMDGPUAAResult(AMDGPUAAResult &&Arg)
+      : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS),
+        ASAliasRules(Arg.ASAliasRules){}
+
+  /// Handle invalidation events from the new pass manager.
+  ///
+  /// By definition, this result is stateless and so remains valid.
+  bool invalidate(Function &, const PreservedAnalyses &) { return false; }
+
+  AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
+  bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal);
+
+private:
+  bool Aliases(const MDNode *A, const MDNode *B) const;
+  bool PathAliases(const MDNode *A, const MDNode *B) const;
+
+  class ASAliasRulesTy {
+  public:
+    ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_);
+    AliasResult getAliasResult(unsigned AS1, unsigned AS2) const;
+  private:
+    Triple::ArchType Arch;
+    AMDGPUAS AS;
+    const AliasResult (*ASAliasRules)[6][6];
+  } ASAliasRules;
+};
+
+/// Analysis pass providing a never-invalidated alias analysis result.
+class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
+  friend AnalysisInfoMixin<AMDGPUAA>;
+  static char PassID;
+
+public:
+  typedef AMDGPUAAResult Result;
+
+  AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
+    return AMDGPUAAResult(F.getParent()->getDataLayout(),
+        Triple(F.getParent()->getTargetTriple()));
+  }
+};
+
+/// Legacy wrapper pass to provide the AMDGPUAAResult object.
+class AMDGPUAAWrapperPass : public ImmutablePass {
+  std::unique_ptr<AMDGPUAAResult> Result;
+
+public:
+  static char ID;
+
+  AMDGPUAAWrapperPass() : ImmutablePass(ID) {
+    initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  AMDGPUAAResult &getResult() { return *Result; }
+  const AMDGPUAAResult &getResult() const { return *Result; }
+
+  bool doInitialization(Module &M) override {
+    Result.reset(new AMDGPUAAResult(M.getDataLayout(),
+        Triple(M.getTargetTriple())));
+    return false;
+  }
+  bool doFinalization(Module &M) override {
+    Result.reset();
+    return false;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+}
+#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 067a16a2af7f..1d03714874e2 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -24,8 +24,10 @@ namespace {
 class AMDGPUAlwaysInline : public ModulePass {
   static char ID;
 
+  bool GlobalOpt;
+
 public:
-  AMDGPUAlwaysInline() : ModulePass(ID) { }
+  AMDGPUAlwaysInline(bool GlobalOpt) : ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
 };
@@ -45,8 +47,10 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     }
   }
 
-  for (GlobalAlias* A : AliasesToRemove) {
-    A->eraseFromParent();
+  if (GlobalOpt) {
+    for (GlobalAlias* A : AliasesToRemove) {
+      A->eraseFromParent();
+    }
   }
 
   for (Function &F : M) {
@@ -70,6 +74,6 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
   return false;
 }
 
-ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
-  return new AMDGPUAlwaysInline();
+ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
+  return new AMDGPUAlwaysInline(GlobalOpt);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index c98d25e20185..3d8db7cd8af5 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -26,7 +27,9 @@ namespace {
 
 class AMDGPUAnnotateKernelFeatures : public ModulePass {
 private:
-  static bool hasAddrSpaceCast(const Function &F);
+  const TargetMachine *TM;
+  AMDGPUAS AS;
+  static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
 
   void addAttrToCallers(Function *Intrin, StringRef AttrName);
   bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
@@ -34,7 +37,8 @@ private:
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+  AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) :
+                               ModulePass(ID), TM(TM_) {}
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
@@ -45,10 +49,11 @@ public:
     ModulePass::getAnalysisUsage(AU);
   }
 
-  static bool visitConstantExpr(const ConstantExpr *CE);
+  static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
   static bool visitConstantExprsRecursively(
     const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+    AMDGPUAS AS);
 };
 
 }
@@ -62,18 +67,20 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
 
 
 // The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS) {
-  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
+  return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
 }
 
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
-  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
+    const AMDGPUAS &AS) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
 }
 
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
+    AMDGPUAS AS) {
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-    return castRequiresQueuePtr(SrcAS);
+    return castRequiresQueuePtr(SrcAS, AS);
   }
 
   return false;
@@ -81,7 +88,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
 
 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+  AMDGPUAS AS) {
 
   if (!ConstantExprVisited.insert(EntryC).second)
     return false;
@@ -94,7 +102,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 
     // Check this constant expression.
     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (visitConstantExpr(CE))
+      if (visitConstantExpr(CE, AS))
         return true;
     }
 
@@ -115,13 +123,14 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 }
 
 // Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
+    AMDGPUAS AS) {
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
   for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC))
+        if (castRequiresQueuePtr(ASC, AS))
           return true;
       }
 
@@ -130,7 +139,7 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS))
           return true;
       }
     }
@@ -170,6 +179,7 @@ bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
 
 bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
   Triple TT(M.getTargetTriple());
+  AS = AMDGPU::getAMDGPUAS(M);
 
   static const StringRef IntrinsicToAttr[][2] = {
     // .x omitted
@@ -190,7 +200,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
   static const StringRef HSAIntrinsicToAttr[][2] = {
     { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
     { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
-    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }
+    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" },
+    { "llvm.trap", "amdgpu-queue-ptr" },
+    { "llvm.debugtrap", "amdgpu-queue-ptr" }
   };
 
   // TODO: We should not add the attributes if the known compile time workgroup
@@ -209,7 +221,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
       if (F.hasFnAttribute("amdgpu-queue-ptr"))
         continue;
 
-      if (hasAddrSpaceCast(F))
+      bool HasApertureRegs =
+        TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs();
+      if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
         F.addFnAttr("amdgpu-queue-ptr");
     }
   }
@@ -217,6 +231,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
   return Changed;
 }
 
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
-  return new AMDGPUAnnotateKernelFeatures();
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) {
+  return new AMDGPUAnnotateKernelFeatures(TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index c011be6fa169..91b3649f5c39 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -37,6 +37,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
   LoopInfo *LI;
   DenseMap<Value*, GetElementPtrInst*> noClobberClones;
   bool isKernelFunc;
+  AMDGPUAS AMDGPUASI;
 
 public:
   static char ID;
@@ -130,8 +131,8 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
-  auto isGlobalLoad = [](LoadInst &Load)->bool {
-    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  auto isGlobalLoad = [&](LoadInst &Load)->bool {
+    return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
   };
   // We're tracking up to the Function boundaries
   // We cannot go beyond because of FunctionPass restrictions
@@ -166,6 +167,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
 }
 
 bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+  AMDGPUASI = AMDGPU::getAMDGPUAS(M);
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 974e79fff3d7..0446655830d1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -17,11 +17,11 @@
 //
 
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
@@ -93,33 +93,40 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
 
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
-  : AsmPrinter(TM, std::move(Streamer)) {}
+  : AsmPrinter(TM, std::move(Streamer)) {
+    AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+  }
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
   return "AMDGPU Assembly Printer";
 }
 
+const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
+  return TM.getMCSubtargetInfo();
+}
+
+AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const {
+  return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer());
+}
+
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
 
-  // Need to construct an MCSubtargetInfo here in case we have no functions
-  // in the module.
-  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
-        TM.getTargetTriple().str(), TM.getTargetCPU(),
-        TM.getTargetFeatureString()));
-
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
 
-  TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+  getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1);
+  getTargetStreamer().EmitDirectiveHSACodeObjectISA(
+      ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
+  getTargetStreamer().EmitStartOfCodeObjectMetadata(M);
+}
 
-  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
-  TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
-                                    "AMD", "AMDGPU");
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
 
-  // Emit runtime metadata.
-  TS->EmitRuntimeMetadata(M);
+  getTargetStreamer().EmitEndOfCodeObjectMetadata();
 }
 
 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
@@ -136,25 +143,32 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
 }
 
-
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
+  amd_kernel_code_t KernelCode;
   if (STM.isAmdCodeObjectV2(*MF)) {
     getSIProgramInfo(KernelInfo, *MF);
-    EmitAmdKernelCodeT(*MF, KernelInfo);
+    getAmdKernelCode(KernelCode, KernelInfo, *MF);
+
+    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+    getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
   }
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+  getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(),
+                                                   KernelCode);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
-    AMDGPUTargetStreamer *TS =
-        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
     SmallString<128> SymbolName;
     getNameWithPrefix(SymbolName, MF->getFunction()),
-    TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
+    getTargetStreamer().EmitAMDGPUSymbolType(
+        SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
   }
 
   AsmPrinter::EmitFunctionEntryLabel();
@@ -163,7 +177,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+  if (AMDGPU::isGroupSegment(GV, AMDGPUASI))
     return;
 
   AsmPrinter::EmitGlobalVariable(GV);
@@ -247,6 +261,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
                                   false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+                                  Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),
+                                  false);
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
                                   Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
                                   false);
@@ -382,6 +399,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         case AMDGPU::EXEC_HI:
         case AMDGPU::SCC:
         case AMDGPU::M0:
+        case AMDGPU::SRC_SHARED_BASE:
+        case AMDGPU::SRC_SHARED_LIMIT:
+        case AMDGPU::SRC_PRIVATE_BASE:
+        case AMDGPU::SRC_PRIVATE_LIMIT:
           continue;
 
         case AMDGPU::VCC:
@@ -478,33 +499,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       ExtraSGPRs = 6;
   }
 
-  // Record first reserved register and reserved register count fields, and
-  // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
-  // requested.
-  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
-  ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
-
-  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
-  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
-  // attribute was requested.
-  if (STM.debuggerEmitPrologue()) {
-    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
-      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
-    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
-      RI->getHWRegIndex(MFI->getScratchRSrcReg());
-  }
+  unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       !STM.hasSGPRInitBug()) {
-    unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
     if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm.
       LLVMContext &Ctx = MF.getFunction()->getContext();
       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
                                        "addressable scalar registers",
                                        MaxSGPR + 1, DS_Error,
-                                       DK_ResourceLimit, MaxAddressableNumSGPRs);
+                                       DK_ResourceLimit,
+                                       MaxAddressableNumSGPRs);
       Ctx.diagnose(Diag);
       MaxSGPR = MaxAddressableNumSGPRs - 1;
     }
@@ -512,41 +520,43 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
   MaxSGPR += ExtraSGPRs;
-  MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
+  MaxVGPR += ExtraVGPRs;
 
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
-  ProgInfo.NumVGPR = MaxVGPR + 1;
   ProgInfo.NumSGPR = MaxSGPR + 1;
+  ProgInfo.NumVGPR = MaxVGPR + 1;
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
   ProgInfo.NumSGPRsForWavesPerEU = std::max(
-    ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
+    ProgInfo.NumSGPR, STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
   ProgInfo.NumVGPRsForWavesPerEU = std::max(
-    ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+    ProgInfo.NumVGPR, STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
 
   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
       STM.hasSGPRInitBug()) {
-    unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxNumSGPRs) {
-      // This can happen due to a compiler bug or when using inline asm to use the
-      // registers which are usually reserved for vcc etc.
-
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+      // This can happen due to a compiler bug or when using inline asm to use
+      // the registers which are usually reserved for vcc etc.
       LLVMContext &Ctx = MF.getFunction()->getContext();
       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
                                        "scalar registers",
                                        ProgInfo.NumSGPR, DS_Error,
-                                       DK_ResourceLimit, MaxNumSGPRs);
+                                       DK_ResourceLimit,
+                                       MaxAddressableNumSGPRs);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxNumSGPRs;
-      ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
+      ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
+      ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
     }
   }
 
   if (STM.hasSGPRInitBug()) {
-    ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-    ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    ProgInfo.NumSGPR =
+        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+    ProgInfo.NumSGPRsForWavesPerEU =
+        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
   }
 
   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
@@ -565,13 +575,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // SGPRBlocks is actual number of SGPR blocks minus 1.
   ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
-                                RI->getSGPRAllocGranule());
-  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
+                                STM.getSGPREncodingGranule());
+  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
 
   // VGPRBlocks is actual number of VGPR blocks minus 1.
   ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
-                                RI->getVGPRAllocGranule());
-  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
+                                STM.getVGPREncodingGranule());
+  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
+
+  // Record first reserved VGPR and number of reserved VGPRs.
+  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
+  ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was requested.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
 
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
   // register.
@@ -580,7 +604,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
 
   // Make clamp modifier on NaN input returns 0.
-  ProgInfo.DX10Clamp = 1;
+  ProgInfo.DX10Clamp = STM.enableDX10Clamp();
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   ProgInfo.ScratchSize = FrameInfo.getStackSize();
@@ -635,6 +659,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
@@ -688,7 +713,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
@@ -713,97 +738,88 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   }
 }
 
-void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) const {
+void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
+                                        const SIProgramInfo &KernelInfo,
+                                        const MachineFunction &MF) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  amd_kernel_code_t header;
 
-  AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
-  header.compute_pgm_resource_registers =
+  Out.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
-
+  Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
-  AMD_HSA_BITS_SET(header.code_properties,
+  AMD_HSA_BITS_SET(Out.code_properties,
                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
 
   if (MFI->hasPrivateSegmentBuffer()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (MFI->hasQueuePtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 
   if (MFI->hasKernargSegmentPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
 
   if (MFI->hasDispatchID())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
 
   if (MFI->hasFlatScratchInit())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
-
-  // TODO: Private segment size
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
   if (MFI->hasGridWorkgroupCountX()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
   }
 
   if (MFI->hasGridWorkgroupCountY()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
   }
 
   if (MFI->hasGridWorkgroupCountZ()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (STM.debuggerSupported())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
 
   if (STM.isXNACKEnabled())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
   // FIXME: Should use getKernArgSize
-  header.kernarg_segment_byte_size =
+  Out.kernarg_segment_byte_size =
     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
-  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  header.workitem_vgpr_count = KernelInfo.NumVGPR;
-  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
-  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
+  Out.workitem_vgpr_count = KernelInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+  Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+  Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
-  header.kernarg_segment_alignment = std::max((size_t)4,
+  Out.kernarg_segment_alignment = std::max((size_t)4,
       countTrailingZeros(MFI->getMaxKernArgAlign()));
 
   if (STM.debuggerEmitPrologue()) {
-    header.debug_wavefront_private_segment_offset_sgpr =
+    Out.debug_wavefront_private_segment_offset_sgpr =
       KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
-    header.debug_private_segment_buffer_sgpr =
+    Out.debug_private_segment_buffer_sgpr =
       KernelInfo.DebuggerPrivateSegmentBufferSGPR;
   }
-
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-  TS->EmitAMDKernelCodeT(header);
 }
 
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 9a4bafef3a25..13425c8b2a0f 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -15,95 +15,84 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
-#include "AMDGPUMCInstLower.h"
-
+#include "AMDKernelCodeT.h"
+#include "AMDGPU.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
 #include <vector>
 
 namespace llvm {
+
+class AMDGPUTargetStreamer;
 class MCOperand;
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
   struct SIProgramInfo {
-    SIProgramInfo() :
-      VGPRBlocks(0),
-      SGPRBlocks(0),
-      Priority(0),
-      FloatMode(0),
-      Priv(0),
-      DX10Clamp(0),
-      DebugMode(0),
-      IEEEMode(0),
-      ScratchSize(0),
-      ComputePGMRSrc1(0),
-      LDSBlocks(0),
-      ScratchBlocks(0),
-      ComputePGMRSrc2(0),
-      NumVGPR(0),
-      NumSGPR(0),
-      FlatUsed(false),
-      NumSGPRsForWavesPerEU(0),
-      NumVGPRsForWavesPerEU(0),
-      ReservedVGPRFirst(0),
-      ReservedVGPRCount(0),
-      DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
-      DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
-      VCCUsed(false),
-      CodeLen(0) {}
-
     // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks;
-    uint32_t SGPRBlocks;
-    uint32_t Priority;
-    uint32_t FloatMode;
-    uint32_t Priv;
-    uint32_t DX10Clamp;
-    uint32_t DebugMode;
-    uint32_t IEEEMode;
-    uint32_t ScratchSize;
-
-    uint64_t ComputePGMRSrc1;
+    uint32_t VGPRBlocks = 0;
+    uint32_t SGPRBlocks = 0;
+    uint32_t Priority = 0;
+    uint32_t FloatMode = 0;
+    uint32_t Priv = 0;
+    uint32_t DX10Clamp = 0;
+    uint32_t DebugMode = 0;
+    uint32_t IEEEMode = 0;
+    uint32_t ScratchSize = 0;
+
+    uint64_t ComputePGMRSrc1 = 0;
 
     // Fields set in PGM_RSRC2 pm4 packet.
-    uint32_t LDSBlocks;
-    uint32_t ScratchBlocks;
+    uint32_t LDSBlocks = 0;
+    uint32_t ScratchBlocks = 0;
 
-    uint64_t ComputePGMRSrc2;
+    uint64_t ComputePGMRSrc2 = 0;
 
-    uint32_t NumVGPR;
-    uint32_t NumSGPR;
+    uint32_t NumVGPR = 0;
+    uint32_t NumSGPR = 0;
     uint32_t LDSSize;
-    bool FlatUsed;
+    bool FlatUsed = false;
 
     // Number of SGPRs that meets number of waves per execution unit request.
-    uint32_t NumSGPRsForWavesPerEU;
+    uint32_t NumSGPRsForWavesPerEU = 0;
 
     // Number of VGPRs that meets number of waves per execution unit request.
-    uint32_t NumVGPRsForWavesPerEU;
+    uint32_t NumVGPRsForWavesPerEU = 0;
 
     // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
     // fixed VGPR number reserved.
-    uint16_t ReservedVGPRFirst;
+    uint16_t ReservedVGPRFirst = 0;
 
     // The number of consecutive VGPRs reserved.
-    uint16_t ReservedVGPRCount;
+    uint16_t ReservedVGPRCount = 0;
 
     // Fixed SGPR number used to hold wave scratch offset for entire kernel
-    // execution, or uint16_t(-1) if the register is not used or not known.
-    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    // execution, or std::numeric_limits<uint16_t>::max() if the register is not
+    // used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
+        std::numeric_limits<uint16_t>::max();
 
     // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
-    // kernel execution, or uint16_t(-1) if the register is not used or not
-    // known.
-    uint16_t DebuggerPrivateSegmentBufferSGPR;
+    // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
+    // is not used or not known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR =
+        std::numeric_limits<uint16_t>::max();
 
     // Bonus information for debugging.
-    bool VCCUsed;
-    uint64_t CodeLen;
+    bool VCCUsed = false;
+    uint64_t CodeLen = 0;
+
+    SIProgramInfo() = default;
   };
 
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
+                        const MachineFunction &MF) const;
   void findNumUsedRegistersSI(const MachineFunction &MF,
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
@@ -112,21 +101,28 @@ private:
   /// can correctly setup the GPU state.
   void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
-  void EmitAmdKernelCodeT(const MachineFunction &MF,
-                          const SIProgramInfo &KernelInfo) const;
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
   StringRef getPassName() const override;
 
+  const MCSubtargetInfo* getSTI() const;
+
+  AMDGPUTargetStreamer& getTargetStreamer() const;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
   /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
   /// pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
 
+  /// \brief Lower the specified LLVM Constant to an MCExpr.
+  /// The AsmPrinter::lowerConstantof does not know how to lower
+  /// addrspacecast, therefore they should be lowered by this function.
+  const MCExpr *lowerConstant(const Constant *CV) override;
+
   /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
@@ -143,6 +139,8 @@ public:
 
   void EmitStartOfAsmFile(Module &M) override;
 
+  void EmitEndOfAsmFile(Module &M) override;
+
   bool isBlockOnlyReachableByFallthrough(
     const MachineBasicBlock *MBB) const override;
 
@@ -153,8 +151,9 @@ public:
 protected:
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
+  AMDGPUAS AMDGPUASI;
 };
 
-} // End anonymous llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index d53cc153dc9a..e67ae092fdda 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -14,8 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUCallLowering.h"
+#include "AMDGPU.h"
 #include "AMDGPUISelLowering.h"
-
+#include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 
@@ -26,17 +31,138 @@ using namespace llvm;
 #endif
 
 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
-  : CallLowering(&TLI) {
+  : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) {
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
-                                        const Value *Val, unsigned VReg) const {
+                                     const Value *Val, unsigned VReg) const {
+  MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
   return true;
 }
 
+unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+                                               Type *ParamTy,
+                                               unsigned Offset) const {
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  LLT PtrType = getLLTForType(*PtrTy, DL);
+  unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
+  unsigned KernArgSegmentPtr =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+  unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
+
+  unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+  MIRBuilder.buildConstant(OffsetReg, Offset);
+
+  MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
+
+  return DstReg;
+}
+
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
+                                        Type *ParamTy, unsigned Offset,
+                                        unsigned DstReg) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+  unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
+  unsigned Align = DL.getABITypeAlignment(ParamTy);
+  unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
+                                       MachineMemOperand::MONonTemporal |
+                                       MachineMemOperand::MOInvariant,
+                                       TypeSize, Align);
+
+  MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
+}
+
 bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                               const Function &F,
                                               ArrayRef<unsigned> VRegs) const {
-  // TODO: Implement once there are generic loads/stores.
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    const LLT P2 = LLT::pointer(2, 64);
+    unsigned VReg = MRI.createGenericVirtualRegister(P2);
+    MRI.addLiveIn(InputPtrReg, VReg);
+    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
+    MIRBuilder.buildCopy(VReg, InputPtrReg);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
+  if (Info->hasDispatchID()) {
+    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(DispatchIDReg);
+  }
+
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
+  unsigned NumArgs = F.arg_size();
+  Function::const_arg_iterator CurOrigArg = F.arg_begin();
+  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
+    MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT();
+    ISD::ArgFlagsTy Flags;
+    Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+    CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
+                                             /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+
+  Function::const_arg_iterator Arg = F.arg_begin();
+  for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
+    // FIXME: We should be getting DebugInfo from the arguments some how.
+    CCValAssign &VA = ArgLocs[i];
+    lowerParameter(MIRBuilder, Arg->getType(),
+                   VA.getLocMemOffset() +
+                   Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]);
+  }
+
   return true;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 9ae87c9397ab..09bdf8ffcde7 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 
 namespace llvm {
@@ -22,6 +23,14 @@ namespace llvm {
 class AMDGPUTargetLowering;
 
 class AMDGPUCallLowering: public CallLowering {
+  AMDGPUAS AMDGPUASI;
+
+  unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+                             unsigned Offset) const;
+
+  void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+                      unsigned Offset, unsigned DstReg) const;
+
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
@@ -29,6 +38,7 @@ class AMDGPUCallLowering: public CallLowering {
                    unsigned VReg) const override;
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 };
 } // End of namespace llvm;
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 47dfa4992068..d308f718aae1 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -17,7 +17,7 @@ class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
 // Calling convention for SI
 def CC_SI : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -25,17 +25,13 @@ def CC_SI : CallingConv<[
     SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
   ]>>>,
 
-  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
-      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
-      SGPR32, SGPR34, SGPR36, SGPR38 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
-      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
-      SGPR33, SGPR35, SGPR37, SGPR39 ]
-  >>>,
+  // We have no way of referring to the generated register tuples
+  // here, so use a custom function.
+  CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
+  CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -53,17 +49,7 @@ def CC_SI : CallingConv<[
     VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
     VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
     VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
-  ]>>>,
-
-  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
-      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
-      SGPR32, SGPR34, SGPR36, SGPR38 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
-      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
-      SGPR33, SGPR35, SGPR37, SGPR39 ]
-  >>>
-
+  ]>>>
 ]>;
 
 def RetCC_SI : CallingConv<[
@@ -76,7 +62,7 @@ def RetCC_SI : CallingConv<[
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32] , CCAssignToReg<[
+  CCIfType<[f32, f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e6230547a9b3..e19314fe0a6c 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,16 +14,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
-
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <iterator>
 
 #define DEBUG_TYPE "amdgpu-codegenprepare"
 
@@ -34,17 +49,15 @@ namespace {
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNTargetMachine *TM;
-  const SISubtarget *ST;
-  DivergenceAnalysis *DA;
-  Module *Mod;
-  bool HasUnsafeFPMath;
+  const SISubtarget *ST = nullptr;
+  DivergenceAnalysis *DA = nullptr;
+  Module *Mod = nullptr;
+  bool HasUnsafeFPMath = false;
 
   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
   ///
   /// \returns Binary operation \p V.
-  Value *copyFlags(const BinaryOperator &I, Value *V) const;
-
   /// \returns \p T's base element bit width.
   unsigned getBaseElementBitWidth(const Type *T) const;
 
@@ -113,13 +126,9 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
 
 public:
   static char ID;
+
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
-    FunctionPass(ID),
-    TM(static_cast<const GCNTargetMachine *>(TM)),
-    ST(nullptr),
-    DA(nullptr),
-    Mod(nullptr),
-    HasUnsafeFPMath(false) { }
+    FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {}
 
   bool visitFDiv(BinaryOperator &I);
 
@@ -142,22 +151,7 @@ public:
  }
 };
 
-} // End anonymous namespace
-
-Value *AMDGPUCodeGenPrepare::copyFlags(
-    const BinaryOperator &I, Value *V) const {
-  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
-  if (!BinOp) // Possibly constant expression.
-    return V;
-
-  if (isa<OverflowingBinaryOperator>(BinOp)) {
-    BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
-    BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-  } else if (isa<PossiblyExactOperator>(BinOp))
-    BinOp->setIsExact(I.isExact());
-
-  return V;
-}
+} // end anonymous namespace
 
 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
@@ -186,12 +180,48 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 }
 
 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
-  if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
-      T->getIntegerBitWidth() <= 16)
+  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
+  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
+    return true;
+
+  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
+    // TODO: The set of packed operations is more limited, so may want to
+    // promote some anyway.
+    if (ST->hasVOP3PInsts())
+      return false;
+
+    return needsPromotionToI32(VT->getElementType());
+  }
+
+  return false;
+}
+
+// Return true if the op promoted to i32 should have nsw set.
+static bool promotedOpIsNSW(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::Add:
+  case Instruction::Sub:
+    return true;
+  case Instruction::Mul:
+    return I.hasNoUnsignedWrap();
+  default:
+    return false;
+  }
+}
+
+// Return true if the op promoted to i32 should have nuw set.
+static bool promotedOpIsNUW(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::Add:
+  case Instruction::Mul:
     return true;
-  if (!T->isVectorTy())
+  case Instruction::Sub:
+    return I.hasNoUnsignedWrap();
+  default:
     return false;
-  return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+  }
 }
 
 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -218,7 +248,19 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
   }
-  ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+
+  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
+  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
+    if (promotedOpIsNSW(cast<Instruction>(I)))
+      Inst->setHasNoSignedWrap();
+
+    if (promotedOpIsNUW(cast<Instruction>(I)))
+      Inst->setHasNoUnsignedWrap();
+
+    if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
+      Inst->setIsExact(ExactOp->isExact());
+  }
+
   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 
   I.replaceAllUsesWith(TruncRes);
@@ -346,9 +388,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
-  Function *Decl
-    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
 
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 805fb7102a35..e32ca9653b3a 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -12,11 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
@@ -69,34 +64,3 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
   // T1.W = stack[1].w
   return 1;
 }
-
-/// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                int FI,
-                                                unsigned &FrameReg) const {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const AMDGPURegisterInfo *RI
-    = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
-
-  // Fill in FrameReg output argument.
-  FrameReg = RI->getFrameRegister(MF);
-
-  // Start the offset at 2 so we don't overwrite work group information.
-  // XXX: We should only do this when the shader actually uses this
-  // information.
-  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
-  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
-
-  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
-    OffsetBytes += MFI.getObjectSize(i);
-    // Each register holds 4 bytes, so we must always align the offset to at
-    // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = alignTo(OffsetBytes, 4);
-  }
-
-  if (FI != -1)
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
-
-  return OffsetBytes / (getStackWidth(MF) * 4);
-}
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 5d51351a00d2..8e187c7e56c1 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -34,9 +34,6 @@ public:
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
-
   bool hasFP(const MachineFunction &MF) const override {
     return false;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
new file mode 100644
index 000000000000..5cb9036f4823
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -0,0 +1,62 @@
+//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by AMDGPURegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace llvm {
+namespace AMDGPU {
+
+enum PartialMappingIdx {
+  None = - 1,
+  PM_SGPR32 = 0,
+  PM_SGPR64 = 1,
+  PM_VGPR32 = 2,
+  PM_VGPR64 = 3
+};
+
+const RegisterBankInfo::PartialMapping PartMappings[] {
+  // StartIdx, Length, RegBank
+  {0, 32, SGPRRegBank},
+  {0, 64, SGPRRegBank},
+  {0, 32, VGPRRegBank},
+  {0, 64, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappings[] {
+  // SGPR 32-bit
+  {&PartMappings[0], 1},
+  // SGPR 64-bit
+  {&PartMappings[1], 1},
+  // VGPR 32-bit
+  {&PartMappings[2], 1},
+  // VGPR 64-bit
+  {&PartMappings[3], 1}
+};
+
+enum ValueMappingIdx {
+  SGPRStartIdx = 0,
+  VGPRStartIdx = 2
+};
+
+const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
+                                                      unsigned Size) {
+  assert(Size % 32 == 0);
+  unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
+  Idx += (Size / 32) - 1;
+  return &ValMappings[Idx];
+}
+
+} // End AMDGPU namespace.
+} // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5bf347e48650..318de7f2e3d2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -67,10 +67,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const AMDGPUSubtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
 
 public:
   explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel) {}
+      : SelectionDAGISel(TM, OptLevel){
+    AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
+  }
   ~AMDGPUDAGToDAGISel() override = default;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -80,6 +83,7 @@ public:
 
 private:
   SDValue foldFrameIndex(SDValue N) const;
+  bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -143,6 +147,8 @@ private:
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+
+  bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -156,7 +162,15 @@ private:
                                  SDValue &Clamp,
                                  SDValue &Omod) const;
 
+  bool SelectVOP3OMods(SDValue In, SDValue &Src,
+                       SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                        SDValue &Clamp) const;
+
   void SelectADD_SUB_I64(SDNode *N);
+  void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
   void SelectFMA_W_CHAIN(SDNode *N);
   void SelectFMUL_W_CHAIN(SDNode *N);
@@ -187,6 +201,17 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
+bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
+  if (TM.Options.NoNaNsFPMath)
+    return true;
+
+  // TODO: Move into isKnownNeverNaN
+  if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(N))
+    return BO->Flags.hasNoNaNs();
+
+  return CurDAG->isKnownNeverNaN(N);
+}
+
 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   const SIInstrInfo *TII
     = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
@@ -250,7 +275,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
     return N;
 
   const SITargetLowering& Lowering =
@@ -290,6 +315,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   llvm_unreachable("invalid vector size");
 }
 
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    Out = C->getAPIntValue().getZExtValue();
+    return true;
+  }
+
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+    Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
+    return true;
+  }
+
+  return false;
+}
+
 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -319,6 +358,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectADD_SUB_I64(N);
     return;
   }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SelectUADDO_USUBO(N);
+    return;
+  }
   case AMDGPUISD::FMUL_W_CHAIN: {
     SelectFMUL_W_CHAIN(N);
     return;
@@ -336,7 +380,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
     EVT EltVT = VT.getVectorElementType();
+
+    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+      if (Opc == ISD::BUILD_VECTOR) {
+        uint32_t LHSVal, RHSVal;
+        if (getConstantValue(N->getOperand(0), LHSVal) &&
+            getConstantValue(N->getOperand(1), RHSVal)) {
+          uint32_t K = LHSVal | (RHSVal << 16);
+          CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
+                               CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+          return;
+        }
+      }
+
+      break;
+    }
+
     assert(EltVT.bitsEq(MVT::i32));
+
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
     } else {
@@ -502,7 +563,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
       *static_cast<const SITargetLowering*>(getTargetLowering());
-    Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+    N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
     break;
   }
   case ISD::AND:
@@ -531,9 +592,9 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
   if (!N->readMem())
     return false;
   if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
 
-  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
+  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
 }
 
 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
@@ -689,6 +750,17 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
+  // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
+  // carry out despite the _i32 name. These were renamed in VI to _U32.
+  // FIXME: We should probably rename the opcodes here.
+  unsigned Opc = N->getOpcode() == ISD::UADDO ?
+    AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
+                       { N->getOperand(0), N->getOperand(1) });
+}
+
 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
   SDLoc SL(N);
   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
@@ -1176,16 +1248,6 @@ bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
   return true;
 }
 
-///
-/// \param EncodedOffset This is the immediate value that will be encoded
-///        directly into the instruction.  On SI/CI the \p EncodedOffset
-///        will be in units of dwords and on VI+ it will be units of bytes.
-static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
-                                 int64_t EncodedOffset) {
-  return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
-     isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
-}
-
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
                                           SDValue &Offset, bool &Imm) const {
 
@@ -1197,10 +1259,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   SDLoc SL(ByteOffsetNode);
   AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
   int64_t ByteOffset = C->getSExtValue();
-  int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
-      ByteOffset >> 2 : ByteOffset;
+  int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
 
-  if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+  if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
     Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
     Imm = true;
     return true;
@@ -1481,7 +1542,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   MemSDNode *Mem = cast<MemSDNode>(N);
   unsigned AS = Mem->getAddressSpace();
-  if (AS == AMDGPUAS::FLAT_ADDRESS) {
+  if (AS == AMDGPUASI.FLAT_ADDRESS) {
     SelectCode(N);
     return;
   }
@@ -1545,7 +1606,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
                                         SDValue &SrcMods) const {
   unsigned Mods = 0;
-
   Src = In;
 
   if (Src.getOpcode() == ISD::FNEG) {
@@ -1559,10 +1619,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
   }
 
   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
-
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
+                                             SDValue &SrcMods) const {
+  SelectVOP3Mods(In, Src, SrcMods);
+  return isNoNanSrc(Src);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   bool Res = SelectVOP3Mods(In, Src, SrcMods);
@@ -1607,6 +1672,50 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
+                                         SDValue &Clamp, SDValue &Omod) const {
+  Src = In;
+
+  SDLoc DL(In);
+  // FIXME: Handle Clamp and Omod
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  unsigned Mods = 0;
+  Src = In;
+
+  // FIXME: Look for on separate components
+  if (Src.getOpcode() == ISD::FNEG) {
+    Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
+    Src = Src.getOperand(0);
+  }
+
+  // Packed instructions do not have abs modifiers.
+
+  // FIXME: Handle abs/neg of individual components.
+  // FIXME: Handle swizzling with op_sel
+  Mods |= SISrcMods::OP_SEL_1;
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
+                                          SDValue &SrcMods,
+                                          SDValue &Clamp) const {
+  SDLoc SL(In);
+
+  // FIXME: Handle clamp and op_sel
+  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+  return SelectVOP3PMods(In, Src, SrcMods);
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 54caa2c5dfad..c0f336e082bd 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
@@ -43,6 +44,37 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
   return true;
 }
 
+static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
+                           CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State,
+                           const TargetRegisterClass *RC,
+                           unsigned NumRegs) {
+  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
+  unsigned RegResult = State.AllocateReg(RegList);
+  if (RegResult == AMDGPU::NoRegister)
+    return false;
+
+  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
+  return true;
+}
+
+static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v2f32: {
+    // Up to SGPR0-SGPR39
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::SGPR_64RegClass, 20);
+  }
+  default:
+    return false;
+  }
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 // Find a larger type to do a load / store of a vector with.
@@ -58,6 +90,7 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
@@ -211,10 +244,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // This is totally unsupported, just custom lower to produce an error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
-  // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-
   // Library functions.  These default to Expand, but we have instructions
   // for them.
   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
@@ -270,6 +299,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
@@ -460,10 +490,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // N > 4 stores on the same chain.
   GatherAllAliasesMaxDepth = 16;
 
-  // FIXME: Need to really handle these.
-  MaxStoresPerMemcpy  = 4096;
-  MaxStoresPerMemmove = 4096;
-  MaxStoresPerMemset  = 4096;
+  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
+  // about these during lowering.
+  MaxStoresPerMemcpy  = 0xffffffff;
+  MaxStoresPerMemmove = 0xffffffff;
+  MaxStoresPerMemset  = 0xffffffff;
 
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::SHL);
@@ -478,12 +509,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
+  setTargetDAGCombine(ISD::FABS);
 }
 
 //===----------------------------------------------------------------------===//
 // Target Information
 //===----------------------------------------------------------------------===//
 
+LLVM_READNONE
 static bool fnegFoldsIntoOp(unsigned Opc) {
   switch (Opc) {
   case ISD::FADD:
@@ -491,17 +524,77 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMUL:
   case ISD::FMA:
   case ISD::FMAD:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::FSIN:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
   case AMDGPUISD::SIN_HW:
   case AMDGPUISD::FMUL_LEGACY:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
     return true;
   default:
     return false;
   }
 }
 
+/// \p returns true if the operation will definitely need to use a 64-bit
+/// encoding, and thus will use a VOP3 encoding regardless of the source
+/// modifiers.
+LLVM_READONLY
+static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
+  return N->getNumOperands() > 2 || VT == MVT::f64;
+}
+
+// Most FP instructions support source modifiers, but this could be refined
+// slightly.
+LLVM_READONLY
+static bool hasSourceMods(const SDNode *N) {
+  if (isa<MemSDNode>(N))
+    return false;
+
+  switch (N->getOpcode()) {
+  case ISD::CopyToReg:
+  case ISD::SELECT:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::INLINEASM:
+  case AMDGPUISD::INTERP_P1:
+  case AMDGPUISD::INTERP_P2:
+  case AMDGPUISD::DIV_SCALE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
+  // it is truly free to use a source modifier in all cases. If there are
+  // multiple users but for each one will necessitate using VOP3, there will be
+  // a code size increase. Try to avoid increasing code size unless we know it
+  // will save on the instruction count.
+  unsigned NumMayIncreaseSize = 0;
+  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
+
+  // XXX - Should this limit number of uses to check?
+  for (const SDNode *U : N->uses()) {
+    if (!hasSourceMods(U))
+      return false;
+
+    if (!opMustUseVOP3Encoding(U, VT)) {
+      if (++NumMayIncreaseSize > CostThreshold)
+        return false;
+    }
+  }
+
+  return true;
+}
+
 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
@@ -580,12 +673,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 
 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
   assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
-                                              VT == MVT::f16);
+
+  // Packed operations do not have a fabs modifier.
+  return VT == MVT::f32 || VT == MVT::f64 ||
+         (Subtarget->has16BitInsts() && VT == MVT::f16);
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
-  return isFAbsFree(VT);
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32 || VT == MVT::f64 ||
+         (Subtarget->has16BitInsts() && VT == MVT::f16) ||
+         (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
 }
 
 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -667,6 +765,11 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//
 
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                  bool IsVarArg) const {
+  return CC_AMDGPU;
+}
+
 /// The SelectionDAGBuilder will automatically promote function arguments
 /// with illegal types.  However, this does not work for the AMDGPU targets
 /// since the function arguments are stored in memory as these illegal types.
@@ -764,11 +867,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
   }
 }
 
-void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
-  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
-}
-
 void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
                            const SmallVectorImpl<ISD::OutputArg> &Outs) const {
 
@@ -788,6 +886,24 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 // Target specific lowering
 //===---------------------------------------------------------------------===//
 
+/// Selects the correct CCAssignFn for a given CallingConvention value.
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                    bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    return CC_AMDGPU;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                         SmallVectorImpl<SDValue> &InVals) const {
   SDValue Callee = CLI.Callee;
@@ -829,14 +945,13 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
-    Op->dump(&DAG);
+    Op->print(errs(), &DAG);
     llvm_unreachable("Custom lowering code for this"
                      "instruction is not implemented yet!");
     break;
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
   case ISD::FREM: return LowerFREM(Op, DAG);
@@ -892,19 +1007,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
-  switch (G->getAddressSpace()) {
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
     // TODO: We could emit code to handle the initialization somewhere.
-    if (hasDefinedInitializer(GV))
-      break;
-
-    unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
-    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
-  }
+    if (!hasDefinedInitializer(GV)) {
+      unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+      return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+    }
   }
 
   const Function &Fn = *DAG.getMachineFunction().getFunction();
@@ -936,41 +1048,12 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
-SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-    SelectionDAG &DAG) const {
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  switch (IntrinsicID) {
-    default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
-      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
-      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
-      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-  }
-}
-
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
                                                    SDValue LHS, SDValue RHS,
                                                    SDValue True, SDValue False,
                                                    SDValue CC,
                                                    DAGCombinerInfo &DCI) const {
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return SDValue();
-
   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
     return SDValue();
 
@@ -1228,7 +1311,10 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
+  unsigned OpCode = Subtarget->hasFP32Denormals() ?
+                    (unsigned)AMDGPUISD::FMAD_FTZ :
+                    (unsigned)ISD::FMAD;
+  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
 
   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
@@ -1662,32 +1748,37 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
 }
 
 // XXX - May require not supporting f32 denormals?
-SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+
+// Don't handle v2f16. The extra instructions to scalarize and repack around the
+// compare and vselect end up producing worse code than scalarizing the whole
+// operation.
+SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
+  EVT VT = Op.getValueType();
 
-  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
 
   // TODO: Should this propagate fast-math-flags?
 
-  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
 
-  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
 
-  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
-  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
+  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
 
-  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
 
   EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
 
-  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
 
-  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
 }
 
 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
@@ -1750,8 +1841,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const
 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::f32)
-    return LowerFROUND32(Op, DAG);
+  if (VT == MVT::f32 || VT == MVT::f16)
+    return LowerFROUND32_16(Op, DAG);
 
   if (VT == MVT::f64)
     return LowerFROUND64(Op, DAG);
@@ -2030,15 +2121,19 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
 }
 
 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue N0 = Op.getOperand(0);
+
+  // Convert to target node to get known bits
+  if (N0.getValueType() == MVT::f32)
+    return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
 
   if (getTargetMachine().Options.UnsafeFPMath) {
     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
     return SDValue();
   }
 
-  SDLoc DL(Op);
-  SDValue N0 = Op.getOperand(0);
-  assert (N0.getSimpleValueType() == MVT::f64);
+  assert(N0.getSimpleValueType() == MVT::f64);
 
   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
   const unsigned ExpMask = 0x7ff;
@@ -2379,6 +2474,28 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
+SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CSrc)
+    return SDValue();
+
+  const APFloat &F = CSrc->getValueAPF();
+  APFloat Zero = APFloat::getZero(F.getSemantics());
+  APFloat::cmpResult Cmp0 = F.compare(Zero);
+  if (Cmp0 == APFloat::cmpLessThan ||
+      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+  }
+
+  APFloat One(F.getSemantics(), "1.0");
+  APFloat::cmpResult Cmp1 = F.compare(One);
+  if (Cmp1 == APFloat::cmpGreaterThan)
+    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+  return SDValue(CSrc, 0);
+}
+
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -2821,20 +2938,41 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
     }
-  }
 
-  if (VT == MVT::f32 && Cond.hasOneUse()) {
-    SDValue MinMax
-      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
-    // Revisit this node so we can catch min3/max3/med3 patterns.
-    //DCI.AddToWorklist(MinMax.getNode());
-    return MinMax;
+    if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
+      SDValue MinMax
+        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+      // Revisit this node so we can catch min3/max3/med3 patterns.
+      //DCI.AddToWorklist(MinMax.getNode());
+      return MinMax;
+    }
   }
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
 }
 
+static bool isConstantFPZero(SDValue N) {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+    return C->isZero() && !C->isNegative();
+  return false;
+}
+
+static unsigned inverseMinMax(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FMAXNUM:
+    return ISD::FMINNUM;
+  case ISD::FMINNUM:
+    return ISD::FMAXNUM;
+  case AMDGPUISD::FMAX_LEGACY:
+    return AMDGPUISD::FMIN_LEGACY;
+  case AMDGPUISD::FMIN_LEGACY:
+    return  AMDGPUISD::FMAX_LEGACY;
+  default:
+    llvm_unreachable("invalid min/max opcode");
+  }
+}
+
 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2847,10 +2985,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   // the other uses cannot, give up. This both prevents unprofitable
   // transformations and infinite loops: we won't repeatedly try to fold around
   // a negate that has no 'good' form.
-  //
-  // TODO: Check users can fold
-  if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
-    return SDValue();
+  if (N0.hasOneUse()) {
+    // This may be able to fold into the source, but at a code size cost. Don't
+    // fold if the fold into the user is free.
+    if (allUsesHaveSourceMods(N, 0))
+      return SDValue();
+  } else {
+    if (fnegFoldsIntoOp(Opc) &&
+        (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
+      return SDValue();
+  }
 
   SDLoc SL(N);
   switch (Opc) {
@@ -2872,7 +3016,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     else
       RHS = RHS.getOperand(0);
 
-    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
     if (!N0.hasOneUse())
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
@@ -2891,7 +3035,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     else
       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
 
-    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
     if (!N0.hasOneUse())
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
@@ -2923,10 +3067,40 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
   }
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+  case AMDGPUISD::FMAX_LEGACY:
+  case AMDGPUISD::FMIN_LEGACY: {
+    // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
+    // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
+    // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
+    // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
+
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    // 0 doesn't have a negated inline immediate.
+    // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
+    // operations.
+    if (isConstantFPZero(RHS))
+      return SDValue();
+
+    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+    SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    unsigned Opposite = inverseMinMax(Opc);
+
+    SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
   case ISD::FP_EXTEND:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT: // XXX - Should fround be handled?
+  case ISD::FSIN:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
-  case ISD::FSIN:
   case AMDGPUISD::SIN_HW: {
     SDValue CvtSrc = N0.getOperand(0);
     if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -2941,7 +3115,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
     // (fneg (rcp x)) -> (rcp (fneg x))
     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
-    return DAG.getNode(Opc, SL, VT, Neg);
+    return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
   }
   case ISD::FP_ROUND: {
     SDValue CvtSrc = N0.getOperand(0);
@@ -2959,6 +3133,45 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
   }
+  case ISD::FP16_TO_FP: {
+    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
+    // f16, but legalization of f16 fneg ends up pulling it out of the source.
+    // Put the fneg back as a legal source operation that can be matched later.
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
+    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
+                                  DAG.getConstant(0x8000, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  switch (N0.getOpcode()) {
+  case ISD::FP16_TO_FP: {
+    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+    SDLoc SL(N);
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
+    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
+                                  DAG.getConstant(0x7fff, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
+  }
   default:
     return SDValue();
   }
@@ -3071,6 +3284,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performSelectCombine(N, DCI);
   case ISD::FNEG:
     return performFNegCombine(N, DCI);
+  case ISD::FABS:
+    return performFAbsCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
@@ -3159,6 +3374,18 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
+  case AMDGPUISD::CLAMP:
+    return performClampCombine(N, DCI);
+  case AMDGPUISD::RCP: {
+    if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
+      // XXX - Should this flush denormals?
+      const APFloat &Val = CFP->getValueAPF();
+      APFloat One(Val.getSemantics(), "1.0");
+      return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+    }
+
+    break;
+  }
   }
   return SDValue();
 }
@@ -3201,13 +3428,17 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AMDGPUISD::NodeType)Opcode) {
   case AMDGPUISD::FIRST_NUMBER: break;
   // AMDIL DAG nodes
-  NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(IF)
+  NODE_NAME_CASE(ELSE)
+  NODE_NAME_CASE(LOOP)
+  NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
-  NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
@@ -3232,6 +3463,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
   NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(FMAD_FTZ)
   NODE_NAME_CASE(TRIG_PREOP)
   NODE_NAME_CASE(RCP)
   NODE_NAME_CASE(RSQ)
@@ -3265,7 +3497,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(LOAD_INPUT)
   NODE_NAME_CASE(SAMPLE)
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
@@ -3274,6 +3505,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE1)
   NODE_NAME_CASE(CVT_F32_UBYTE2)
   NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+  NODE_NAME_CASE(FP_TO_FP16)
+  NODE_NAME_CASE(FP16_ZEXT)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -3338,13 +3572,11 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
 }
 
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
-  const SDValue Op,
-  APInt &KnownZero,
-  APInt &KnownOne,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
 
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+  unsigned BitWidth = KnownZero.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
 
   APInt KnownZero2;
   APInt KnownOne2;
@@ -3365,21 +3597,27 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     if (!CWidth)
       return;
 
-    unsigned BitWidth = 32;
     uint32_t Width = CWidth->getZExtValue() & 0x1f;
 
     if (Opc == AMDGPUISD::BFE_U32)
-      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+      KnownZero = APInt::getHighBitsSet(32, 32 - Width);
 
     break;
   }
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP16_ZEXT: {
+    unsigned BitWidth = KnownZero.getBitWidth();
+
+    // High bits are zero.
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+    break;
+  }
   }
 }
 
 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
-  SDValue Op,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
   switch (Op.getOpcode()) {
   case AMDGPUISD::BFE_I32: {
     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -3403,7 +3641,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW:
     return 31;
-
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP16_ZEXT:
+    return 16;
   default:
     return 1;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f6adceac6f11..d6aa0ba92bf7 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -16,6 +16,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
+#include "AMDGPU.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -34,10 +36,10 @@ private:
 
 protected:
   const AMDGPUSubtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
 
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
@@ -47,7 +49,7 @@ protected:
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
@@ -70,6 +72,7 @@ protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
@@ -85,6 +88,7 @@ protected:
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
@@ -111,8 +115,6 @@ protected:
                                     SmallVectorImpl<SDValue> &Results) const;
   void analyzeFormalArgumentsCompute(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
-  void AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
   void AnalyzeReturn(CCState &State,
                      const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
@@ -120,7 +122,7 @@ public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
   bool mayIgnoreSignedZero(SDValue Op) const {
-    if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only
+    if (getTargetMachine().Options.NoSignedZerosFPMath)
       return true;
 
     if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op))
@@ -158,6 +160,7 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
+  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
@@ -174,7 +177,7 @@ public:
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
-  SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+  SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
                                SDValue RHS, SDValue True, SDValue False,
                                SDValue CC, DAGCombinerInfo &DCI) const;
 
@@ -198,10 +201,12 @@ public:
   void computeKnownBitsForTargetNode(const SDValue Op,
                                      APInt &KnownZero,
                                      APInt &KnownOne,
+                                     const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
@@ -222,6 +227,10 @@ public:
   /// type of implicit parameter.
   uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
                                       const ImplicitParameter Param) const;
+
+  AMDGPUAS getAMDGPUAS() const {
+    return AMDGPUASI;
+  }
 };
 
 namespace AMDGPUISD {
@@ -229,15 +238,34 @@ namespace AMDGPUISD {
 enum NodeType : unsigned {
   // AMDIL ISD Opcodes
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+
+  // Function call.
+  CALL,
+
+  // Masked control flow nodes.
+  IF,
+  ELSE,
+  LOOP,
+
+  // A uniform kernel return that terminates the wavefront.
   ENDPGM,
-  RETURN,
+
+  // Return to a shader part's epilog code.
+  RETURN_TO_EPILOG,
+
+  // Return with values from a non-entry function.
+  RET_FLAG,
+
   DWORDADDR,
   FRACT,
+
+  /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+  /// modifier behavior with dx10_enable.
   CLAMP,
+
   // This is SETCC with the full mask result which is used for a compare with a
   // result bit per item in the wavefront.
   SETCC,
@@ -265,6 +293,9 @@ enum NodeType : unsigned {
   DIV_SCALE,
   DIV_FMAS,
   DIV_FIXUP,
+  // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
+  // treated as an illegal operation.
+  FMAD_FTZ,
   TRIG_PREOP, // 1 ULP max error for f64
 
   // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
@@ -301,7 +332,6 @@ enum NodeType : unsigned {
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
-  LOAD_INPUT,
   SAMPLE,
   SAMPLEB,
   SAMPLED,
@@ -312,6 +342,18 @@ enum NodeType : unsigned {
   CVT_F32_UBYTE1,
   CVT_F32_UBYTE2,
   CVT_F32_UBYTE3,
+
+  // Convert two float 32 numbers into a single register holding two packed f16
+  // with round to zero.
+  CVT_PKRTZ_F16_F32,
+
+  // Same as the standard node, except the high bits of the resulting integer
+  // are known 0.
+  FP_TO_FP16,
+
+  // Wrapper around fp16 results that are known to zero the high bits.
+  FP16_ZEXT,
+
   /// This node is for VLIW targets and it is used to represent a vector
   /// that is stored in consecutive registers with the same channel.
   /// For example:
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index e4dc6599e156..a01f5d37c7c1 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 void AMDGPUInstrInfo::anchor() {}
 
 AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
-  : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
+  : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
 // the first 16 loads will be interleaved with the stores, and the next 16 will
@@ -86,6 +86,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
   case AMDGPUSubtarget::SEA_ISLANDS:
     return SIEncodingFamily::SI;
   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+  case AMDGPUSubtarget::GFX9:
     return SIEncodingFamily::VI;
 
   // FIXME: This should never be called for r600 GPUs.
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index bd8e389639f5..12caa5118342 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -16,11 +16,11 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 
+#include "AMDGPU.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 
 #define GET_INSTRINFO_HEADER
-#define GET_INSTRINFO_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 
 namespace llvm {
@@ -35,6 +35,8 @@ private:
   const AMDGPUSubtarget &ST;
 
   virtual void anchor();
+protected:
+  AMDGPUAS AMDGPUASI;
 
 public:
   explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index d7fa28bdc001..56f060984f08 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2,
   [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
 >;
 
+def AMDGPUFPPackOp : SDTypeProfile<1, 2,
+  [SDTCisFP<1>, SDTCisSameAs<1, 2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@@ -42,10 +46,38 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 
 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def AMDGPUIfOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPUElseOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPULoopOp : SDTypeProfile<0, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+>;
+
+def AMDGPUBreakOp : SDTypeProfile<1, 1,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
+>;
+
+def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
+>;
+
+def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
 
+def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
+def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
+def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+
 def AMDGPUconstdata_ptr : SDNode<
   "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
                                                      SDTCisVT<0, iPTR>]>
@@ -78,6 +110,11 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
+def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
+def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
+
+
 def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
 
 // out = max(a, b) a and b are floats, where a nan comparison fails.
@@ -92,17 +129,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-
-// out = max(a, b) a and b are signed ints
-def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// out = max(a, b) a and b are unsigned ints
-def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
 // out = min(a, b) a and b are floats, where a nan comparison fails.
 def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
@@ -194,6 +221,8 @@ def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
 // Denominator, src2 = Numerator).
 def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
 
+def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
+
 // Look Up 2.0 / pi src0 with segment select src1[4:0]
 def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
 
@@ -291,15 +320,16 @@ def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
 
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
-  SDTCisInt<0>, // i8 en
-  SDTCisInt<1>, // i1 vm
+  SDTCisInt<0>,       // i8 tgt
+  SDTCisInt<1>,       // i8 en
+                      // i32 or f32 src0
+  SDTCisSameAs<3, 2>, // f32 src1
+  SDTCisSameAs<4, 2>, // f32 src2
+  SDTCisSameAs<5, 2>, // f32 src3
+  SDTCisInt<6>,       // i1 compr
   // skip done
-  SDTCisInt<2>, // i8 tgt
-  SDTCisSameAs<3, 1>, // i1 compr
-  SDTCisFP<4>,        // f32 src0
-  SDTCisSameAs<5, 4>, // f32 src1
-  SDTCisSameAs<6, 4>, // f32 src2
-  SDTCisSameAs<7, 4>  // f32 src3
+  SDTCisInt<1>        // i1 vm
+
 ]>;
 
 def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
@@ -333,5 +363,9 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
     [SDNPHasChain, SDNPOptInGlue]>;
 
-def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
+def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
new file mode 100644
index 000000000000..8867ed689a31
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -0,0 +1,424 @@
+//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-isel"
+
+using namespace llvm;
+
+AMDGPUInstructionSelector::AMDGPUInstructionSelector(
+    const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+    : InstructionSelector(), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
+
+MachineOperand
+AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
+                                           unsigned SubIdx) const {
+
+  MachineInstr *MI = MO.getParent();
+  MachineBasicBlock *BB = MO.getParent()->getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+  if (MO.isReg()) {
+    unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
+    unsigned Reg = MO.getReg();
+    BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
+            .addReg(Reg, 0, ComposedSubIdx);
+
+    return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
+                                     MO.isKill(), MO.isDead(), MO.isUndef(),
+                                     MO.isEarlyClobber(), 0, MO.isDebug(),
+                                     MO.isInternalRead());
+  }
+
+  assert(MO.isImm());
+
+  APInt Imm(64, MO.getImm());
+
+  switch (SubIdx) {
+  default:
+    llvm_unreachable("do not know to split immediate with this sub index.");
+  case AMDGPU::sub0:
+    return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
+  case AMDGPU::sub1:
+    return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
+  }
+}
+
+bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  if (Size != 64)
+    return false;
+
+  DebugLoc DL = I.getDebugLoc();
+
+  MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0));
+  MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
+          .add(Lo1)
+          .add(Lo2);
+
+  MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1));
+  MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
+          .add(Hi1)
+          .add(Hi2);
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg())
+          .addReg(DstLo)
+          .addImm(AMDGPU::sub0)
+          .addReg(DstHi)
+          .addImm(AMDGPU::sub1);
+
+  for (MachineOperand &MO : I.explicit_operands()) {
+    if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      continue;
+    RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI);
+  }
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
+  return selectG_ADD(I);
+}
+
+bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  DebugLoc DL = I.getDebugLoc();
+
+  // FIXME: Select store instruction based on address space
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+          .add(I.getOperand(1))
+          .add(I.getOperand(0))
+          .addImm(0)
+          .addImm(0)
+          .addImm(0);
+
+  // Now that we selected an opcode, we need to constrain the register
+  // operands to use appropriate classes.
+  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return Ret;
+}
+
+bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+  if (Size == 32) {
+    I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  assert(Size == 64);
+
+  DebugLoc DL = I.getDebugLoc();
+  unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  const APInt &Imm = I.getOperand(1).getCImm()->getValue();
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+          .addImm(Imm.trunc(32).getZExtValue());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+          .addImm(Imm.ashr(32).getZExtValue());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+          .addReg(LoReg)
+          .addImm(AMDGPU::sub0)
+          .addReg(HiReg)
+          .addImm(AMDGPU::sub1);
+  // We can't call constrainSelectedInstRegOperands here, because it doesn't
+  // work for target independent opcodes
+  I.eraseFromParent();
+  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+}
+
+static bool isConstant(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
+}
+
+void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
+    const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
+
+  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
+
+  assert(PtrMI);
+
+  if (PtrMI->getOpcode() != TargetOpcode::G_GEP)
+    return;
+
+  GEPInfo GEPInfo(*PtrMI);
+
+  for (unsigned i = 1, e = 3; i < e; ++i) {
+    const MachineOperand &GEPOp = PtrMI->getOperand(i);
+    const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
+    assert(OpDef);
+    if (isConstant(*OpDef)) {
+      // FIXME: Is it possible to have multiple Imm parts?  Maybe if we
+      // are lacking other optimizations.
+      assert(GEPInfo.Imm == 0);
+      GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
+      continue;
+    }
+    const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
+    if (OpBank->getID() == AMDGPU::SGPRRegBankID)
+      GEPInfo.SgprParts.push_back(GEPOp.getReg());
+    else
+      GEPInfo.VgprParts.push_back(GEPOp.getReg());
+  }
+
+  AddrInfo.push_back(GEPInfo);
+  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  const Value *Ptr = MMO->getValue();
+
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
+static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) {
+
+  if (LoadSize == 32)
+    return BaseOpcode;
+
+  switch (BaseOpcode) {
+  case AMDGPU::S_LOAD_DWORD_IMM:
+    switch (LoadSize) {
+    case 64:
+      return AMDGPU::S_LOAD_DWORDX2_IMM;
+    case 128:
+      return AMDGPU::S_LOAD_DWORDX4_IMM;
+    case 256:
+      return AMDGPU::S_LOAD_DWORDX8_IMM;
+    case 512:
+      return AMDGPU::S_LOAD_DWORDX16_IMM;
+    }
+    break;
+  case AMDGPU::S_LOAD_DWORD_IMM_ci:
+    switch (LoadSize) {
+    case 64:
+      return AMDGPU::S_LOAD_DWORDX2_IMM_ci;
+    case 128:
+      return AMDGPU::S_LOAD_DWORDX4_IMM_ci;
+    case 256:
+      return AMDGPU::S_LOAD_DWORDX8_IMM_ci;
+    case 512:
+      return AMDGPU::S_LOAD_DWORDX16_IMM_ci;
+    }
+    break;
+  case AMDGPU::S_LOAD_DWORD_SGPR:
+    switch (LoadSize) {
+    case 64:
+      return AMDGPU::S_LOAD_DWORDX2_SGPR;
+    case 128:
+      return AMDGPU::S_LOAD_DWORDX4_SGPR;
+    case 256:
+      return AMDGPU::S_LOAD_DWORDX8_SGPR;
+    case 512:
+      return AMDGPU::S_LOAD_DWORDX16_SGPR;
+    }
+    break;
+  }
+  llvm_unreachable("Invalid base smrd opcode or size");
+}
+
+bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
+  for (const GEPInfo &GEPInfo : AddrInfo) {
+    if (!GEPInfo.VgprParts.empty())
+      return true;
+  }
+  return false;
+}
+
+bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
+                                           ArrayRef<GEPInfo> AddrInfo) const {
+
+  if (!I.hasOneMemOperand())
+    return false;
+
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+    return false;
+
+  if (!isInstrUniform(I))
+    return false;
+
+  if (hasVgprParts(AddrInfo))
+    return false;
+
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = I.getOperand(0).getReg();
+  const DebugLoc &DL = I.getDebugLoc();
+  unsigned Opcode;
+  unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+  if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) {
+
+    const GEPInfo &GEPInfo = AddrInfo[0];
+
+    unsigned PtrReg = GEPInfo.SgprParts[0];
+    int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm);
+    if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
+
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                 .addReg(PtrReg)
+                                 .addImm(EncodedImm)
+                                 .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+
+    if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS &&
+        isUInt<32>(EncodedImm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize);
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                   .addReg(PtrReg)
+                                   .addImm(EncodedImm)
+                                   .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+
+    if (isUInt<32>(GEPInfo.Imm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize);
+      unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+              .addImm(GEPInfo.Imm);
+
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                   .addReg(PtrReg)
+                                   .addReg(OffsetReg)
+                                   .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+  }
+
+  unsigned PtrReg = I.getOperand(1).getReg();
+  Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
+  MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                               .addReg(PtrReg)
+                               .addImm(0)
+                               .addImm(0); // glc
+  return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+}
+
+
+bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  DebugLoc DL = I.getDebugLoc();
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned PtrReg = I.getOperand(1).getReg();
+  unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+  unsigned Opcode;
+
+  SmallVector<GEPInfo, 4> AddrInfo;
+
+  getAddrModeInfo(I, MRI, AddrInfo);
+
+  if (selectSMRD(I, AddrInfo)) {
+    I.eraseFromParent();
+    return true;
+  }
+
+  switch (LoadSize) {
+  default:
+    llvm_unreachable("Load size not supported\n");
+  case 32:
+    Opcode = AMDGPU::FLAT_LOAD_DWORD;
+    break;
+  case 64:
+    Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
+    break;
+  }
+
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
+                               .add(I.getOperand(0))
+                               .addReg(PtrReg)
+                               .addImm(0)
+                               .addImm(0)
+                               .addImm(0);
+
+  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
+  I.eraseFromParent();
+  return Ret;
+}
+
+bool AMDGPUInstructionSelector::select(MachineInstr &I) const {
+
+  if (!isPreISelGenericOpcode(I.getOpcode()))
+    return true;
+
+  switch (I.getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::G_ADD:
+    return selectG_ADD(I);
+  case TargetOpcode::G_CONSTANT:
+    return selectG_CONSTANT(I);
+  case TargetOpcode::G_GEP:
+    return selectG_GEP(I);
+  case TargetOpcode::G_LOAD:
+    return selectG_LOAD(I);
+  case TargetOpcode::G_STORE:
+    return selectG_STORE(I);
+  }
+  return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
new file mode 100644
index 000000000000..c87102e55dfb
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -0,0 +1,67 @@
+//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for
+/// AMDGPU.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class AMDGPUInstrInfo;
+class AMDGPURegisterBankInfo;
+class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
+class SIInstrInfo;
+class SIRegisterInfo;
+class SISubtarget;
+
+class AMDGPUInstructionSelector : public InstructionSelector {
+public:
+  AMDGPUInstructionSelector(const SISubtarget &STI,
+                            const AMDGPURegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+private:
+  struct GEPInfo {
+    const MachineInstr &GEP;
+    SmallVector<unsigned, 2> SgprParts;
+    SmallVector<unsigned, 2> VgprParts;
+    int64_t Imm;
+    GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
+  };
+
+  MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+  bool selectG_CONSTANT(MachineInstr &I) const;
+  bool selectG_ADD(MachineInstr &I) const;
+  bool selectG_GEP(MachineInstr &I) const;
+  bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
+  void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
+                       SmallVectorImpl<GEPInfo> &AddrInfo) const;
+  bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
+  bool selectG_LOAD(MachineInstr &I) const;
+  bool selectG_STORE(MachineInstr &I) const;
+
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+  const AMDGPURegisterBankInfo &RBI;
+protected:
+  AMDGPUAS AMDGPUASI;
+};
+
+} // End llvm namespace.
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 59cba636c586..b8d681298dee 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -72,6 +72,49 @@ def u8imm : Operand<i8> {
 def brtarget   : Operand<OtherVT>;
 
 //===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0),
+  (op $src0),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1),
+  (op $src0, $src1),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1, node:$src2),
+  (op $src0, $src1, $src2),
+  [{ return N->hasOneUse(); }]
+>;
+
+def trunc_oneuse : HasOneUseUnaryOp<trunc>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+def fminnum_oneuse : HasOneUseBinOp<fminnum>;
+def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def and_oneuse : HasOneUseBinOp<and>;
+def or_oneuse : HasOneUseBinOp<or>;
+def xor_oneuse : HasOneUseBinOp<xor>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+def sub_oneuse : HasOneUseBinOp<sub>;
+
+def srl_oneuse : HasOneUseBinOp<srl>;
+def shl_oneuse : HasOneUseBinOp<shl>;
+
+def select_oneuse : HasOneUseTernaryOp<select>;
+
+//===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
 
@@ -157,27 +200,11 @@ def COND_NULL : PatLeaf <
 
 
 //===----------------------------------------------------------------------===//
-// Misc. PatFrags
-//===----------------------------------------------------------------------===//
-
-class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0, node:$src1),
-  (op $src0, $src1),
-  [{ return N->hasOneUse(); }]
->;
-
-class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0, node:$src1, node:$src2),
-  (op $src0, $src1, $src2),
-  [{ return N->hasOneUse(); }]
->;
-
-//===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
 class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
 }]>;
 
 class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
@@ -195,7 +222,7 @@ def truncstorei16_private : PrivateStore <truncstorei16>;
 def store_private : PrivateStore <store>;
 
 class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 // Global address space loads
@@ -215,7 +242,7 @@ def global_store_atomic : GlobalStore<atomic_store>;
 
 
 class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 // Constant address space loads
@@ -226,7 +253,7 @@ class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
 def constant_load : ConstantLoad<load>;
 
 class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 // Local address space loads
@@ -239,7 +266,7 @@ class LocalStore <SDPatternOperator op> : LocalMemOp <
 >;
 
 class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS;
 }]>;
 
 class FlatLoad <SDPatternOperator op> : FlatMemOp <
@@ -321,7 +348,7 @@ def local_store_aligned8bytes : Aligned8Bytes <
 class local_binary_atomic_op<SDNode atomic_op> :
   PatFrag<(ops node:$ptr, node:$value),
     (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 
@@ -339,7 +366,7 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
 
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
@@ -349,7 +376,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
       return AN->getMemoryVT() == MVT::i32 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+             AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
   }]>;
 
   def _64_local : PatFrag<
@@ -357,7 +384,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
       return AN->getMemoryVT() == MVT::i64 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+             AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
   }]>;
 }
 
@@ -367,17 +394,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> {
   def "" : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
   def _noret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
   def _ret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 }
 
 defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
@@ -395,22 +422,22 @@ defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
 def AMDGPUatomic_cmp_swap_global : PatFrag<
         (ops node:$ptr, node:$value),
         (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
 def atomic_cmp_swap_global : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
 def atomic_cmp_swap_global_noret : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
 def atomic_cmp_swap_global_ret : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
@@ -422,6 +449,7 @@ int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 int FP16_ONE = 0x3C00;
+int V2FP16_ONE = 0x3C003C00;
 int FP32_ONE = 0x3f800000;
 int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
@@ -452,7 +480,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
   (ins rc:$src0),
   "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+  [(set f32:$dst, (AMDGPUclamp f32:$src0))]
 >;
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
@@ -565,6 +593,12 @@ multiclass BFIPatterns <Instruction BFI_INT,
   >;
 
   def : Pat <
+    (f32 (fcopysign f32:$src0, f64:$src1)),
+    (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0,
+             (i32 (EXTRACT_SUBREG $src1, sub1)))
+  >;
+
+  def : Pat <
     (f64 (fcopysign f64:$src0, f64:$src1)),
     (REG_SEQUENCE RC64,
       (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -602,10 +636,22 @@ def IMMPopCount : SDNodeXForm<imm, [{
                                    MVT::i32);
 }]>;
 
-class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
-  (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
-  (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
->;
+multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
+  def : Pat <
+    (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
+    (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
+  >;
+
+  def : Pat <
+    (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+    (UBFE $src, (i32 0), $width)
+  >;
+
+  def : Pat <
+    (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+    (SBFE $src, (i32 0), $width)
+  >;
+}
 
 // rotr pattern
 class ROTRPattern <Instruction BIT_ALIGN> : Pat <
@@ -618,23 +664,13 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
 class IntMed3Pat<Instruction med3Inst,
                  SDPatternOperator max,
                  SDPatternOperator max_oneuse,
-                 SDPatternOperator min_oneuse> : Pat<
-  (max (min_oneuse i32:$src0, i32:$src1),
-       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+                 SDPatternOperator min_oneuse,
+                 ValueType vt = i32> : Pat<
+  (max (min_oneuse vt:$src0, vt:$src1),
+       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst $src0, $src1, $src2)
 >;
 
-let Properties = [SDNPCommutative, SDNPAssociative] in {
-def smax_oneuse : HasOneUseBinOp<smax>;
-def smin_oneuse : HasOneUseBinOp<smin>;
-def umax_oneuse : HasOneUseBinOp<umax>;
-def umin_oneuse : HasOneUseBinOp<umin>;
-} // Properties = [SDNPCommutative, SDNPAssociative]
-
-def sub_oneuse : HasOneUseBinOp<sub>;
-
-def select_oneuse : HasOneUseTernaryOp<select>;
-
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 8e3471bd2083..86dc9bd9ea74 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -54,14 +54,7 @@ std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
 FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
                                            ArrayRef<Type*> Tys) const {
   // FIXME: Re-use Intrinsic::getType machinery
-  switch (ID) {
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
-    Type *F32Ty = Type::getFloatTy(Context);
-    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
-  }
-  default:
-    llvm_unreachable("unhandled intrinsic");
-  }
+  llvm_unreachable("unhandled intrinsic");
 }
 
 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -97,8 +90,8 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
   Function *F
     = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
 
-  AttributeSet AS = getAttributes(M->getContext(),
-                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  AttributeList AS =
+      getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID));
   F->setAttributes(AS);
   return F;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ceae0b575395..18c9bd933af2 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -12,25 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
-  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
-
-  // Deprecated in favor of llvm.amdgcn.sffbh
-  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
-  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of expanded bit operations
-  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of llvm.amdgcn.rsq
-  def int_AMDGPU_rsq : Intrinsic<
-    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
-  >;
 }
 
 include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
new file mode 100644
index 000000000000..a2567a549028
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -0,0 +1,62 @@
+//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+  using namespace TargetOpcode;
+
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+  const LLT P1 = LLT::pointer(1, 64);
+  const LLT P2 = LLT::pointer(2, 64);
+
+  setAction({G_CONSTANT, S64}, Legal);
+
+  setAction({G_GEP, P1}, Legal);
+  setAction({G_GEP, P2}, Legal);
+  setAction({G_GEP, 1, S64}, Legal);
+
+  setAction({G_LOAD, P1}, Legal);
+  setAction({G_LOAD, P2}, Legal);
+  setAction({G_LOAD, S32}, Legal);
+  setAction({G_LOAD, 1, P1}, Legal);
+  setAction({G_LOAD, 1, P2}, Legal);
+
+  setAction({G_STORE, S32}, Legal);
+  setAction({G_STORE, 1, P1}, Legal);
+
+  // FIXME: When RegBankSelect inserts copies, it will only create new
+  // registers with scalar types.  This means we can end up with
+  // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer
+  // operands.  In assert builds, the instruction selector will assert
+  // if it sees a generic instruction which isn't legal, so we need to
+  // tell it that scalar types are legal for pointer operands
+  setAction({G_GEP, S64}, Legal);
+  setAction({G_LOAD, 1, S64}, Legal);
+  setAction({G_STORE, 1, S64}, Legal);
+
+  computeTables();
+}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
new file mode 100644
index 000000000000..291e3361f163
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -0,0 +1,30 @@
+//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class AMDGPULegalizerInfo : public LegalizerInfo {
+public:
+  AMDGPULegalizerInfo();
+};
+} // End llvm namespace.
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 000000000000..dcb6670621ee
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,160 @@
+//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+const unsigned MaxStaticSize = 1024;
+
+class AMDGPULowerIntrinsics : public ModulePass {
+private:
+  const TargetMachine *TM;
+
+  bool makeLIDRangeMetadata(Function &F) const;
+
+public:
+  static char ID;
+
+  AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr)
+    : ModulePass(ID), TM(TM) { }
+  bool runOnModule(Module &M) override;
+  StringRef getPassName() const override {
+    return "AMDGPU Lower Intrinsics";
+  }
+};
+
+}
+
+char AMDGPULowerIntrinsics::ID = 0;
+
+char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
+
+INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
+                  "Lower intrinsics", false, false)
+
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+static bool shouldExpandOperationWithSize(Value *Size) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+  return !CI || (CI->getZExtValue() > MaxStaticSize);
+}
+
+static bool expandMemIntrinsicUses(Function &F) {
+  Intrinsic::ID ID = F.getIntrinsicID();
+  bool Changed = false;
+
+  for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
+    Instruction *Inst = cast<Instruction>(*I);
+    ++I;
+
+    switch (ID) {
+    case Intrinsic::memcpy: {
+      auto *Memcpy = cast<MemCpyInst>(Inst);
+      if (shouldExpandOperationWithSize(Memcpy->getLength())) {
+        expandMemCpyAsLoop(Memcpy);
+        Changed = true;
+        Memcpy->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memmove: {
+      auto *Memmove = cast<MemMoveInst>(Inst);
+      if (shouldExpandOperationWithSize(Memmove->getLength())) {
+        expandMemMoveAsLoop(Memmove);
+        Changed = true;
+        Memmove->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memset: {
+      auto *Memset = cast<MemSetInst>(Inst);
+      if (shouldExpandOperationWithSize(Memset->getLength())) {
+        expandMemSetAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+      }
+
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
+  if (!TM)
+    return false;
+
+  bool Changed = false;
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+  for (auto *U : F.users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    Changed |= ST.makeLIDRangeMetadata(CI);
+  }
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      continue;
+
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset:
+      if (expandMemIntrinsicUses(F))
+        Changed = true;
+      break;
+
+    case Intrinsic::amdgcn_workitem_id_x:
+    case Intrinsic::r600_read_tidig_x:
+    case Intrinsic::amdgcn_workitem_id_y:
+    case Intrinsic::r600_read_tidig_y:
+    case Intrinsic::amdgcn_workitem_id_z:
+    case Intrinsic::r600_read_tidig_z:
+    case Intrinsic::r600_read_local_size_x:
+    case Intrinsic::r600_read_local_size_y:
+    case Intrinsic::r600_read_local_size_z:
+      Changed |= makeLIDRangeMetadata(F);
+      break;
+
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) {
+  return new AMDGPULowerIntrinsics(TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 7d56355074b1..14ee1c81f8fa 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -151,6 +151,28 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
   return MCInstLowering.lowerOperand(MO, MCOp);
 }
 
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+  // TargetMachine does not support llvm-style cast. Use C++-style cast.
+  // This is safe since TM is always of type AMDGPUTargetMachine or its
+  // derived class.
+  auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+  auto *CE = dyn_cast<ConstantExpr>(CV);
+
+  // Lower null pointers in private and local address space.
+  // Clang generates addrspacecast for null pointers in private and local
+  // address space, which needs to be lowered.
+  if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
+    auto Op = CE->getOperand(0);
+    auto SrcAddr = Op->getType()->getPointerAddressSpace();
+    if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+      auto DstAddr = CE->getType()->getPointerAddressSpace();
+      return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+        OutContext);
+    }
+  }
+  return AsmPrinter::lowerConstant(CV);
+}
+
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
@@ -162,7 +184,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
     LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
     C.emitError("Illegal instruction detected: " + Err);
-    MI->dump();
+    MI->print(errs());
   }
 
   if (MI->isBundle()) {
@@ -173,8 +195,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       ++I;
     }
   } else {
-    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
-    // terminator instructions and should only be printed as comments.
+    // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are
+    // placeholder terminator instructions and should only be printed as
+    // comments.
     if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
       if (isVerbose()) {
         SmallVector<char, 16> BBStr;
@@ -190,9 +213,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       return;
     }
 
-    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+    if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
       if (isVerbose())
-        OutStreamer->emitRawComment(" return");
+        OutStreamer->emitRawComment(" return to shader part epilog");
       return;
     }
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 40c3327a98db..27fe639e3d4b 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -12,6 +12,20 @@
 
 using namespace llvm;
 
+static bool isEntryFunctionCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    return true;
+  default:
+    return false;
+  }
+}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
@@ -19,8 +33,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
-  IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
-           MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) {
+  IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 5d0640b816f3..8bfeb67ad4ec 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -30,7 +30,11 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// Start of implicit kernel args
   unsigned ABIArgOffset;
 
-  bool IsKernel;
+  // Kernels + shaders. i.e. functions called by the driver and not not called
+  // by other functions.
+  bool IsEntryFunction;
+
+  bool NoSignedZerosFPMath;
 
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
@@ -66,8 +70,12 @@ public:
     return LDSSize;
   }
 
-  bool isKernel() const {
-    return IsKernel;
+  bool isEntryFunction() const {
+    return IsEntryFunction;
+  }
+
+  bool hasNoSignedZerosFPMath() const {
+    return NoSignedZerosFPMath;
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index 947d45b66969..71b9ab699b96 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -19,12 +19,13 @@
 
 namespace AMDGPU {
 
-namespace PT_NOTE {
+namespace ElfNote {
 
 const char SectionName[] = ".note";
 
 const char NoteName[] = "AMD";
 
+// TODO: Move this enum to include/llvm/Support so it can be used in tools?
 enum NoteType{
     NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
     NT_AMDGPU_HSA_HSAIL = 2,
@@ -32,7 +33,7 @@ enum NoteType{
     NT_AMDGPU_HSA_PRODUCER = 4,
     NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
     NT_AMDGPU_HSA_EXTENSION = 6,
-    NT_AMDGPU_HSA_RUNTIME_METADATA = 7,
+    NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10,
     NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
     NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
 };
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index baa28de7a770..4fb262c6277c 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -14,12 +14,49 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -31,16 +68,16 @@ namespace {
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
   const TargetMachine *TM;
-  Module *Mod;
-  const DataLayout *DL;
-  MDNode *MaxWorkGroupSizeRange;
+  Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
+  AMDGPUAS AS;
 
   // FIXME: This should be per-kernel.
-  uint32_t LocalMemLimit;
-  uint32_t CurrentLocalMemUsage;
+  uint32_t LocalMemLimit = 0;
+  uint32_t CurrentLocalMemUsage = 0;
 
-  bool IsAMDGCN;
-  bool IsAMDHSA;
+  bool IsAMDGCN = false;
+  bool IsAMDHSA = false;
 
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
@@ -63,15 +100,7 @@ public:
   static char ID;
 
   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
-    FunctionPass(ID),
-    TM(TM_),
-    Mod(nullptr),
-    DL(nullptr),
-    MaxWorkGroupSizeRange(nullptr),
-    LocalMemLimit(0),
-    CurrentLocalMemUsage(0),
-    IsAMDGCN(false),
-    IsAMDHSA(false) { }
+    FunctionPass(ID), TM(TM_) {}
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -86,7 +115,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
 
@@ -95,7 +124,6 @@ INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 
-
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
   if (!TM)
     return false;
@@ -103,13 +131,6 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
   Mod = &M;
   DL = &Mod->getDataLayout();
 
-  // The maximum workitem id.
-  //
-  // FIXME: Should get as subtarget property. Usually runtime enforced max is
-  // 256.
-  MDBuilder MDB(Mod->getContext());
-  MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
-
   const Triple &TT = TM->getTargetTriple();
 
   IsAMDGCN = TT.getArch() == Triple::amdgcn;
@@ -125,6 +146,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
+  AS = AMDGPU::getAMDGPUAS(*F.getParent());
 
   FunctionType *FTy = F.getFunctionType();
 
@@ -133,7 +155,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   // we cannot use local memory in the pass.
   for (Type *ParamTy : FTy->params()) {
     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
       LocalMemLimit = 0;
       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                       "local memory disabled.\n");
@@ -150,7 +172,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   // Check how much local memory is being used by global objects
   CurrentLocalMemUsage = 0;
   for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
       continue;
 
     for (const User *U : GV.users()) {
@@ -175,7 +197,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
     }
   }
 
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
 
   // Restrict local memory usage so that we don't drastically reduce occupancy,
   // unless it is already significantly reduced.
@@ -196,7 +219,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
   // Round up to the next tier of usage.
   unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
 
   // Program is possibly broken by using more local mem than available.
   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
@@ -226,6 +249,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
 std::pair<Value *, Value *>
 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
+                                *Builder.GetInsertBlock()->getParent());
+
   if (!IsAMDHSA) {
     Function *LocalSizeYFn
       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
@@ -235,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
 
-    LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
-    LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+    ST.makeLIDRangeMetadata(LocalSizeY);
+    ST.makeLIDRangeMetadata(LocalSizeZ);
 
     return std::make_pair(LocalSizeY, LocalSizeZ);
   }
@@ -279,15 +305,15 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 
   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
-  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
-  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
 
   // Size of the dispatch packet struct.
-  DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
   Value *CastDispatchPtr = Builder.CreateBitCast(
-    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+    DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
 
   // We could do a single 64-bit load here, but it's likely that the basic
   // 32-bit and extract sequence is already present, and it is probably easier
@@ -298,10 +324,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
 
-  MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
+  MDNode *MD = MDNode::get(Mod->getContext(), None);
   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
-  LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  ST.makeLIDRangeMetadata(LoadZU);
 
   // Extract y component. Upper half of LoadZU should be zero already.
   Value *Y = Builder.CreateLShr(LoadXY, 16);
@@ -310,6 +336,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 }
 
 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
+                                *Builder.GetInsertBlock()->getParent());
   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
 
   switch (N) {
@@ -332,7 +360,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
 
   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
-  CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  ST.makeLIDRangeMetadata(CI);
 
   return CI;
 }
@@ -383,7 +411,7 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
@@ -438,7 +466,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
       Value *Ptr = Inst->getOperand(0);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 
@@ -450,7 +478,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
 
       Value *Ptr = Inst->getOperand(1);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
@@ -580,6 +608,9 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
     }
 
     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
+      // Give up if the pointer may be captured.
+      if (PointerMayBeCaptured(UseInst, true, true))
+        return false;
       // Don't collect the users of this.
       WorkList.push_back(User);
       continue;
@@ -640,7 +671,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I)) {
+  if (tryPromoteAllocaToVector(&I, AS)) {
     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
     return;
   }
@@ -655,8 +686,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   const AMDGPUSubtarget &ST =
     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
-  // FIXME: We should also try to get this value from the reqd_work_group_size
-  // function attribute if it is available.
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
   const DataLayout &DL = Mod->getDataLayout();
@@ -701,7 +730,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       Twine(F->getName()) + Twine('.') + I.getName(),
       nullptr,
       GlobalVariable::NotThreadLocal,
-      AMDGPUAS::LOCAL_ADDRESS);
+      AS.LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   GV->setAlignment(I.getAlignment());
 
@@ -734,7 +763,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
         Value *Src0 = CI->getOperand(0);
         Type *EltTy = Src0->getType()->getPointerElementType();
-        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+        PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
 
         if (isa<ConstantPointerNull>(CI->getOperand(0)))
           CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -751,7 +780,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
         continue;
 
       Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
 
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
@@ -819,17 +848,17 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       Type *SrcTy = Src->getType()->getPointerElementType();
       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
         Intrinsic::objectsize,
-        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+        { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
       );
 
-      CallInst *NewCall
-        = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+      CallInst *NewCall = Builder.CreateCall(
+          ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
       Intr->replaceAllUsesWith(NewCall);
       Intr->eraseFromParent();
       continue;
     }
     default:
-      Intr->dump();
+      Intr->print(errs());
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
     }
   }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
new file mode 100644
index 000000000000..a5edc0c3b937
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -0,0 +1,230 @@
+//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "AMDGPUGenRegisterBank.inc"
+
+// This file will be TableGen'ed at some point.
+#include "AMDGPUGenRegisterBankInfo.def"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
+    : AMDGPUGenRegisterBankInfo(),
+      TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
+
+  // HACK: Until this is fully tablegen'd
+  static bool AlreadyInit = false;
+  if (AlreadyInit)
+    return;
+
+  AlreadyInit = true;
+
+  const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
+  (void)RBSGPR;
+  assert(&RBSGPR == &AMDGPU::SGPRRegBank);
+
+  const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
+  (void)RBVGPR;
+  assert(&RBVGPR == &AMDGPU::VGPRRegBank);
+
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
+                                           const RegisterBank &B,
+                                           unsigned Size) const {
+  return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+
+  if (TRI->isSGPRClass(&RC))
+    return getRegBank(AMDGPU::SGPRRegBankID);
+
+  return getRegBank(AMDGPU::VGPRRegBankID);
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappings(
+    const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+
+  InstructionMappings AltMappings;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD: {
+    // FIXME: Should we be hard coding the size for these mappings?
+    InstructionMapping SSMapping(1, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+      2); // Num Operands
+    AltMappings.emplace_back(std::move(SSMapping));
+
+    InstructionMapping VVMapping(2, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
+      2); // Num Operands
+    AltMappings.emplace_back(std::move(VVMapping));
+
+    // FIXME: Should this be the pointer-size (64-bits) or the size of the
+    // register that will hold the bufffer resourc (128-bits).
+    InstructionMapping VSMapping(3, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+      2); // Num Operands
+    AltMappings.emplace_back(std::move(VSMapping));
+
+    return AltMappings;
+
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AMDGPURegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  return applyDefaultMapping(OpdMapper);
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  return AMDGPU::isUniformMMO(MMO);
+}
+
+RegisterBankInfo::InstructionMapping
+AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  RegisterBankInfo::InstructionMapping Mapping =
+      InstructionMapping{1, 1, nullptr, MI.getNumOperands()};
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+  unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+
+  const ValueMapping *ValMapping;
+  const ValueMapping *PtrMapping;
+
+  if (isInstrUniform(MI)) {
+    // We have a uniform instruction so we want to use an SMRD load
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
+  } else {
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    // FIXME: What would happen if we used SGPRRegBankID here?
+    PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
+  }
+
+  OpdsMapping[0] = ValMapping;
+  OpdsMapping[1] = PtrMapping;
+  Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+  return Mapping;
+
+  // FIXME: Do we want to add a mapping for FLAT load, or should we just
+  // handle that during instruction selection?
+}
+
+RegisterBankInfo::InstructionMapping
+AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+
+  if (Mapping.isValid())
+    return Mapping;
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()};
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+  switch (MI.getOpcode()) {
+  default: break;
+  case AMDGPU::G_CONSTANT: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+    return Mapping;
+  }
+  case AMDGPU::G_GEP: {
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (!MI.getOperand(i).isReg())
+        continue;
+
+      unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits();
+      OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    }
+    Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+    return Mapping;
+  }
+  case AMDGPU::G_STORE: {
+    assert(MI.getOperand(0).isReg());
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    // FIXME: We need to specify a different reg bank once scalar stores
+    // are supported.
+    const ValueMapping *ValMapping =
+        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    // FIXME: Depending on the type of store, the pointer could be in
+    // the SGPR Reg bank.
+    // FIXME: Pointer size should be based on the address space.
+    const ValueMapping *PtrMapping =
+        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+
+    OpdsMapping[0] = ValMapping;
+    OpdsMapping[1] = PtrMapping;
+    Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+    return Mapping;
+  }
+
+  case AMDGPU::G_LOAD:
+    return getInstrMappingForLoad(MI);
+  }
+
+  unsigned BankID = AMDGPU::SGPRRegBankID;
+
+  Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()};
+  unsigned Size = 0;
+  for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+    // If the operand is not a register default to the size of the previous
+    // operand.
+    // FIXME: Can't we pull the types from the MachineInstr rather than the
+    // operands.
+    if (MI.getOperand(Idx).isReg())
+      Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
+    OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+  }
+  Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+
+  return Mapping;
+}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
new file mode 100644
index 000000000000..f13bde87ef2d
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -0,0 +1,65 @@
+//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+class TargetRegisterInfo;
+
+namespace AMDGPU {
+enum {
+  SGPRRegBankID = 0,
+  VGPRRegBankID = 1,
+  NumRegisterBanks
+};
+} // End AMDGPU namespace.
+
+/// This class provides the information for the target register banks.
+class AMDGPUGenRegisterBankInfo : public RegisterBankInfo {
+
+protected:
+
+#define GET_TARGET_REGBANK_CLASS
+#include "AMDGPUGenRegisterBank.inc"
+
+};
+class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+  const SIRegisterInfo *TRI;
+
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  RegisterBankInfo::InstructionMapping
+  getInstrMappingForLoad(const MachineInstr &MI) const;
+
+public:
+  AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                    unsigned Size) const override;
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
new file mode 100644
index 000000000000..f4428e56035f
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -0,0 +1,16 @@
+//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def SGPRRegBank : RegisterBank<"SGPR",
+  [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512]
+>;
+
+def VGPRRegBank : RegisterBank<"VGPR",
+  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
+>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index ef51aad95dce..22b1663821d9 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -16,10 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 
-#include "llvm/Target/TargetRegisterInfo.h"
-
 #define GET_REGINFO_HEADER
-#define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
 
 namespace llvm {
diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
deleted file mode 100644
index ecd2ac72bf1b..000000000000
--- a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
+++ /dev/null
@@ -1,193 +0,0 @@
-//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Enums and structure types used by runtime metadata.
-///
-/// Runtime requests certain information (metadata) about kernels to be able
-/// to execute the kernels and answer the queries about the kernels.
-/// The metadata is represented as a note element in the .note ELF section of a
-/// binary (code object). The desc field of the note element is a YAML string
-/// consisting of key-value pairs. Each key is a string. Each value can be
-/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps.
-/// At the beginning of the YAML string is the module level YAML map. A
-/// kernel-level YAML map is in the amd.Kernels sequence. A
-/// kernel-argument-level map is in the amd.Args sequence.
-///
-/// The format should be kept backward compatible. New enum values and bit
-/// fields should be appended at the end. It is suggested to bump up the
-/// revision number whenever the format changes and document the change
-/// in the revision in this header.
-///
-//
-//===----------------------------------------------------------------------===//
-//
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-
-#include <cstdint>
-#include <vector>
-#include <string>
-
-namespace AMDGPU {
-
-namespace RuntimeMD {
-
-  // Version and revision of runtime metadata
-  const unsigned char MDVersion   = 2;
-  const unsigned char MDRevision  = 0;
-
-  // Name of keys for runtime metadata.
-  namespace KeyName {
-    const char MDVersion[]                = "amd.MDVersion";            // Runtime metadata version
-    const char Language[]                 = "amd.Language";             // Language
-    const char LanguageVersion[]          = "amd.LanguageVersion";      // Language version
-    const char Kernels[]                  = "amd.Kernels";              // Kernels
-    const char KernelName[]               = "amd.KernelName";           // Kernel name
-    const char Args[]                     = "amd.Args";                 // Kernel arguments
-    const char ArgSize[]                  = "amd.ArgSize";              // Kernel arg size
-    const char ArgAlign[]                 = "amd.ArgAlign";             // Kernel arg alignment
-    const char ArgTypeName[]              = "amd.ArgTypeName";          // Kernel type name
-    const char ArgName[]                  = "amd.ArgName";              // Kernel name
-    const char ArgKind[]                  = "amd.ArgKind";              // Kernel argument kind
-    const char ArgValueType[]             = "amd.ArgValueType";         // Kernel argument value type
-    const char ArgAddrQual[]              = "amd.ArgAddrQual";          // Kernel argument address qualifier
-    const char ArgAccQual[]               = "amd.ArgAccQual";           // Kernel argument access qualifier
-    const char ArgIsConst[]               = "amd.ArgIsConst";           // Kernel argument is const qualified
-    const char ArgIsRestrict[]            = "amd.ArgIsRestrict";        // Kernel argument is restrict qualified
-    const char ArgIsVolatile[]            = "amd.ArgIsVolatile";        // Kernel argument is volatile qualified
-    const char ArgIsPipe[]                = "amd.ArgIsPipe";            // Kernel argument is pipe qualified
-    const char ReqdWorkGroupSize[]        = "amd.ReqdWorkGroupSize";    // Required work group size
-    const char WorkGroupSizeHint[]        = "amd.WorkGroupSizeHint";    // Work group size hint
-    const char VecTypeHint[]              = "amd.VecTypeHint";          // Vector type hint
-    const char KernelIndex[]              = "amd.KernelIndex";          // Kernel index for device enqueue
-    const char NoPartialWorkGroups[]      = "amd.NoPartialWorkGroups";  // No partial work groups
-    const char PrintfInfo[]               = "amd.PrintfInfo";           // Prinf function call information
-    const char ArgActualAcc[]             = "amd.ArgActualAcc";         // The actual kernel argument access qualifier
-    const char ArgPointeeAlign[]          = "amd.ArgPointeeAlign";      // Alignment of pointee type
-  }
-
-  namespace KernelArg {
-    enum Kind : uint8_t {
-      ByValue                 = 0,
-      GlobalBuffer            = 1,
-      DynamicSharedPointer    = 2,
-      Sampler                 = 3,
-      Image                   = 4,
-      Pipe                    = 5,
-      Queue                   = 6,
-      HiddenGlobalOffsetX     = 7,
-      HiddenGlobalOffsetY     = 8,
-      HiddenGlobalOffsetZ     = 9,
-      HiddenNone              = 10,
-      HiddenPrintfBuffer      = 11,
-      HiddenDefaultQueue      = 12,
-      HiddenCompletionAction  = 13,
-    };
-
-    enum ValueType : uint16_t {
-      Struct  = 0,
-      I8      = 1,
-      U8      = 2,
-      I16     = 3,
-      U16     = 4,
-      F16     = 5,
-      I32     = 6,
-      U32     = 7,
-      F32     = 8,
-      I64     = 9,
-      U64     = 10,
-      F64     = 11,
-    };
-
-    // Avoid using 'None' since it conflicts with a macro in X11 header file.
-    enum AccessQualifer : uint8_t {
-      AccNone    = 0,
-      ReadOnly   = 1,
-      WriteOnly  = 2,
-      ReadWrite  = 3,
-    };
-
-    enum AddressSpaceQualifer : uint8_t {
-      Private    = 0,
-      Global     = 1,
-      Constant   = 2,
-      Local      = 3,
-      Generic    = 4,
-      Region     = 5,
-    };
-  } // namespace KernelArg
-
-  // Invalid values are used to indicate an optional key should not be emitted.
-  const uint8_t INVALID_ADDR_QUAL     = 0xff;
-  const uint8_t INVALID_ACC_QUAL      = 0xff;
-  const uint32_t INVALID_KERNEL_INDEX = ~0U;
-
-  namespace KernelArg {
-    // In-memory representation of kernel argument information.
-    struct Metadata {
-      uint32_t Size;
-      uint32_t Align;
-      uint32_t PointeeAlign;
-      uint8_t Kind;
-      uint16_t ValueType;
-      std::string TypeName;
-      std::string Name;
-      uint8_t AddrQual;
-      uint8_t AccQual;
-      uint8_t IsVolatile;
-      uint8_t IsConst;
-      uint8_t IsRestrict;
-      uint8_t IsPipe;
-      Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0),
-          AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0),
-          IsConst(0), IsRestrict(0), IsPipe(0) {}
-    };
-  }
-
-  namespace Kernel {
-    // In-memory representation of kernel information.
-    struct Metadata {
-      std::string Name;
-      std::string Language;
-      std::vector<uint8_t> LanguageVersion;
-      std::vector<uint32_t> ReqdWorkGroupSize;
-      std::vector<uint32_t> WorkGroupSizeHint;
-      std::string VecTypeHint;
-      uint32_t KernelIndex;
-      uint8_t NoPartialWorkGroups;
-      std::vector<KernelArg::Metadata> Args;
-      Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {}
-    };
-  }
-
-  namespace Program {
-    // In-memory representation of program information.
-    struct Metadata {
-      std::vector<uint8_t> MDVersionSeq;
-      std::vector<std::string> PrintfInfo;
-      std::vector<Kernel::Metadata> Kernels;
-
-      explicit Metadata(){}
-
-      // Construct from an YAML string.
-      explicit Metadata(const std::string &YAML);
-
-      // Convert to YAML string.
-      std::string toYAML();
-
-      // Convert from YAML string.
-      static Metadata fromYAML(const std::string &S);
-    };
-  }
-} // namespace RuntimeMD
-} // namespace AMDGPU
-
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c35a67de1d7f..972c28579f7a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,8 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include <algorithm>
 
@@ -22,7 +24,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-subtarget"
 
-#define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
@@ -41,9 +42,10 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
 
-  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
-    FullFS += "+flat-for-global,+unaligned-buffer-access,";
+    FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, FullFS);
@@ -59,9 +61,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // denormals, but should be checked. Should we issue a warning somewhere
   // if someone tries to enable these?
   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    FP16Denormals = false;
+    FP64FP16Denormals = false;
     FP32Denormals = false;
-    FP64Denormals = false;
   }
 
   // Set defaults if needed.
@@ -85,15 +86,17 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FastFMAF32(false),
     HalfRate64Ops(false),
 
-    FP16Denormals(false),
     FP32Denormals(false),
-    FP64Denormals(false),
+    FP64FP16Denormals(false),
     FPExceptions(false),
+    DX10Clamp(false),
     FlatForGlobal(false),
     UnalignedScratchAccess(false),
     UnalignedBufferAccess(false),
 
+    HasApertureRegs(false),
     EnableXNACK(false),
+    TrapHandler(false),
     DebuggerInsertNops(false),
     DebuggerReserveRegs(false),
     DebuggerEmitPrologue(false),
@@ -110,13 +113,17 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     GCN1Encoding(false),
     GCN3Encoding(false),
     CIInsts(false),
+    GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
     Has16BitInsts(false),
+    HasVOP3PInsts(false),
     HasMovrel(false),
     HasVGPRIndexMode(false),
     HasScalarStores(false),
     HasInv2PiInlineImm(false),
+    HasSDWA(false),
+    HasDPP(false),
     FlatAddressSpace(false),
 
     R600ALUInst(false),
@@ -128,65 +135,30 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 
     FeatureDisable(false),
     InstrItins(getInstrItineraryForCPU(GPU)) {
+  AS = AMDGPU::getAMDGPUAS(TT);
   initializeSubtargetDependencies(TT, GPU, FS);
 }
 
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
-  switch (NWaves) {
-  case 10:
-    return 1638;
-  case 9:
-    return 1820;
-  case 8:
-    return 2048;
-  case 7:
-    return 2340;
-  case 6:
-    return 2730;
-  case 5:
-    return 3276;
-  case 4:
-    return 4096;
-  case 3:
-    return 5461;
-  case 2:
-    return 8192;
-  default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+  const Function &F) const {
+  if (NWaves == 1)
     return getLocalMemorySize();
-  }
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 }
 
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
-  if (Bytes <= 1638)
-    return 10;
-
-  if (Bytes <= 1820)
-    return 9;
-
-  if (Bytes <= 2048)
-    return 8;
-
-  if (Bytes <= 2340)
-    return 7;
-
-  if (Bytes <= 2730)
-    return 6;
-
-  if (Bytes <= 3276)
-    return 5;
-
-  if (Bytes <= 4096)
-    return 4;
-
-  if (Bytes <= 5461)
-    return 3;
-
-  if (Bytes <= 8192)
-    return 2;
-
-  return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+  const Function &F) const {
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+  NumWaves = std::min(NumWaves, MaxWaves);
+  NumWaves = std::max(NumWaves, 1u);
+  return NumWaves;
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
@@ -224,7 +196,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, 0);
+  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 
   // Default/requested minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
@@ -269,6 +241,65 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   return Requested;
 }
 
+bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
+  Function *Kernel = I->getParent()->getParent();
+  unsigned MinSize = 0;
+  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
+  bool IdQuery = false;
+
+  // If reqd_work_group_size is present it narrows value down.
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    const Function *F = CI->getCalledFunction();
+    if (F) {
+      unsigned Dim = UINT_MAX;
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::amdgcn_workitem_id_x:
+      case Intrinsic::r600_read_tidig_x:
+        IdQuery = true;
+      case Intrinsic::r600_read_local_size_x:
+        Dim = 0;
+        break;
+      case Intrinsic::amdgcn_workitem_id_y:
+      case Intrinsic::r600_read_tidig_y:
+        IdQuery = true;
+      case Intrinsic::r600_read_local_size_y:
+        Dim = 1;
+        break;
+      case Intrinsic::amdgcn_workitem_id_z:
+      case Intrinsic::r600_read_tidig_z:
+        IdQuery = true;
+      case Intrinsic::r600_read_local_size_z:
+        Dim = 2;
+        break;
+      default:
+        break;
+      }
+      if (Dim <= 3) {
+        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
+          if (Node->getNumOperands() == 3)
+            MinSize = MaxSize = mdconst::extract<ConstantInt>(
+                                  Node->getOperand(Dim))->getZExtValue();
+      }
+    }
+  }
+
+  if (!MaxSize)
+    return false;
+
+  // Range metadata is [Lo, Hi). For ID query we need to pass max size
+  // as Hi. For size query we need to pass Hi + 1.
+  if (IdQuery)
+    MinSize = 0;
+  else
+    ++MaxSize;
+
+  MDBuilder MDB(I->getContext());
+  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
+                                                  APInt(32, MaxSize));
+  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  return true;
+}
+
 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
                              const TargetMachine &TM) :
   AMDGPUSubtarget(TT, GPU, FS, TM),
@@ -305,7 +336,7 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 }
 
 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
-					    unsigned ExplicitArgBytes) const {
+                                            unsigned ExplicitArgBytes) const {
   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
   if (ImplicitBytes == 0)
     return ExplicitArgBytes;
@@ -359,12 +390,100 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   return 1;
 }
 
-unsigned SISubtarget::getMaxNumSGPRs() const {
+unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI.hasFlatScratchInit()) {
+    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
+    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+      return 4; // FLAT_SCRATCH, VCC (in that order).
+  }
+
+  if (isXNACKEnabled())
+    return 4; // XNACK, VCC (in that order).
+  return 2; // VCC.
+}
+
+unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of SGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
+  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
+
+  // Check if maximum number of SGPRs was explicitly requested using
+  // "amdgpu-num-sgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-sgpr", MaxNumSGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && (Requested <= getReservedNumSGPRs(MF)))
+      Requested = 0;
+
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accommodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < InputNumSGPRs)
+      Requested = InputNumSGPRs;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumSGPRs = Requested;
+  }
+
   if (hasSGPRInitBug())
-    return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
+                  MaxAddressableNumSGPRs);
+}
 
-  if (getGeneration() >= VOLCANIC_ISLANDS)
-    return 102;
+unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of VGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+
+  // Check if maximum number of VGPRs was explicitly requested using
+  // "amdgpu-num-vgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-vgpr", MaxNumVGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && Requested <= getReservedNumVGPRs(MF))
+      Requested = 0;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumVGPRs = Requested;
+  }
 
-  return 104;
+  return MaxNumVGPRs - getReservedNumVGPRs(MF);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 0e3cb7dc1f87..36bc2498781f 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -22,6 +22,7 @@
 #include "SIInstrInfo.h"
 #include "SIISelLowering.h"
 #include "SIFrameLowering.h"
+#include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@@ -51,6 +52,7 @@ public:
     SOUTHERN_ISLANDS,
     SEA_ISLANDS,
     VOLCANIC_ISLANDS,
+    GFX9,
   };
 
   enum {
@@ -64,6 +66,28 @@ public:
     ISAVersion8_0_3,
     ISAVersion8_0_4,
     ISAVersion8_1_0,
+    ISAVersion9_0_0,
+    ISAVersion9_0_1
+  };
+
+  enum TrapHandlerAbi {
+    TrapHandlerAbiNone = 0,
+    TrapHandlerAbiHsa = 1
+  };
+
+  enum TrapID {
+    TrapIDHardwareReserved = 0,
+    TrapIDHSADebugTrap = 1,
+    TrapIDLLVMTrap = 2,
+    TrapIDLLVMDebugTrap = 3,
+    TrapIDDebugBreakpoint = 7,
+    TrapIDDebugReserved8 = 8,
+    TrapIDDebugReservedFE = 0xfe,
+    TrapIDDebugReservedFF = 0xff
+  };
+
+  enum TrapRegValues {
+    LLVMTrapHandlerRegValue = 1
   };
 
 protected:
@@ -81,14 +105,16 @@ protected:
   bool HalfRate64Ops;
 
   // Dynamially set bits that enable features.
-  bool FP16Denormals;
   bool FP32Denormals;
-  bool FP64Denormals;
+  bool FP64FP16Denormals;
   bool FPExceptions;
+  bool DX10Clamp;
   bool FlatForGlobal;
   bool UnalignedScratchAccess;
   bool UnalignedBufferAccess;
+  bool HasApertureRegs;
   bool EnableXNACK;
+  bool TrapHandler;
   bool DebuggerInsertNops;
   bool DebuggerReserveRegs;
   bool DebuggerEmitPrologue;
@@ -107,13 +133,17 @@ protected:
   bool GCN1Encoding;
   bool GCN3Encoding;
   bool CIInsts;
+  bool GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
   bool Has16BitInsts;
+  bool HasVOP3PInsts;
   bool HasMovrel;
   bool HasVGPRIndexMode;
   bool HasScalarStores;
   bool HasInv2PiInlineImm;
+  bool HasSDWA;
+  bool HasDPP;
   bool FlatAddressSpace;
   bool R600ALUInst;
   bool CaymanISA;
@@ -127,6 +157,7 @@ protected:
 
   InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
+  AMDGPUAS AS;
 
 public:
   AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
@@ -184,10 +215,18 @@ public:
     return MaxPrivateElementSize;
   }
 
+  AMDGPUAS getAMDGPUAS() const {
+    return AS;
+  }
+
   bool has16BitInsts() const {
     return Has16BitInsts;
   }
 
+  bool hasVOP3PInsts() const {
+    return HasVOP3PInsts;
+  }
+
   bool hasHWFP64() const {
     return FP64;
   }
@@ -243,6 +282,10 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
+  bool hasMed3_16() const {
+    return getGeneration() >= GFX9;
+  }
+
   bool hasCARRY() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -255,6 +298,10 @@ public:
     return CaymanISA;
   }
 
+  TrapHandlerAbi getTrapHandlerAbi() const {
+    return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+  }
+
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
@@ -267,20 +314,22 @@ public:
     return DumpCode;
   }
 
-  bool enableIEEEBit(const MachineFunction &MF) const {
-    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
-  }
-
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
-  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
 
   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
   /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+    const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
+  }
 
   bool hasFP16Denormals() const {
-    return FP16Denormals;
+    return FP64FP16Denormals;
   }
 
   bool hasFP32Denormals() const {
@@ -288,13 +337,21 @@ public:
   }
 
   bool hasFP64Denormals() const {
-    return FP64Denormals;
+    return FP64FP16Denormals;
   }
 
   bool hasFPExceptions() const {
     return FPExceptions;
   }
 
+  bool enableDX10Clamp() const {
+    return DX10Clamp;
+  }
+
+  bool enableIEEEBit(const MachineFunction &MF) const {
+    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+  }
+
   bool useFlatForGlobal() const {
     return FlatForGlobal;
   }
@@ -307,10 +364,22 @@ public:
     return UnalignedScratchAccess;
   }
 
+  bool hasApertureRegs() const {
+   return HasApertureRegs;
+  }
+
+  bool isTrapHandlerEnabled() const {
+    return TrapHandler;
+  }
+
   bool isXNACKEnabled() const {
     return EnableXNACK;
   }
 
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
   bool isMesaKernel(const MachineFunction &MF) const {
     return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
   }
@@ -324,6 +393,10 @@ public:
     return isAmdHsaOS() || isMesaKernel(MF);
   }
 
+  bool hasFminFmaxLegacy() const {
+    return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -355,72 +428,71 @@ public:
     return true;
   }
 
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
   /// \returns Number of execution units per compute unit supported by the
   /// subtarget.
   unsigned getEUsPerCU() const {
-    return 4;
+    return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
   }
 
   /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
-    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 8;
-    return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
+                                                  FlatWorkGroupSize);
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerCU() const {
-    return getMaxWavesPerEU() * getEUsPerCU();
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return getWavesPerWorkGroup(FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+                                             FlatWorkGroupSize);
   }
 
   /// \returns Minimum number of waves per execution unit supported by the
   /// subtarget.
   unsigned getMinWavesPerEU() const {
-    return 1;
+    return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const {
-    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 8;
-    // FIXME: Need to take scratch memory into account.
-    return 10;
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
-    return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
-      getEUsPerCU();
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
+                                             FlatWorkGroupSize);
   }
 
   /// \returns Minimum flat work group size supported by the subtarget.
   unsigned getMinFlatWorkGroupSize() const {
-    return 1;
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
   }
 
   /// \returns Maximum flat work group size supported by the subtarget.
   unsigned getMaxFlatWorkGroupSize() const {
-    return 2048;
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
   }
 
-  /// \returns Number of waves per work group given the flat work group size.
+  /// \returns Number of waves per work group supported by the subtarget and
+  /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
+    return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
+                                                 FlatWorkGroupSize);
   }
 
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
-
   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
   /// for function \p F, or minimum/maximum flat work group sizes explicitly
   /// requested using "amdgpu-flat-work-group-size" attribute attached to
@@ -440,6 +512,9 @@ public:
   /// compatible with minimum/maximum number of waves limited by flat work group
   /// size, register usage, and/or lds usage.
   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+  /// Creates value range metadata on an workitemid.* inrinsic call or load.
+  bool makeLIDRangeMetadata(Instruction *I) const;
 };
 
 class R600Subtarget final : public AMDGPUSubtarget {
@@ -482,13 +557,6 @@ public:
 };
 
 class SISubtarget final : public AMDGPUSubtarget {
-public:
-  enum {
-    // The closed Vulkan driver sets 96, which limits the wave count to 8 but
-    // doesn't spill SGPRs as much as when 80 is set.
-    FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
-  };
-
 private:
   SIInstrInfo InstrInfo;
   SIFrameLowering FrameLowering;
@@ -516,6 +584,21 @@ public:
     return GISel->getCallLowering();
   }
 
+  const InstructionSelector *getInstructionSelector() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getInstructionSelector();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getLegalizerInfo();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getRegBankInfo();
+  }
+
   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
@@ -524,6 +607,11 @@ public:
     this->GISel.reset(&GISel);
   }
 
+  // XXX - Why is this here if it isn't in the default pass set?
+  bool enableEarlyIfConversion() const override {
+    return true;
+  }
+
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
@@ -533,10 +621,6 @@ public:
     return 16;
   }
 
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
-
   bool hasSMemRealTime() const {
     return HasSMemRealTime;
   }
@@ -549,6 +633,10 @@ public:
     return HasVGPRIndexMode;
   }
 
+  bool useVGPRIndexMode(bool UserEnable) const {
+    return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
+  }
+
   bool hasScalarCompareEq64() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
@@ -561,6 +649,14 @@ public:
     return HasInv2PiInlineImm;
   }
 
+  bool hasSDWA() const {
+    return HasSDWA;
+  }
+
+  bool hasDPP() const {
+    return HasDPP;
+  }
+
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
@@ -594,6 +690,14 @@ public:
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
+  bool hasSMovFedHazard() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasReadM0Hazard() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
 
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
@@ -605,10 +709,107 @@ public:
   /// \returns True if waitcnt instruction is needed before barrier instruction,
   /// false otherwise.
   bool needWaitcntBeforeBarrier() const {
-    return true;
+    return getGeneration() < GFX9;
+  }
+
+  /// \returns true if the flat_scratch register should be initialized with the
+  /// pointer to the wave's scratch memory rather than a size and offset.
+  bool flatScratchIsPointer() const {
+    return getGeneration() >= GFX9;
+  }
+
+  /// \returns SGPR allocation granularity supported by the subtarget.
+  unsigned getSGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+  }
+
+  /// \returns SGPR encoding granularity supported by the subtarget.
+  unsigned getSGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
   }
 
-  unsigned getMaxNumSGPRs() const;
+  /// \returns Total number of SGPRs supported by the subtarget.
+  unsigned getTotalNumSGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+  }
+
+  /// \returns Addressable number of SGPRs supported by the subtarget.
+  unsigned getAddressableNumSGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+  }
+
+  /// \returns Minimum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Maximum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
+                                           Addressable);
+  }
+
+  /// \returns Reserved number of SGPRs for given function \p MF.
+  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns Maximum number of SGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of SGPRs explicitly
+  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns VGPR allocation granularity supported by the subtarget.
+  unsigned getVGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());;
+  }
+
+  /// \returns VGPR encoding granularity supported by the subtarget.
+  unsigned getVGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+  }
+
+  /// \returns Total number of VGPRs supported by the subtarget.
+  unsigned getTotalNumVGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+  }
+
+  /// \returns Addressable number of VGPRs supported by the subtarget.
+  unsigned getAddressableNumVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+  }
+
+  /// \returns Minimum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Maximum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Reserved number of VGPRs for given function \p MF.
+  unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
+    return debuggerReserveRegs() ? 4 : 0;
+  }
+
+  /// \returns Maximum number of VGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of VGPRs explicitly
+  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d8a0c716279c..0202220b8011 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,24 +15,29 @@
 
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUCallLowering.h"
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPULegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "AMDGPURegisterBankInfo.h"
+#endif
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
+#include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
 #include "R600MachineScheduler.h"
 #include "SIMachineScheduler.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -58,6 +63,11 @@ static cl::opt<bool> EnableSROA(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool>
+EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                        cl::desc("Run early if-conversion"),
+                        cl::init(false));
+
 static cl::opt<bool> EnableR600IfConvert(
   "r600-if-convert",
   cl::desc("Use if conversion pass"),
@@ -78,6 +88,36 @@ static cl::opt<bool> ScalarizeGlobal(
   cl::init(false),
   cl::Hidden);
 
+// Option to run internalize pass.
+static cl::opt<bool> InternalizeSymbols(
+  "amdgpu-internalize-symbols",
+  cl::desc("Enable elimination of non-kernel functions and unused globals"),
+  cl::init(false),
+  cl::Hidden);
+
+// Option to inline all early.
+static cl::opt<bool> EarlyInlineAll(
+  "amdgpu-early-inline-all",
+  cl::desc("Inline all functions early"),
+  cl::init(false),
+  cl::Hidden);
+
+static cl::opt<bool> EnableSDWAPeephole(
+  "amdgpu-sdwa-peephole",
+  cl::desc("Enable SDWA peepholer"),
+  cl::init(true));
+
+// Enable address space based alias analysis
+static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+  cl::desc("Enable AMDGPU Alias Analysis"),
+  cl::init(true));
+
+// Option to enable new waitcnt insertion pass.
+static cl::opt<bool> EnableSIInsertWaitcntsPass(
+  "enable-si-insert-waitcnts",
+  cl::desc("Use new waitcnt insertion pass"),
+  cl::init(false));
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -86,22 +126,28 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   PassRegistry *PR = PassRegistry::getPassRegistry();
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
+  initializeSIFixVGPRCopiesPass(*PR);
   initializeSIFoldOperandsPass(*PR);
+  initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
+  initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
+  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
+  initializeAMDGPUAAWrapperPassPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -119,13 +165,26 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-      new ScheduleDAGMILive(C,
-                            llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
 
+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  auto DAG = new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+  return new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
 static MachineSchedRegistry
 R600SchedRegistry("r600", "Run R600's custom scheduler",
                    createR600MachineScheduler);
@@ -139,6 +198,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
                              "Run GCN scheduler to maximize occupancy",
                              createGCNMaxOccupancyMachineScheduler);
 
+static MachineSchedRegistry
+IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
+  "Run GCN scheduler to maximize occupancy (experimental)",
+  createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+GCNMinRegSchedRegistry("gcn-minreg",
+  "Run GCN iterative scheduler for minimal register usage (experimental)",
+  createMinRegScheduler);
+
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
@@ -148,9 +217,14 @@ static StringRef computeDataLayout(const Triple &TT) {
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat.
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+  if (TT.getEnvironmentName() == "amdgiz" ||
+      TT.getEnvironmentName() == "amdgizcl")
+    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 }
 
 LLVM_READNONE
@@ -180,6 +254,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
   : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
                       FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
     TLOF(createTLOF(getTargetTriple())) {
+  AS = AMDGPU::getAMDGPUAS(TT);
   initAsmInfo();
 }
 
@@ -199,8 +274,65 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     FSAttr.getValueAsString();
 }
 
-void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
-  PM.add(createAMDGPUUnifyMetadataPass());
+static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
+  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
+      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+        AAR.addAAResult(WrapperPass->getResult());
+      });
+}
+
+void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+  Builder.DivergentTarget = true;
+
+  bool Internalize = InternalizeSymbols &&
+                     (getOptLevel() > CodeGenOpt::None) &&
+                     (getTargetTriple().getArch() == Triple::amdgcn);
+  bool EarlyInline = EarlyInlineAll &&
+                     (getOptLevel() > CodeGenOpt::None);
+  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_ModuleOptimizerEarly,
+    [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
+                                         legacy::PassManagerBase &PM) {
+      if (AMDGPUAA) {
+        PM.add(createAMDGPUAAWrapperPass());
+        PM.add(createAMDGPUExternalAAWrapperPass());
+      }
+      PM.add(createAMDGPUUnifyMetadataPass());
+      if (Internalize) {
+        PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
+          if (const Function *F = dyn_cast<Function>(&GV)) {
+            if (F->isDeclaration())
+                return true;
+            switch (F->getCallingConv()) {
+            default:
+              return false;
+            case CallingConv::AMDGPU_VS:
+            case CallingConv::AMDGPU_GS:
+            case CallingConv::AMDGPU_PS:
+            case CallingConv::AMDGPU_CS:
+            case CallingConv::AMDGPU_KERNEL:
+            case CallingConv::SPIR_KERNEL:
+              return true;
+            }
+          }
+          return !GV.use_empty();
+        }));
+        PM.add(createGlobalDCEPass());
+      }
+      if (EarlyInline)
+        PM.add(createAMDGPUAlwaysInlinePass(false));
+  });
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_EarlyAsPossible,
+    [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      if (AMDGPUAA) {
+        PM.add(createAMDGPUAAWrapperPass());
+        PM.add(createAMDGPUExternalAAWrapperPass());
+      }
+  });
 }
 
 //===----------------------------------------------------------------------===//
@@ -245,9 +377,21 @@ namespace {
 
 struct SIGISelActualAccessor : public GISelAccessor {
   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
   const AMDGPUCallLowering *getCallLowering() const override {
     return CallLoweringInfo.get();
   }
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
 };
 
 } // end anonymous namespace
@@ -281,6 +425,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
     GISel->CallLoweringInfo.reset(
       new AMDGPUCallLowering(*I->getTargetLowering()));
+    GISel->Legalizer.reset(new AMDGPULegalizerInfo());
+
+    GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo()));
+    GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I,
+				*static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get())));
 #endif
 
     I->setGISelAccessor(*GISel);
@@ -356,9 +505,9 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
-  void addIRPasses() override;
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
+  bool addILPOpts() override;
   bool addInstSelector() override;
 #ifdef LLVM_BUILD_GLOBAL_ISEL
   bool addIRTranslator() override;
@@ -406,11 +555,15 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
 }
 
 void AMDGPUPassConfig::addIRPasses() {
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+
   // There is no reason to run these.
   disablePass(&StackMapLivenessID);
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  addPass(createAMDGPULowerIntrinsicsPass(&TM));
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());
@@ -421,17 +574,33 @@ void AMDGPUPassConfig::addIRPasses() {
   // without ever running any passes on the second.
   addPass(createBarrierNoopPass());
 
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+    // TODO: May want to move later or split into an early and late one.
+
+    addPass(createAMDGPUCodeGenPreparePass(
+              static_cast<const GCNTargetMachine *>(&TM)));
+  }
+
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
 
-  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
   if (TM.getOptLevel() > CodeGenOpt::None) {
+    addPass(createInferAddressSpacesPass());
     addPass(createAMDGPUPromoteAlloca(&TM));
 
     if (EnableSROA)
       addPass(createSROAPass());
 
     addStraightLineScalarOptimizationPasses();
+
+    if (EnableAMDGPUAliasAnalysis) {
+      addPass(createAMDGPUAAWrapperPass());
+      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
+                                             AAResults &AAR) {
+        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+          AAR.addAAResult(WrapperPass->getResult());
+        }));
+    }
   }
 
   TargetPassConfig::addIRPasses();
@@ -526,7 +695,12 @@ bool GCNPassConfig::addPreISel() {
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
-  addPass(&AMDGPUAnnotateKernelFeaturesID);
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+  addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+
+  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+  // regions formed by them.
+  addPass(&AMDGPUUnifyDivergentExitNodesID);
   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
@@ -549,13 +723,19 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&SIFoldOperandsID);
   addPass(&DeadMachineInstructionElimID);
   addPass(&SILoadStoreOptimizerID);
+  addPass(createSIShrinkInstructionsPass());
+  if (EnableSDWAPeephole) {
+    addPass(&SIPeepholeSDWAID);
+    addPass(&DeadMachineInstructionElimID);
+  }
 }
 
-void GCNPassConfig::addIRPasses() {
-  // TODO: May want to move later or split into an early and late one.
-  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+bool GCNPassConfig::addILPOpts() {
+  if (EnableEarlyIfConversion)
+    addPass(&EarlyIfConverterID);
 
-  AMDGPUPassConfig::addIRPasses();
+  TargetPassConfig::addILPOpts();
+  return false;
 }
 
 bool GCNPassConfig::addInstSelector() {
@@ -572,20 +752,23 @@ bool GCNPassConfig::addIRTranslator() {
 }
 
 bool GCNPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
   return false;
 }
 
 bool GCNPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
   return false;
 }
 
 bool GCNPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
   return false;
 }
+
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
-  addPass(createSIShrinkInstructionsPass());
   addPass(createSIWholeQuadModePass());
 }
 
@@ -615,6 +798,7 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
+  addPass(&SIFixVGPRCopiesID);
   addPass(&SIOptimizeExecMaskingID);
   TargetPassConfig::addPostRegAlloc();
 }
@@ -633,7 +817,10 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  addPass(createSIInsertWaitsPass());
+  if (EnableSIInsertWaitcntsPass)
+    addPass(createSIInsertWaitcntsPass());
+  else
+    addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
   addPass(createSIDebuggerInsertNopsPass());
@@ -643,3 +830,4 @@ void GCNPassConfig::addPreEmitPass() {
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new GCNPassConfig(this, PM);
 }
+
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 9496773a073f..934bf7f31bab 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AMDGPUIntrinsicInfo IntrinsicInfo;
+  AMDGPUAS AS;
 
   StringRef getGPUName(const Function &F) const;
   StringRef getFeatureString(const Function &F) const;
@@ -57,7 +58,18 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+  AMDGPUAS getAMDGPUAS() const {
+    return AS;
+  }
+
+  void adjustPassManager(PassManagerBuilder &) override;
+  /// Get the integer value of a null pointer in the given address space.
+  uint64_t getNullPointerValue(unsigned AddrSpace) const {
+    if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS)
+      return -1;
+    return 0;
+  }
+
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 1fddc88a705a..c96761c0b04e 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUTargetMachine.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "llvm/MC/MCContext.h"
@@ -22,7 +23,8 @@ using namespace llvm;
 
 MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
+  auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) &&
       AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
     return TextSection;
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index de327786dff6..ca6210f69298 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index e90487065992..01ac9968181a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -29,6 +29,39 @@ using namespace llvm;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
+static cl::opt<unsigned> UnrollThresholdPrivate(
+  "amdgpu-unroll-threshold-private",
+  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
+  cl::init(2500), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdLocal(
+  "amdgpu-unroll-threshold-local",
+  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
+  cl::init(1000), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdIf(
+  "amdgpu-unroll-threshold-if",
+  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
+  cl::init(150), cl::Hidden);
+
+static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
+                              unsigned Depth = 0) {
+  const Instruction *I = dyn_cast<Instruction>(Cond);
+  if (!I)
+    return false;
+
+  for (const Value *V : I->operand_values()) {
+    if (!L->contains(I))
+      continue;
+    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
+      if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
+                  return SubLoop->contains(PHI); }))
+        return true;
+    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
+      return true;
+  }
+  return false;
+}
 
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                             TTI::UnrollingPreferences &UP) {
@@ -38,29 +71,115 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
 
   // TODO: Do we want runtime unrolling?
 
+  // Maximum alloca size than can fit registers. Reserve 16 registers.
+  const unsigned MaxAlloca = (256 - 16) * 4;
+  unsigned ThresholdPrivate = UnrollThresholdPrivate;
+  unsigned ThresholdLocal = UnrollThresholdLocal;
+  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+  AMDGPUAS ASST = ST->getAMDGPUAS();
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
+    unsigned LocalGEPsSeen = 0;
+
+    if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
+               return SubLoop->contains(BB); }))
+        continue; // Block belongs to an inner loop.
+
     for (const Instruction &I : *BB) {
+
+      // Unroll a loop which contains an "if" statement whose condition
+      // defined by a PHI belonging to the loop. This may help to eliminate
+      // if region and potentially even PHI itself, saving on both divergence
+      // and registers used for the PHI.
+      // Add a small bonus for each of such "if" statements.
+      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
+        if (UP.Threshold < MaxBoost && Br->isConditional()) {
+          if (L->isLoopExiting(Br->getSuccessor(0)) ||
+              L->isLoopExiting(Br->getSuccessor(1)))
+            continue;
+          if (dependsOnLocalPhi(L, Br->getCondition())) {
+            UP.Threshold += UnrollThresholdIf;
+            DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                         << " for loop:\n" << *L << " due to " << *Br << '\n');
+            if (UP.Threshold >= MaxBoost)
+              return;
+          }
+        }
+        continue;
+      }
+
       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
-      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+      if (!GEP)
+        continue;
+
+      unsigned AS = GEP->getAddressSpace();
+      unsigned Threshold = 0;
+      if (AS == ASST.PRIVATE_ADDRESS)
+        Threshold = ThresholdPrivate;
+      else if (AS == ASST.LOCAL_ADDRESS)
+        Threshold = ThresholdLocal;
+      else
+        continue;
+
+      if (UP.Threshold >= Threshold)
         continue;
 
-      const Value *Ptr = GEP->getPointerOperand();
-      const AllocaInst *Alloca =
-          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
-      if (Alloca) {
-        // We want to do whatever we can to limit the number of alloca
-        // instructions that make it through to the code generator.  allocas
-        // require us to use indirect addressing, which is slow and prone to
-        // compiler bugs.  If this loop does an address calculation on an
-        // alloca ptr, then we want to use a higher than normal loop unroll
-        // threshold. This will give SROA a better chance to eliminate these
-        // allocas.
-        //
-        // Don't use the maximum allowed value here as it will make some
-        // programs way too big.
-        UP.Threshold = 800;
+      if (AS == ASST.PRIVATE_ADDRESS) {
+        const Value *Ptr = GEP->getPointerOperand();
+        const AllocaInst *Alloca =
+            dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+        if (!Alloca || !Alloca->isStaticAlloca())
+          continue;
+        Type *Ty = Alloca->getAllocatedType();
+        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
+        if (AllocaSize > MaxAlloca)
+          continue;
+      } else if (AS == ASST.LOCAL_ADDRESS) {
+        LocalGEPsSeen++;
+        // Inhibit unroll for local memory if we have seen addressing not to
+        // a variable, most likely we will be unable to combine it.
+        // Do not unroll too deep inner loops for local memory to give a chance
+        // to unroll an outer loop for a more important reason.
+        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
+            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
+             !isa<Argument>(GEP->getPointerOperand())))
+          continue;
       }
+
+      // Check if GEP depends on a value defined by this loop itself.
+      bool HasLoopDef = false;
+      for (const Value *Op : GEP->operands()) {
+        const Instruction *Inst = dyn_cast<Instruction>(Op);
+        if (!Inst || L->isLoopInvariant(Op))
+          continue;
+
+        if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+             return SubLoop->contains(Inst); }))
+          continue;
+        HasLoopDef = true;
+        break;
+      }
+      if (!HasLoopDef)
+        continue;
+
+      // We want to do whatever we can to limit the number of alloca
+      // instructions that make it through to the code generator.  allocas
+      // require us to use indirect addressing, which is slow and prone to
+      // compiler bugs.  If this loop does an address calculation on an
+      // alloca ptr, then we want to use a higher than normal loop unroll
+      // threshold. This will give SROA a better chance to eliminate these
+      // allocas.
+      //
+      // We also want to have more unrolling for local memory to let ds
+      // instructions with different offsets combine.
+      //
+      // Don't use the maximum allowed value here as it will make some
+      // programs way too big.
+      UP.Threshold = Threshold;
+      DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
+                   << *L << " due to " << *GEP << '\n');
+      if (UP.Threshold >= MaxBoost)
+        return;
     }
   }
 }
@@ -81,28 +200,56 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
 }
 
 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
-  switch (AddrSpace) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS:
-  case AMDGPUAS::FLAT_ADDRESS:
+  AMDGPUAS AS = ST->getAMDGPUAS();
+  if (AddrSpace == AS.GLOBAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS ||
+      AddrSpace == AS.FLAT_ADDRESS)
     return 128;
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS:
+  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.REGION_ADDRESS)
     return 64;
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  if (AddrSpace == AS.PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
-  default:
-    if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
-        (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
-         AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
-         (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
-          AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
-      return 128;
-    llvm_unreachable("unhandled address space");
+
+  if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+      (AddrSpace == AS.PARAM_D_ADDRESS ||
+      AddrSpace == AS.PARAM_I_ADDRESS ||
+      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+      AddrSpace <= AS.CONSTANT_BUFFER_15)))
+    return 128;
+  llvm_unreachable("unhandled address space");
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                               unsigned Alignment,
+                                               unsigned AddrSpace) const {
+  // We allow vectorization of flat stores, even though we may need to decompose
+  // them later if they may access private memory. We don't have enough context
+  // here, and legalization can handle it.
+  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
+      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
+  return true;
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                                unsigned Alignment,
+                                                unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                                 unsigned Alignment,
+                                                 unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Disable unrolling if the loop is not vectorized.
+  if (VF == 1)
+    return 1;
+
   // Semi-arbitrary large amount.
   return 64;
 }
@@ -228,16 +375,8 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
-                                          const IntrinsicInst *I) {
+static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
   switch (I->getIntrinsicID()) {
-  default:
-    return false;
-  case Intrinsic::not_intrinsic:
-    // This means we have an intrinsic that isn't defined in
-    // IntrinsicsAMDGPU.td
-    break;
-
   case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::amdgcn_workitem_id_z:
@@ -249,6 +388,8 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::r600_read_tidig_x:
   case Intrinsic::r600_read_tidig_y:
   case Intrinsic::r600_read_tidig_z:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
   case Intrinsic::amdgcn_image_atomic_swap:
   case Intrinsic::amdgcn_image_atomic_add:
   case Intrinsic::amdgcn_image_atomic_sub:
@@ -274,16 +415,10 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::amdgcn_buffer_atomic_xor:
   case Intrinsic::amdgcn_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_ps_live:
+  case Intrinsic::amdgcn_ds_swizzle:
     return true;
-  }
-
-  StringRef Name = I->getCalledFunction()->getName();
-  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
   default:
     return false;
-  case AMDGPUIntrinsic::SI_fs_interp:
-  case AMDGPUIntrinsic::SI_fs_constant:
-    return true;
   }
 }
 
@@ -295,8 +430,8 @@ static bool isArgPassedInSGPR(const Argument *A) {
     return true;
 
   // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
-  if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
-      F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+  if (F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
+      F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal))
     return true;
 
   // Everything else is in VGPRs.
@@ -318,7 +453,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   // All other loads are not divergent, because if threads issue loads with the
   // same arguments, they will always get the same result.
   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
-    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+    return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
 
   // Atomics are divergent because they are executed sequentially: when an
   // atomic operation refers to the same address in each thread, then each
@@ -327,10 +462,8 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
     return true;
 
-  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    const TargetMachine &TM = getTLI()->getTargetMachine();
-    return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
-  }
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
+    return isIntrinsicSourceOfDivergence(Intrinsic);
 
   // Assume all function calls are a source of divergence.
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 0d83b2a585bf..71d6306bc1a5 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -32,6 +32,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   const AMDGPUSubtarget *ST;
   const AMDGPUTargetLowering *TLI;
+  bool IsGraphicsShader;
 
   const AMDGPUSubtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
@@ -62,7 +63,8 @@ public:
   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()),
       ST(TM->getSubtargetImpl(F)),
-      TLI(ST->getTargetLowering()) {}
+      TLI(ST->getTargetLowering()),
+      IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
 
   bool hasBranchDivergence() { return true; }
 
@@ -76,6 +78,17 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                  unsigned Alignment,
+                                  unsigned AddrSpace) const;
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                   unsigned Alignment,
+                                   unsigned AddrSpace) const;
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                    unsigned Alignment,
+                                    unsigned AddrSpace) const;
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getArithmeticInstrCost(
@@ -91,6 +104,15 @@ public:
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
 
+  unsigned getFlatAddressSpace() const {
+    // Don't bother running InferAddressSpaces pass on graphics shaders which
+    // don't use flat addressing.
+    if (IsGraphicsShader)
+      return -1;
+    return ST->hasFlatAddressSpace() ?
+      ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE;
+  }
+
   unsigned getVectorSplitCost() { return 0; }
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
new file mode 100644
index 000000000000..309913f87fb6
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -0,0 +1,225 @@
+//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
+
+namespace {
+
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+    initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
+  }
+
+  // We can preserve non-critical-edgeness when we unify function exit nodes
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+
+}
+
+char AMDGPUUnifyDivergentExitNodes::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                     "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                    "Unify divergent function exit nodes", false, false)
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // TODO: Preserve dominator tree.
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+
+  AU.addRequired<DivergenceAnalysis>();
+
+  // No divergent values are changed, only blocks and branch edges.
+  AU.addPreserved<DivergenceAnalysis>();
+
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(LowerSwitchID);
+  FunctionPass::getAnalysisUsage(AU);
+
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+/// \returns true if \p BB is reachable through only uniform branches.
+/// XXX - Is there a more efficient way to find this?
+static bool isUniformlyReached(const DivergenceAnalysis &DA,
+                               BasicBlock &BB) {
+  SmallVector<BasicBlock *, 8> Stack;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+
+  for (BasicBlock *Pred : predecessors(&BB))
+    Stack.push_back(Pred);
+
+  while (!Stack.empty()) {
+    BasicBlock *Top = Stack.pop_back_val();
+    if (!DA.isUniform(Top->getTerminator()))
+      return false;
+
+    for (BasicBlock *Pred : predecessors(Top)) {
+      if (Visited.insert(Pred).second)
+        Stack.push_back(Pred);
+    }
+  }
+
+  return true;
+}
+
+static BasicBlock *unifyReturnBlockSet(Function &F,
+                                       ArrayRef<BasicBlock *> ReturningBlocks,
+                                       const TargetTransformInfo &TTI,
+                                       StringRef Name) {
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Cleanup possible branch to unconditional branch to the return.
+    SimplifyCFG(BB, TTI, 2);
+  }
+
+  return NewRetBlock;
+}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  if (PDT.getRoots().size() <= 1)
+    return false;
+
+  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  SmallVector<BasicBlock *, 4> ReturningBlocks;
+  SmallVector<BasicBlock *, 4> UnreachableBlocks;
+
+  for (BasicBlock *BB : PDT.getRoots()) {
+    if (isa<ReturnInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        ReturningBlocks.push_back(BB);
+    } else if (isa<UnreachableInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        UnreachableBlocks.push_back(BB);
+    }
+  }
+
+  if (!UnreachableBlocks.empty()) {
+    BasicBlock *UnreachableBlock = nullptr;
+
+    if (UnreachableBlocks.size() == 1) {
+      UnreachableBlock = UnreachableBlocks.front();
+    } else {
+      UnreachableBlock = BasicBlock::Create(F.getContext(),
+                                            "UnifiedUnreachableBlock", &F);
+      new UnreachableInst(F.getContext(), UnreachableBlock);
+
+      for (BasicBlock *BB : UnreachableBlocks) {
+        BB->getInstList().pop_back();  // Remove the unreachable inst.
+        BranchInst::Create(UnreachableBlock, BB);
+      }
+    }
+
+    if (!ReturningBlocks.empty()) {
+      // Don't create a new unreachable inst if we have a return. The
+      // structurizer/annotator can't handle the multiple exits
+
+      Type *RetTy = F.getReturnType();
+      Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.
+
+      Function *UnreachableIntrin =
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+
+      // Insert a call to an intrinsic tracking that this is an unreachable
+      // point, in case we want to kill the active lanes or something later.
+      CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
+
+      // Don't create a scalar trap. We would only want to trap if this code was
+      // really reached, but a scalar trap would happen even if no lanes
+      // actually reached here.
+      ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
+      ReturningBlocks.push_back(UnreachableBlock);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty())
+    return false; // No blocks return
+
+  if (ReturningBlocks.size() == 1)
+    return false; // Already has a single return block
+
+  const TargetTransformInfo &TTI
+    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+  return true;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index bf501a1e8405..3a0c3ede08f4 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -13,38 +13,39 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include <algorithm>
+#include <cassert>
 
 using namespace llvm;
 
 namespace {
+
   namespace kOCLMD {
+
     const char SpirVer[]            = "opencl.spir.version";
     const char OCLVer[]             = "opencl.ocl.version";
     const char UsedExt[]            = "opencl.used.extensions";
     const char UsedOptCoreFeat[]    = "opencl.used.optional.core.features";
     const char CompilerOptions[]    = "opencl.compiler.options";
     const char LLVMIdent[]          = "llvm.ident";
-  }
+
+  } // end namespace kOCLMD
 
   /// \brief Unify multiple OpenCL metadata due to linking.
-  class AMDGPUUnifyMetadata : public FunctionPass {
+  class AMDGPUUnifyMetadata : public ModulePass {
   public:
     static char ID;
-    explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {};
+    explicit AMDGPUUnifyMetadata() : ModulePass(ID) {};
 
   private:
-    // This should really be a module pass but we have to run it as early
-    // as possible, so given function passes are executed first and
-    // TargetMachine::addEarlyAsPossiblePasses() expects only function passes
-    // it has to be a function pass.
     virtual bool runOnModule(Module &M);
 
-    // \todo: Convert to a module pass.
-    virtual bool runOnFunction(Function &F);
-
     /// \brief Unify version metadata.
     /// \return true if changes are made.
     /// Assume the named metadata has operands each of which is a pair of
@@ -117,7 +118,7 @@ INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
                 "Unify multiple OpenCL metadata due to linking",
                 false, false)
 
-FunctionPass* llvm::createAMDGPUUnifyMetadataPass() {
+ModulePass* llvm::createAMDGPUUnifyMetadataPass() {
   return new AMDGPUUnifyMetadata();
 }
 
@@ -143,7 +144,3 @@ bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
 
   return Changed;
 }
-
-bool AMDGPUUnifyMetadata::runOnFunction(Function &F) {
-  return runOnModule(*F.getParent());
-}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 7faeccdc5df3..1a393845a822 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -9,27 +9,40 @@
 //==-----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "R600RegisterInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstddef>
 #include <deque>
+#include <iterator>
+#include <map>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -53,15 +66,19 @@ STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
 namespace llvm {
+
   void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
-}
+
+} // end namespace llvm
+
+namespace {
 
 //===----------------------------------------------------------------------===//
 //
 // Miscellaneous utility for CFGStructurizer.
 //
 //===----------------------------------------------------------------------===//
-namespace {
+
 #define SHOWNEWINSTR(i) \
   DEBUG(dbgs() << "New instr: " << *i << "\n");
 
@@ -82,35 +99,19 @@ DEBUG( \
 
 #define INVALIDSCCNUM -1
 
-template<class NodeT>
-void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
-  size_t sz = Src.size();
-  for (size_t i = 0; i < sz/2; ++i) {
-    NodeT *t = Src[i];
-    Src[i] = Src[sz - i - 1];
-    Src[sz - i - 1] = t;
-  }
-}
-
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 //
 // supporting data structure for CFGStructurizer
 //
 //===----------------------------------------------------------------------===//
 
-
-namespace {
-
 class BlockInformation {
 public:
-  bool IsRetired;
-  int  SccNum;
-  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
-};
+  bool IsRetired = false;
+  int SccNum = INVALIDSCCNUM;
 
-} // end anonymous namespace
+  BlockInformation() = default;
+};
 
 //===----------------------------------------------------------------------===//
 //
@@ -118,7 +119,6 @@ public:
 //
 //===----------------------------------------------------------------------===//
 
-namespace {
 class AMDGPUCFGStructurizer : public MachineFunctionPass {
 public:
   typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
@@ -133,8 +133,7 @@ public:
 
   static char ID;
 
-  AMDGPUCFGStructurizer() :
-      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+  AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -167,7 +166,7 @@ public:
     MLI = &getAnalysis<MachineLoopInfo>();
     DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
+    DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
     DEBUG(PDT->print(dbgs()););
     prepare();
@@ -180,8 +179,8 @@ protected:
   MachineDominatorTree *MDT;
   MachinePostDominatorTree *PDT;
   MachineLoopInfo *MLI;
-  const R600InstrInfo *TII;
-  const R600RegisterInfo *TRI;
+  const R600InstrInfo *TII = nullptr;
+  const R600RegisterInfo *TRI = nullptr;
 
   // PRINT FUNCTIONS
   /// Print the ordered Blocks.
@@ -198,6 +197,7 @@ protected:
       }
     }
   }
+
   static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
     for (MachineLoop::iterator iter = LoopInfo.begin(),
          iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
@@ -263,7 +263,6 @@ protected:
       MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
   static void wrapup(MachineBasicBlock *MBB);
 
-
   int patternMatch(MachineBasicBlock *MBB);
   int patternMatchGroup(MachineBasicBlock *MBB);
   int serialPatternMatch(MachineBasicBlock *MBB);
@@ -328,7 +327,6 @@ protected:
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
 
-
 private:
   MBBInfoMap BlockInfoMap;
   LoopLandInfoMap LLInfoMap;
@@ -337,6 +335,10 @@ private:
   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
 };
 
+char AMDGPUCFGStructurizer::ID = 0;
+
+} // end anonymous namespace
+
 int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
   if (It == BlockInfoMap.end())
@@ -379,6 +381,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
   }
   return false;
 }
+
 AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
     bool AllowSideEntry) const {
@@ -697,10 +700,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
    // there isn't such an interface yet.  alternatively, replace all the other
    // blocks in the jump table with the entryBlk //}
-
 }
 
-
 bool AMDGPUCFGStructurizer::prepare() {
   bool Changed = false;
 
@@ -748,7 +749,6 @@ bool AMDGPUCFGStructurizer::prepare() {
 }
 
 bool AMDGPUCFGStructurizer::run() {
-
   //Assume reducible CFG...
   DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
 
@@ -886,8 +886,6 @@ bool AMDGPUCFGStructurizer::run() {
   return true;
 }
 
-
-
 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   int SccNum = 0;
   MachineBasicBlock *MBB;
@@ -903,11 +901,8 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
     }
   }
 
-  //walk through all the block in func to check for unreachable
-  typedef GraphTraits<MachineFunction *> GTM;
-  auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
+  // walk through all the block in func to check for unreachable
+  for (auto *MBB : nodes(MF)) {
     SccNum = getSCCNum(MBB);
     if (SccNum == INVALIDSCCNUM)
       dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
@@ -941,7 +936,6 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
   return NumMatch;
 }
 
-
 int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
   if (MBB->succ_size() != 1)
     return 0;
@@ -1039,7 +1033,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
     for (MachineLoop *ML : depth_first(It))
       NestedLoops.push_front(ML);
 
-  if (NestedLoops.size() == 0)
+  if (NestedLoops.empty())
     return 0;
 
   // Process nested loop outside->inside (we did push_front),
@@ -1074,13 +1068,9 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   MachineBasicBlock *ExitBlk = *ExitBlks.begin();
   assert(ExitBlk && "Loop has several exit block");
   MBBVector LatchBlks;
-  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
-  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
-      PE = InvMBBTraits::child_end(LoopHeader);
-  for (; PI != PE; PI++) {
-    if (LoopRep->contains(*PI))
-      LatchBlks.push_back(*PI);
-  }
+  for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
+    if (LoopRep->contains(LB))
+      LatchBlks.push_back(LB);
 
   for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
     mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
@@ -1217,7 +1207,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
     }
   }
 
-    dbgs() << "\n";
+  dbgs() << "\n";
 }
 
 int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
@@ -1478,7 +1468,6 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
 
   if (LandMBB && TrueMBB && FalseMBB)
     MBB->addSuccessor(LandMBB);
-
 }
 
 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
@@ -1491,7 +1480,6 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
   DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
-
 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
     MachineBasicBlock *LandMBB) {
   DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
@@ -1727,11 +1715,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
          && "can't retire block yet");
 }
 
-char AMDGPUCFGStructurizer::ID = 0;
-
-} // end anonymous namespace
-
-
 INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
                       "AMDGPU CFG Structurizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3cf9a1d92469..961f7186f373 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -16,6 +16,7 @@
 #include "Utils/AMDGPUAsmUtils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -39,15 +40,12 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -56,7 +54,6 @@
 #include <map>
 #include <memory>
 #include <string>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -83,7 +80,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   const AMDGPUAsmParser *AsmParser;
 
 public:
-  AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
+  AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
     : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
 
   typedef std::unique_ptr<AMDGPUOperand> Ptr;
@@ -160,7 +157,11 @@ public:
     ImmTySendMsg,
     ImmTyInterpSlot,
     ImmTyInterpAttr,
-    ImmTyAttrChan
+    ImmTyAttrChan,
+    ImmTyOpSel,
+    ImmTyOpSelHi,
+    ImmTyNegLo,
+    ImmTyNegHi
   };
 
   struct TokOp {
@@ -297,6 +298,10 @@ public:
   bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
   bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
   bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+  bool isOpSel() const { return isImmTy(ImmTyOpSel); }
+  bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
+  bool isNegLo() const { return isImmTy(ImmTyNegLo); }
+  bool isNegHi() const { return isImmTy(ImmTyNegHi); }
 
   bool isMod() const {
     return isClampSI() || isOModSI();
@@ -316,6 +321,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
   }
 
+  bool isSCSrcV2B16() const {
+    return isSCSrcB16();
+  }
+
   bool isSCSrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
   }
@@ -328,6 +337,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
   }
 
+  bool isSCSrcV2F16() const {
+    return isSCSrcF16();
+  }
+
   bool isSCSrcF32() const {
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
   }
@@ -344,6 +357,11 @@ public:
     return isSCSrcB16() || isLiteralImm(MVT::i16);
   }
 
+  bool isSSrcV2B16() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcB16();
+  }
+
   bool isSSrcB64() const {
     // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
     // See isVSrc64().
@@ -362,6 +380,11 @@ public:
     return isSCSrcB16() || isLiteralImm(MVT::f16);
   }
 
+  bool isSSrcV2F16() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcF16();
+  }
+
   bool isVCSrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
@@ -374,6 +397,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
+  bool isVCSrcV2B16() const {
+    return isVCSrcB16();
+  }
+
   bool isVCSrcF32() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
@@ -386,6 +413,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
 
+  bool isVCSrcV2F16() const {
+    return isVCSrcF16();
+  }
+
   bool isVSrcB32() const {
     return isVCSrcF32() || isLiteralImm(MVT::i32);
   }
@@ -398,6 +429,11 @@ public:
     return isVCSrcF16() || isLiteralImm(MVT::i16);
   }
 
+  bool isVSrcV2B16() const {
+    llvm_unreachable("cannot happen");
+    return isVSrcB16();
+  }
+
   bool isVSrcF32() const {
     return isVCSrcF32() || isLiteralImm(MVT::f32);
   }
@@ -410,6 +446,11 @@ public:
     return isVCSrcF16() || isLiteralImm(MVT::f16);
   }
 
+  bool isVSrcV2F16() const {
+    llvm_unreachable("cannot happen");
+    return isVSrcF16();
+  }
+
   bool isKImmFP32() const {
     return isLiteralImm(MVT::f32);
   }
@@ -459,7 +500,7 @@ public:
     return Imm.Val;
   }
 
-  enum ImmTy getImmTy() const {
+  ImmTy getImmTy() const {
     assert(isImm());
     return Imm.Type;
   }
@@ -501,9 +542,11 @@ public:
     return getModifiers().hasIntModifiers();
   }
 
+  uint64_t applyInputFPModifiers(uint64_t Val, unsigned Size) const;
+
   void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const;
 
-  void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
+  void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const;
 
   template <unsigned Bitwidth>
   void addKImmFPOperands(MCInst &Inst, unsigned N) const;
@@ -610,6 +653,10 @@ public:
     case ImmTyInterpSlot: OS << "InterpSlot"; break;
     case ImmTyInterpAttr: OS << "InterpAttr"; break;
     case ImmTyAttrChan: OS << "AttrChan"; break;
+    case ImmTyOpSel: OS << "OpSel"; break;
+    case ImmTyOpSelHi: OS << "OpSelHi"; break;
+    case ImmTyNegLo: OS << "NegLo"; break;
+    case ImmTyNegHi: OS << "NegHi"; break;
     }
   }
 
@@ -636,7 +683,7 @@ public:
 
   static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser,
                                       int64_t Val, SMLoc Loc,
-                                      enum ImmTy Type = ImmTyNone,
+                                      ImmTy Type = ImmTyNone,
                                       bool IsFPImm = false) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
     Op->Imm.Val = Val;
@@ -695,9 +742,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
 // Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next
 // .amdgpu_hsa_kernel or at EOF.
 class KernelScopeInfo {
-  int SgprIndexUnusedMin;
-  int VgprIndexUnusedMin;
-  MCContext *Ctx;
+  int SgprIndexUnusedMin = -1;
+  int VgprIndexUnusedMin = -1;
+  MCContext *Ctx = nullptr;
 
   void usesSgprAt(int i) {
     if (i >= SgprIndexUnusedMin) {
@@ -708,6 +755,7 @@ class KernelScopeInfo {
       }
     }
   }
+
   void usesVgprAt(int i) {
     if (i >= VgprIndexUnusedMin) {
       VgprIndexUnusedMin = ++i;
@@ -717,14 +765,16 @@ class KernelScopeInfo {
       }
     }
   }
+
 public:
-  KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr)
-  {}
+  KernelScopeInfo() = default;
+
   void initialize(MCContext &Context) {
     Ctx = &Context;
     usesSgprAt(SgprIndexUnusedMin = -1);
     usesVgprAt(VgprIndexUnusedMin = -1);
   }
+
   void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
     switch (RegKind) {
       case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
@@ -738,9 +788,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   const MCInstrInfo &MII;
   MCAsmParser &Parser;
 
-  unsigned ForcedEncodingSize;
-  bool ForcedDPP;
-  bool ForcedSDWA;
+  unsigned ForcedEncodingSize = 0;
+  bool ForcedDPP = false;
+  bool ForcedSDWA = false;
   KernelScopeInfo KernelScope;
 
   /// @name Auto-generated Match Functions
@@ -756,7 +806,7 @@ private:
   bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
   bool ParseDirectiveHSACodeObjectVersion();
   bool ParseDirectiveHSACodeObjectISA();
-  bool ParseDirectiveRuntimeMetadata();
+  bool ParseDirectiveCodeObjectMetadata();
   bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
   bool ParseDirectiveAMDKernelCodeT();
   bool ParseSectionDirectiveHSAText();
@@ -767,44 +817,52 @@ private:
   bool ParseSectionDirectiveHSADataGlobalAgent();
   bool ParseSectionDirectiveHSADataGlobalProgram();
   bool ParseSectionDirectiveHSARodataReadonlyAgent();
-  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
-  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex);
-  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
+  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
+                             RegisterKind RegKind, unsigned Reg1,
+                             unsigned RegNum);
+  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
+                           unsigned& RegNum, unsigned& RegWidth,
+                           unsigned *DwordRegIndex);
+  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
+                    bool IsAtomic, bool IsAtomicReturn);
+  void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
+                 bool IsGdsHardcoded);
 
 public:
   enum AMDGPUMatchResultTy {
     Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
   };
 
+  typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
+
   AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0),
-        ForcedDPP(false),
-        ForcedSDWA(false) {
+      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) {
     MCAsmParserExtension::Initialize(Parser);
 
-    if (getSTI().getFeatureBits().none()) {
+    if (getFeatureBits().none()) {
       // Set default features.
       copySTI().ToggleFeature("SOUTHERN_ISLANDS");
     }
 
-    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
 
     {
       // TODO: make those pre-defined variables read-only.
       // Currently there is none suitable machinery in the core llvm-mc for this.
       // MCSymbol::isRedefinable is intended for another purpose, and
       // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
-      AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+      AMDGPU::IsaInfo::IsaVersion ISA =
+          AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
       MCContext &Ctx = getContext();
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+      MCSymbol *Sym =
+          Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
       Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
       Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
     }
     KernelScope.initialize(getContext());
   }
@@ -822,7 +880,7 @@ public:
   }
 
   bool hasInv2PiInlineImm() const {
-    return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+    return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
   }
 
   bool hasSGPR102_SGPR103() const {
@@ -844,6 +902,10 @@ public:
     return &MII;
   }
 
+  const FeatureBitset &getFeatureBits() const {
+    return getSTI().getFeatureBits();
+  }
+
   void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
   void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
   void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
@@ -871,19 +933,28 @@ public:
   //bool ProcessInstruction(MCInst &Inst);
 
   OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
+
   OperandMatchResultTy
   parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                     enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                     AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
                      bool (*ConvertResult)(int64_t &) = nullptr);
+
+  OperandMatchResultTy parseOperandArrayWithPrefix(
+    const char *Prefix,
+    OperandVector &Operands,
+    AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+    bool (*ConvertResult)(int64_t&) = nullptr);
+
   OperandMatchResultTy
   parseNamedBit(const char *Name, OperandVector &Operands,
-                enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+                AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
   OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
                                              StringRef &Value);
 
-  OperandMatchResultTy parseImm(OperandVector &Operands);
+  bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false);
+  OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false);
   OperandMatchResultTy parseReg(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false);
   OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
   OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
   OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
@@ -891,7 +962,8 @@ public:
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
-  void cvtDS(MCInst &Inst, const OperandVector &Operands);
+  void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
+  void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); }
   void cvtExp(MCInst &Inst, const OperandVector &Operands);
 
   bool parseCnt(int64_t &IntVal);
@@ -911,6 +983,12 @@ private:
   void errorExpTgt();
   OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
 
+  bool validateOperandLimitations(const MCInst &Inst);
+  bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
+  bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
+  unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+  bool isSGPR(unsigned Reg);
+
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
@@ -940,7 +1018,13 @@ public:
 
   void cvtId(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
+
+  void cvtVOP3Impl(MCInst &Inst,
+                   const OperandVector &Operands,
+                   OptionalImmIndexMap &OptionalIdx);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
 
   void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
   void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
@@ -988,6 +1072,30 @@ static const fltSemantics *getFltSemantics(MVT VT) {
   return getFltSemantics(VT.getSizeInBits() / 8);
 }
 
+static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
+  switch (OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    return &APFloat::IEEEsingle();
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    return &APFloat::IEEEdouble();
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    return &APFloat::IEEEhalf();
+  default:
+    llvm_unreachable("unsupported fp type");
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Operand
 //===----------------------------------------------------------------------===//
@@ -1031,13 +1139,18 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
     if (!canLosslesslyConvertToFPType(FPLiteral, type))
       return false;
 
+    if (type.getScalarSizeInBits() == 16) {
+      return AMDGPU::isInlinableLiteral16(
+        static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+        AsmParser->hasInv2PiInlineImm());
+    }
+
     // Check if single precision literal is inlinable
     return AMDGPU::isInlinableLiteral32(
       static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
       AsmParser->hasInv2PiInlineImm());
   }
 
-
   // We got int literal token.
   if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
     return AMDGPU::isInlinableLiteral64(Imm.Val,
@@ -1064,6 +1177,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
   if (!Imm.IsFPImm) {
     // We got int literal token.
 
+    if (type == MVT::f64 && hasFPModifiers()) {
+      // Cannot apply fp modifiers to int literals preserving the same semantics
+      // for VOP1/2/C and VOP3 because of integer truncation. To avoid ambiguity,
+      // disable these cases.
+      return false;
+    }
+
     unsigned Size = type.getSizeInBits();
     if (Size == 64)
       Size = 32;
@@ -1093,40 +1213,57 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
-void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
-  int64_t Val = Imm.Val;
-  if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) {
-    // Apply modifiers to immediate value. Only negate can get here
-    if (Imm.IsFPImm) {
-      APFloat F(BitsToDouble(Val));
-      F.changeSign();
-      Val = F.bitcastToAPInt().getZExtValue();
-    } else {
-      Val = -Val;
-    }
+uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
+{
+  assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
+  assert(Size == 2 || Size == 4 || Size == 8);
+
+  const uint64_t FpSignMask = (1ULL << (Size * 8 - 1));
+
+  if (Imm.Mods.Abs) {
+    Val &= ~FpSignMask;
   }
+  if (Imm.Mods.Neg) {
+    Val ^= FpSignMask;
+  }
+
+  return Val;
+}
+
+void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
 
   if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
                              Inst.getNumOperands())) {
-    addLiteralImmOperand(Inst, Val);
+    addLiteralImmOperand(Inst, Imm.Val,
+                         ApplyModifiers &
+                         isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
   } else {
-    Inst.addOperand(MCOperand::createImm(Val));
+    assert(!isImmTy(ImmTyNone) || !hasModifiers());
+    Inst.addOperand(MCOperand::createImm(Imm.Val));
   }
 }
 
-void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
+void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const {
   const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode());
   auto OpNum = Inst.getNumOperands();
   // Check that this operand accepts literals
   assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
 
-  auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
+  if (ApplyModifiers) {
+    assert(AMDGPU::isSISrcFPOperand(InstDesc, OpNum));
+    const unsigned Size = Imm.IsFPImm ? sizeof(double) : getOperandSize(InstDesc, OpNum);
+    Val = applyInputFPModifiers(Val, Size);
+  }
+
+  APInt Literal(64, Val);
+  uint8_t OpTy = InstDesc.OpInfo[OpNum].OperandType;
 
   if (Imm.IsFPImm) { // We got fp literal token
-    APInt Literal(64, Val);
-
-    switch (OpSize) {
-    case 8: {
+    switch (OpTy) {
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1151,16 +1288,31 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
       // in predicate methods (isLiteralImm())
       llvm_unreachable("fp literal in 64-bit integer instruction.");
     }
-    case 4:
-    case 2: {
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
       bool lost;
       APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
       // Convert literal to single precision
-      FPLiteral.convert(*getFltSemantics(OpSize),
+      FPLiteral.convert(*getOpFltSemantics(OpTy),
                         APFloat::rmNearestTiesToEven, &lost);
       // We allow precision lost but not overflow or underflow. This should be
       // checked earlier in isLiteralImm()
-      Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+
+      uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
+      if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
+          OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+        ImmVal |= (ImmVal << 16);
+      }
+
+      Inst.addOperand(MCOperand::createImm(ImmVal));
       return;
     }
     default:
@@ -1173,8 +1325,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
    // We got int literal token.
   // Only sign extend inline immediates.
   // FIXME: No errors on truncation
-  switch (OpSize) {
-  case 4: {
+  switch (OpTy) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     if (isInt<32>(Val) &&
         AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1185,9 +1340,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
     Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
     return;
   }
-  case 8: {
-    if (AMDGPU::isInlinableLiteral64(Val,
-                                     AsmParser->hasInv2PiInlineImm())) {
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
+    if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       return;
     }
@@ -1195,7 +1352,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
     Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
     return;
   }
-  case 2: {
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
     if (isInt<16>(Val) &&
         AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1206,6 +1366,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
     Inst.addOperand(MCOperand::createImm(Val & 0xffff));
     return;
   }
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue());
+    assert(AMDGPU::isInlinableLiteral16(LiteralVal,
+                                        AsmParser->hasInv2PiInlineImm()));
+
+    uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 |
+                      static_cast<uint32_t>(LiteralVal);
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+    return;
+  }
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -1289,7 +1460,8 @@ static unsigned getSpecialRegForName(StringRef RegName) {
     .Default(0);
 }
 
-bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                    SMLoc &EndLoc) {
   auto R = parseRegister();
   if (!R) return true;
   assert(R->isReg());
@@ -1299,20 +1471,43 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
   return false;
 }
 
-bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum)
-{
+bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
+                                            RegisterKind RegKind, unsigned Reg1,
+                                            unsigned RegNum) {
   switch (RegKind) {
   case IS_SPECIAL:
-    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
+      Reg = AMDGPU::EXEC;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) {
+      Reg = AMDGPU::FLAT_SCR;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
+      Reg = AMDGPU::VCC;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) {
+      Reg = AMDGPU::TBA;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) {
+      Reg = AMDGPU::TMA;
+      RegWidth = 2;
+      return true;
+    }
     return false;
   case IS_VGPR:
   case IS_SGPR:
   case IS_TTMP:
-    if (Reg1 != Reg + RegWidth) { return false; }
+    if (Reg1 != Reg + RegWidth) {
+      return false;
+    }
     RegWidth++;
     return true;
   default:
@@ -1320,8 +1515,9 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, R
   }
 }
 
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex)
-{
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+                                          unsigned &RegNum, unsigned &RegWidth,
+                                          unsigned *DwordRegIndex) {
   if (DwordRegIndex) { *DwordRegIndex = 0; }
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   if (getLexer().is(AsmToken::Identifier)) {
@@ -1462,8 +1658,33 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
 }
 
+bool
+AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) {
+  if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) &&
+      (getLexer().getKind() == AsmToken::Integer ||
+       getLexer().getKind() == AsmToken::Real)) {
+
+    // This is a workaround for handling operands like these:
+    //     |1.0|
+    //     |-1|
+    // This syntax is not compatible with syntax of standard
+    // MC expressions (due to the trailing '|').
+
+    SMLoc EndLoc;
+    const MCExpr *Expr;
+
+    if (getParser().parsePrimaryExpr(Expr, EndLoc)) {
+      return true;
+    }
+
+    return !Expr->evaluateAsAbsolute(Val);
+  }
+
+  return getParser().parseAbsoluteExpression(Val);
+}
+
 OperandMatchResultTy
-AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
   // TODO: add syntactic sugar for 1/(2*PI)
   bool Minus = false;
   if (getLexer().getKind() == AsmToken::Minus) {
@@ -1475,7 +1696,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
   switch(getLexer().getKind()) {
   case AsmToken::Integer: {
     int64_t IntVal;
-    if (getParser().parseAbsoluteExpression(IntVal))
+    if (parseAbsoluteExpr(IntVal, AbsMod))
       return MatchOperand_ParseFail;
     if (Minus)
       IntVal *= -1;
@@ -1484,7 +1705,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
   }
   case AsmToken::Real: {
     int64_t IntVal;
-    if (getParser().parseAbsoluteExpression(IntVal))
+    if (parseAbsoluteExpr(IntVal, AbsMod))
       return MatchOperand_ParseFail;
 
     APFloat F(BitsToDouble(IntVal));
@@ -1512,8 +1733,8 @@ AMDGPUAsmParser::parseReg(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
-  auto res = parseImm(Operands);
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) {
+  auto res = parseImm(Operands, AbsMod);
   if (res != MatchOperand_NoMatch) {
     return res;
   }
@@ -1522,18 +1743,50 @@ AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
-  // XXX: During parsing we can't determine if minus sign means
-  // negate-modifier or negative immediate value.
-  // By default we suppose it is modifier.
-  bool Negate = false, Abs = false, Abs2 = false;
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
+                                              bool AllowImm) {
+  bool Negate = false, Negate2 = false, Abs = false, Abs2 = false;
 
   if (getLexer().getKind()== AsmToken::Minus) {
+    const AsmToken NextToken = getLexer().peekTok();
+
+    // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
+    if (NextToken.is(AsmToken::Minus)) {
+      Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier");
+      return MatchOperand_ParseFail;
+    }
+
+    // '-' followed by an integer literal N should be interpreted as integer
+    // negation rather than a floating-point NEG modifier applied to N.
+    // Beside being contr-intuitive, such use of floating-point NEG modifier
+    // results in different meaning of integer literals used with VOP1/2/C
+    // and VOP3, for example:
+    //    v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
+    //    v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
+    // Negative fp literals should be handled likewise for unifomtity
+    if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) {
+      Parser.Lex();
+      Negate = true;
+    }
+  }
+
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "neg") {
+    if (Negate) {
+      Error(Parser.getTok().getLoc(), "expected register or immediate");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Negate2 = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after neg");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    Negate = true;
   }
 
-  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") {
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "abs") {
     Parser.Lex();
     Abs2 = true;
     if (getLexer().isNot(AsmToken::LParen)) {
@@ -1554,7 +1807,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
 
   OperandMatchResultTy Res;
   if (AllowImm) {
-    Res = parseRegOrImm(Operands);
+    Res = parseRegOrImm(Operands, Abs);
   } else {
     Res = parseReg(Operands);
   }
@@ -1563,9 +1816,6 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
   }
 
   AMDGPUOperand::Modifiers Mods;
-  if (Negate) {
-    Mods.Neg = true;
-  }
   if (Abs) {
     if (getLexer().getKind() != AsmToken::Pipe) {
       Error(Parser.getTok().getLoc(), "expected vertical bar");
@@ -1583,6 +1833,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
     Mods.Abs = true;
   }
 
+  if (Negate) {
+    Mods.Neg = true;
+  } else if (Negate2) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Neg = true;
+  }
+
   if (Mods.hasFPModifiers()) {
     AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
     Op.setModifiers(Mods);
@@ -1591,10 +1852,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
+                                               bool AllowImm) {
   bool Sext = false;
 
-  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "sext") {
     Parser.Lex();
     Sext = true;
     if (getLexer().isNot(AsmToken::LParen)) {
@@ -1661,7 +1924,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands)
 }
 
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
-
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
 
   if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
@@ -1719,6 +1981,128 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
   return makeArrayRef(Variants);
 }
 
+unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  const unsigned Num = Desc.getNumImplicitUses();
+  for (unsigned i = 0; i < Num; ++i) {
+    unsigned Reg = Desc.ImplicitUses[i];
+    switch (Reg) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::VCC:
+    case AMDGPU::M0:
+      return Reg;
+    default:
+      break;
+    }
+  }
+  return AMDGPU::NoRegister;
+}
+
+bool AMDGPUAsmParser::isSGPR(unsigned Reg) {
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
+  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
+         Reg == AMDGPU::SCC;
+}
+
+// NB: This code is correct only when used to check constant
+// bus limitations because GFX7 support no f16 inline constants.
+// Note that there are no cases when a GFX7 opcode violates
+// constant bus limitations due to the use of an f16 constant.
+bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
+                                       unsigned OpIdx) const {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+
+  if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+    return false;
+  }
+
+  const MCOperand &MO = Inst.getOperand(OpIdx);
+
+  int64_t Val = MO.getImm();
+  auto OpSize = AMDGPU::getOperandSize(Desc, OpIdx);
+
+  switch (OpSize) { // expected operand size
+  case 8:
+    return AMDGPU::isInlinableLiteral64(Val, hasInv2PiInlineImm());
+  case 4:
+    return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm());
+  case 2: {
+    const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
+    if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+      return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+    } else {
+      return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
+    }
+  }
+  default:
+    llvm_unreachable("invalid operand size");
+  }
+}
+
+bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
+  const MCOperand &MO = Inst.getOperand(OpIdx);
+  if (MO.isImm()) {
+    return !isInlineConstant(Inst, OpIdx);
+  }
+  return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg()));
+}
+
+bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
+  const unsigned Opcode = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opcode);
+  unsigned ConstantBusUseCount = 0;
+
+  if (Desc.TSFlags &
+      (SIInstrFlags::VOPC |
+       SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |
+       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) {
+
+    // Check special imm operands (used by madmk, etc)
+    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
+      ++ConstantBusUseCount;
+    }
+
+    unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+    if (SGPRUsed != AMDGPU::NoRegister) {
+      ++ConstantBusUseCount;
+    }
+
+    const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+    const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+    const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+    for (int OpIdx : OpIndices) {
+      if (OpIdx == -1) break;
+
+      const MCOperand &MO = Inst.getOperand(OpIdx);
+      if (usesConstantBus(Inst, OpIdx)) {
+        if (MO.isReg()) {
+          const unsigned Reg = mc2PseudoReg(MO.getReg());
+          // Pairs of registers with a partial intersections like these
+          //   s0, s[0:1]
+          //   flat_scratch_lo, flat_scratch
+          //   flat_scratch_lo, flat_scratch_hi
+          // are theoretically valid but they are disabled anyway.
+          // Note that this code mimics SIInstrInfo::verifyInstruction
+          if (Reg != SGPRUsed) {
+            ++ConstantBusUseCount;
+          }
+          SGPRUsed = Reg;
+        } else { // Expression or a literal
+          ++ConstantBusUseCount;
+        }
+      }
+    }
+  }
+
+  return ConstantBusUseCount <= 1;
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -1751,6 +2135,10 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (Result) {
   default: break;
   case Match_Success:
+    if (!validateOperandLimitations(Inst)) {
+      return Error(IDLoc,
+                   "invalid operand (violates constant bus restrictions)");
+    }
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, getSTI());
     return false;
@@ -1793,7 +2181,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
   return false;
 }
 
-
 bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
                                                uint32_t &Minor) {
   if (ParseAsAbsoluteExpression(Major))
@@ -1810,7 +2197,6 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
 }
 
 bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
-
   uint32_t Major;
   uint32_t Minor;
 
@@ -1831,9 +2217,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
-    getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
-                                                      Isa.Stepping,
+    AMDGPU::IsaInfo::IsaVersion ISA =
+        AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+    getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
+                                                      ISA.Stepping,
                                                       "AMD", "AMDGPU");
     return false;
   }
@@ -1873,42 +2260,45 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   return false;
 }
 
-bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() {
-  std::string Metadata;
-  raw_string_ostream MS(Metadata);
+bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() {
+  std::string YamlString;
+  raw_string_ostream YamlStream(YamlString);
 
   getLexer().setSkipSpace(false);
 
   bool FoundEnd = false;
   while (!getLexer().is(AsmToken::Eof)) {
     while (getLexer().is(AsmToken::Space)) {
-      MS << ' ';
+      YamlStream << getLexer().getTok().getString();
       Lex();
     }
 
     if (getLexer().is(AsmToken::Identifier)) {
       StringRef ID = getLexer().getTok().getIdentifier();
-      if (ID == ".end_amdgpu_runtime_metadata") {
+      if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) {
         Lex();
         FoundEnd = true;
         break;
       }
     }
 
-    MS << Parser.parseStringToEndOfStatement()
-       << getContext().getAsmInfo()->getSeparatorString();
+    YamlStream << Parser.parseStringToEndOfStatement()
+               << getContext().getAsmInfo()->getSeparatorString();
 
     Parser.eatToEndOfStatement();
   }
 
   getLexer().setSkipSpace(true);
 
-  if (getLexer().is(AsmToken::Eof) && !FoundEnd)
-    return TokError("expected directive .end_amdgpu_runtime_metadata not found");
+  if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
+    return TokError(
+        "expected directive .end_amdgpu_code_object_metadata not found");
+  }
 
-  MS.flush();
+  YamlStream.flush();
 
-  getTargetStreamer().EmitRuntimeMetadata(Metadata);
+  if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString))
+    return Error(getParser().getTok().getLoc(), "invalid code object metadata");
 
   return false;
 }
@@ -1926,7 +2316,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits());
 
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
@@ -2020,8 +2410,8 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".hsa_code_object_isa")
     return ParseDirectiveHSACodeObjectISA();
 
-  if (IDVal == ".amdgpu_runtime_metadata")
-    return ParseDirectiveRuntimeMetadata();
+  if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin)
+    return ParseDirectiveCodeObjectMetadata();
 
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
@@ -2080,7 +2470,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
-
   // Try to parse with a custom parser
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
 
@@ -2208,7 +2597,7 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                                    enum AMDGPUOperand::ImmTy ImmTy,
+                                    AMDGPUOperand::ImmTy ImmTy,
                                     bool (*ConvertResult)(int64_t&)) {
   SMLoc S = Parser.getTok().getLoc();
   int64_t Value = 0;
@@ -2225,9 +2614,59 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix(
+  const char *Prefix,
+  OperandVector &Operands,
+  AMDGPUOperand::ImmTy ImmTy,
+  bool (*ConvertResult)(int64_t&)) {
+  StringRef Name = Parser.getTok().getString();
+  if (!Name.equals(Prefix))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Colon))
+    return MatchOperand_ParseFail;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::LBrac))
+    return MatchOperand_ParseFail;
+  Parser.Lex();
+
+  unsigned Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  // FIXME: How to verify the number of elements matches the number of src
+  // operands?
+  for (int I = 0; I < 3; ++I) {
+    if (I != 0) {
+      if (getLexer().is(AsmToken::RBrac))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+    }
+
+    if (getLexer().isNot(AsmToken::Integer))
+      return MatchOperand_ParseFail;
+
+    int64_t Op;
+    if (getParser().parseAbsoluteExpression(Op))
+      return MatchOperand_ParseFail;
+
+    if (Op != 0 && Op != 1)
+      return MatchOperand_ParseFail;
+    Val |= (Op << I);
+  }
+
+  Parser.Lex();
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
-                               enum AMDGPUOperand::ImmTy ImmTy) {
+                               AMDGPUOperand::ImmTy ImmTy) {
   int64_t Bit = 0;
   SMLoc S = Parser.getTok().getLoc();
 
@@ -2257,11 +2696,11 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
-typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
-
-void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
-                           OptionalImmIndexMap& OptionalIdx,
-                           enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+static void addOptionalImmOperand(
+  MCInst& Inst, const OperandVector& Operands,
+  AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
+  AMDGPUOperand::ImmTy ImmT,
+  int64_t Default = 0) {
   auto i = OptionalIdx.find(ImmT);
   if (i != OptionalIdx.end()) {
     unsigned Idx = i->second;
@@ -2323,9 +2762,9 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
 
-void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-  bool GDSOnly = false;
+void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
+                                bool IsGdsHardcoded) {
+  OptionalImmIndexMap OptionalIdx;
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -2337,7 +2776,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
     }
 
     if (Op.isToken() && Op.getToken() == "gds") {
-      GDSOnly = true;
+      IsGdsHardcoded = true;
       continue;
     }
 
@@ -2346,9 +2785,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
   }
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
-
-  if (!GDSOnly) {
+  if (!IsGdsHardcoded) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
   }
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
@@ -2421,13 +2858,14 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
     Parser.Lex();
 
-  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
   if (CntName == "vmcnt")
-    IntVal = encodeVmcnt(IV, IntVal, CntVal);
+    IntVal = encodeVmcnt(ISA, IntVal, CntVal);
   else if (CntName == "expcnt")
-    IntVal = encodeExpcnt(IV, IntVal, CntVal);
+    IntVal = encodeExpcnt(ISA, IntVal, CntVal);
   else if (CntName == "lgkmcnt")
-    IntVal = encodeLgkmcnt(IV, IntVal, CntVal);
+    IntVal = encodeLgkmcnt(ISA, IntVal, CntVal);
   else
     return true;
 
@@ -2436,8 +2874,9 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
-  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
-  int64_t Waitcnt = getWaitcntBitMask(IV);
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+  int64_t Waitcnt = getWaitcntBitMask(ISA);
   SMLoc S = Parser.getTok().getLoc();
 
   switch(getLexer().getKind()) {
@@ -2459,7 +2898,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
+                                          int64_t &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
   if (Parser.getTok().getString() != "hwreg")
@@ -2520,8 +2960,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
   return false;
 }
 
-OperandMatchResultTy
-AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   using namespace llvm::AMDGPU::Hwreg;
 
   int64_t Imm16Val = 0;
@@ -3170,6 +3609,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
   {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
   {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
+  {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
+  {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
+  {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
+  {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}
 };
 
 OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
@@ -3186,6 +3629,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan
       res = parseSDWASel(Operands, Op.Name, Op.Type);
     } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
       res = parseSDWADstUnused(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTyOpSel ||
+               Op.Type == AMDGPUOperand::ImmTyOpSelHi ||
+               Op.Type == AMDGPUOperand::ImmTyNegLo ||
+               Op.Type == AMDGPUOperand::ImmTyNegHi) {
+      res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
+                                        Op.ConvertResult);
     } else {
       res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
     }
@@ -3241,8 +3690,8 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
       && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
 }
 
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
-  OptionalImmIndexMap OptionalIdx;
+void AMDGPUAsmParser::cvtVOP3Impl(MCInst &Inst, const OperandVector &Operands,
+                                  OptionalImmIndexMap &OptionalIdx) {
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
   for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
@@ -3253,12 +3702,20 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
       Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
-    } else if (Op.isImm()) {
+    } else if (Op.isImmModifier()) {
       OptionalIdx[Op.getImmTy()] = I;
+    } else if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
     } else {
       llvm_unreachable("unhandled operand type");
     }
   }
+}
+
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  cvtVOP3Impl(Inst, Operands, OptionalIdx);
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
@@ -3283,6 +3740,96 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
   }
 }
 
+void AMDGPUAsmParser::cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    if (Op.isMod()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      Op.addRegOrImmOperands(Inst, 1);
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+}
+
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptIdx;
+
+  cvtVOP3Impl(Inst, Operands, OptIdx);
+
+  // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
+  // instruction, and then figure out where to actually put the modifiers
+  int Opc = Inst.getOpcode();
+
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+  addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1);
+
+  int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
+  if (NegLoIdx != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
+  }
+
+  const int Ops[] = { AMDGPU::OpName::src0,
+                      AMDGPU::OpName::src1,
+                      AMDGPU::OpName::src2 };
+  const int ModOps[] = { AMDGPU::OpName::src0_modifiers,
+                         AMDGPU::OpName::src1_modifiers,
+                         AMDGPU::OpName::src2_modifiers };
+
+  int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+  int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+
+  unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+  unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+  unsigned NegLo = 0;
+  unsigned NegHi = 0;
+
+  if (NegLoIdx != -1) {
+    int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+    NegLo = Inst.getOperand(NegLoIdx).getImm();
+    NegHi = Inst.getOperand(NegHiIdx).getImm();
+  }
+
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
+    if (OpIdx == -1)
+      break;
+
+    uint32_t ModVal = 0;
+
+    if ((OpSel & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_0;
+
+    if ((OpSelHi & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_1;
+
+    if ((NegLo & (1 << J)) != 0)
+      ModVal |= SISrcMods::NEG;
+
+    if ((NegHi & (1 << J)) != 0)
+      ModVal |= SISrcMods::NEG_HI;
+
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+    Inst.getOperand(ModIdx).setImm(ModVal);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // dpp
 //===----------------------------------------------------------------------===//
@@ -3436,7 +3983,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     // Add the register arguments
     if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
-      // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token.
+      // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token.
       // Skip it.
       continue;
     } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
@@ -3547,6 +4094,7 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
 
 void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
                               uint64_t BasicInstType) {
+  using namespace llvm::AMDGPU::SDWA;
   OptionalImmIndexMap OptionalIdx;
 
   unsigned I = 1;
@@ -3581,21 +4129,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
     // V_NOP_sdwa_vi has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       break;
 
     case SIInstrFlags::VOP2:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
 
     case SIInstrFlags::VOPC:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
 
     default:
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 45a7fe6d3439..a6609f0725ab 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -21,8 +21,8 @@ def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset"
 class MubufLoad <SDPatternOperator op> : PatFrag <
   (ops node:$ptr), (op node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 def mubuf_load          : MubufLoad <load>;
@@ -705,12 +705,6 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 
 let Predicates = [isGCN] in {
 
-// int_SI_vs_load_input
-def : Pat<
-  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
->;
-
 // Offset in an 32-bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 02d441756c85..7c0ef4aeac3c 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -12,11 +12,17 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
+if(LLVM_BUILD_GLOBAL_ISEL)
+  tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
+endif()
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
 # List of all GlobalISel files.
 set(GLOBAL_ISEL_FILES
   AMDGPUCallLowering.cpp
+  AMDGPUInstructionSelector.cpp
+  AMDGPULegalizerInfo.cpp
+  AMDGPURegisterBankInfo.cpp
   )
 
 # Add GlobalISel files to the dependencies if the user wants to build it.
@@ -30,6 +36,7 @@ endif()
 
 add_llvm_target(AMDGPUCodeGen
   AMDILCFGStructurizer.cpp
+  AMDGPUAliasAnalysis.cpp
   AMDGPUAlwaysInlinePass.cpp
   AMDGPUAnnotateKernelFeatures.cpp
   AMDGPUAnnotateUniformValues.cpp
@@ -39,6 +46,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
   AMDGPUISelDAGToDAG.cpp
+  AMDGPULowerIntrinsics.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUUnifyMetadata.cpp
@@ -50,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  AMDGPUUnifyDivergentExitNodes.cpp
   GCNHazardRecognizer.cpp
   GCNSchedStrategy.cpp
   R600ClauseMergePass.cpp
@@ -68,10 +77,12 @@ add_llvm_target(AMDGPUCodeGen
   SIDebuggerInsertNops.cpp
   SIFixControlFlowLiveIntervals.cpp
   SIFixSGPRCopies.cpp
+  SIFixVGPRCopies.cpp
   SIFoldOperands.cpp
   SIFrameLowering.cpp
   SIInsertSkips.cpp
   SIInsertWaits.cpp
+  SIInsertWaitcnts.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILoadStoreOptimizer.cpp
@@ -80,10 +91,14 @@ add_llvm_target(AMDGPUCodeGen
   SIMachineFunctionInfo.cpp
   SIMachineScheduler.cpp
   SIOptimizeExecMasking.cpp
+  SIPeepholeSDWA.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
   SIWholeQuadMode.cpp
+  GCNIterativeScheduler.cpp
+  GCNMinRegStrategy.cpp
+  GCNRegPressure.cpp
   ${GLOBAL_ISEL_BUILD_FILES}
   )
 
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index a077001df6bd..a9f64589fa5e 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -88,18 +88,6 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
   let has_vdst = 0;
 }
 
-class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName,
-  (outs),
-  (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
-  "$addr $offset0$offset1$gds"> {
-
-  let has_data0 = 0;
-  let has_data1 = 0;
-  let has_vdst  = 0;
-  let has_offset = 0;
-  let AsmMatchConverter = "cvtDSOffset01";
-}
-
 class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
@@ -143,6 +131,20 @@ class DS_1A2D_RET<string opName,
   let hasPostISelHook = 1;
 }
 
+class DS_1A2D_Off8_RET<string opName,
+                       RegisterClass rc = VGPR_32,
+                       RegisterClass src = rc>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
+  "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
+
+  let has_offset = 0;
+  let AsmMatchConverter = "cvtDSOffset01";
+
+  let hasPostISelHook = 1;
+}
+
 class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
@@ -174,6 +176,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
   let has_data1 = 0;
   let has_gds = 0;
   let gdsValue = 1;
+  let AsmMatchConverter = "cvtDSGds";
 }
 
 class DS_0A_RET <string opName> : DS_Pseudo<opName,
@@ -202,20 +205,46 @@ class DS_1A <string opName> : DS_Pseudo<opName,
   let has_data1 = 0;
 }
 
-class DS_1A_GDS <string opName> : DS_Pseudo<opName,
-  (outs),
-  (ins VGPR_32:$addr),
-  "$addr gds"> {
+class DS_GWS <string opName, dag ins, string asmOps>
+: DS_Pseudo<opName, (outs), ins, asmOps> {
+
+  let has_vdst  = 0;
+  let has_addr  = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+
+  let has_gds   = 0;
+  let gdsValue  = 1;
+  let AsmMatchConverter = "cvtDSGds";
+}
+
+class DS_GWS_0D <string opName>
+: DS_GWS<opName,
+  (ins offset:$offset, gds:$gds), "$offset gds">;
 
-  let has_vdst    = 0;
-  let has_data0   = 0;
-  let has_data1   = 0;
-  let has_offset  = 0;
+class DS_GWS_1D <string opName>
+: DS_GWS<opName,
+  (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
+
+  let has_data0 = 1;
+}
+
+class DS_VOID <string opName> : DS_Pseudo<opName,
+  (outs), (ins), ""> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 1;
+  let UseNamedOperandTable = 0;
+  let AsmMatchConverter = "";
+
+  let has_vdst = 0;
+  let has_addr = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+  let has_offset = 0;
   let has_offset0 = 0;
   let has_offset1 = 0;
-
-  let has_gds     = 0;
-  let gdsValue    = 1;
+  let has_gds = 0;
 }
 
 class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
@@ -226,6 +255,8 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
   [(set i32:$vdst,
    (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
 
+  let LGKM_CNT = 0;
+
   let mayLoad = 0;
   let mayStore = 0;
   let isConvergent = 1;
@@ -324,9 +355,9 @@ def DS_MAX_RTN_F32    : DS_1A1D_RET <"ds_max_rtn_f32">,
 
 def DS_WRXCHG_RTN_B32      : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
                              AtomicNoRet<"", 1>;
-def DS_WRXCHG2_RTN_B32     : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
+def DS_WRXCHG2_RTN_B32     : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
                              AtomicNoRet<"", 1>;
-def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
+def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
                              AtomicNoRet<"", 1>;
 
 def DS_ADD_RTN_U64    : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
@@ -365,17 +396,17 @@ def DS_MAX_RTN_F64    : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
                         AtomicNoRet<"ds_max_f64", 1>;
 
 def DS_WRXCHG_RTN_B64      : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
-                             AtomicNoRet<"ds_wrxchg_b64", 1>;
-def DS_WRXCHG2_RTN_B64     : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
-                             AtomicNoRet<"ds_wrxchg2_b64", 1>;
-def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
-                             AtomicNoRet<"ds_wrxchg2st64_b64", 1>;
-
-def DS_GWS_INIT       : DS_1A_GDS<"ds_gws_init">;
-def DS_GWS_SEMA_V     : DS_1A_GDS<"ds_gws_sema_v">;
-def DS_GWS_SEMA_BR    : DS_1A_GDS<"ds_gws_sema_br">;
-def DS_GWS_SEMA_P     : DS_1A_GDS<"ds_gws_sema_p">;
-def DS_GWS_BARRIER    : DS_1A_GDS<"ds_gws_barrier">;
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2_RTN_B64     : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"", 1>;
+
+def DS_GWS_INIT       : DS_GWS_1D<"ds_gws_init">;
+def DS_GWS_SEMA_V     : DS_GWS_0D<"ds_gws_sema_v">;
+def DS_GWS_SEMA_BR    : DS_GWS_1D<"ds_gws_sema_br">;
+def DS_GWS_SEMA_P     : DS_GWS_0D<"ds_gws_sema_p">;
+def DS_GWS_BARRIER    : DS_GWS_1D<"ds_gws_barrier">;
 
 def DS_ADD_SRC2_U32   : DS_1A<"ds_add_src2_u32">;
 def DS_SUB_SRC2_U32   : DS_1A<"ds_sub_src2_u32">;
@@ -386,7 +417,7 @@ def DS_MIN_SRC2_I32   : DS_1A<"ds_min_src2_i32">;
 def DS_MAX_SRC2_I32   : DS_1A<"ds_max_src2_i32">;
 def DS_MIN_SRC2_U32   : DS_1A<"ds_min_src2_u32">;
 def DS_MAX_SRC2_U32   : DS_1A<"ds_max_src2_u32">;
-def DS_AND_SRC2_B32   : DS_1A<"ds_and_src_b32">;
+def DS_AND_SRC2_B32   : DS_1A<"ds_and_src2_b32">;
 def DS_OR_SRC2_B32    : DS_1A<"ds_or_src2_b32">;
 def DS_XOR_SRC2_B32   : DS_1A<"ds_xor_src2_b32">;
 def DS_MIN_SRC2_F32   : DS_1A<"ds_min_src2_f32">;
@@ -407,8 +438,8 @@ def DS_XOR_SRC2_B64   : DS_1A<"ds_xor_src2_b64">;
 def DS_MIN_SRC2_F64   : DS_1A<"ds_min_src2_f64">;
 def DS_MAX_SRC2_F64   : DS_1A<"ds_max_src2_f64">;
 
-def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">;
-def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">;
+def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
+def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
 
 let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
 def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
@@ -429,30 +460,34 @@ def DS_READ2_B64     : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
 def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
 }
 
-let SubtargetPredicate = isSICI in {
 def DS_CONSUME       : DS_0A_RET<"ds_consume">;
 def DS_APPEND        : DS_0A_RET<"ds_append">;
 def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
-}
 
 //===----------------------------------------------------------------------===//
 // Instruction definitions for CI and newer.
 //===----------------------------------------------------------------------===//
-// Remaining instructions:
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
 
 let SubtargetPredicate = isCIVI in {
 
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">,
-                      AtomicNoRet<"ds_wrap_f32", 1>;
+def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>;
+
+def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>,
+                            AtomicNoRet<"", 1>;
+
+def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
+
+let mayStore = 0 in {
+def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>;
+def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>;
+} // End mayStore = 0
+
+let mayLoad = 0 in {
+def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>;
+def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>;
+} // End mayLoad = 0
+
+def DS_NOP : DS_VOID<"ds_nop">;
 
 } // let SubtargetPredicate = isCIVI
 
@@ -623,6 +658,7 @@ def DS_CMPST_B32_si       : DS_Real_si<0x10, DS_CMPST_B32>;
 def DS_CMPST_F32_si       : DS_Real_si<0x11, DS_CMPST_F32>;
 def DS_MIN_F32_si         : DS_Real_si<0x12, DS_MIN_F32>;
 def DS_MAX_F32_si         : DS_Real_si<0x13, DS_MAX_F32>;
+def DS_NOP_si             : DS_Real_si<0x14, DS_NOP>;
 def DS_GWS_INIT_si        : DS_Real_si<0x19, DS_GWS_INIT>;
 def DS_GWS_SEMA_V_si      : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
 def DS_GWS_SEMA_BR_si     : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
@@ -651,8 +687,10 @@ def DS_CMPST_RTN_F32_si   : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
 def DS_MIN_RTN_F32_si     : DS_Real_si<0x32, DS_MIN_RTN_F32>;
 def DS_MAX_RTN_F32_si     : DS_Real_si<0x33, DS_MAX_RTN_F32>;
 
-// FIXME: this instruction is actually CI/VI
-def DS_WRAP_RTN_F32_si    : DS_Real_si<0x34, DS_WRAP_RTN_F32>;
+// These instruction are CI/VI only
+def DS_WRAP_RTN_B32_si    : DS_Real_si<0x34, DS_WRAP_RTN_B32>;
+def DS_CONDXCHG32_RTN_B64_si   : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>;
+def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>;
 
 def DS_SWIZZLE_B32_si     : DS_Real_si<0x35, DS_SWIZZLE_B32>;
 def DS_READ_B32_si        : DS_Real_si<0x36, DS_READ_B32>;
@@ -744,6 +782,10 @@ def DS_WRITE_SRC2_B64_si  : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
 
 def DS_MIN_SRC2_F64_si    : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
 def DS_MAX_SRC2_F64_si    : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
+def DS_WRITE_B96_si       : DS_Real_si<0xde, DS_WRITE_B96>;
+def DS_WRITE_B128_si      : DS_Real_si<0xdf, DS_WRITE_B128>;
+def DS_READ_B96_si        : DS_Real_si<0xfe, DS_READ_B96>;
+def DS_READ_B128_si       : DS_Real_si<0xff, DS_READ_B128>;
 
 //===----------------------------------------------------------------------===//
 // VIInstructions.td
@@ -787,12 +829,13 @@ def DS_CMPST_B32_vi       : DS_Real_vi<0x10, DS_CMPST_B32>;
 def DS_CMPST_F32_vi       : DS_Real_vi<0x11, DS_CMPST_F32>;
 def DS_MIN_F32_vi         : DS_Real_vi<0x12, DS_MIN_F32>;
 def DS_MAX_F32_vi         : DS_Real_vi<0x13, DS_MAX_F32>;
+def DS_NOP_vi             : DS_Real_vi<0x14, DS_NOP>;
 def DS_ADD_F32_vi         : DS_Real_vi<0x15, DS_ADD_F32>;
-def DS_GWS_INIT_vi        : DS_Real_vi<0x19, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x1a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x1c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_vi     : DS_Real_vi<0x1d, DS_GWS_BARRIER>;
+def DS_GWS_INIT_vi        : DS_Real_vi<0x99, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x9a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x9c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_vi     : DS_Real_vi<0x9d, DS_GWS_BARRIER>;
 def DS_WRITE_B8_vi        : DS_Real_vi<0x1e, DS_WRITE_B8>;
 def DS_WRITE_B16_vi       : DS_Real_vi<0x1f, DS_WRITE_B16>;
 def DS_ADD_RTN_U32_vi     : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
@@ -815,7 +858,7 @@ def DS_CMPST_RTN_B32_vi   : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
 def DS_CMPST_RTN_F32_vi   : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
 def DS_MIN_RTN_F32_vi     : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
 def DS_MAX_RTN_F32_vi     : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
-def DS_WRAP_RTN_F32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_F32>;
+def DS_WRAP_RTN_B32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
 def DS_ADD_RTN_F32_vi     : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
 def DS_READ_B32_vi        : DS_Real_vi<0x36, DS_READ_B32>;
 def DS_READ2_B32_vi       : DS_Real_vi<0x37, DS_READ2_B32>;
@@ -824,6 +867,9 @@ def DS_READ_I8_vi         : DS_Real_vi<0x39, DS_READ_I8>;
 def DS_READ_U8_vi         : DS_Real_vi<0x3a, DS_READ_U8>;
 def DS_READ_I16_vi        : DS_Real_vi<0x3b, DS_READ_I16>;
 def DS_READ_U16_vi        : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_CONSUME_vi         : DS_Real_vi<0xbd, DS_CONSUME>;
+def DS_APPEND_vi          : DS_Real_vi<0xbe, DS_APPEND>;
+def DS_ORDERED_COUNT_vi   : DS_Real_vi<0xbf, DS_ORDERED_COUNT>;
 def DS_SWIZZLE_B32_vi     : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
 def DS_PERMUTE_B32_vi     : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
 def DS_BPERMUTE_B32_vi    : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
@@ -865,6 +911,8 @@ def DS_MSKOR_RTN_B64_vi   : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
 def DS_WRXCHG_RTN_B64_vi  : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
 def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
 def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CONDXCHG32_RTN_B64_vi   : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
+def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
 def DS_CMPST_RTN_B64_vi   : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
 def DS_CMPST_RTN_F64_vi   : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
 def DS_MIN_RTN_F64_vi     : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
@@ -904,3 +952,7 @@ def DS_XOR_SRC2_B64_vi    : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
 def DS_WRITE_SRC2_B64_vi  : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
 def DS_MIN_SRC2_F64_vi    : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
 def DS_MAX_SRC2_F64_vi    : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
+def DS_WRITE_B96_vi       : DS_Real_vi<0xde, DS_WRITE_B96>;
+def DS_WRITE_B128_vi      : DS_Real_vi<0xdf, DS_WRITE_B128>;
+def DS_READ_B96_vi        : DS_Real_vi<0xfe, DS_READ_B96>;
+def DS_READ_B128_vi       : DS_Real_vi<0xff, DS_READ_B128>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2247cad7bb51..4fb03b62bba9 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -22,6 +22,7 @@
 #include "AMDGPURegisterInfo.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
@@ -97,9 +98,13 @@ static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
 }
 
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
+static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
+                                         unsigned Imm,
+                                         uint64_t Addr,
+                                         const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
+}
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
@@ -138,7 +143,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   CommentStream = &CS;
 
   // ToDo: AMDGPUDisassembler supports only VI ISA.
-  assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
+    report_fatal_error("Disassembly not yet supported for subtarget");
 
   const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
   Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -179,6 +185,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
   } while (false);
 
+  if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
+              MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+    // Insert dummy unused src2_modifiers.
+    int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                AMDGPU::OpName::src2_modifiers);
+    auto I = MI.begin();
+    std::advance(I, Src2ModIdx);
+    MI.insert(I, MCOperand::createImm(0));
+  }
+
   Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
   return Res;
 }
@@ -263,6 +280,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
   return decodeSrcOp(OPW16, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const {
+  return decodeSrcOp(OPWV216, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
   // Some instructions have operand restrictions beyond what the encoding
   // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -423,6 +444,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
   case OPW64:
     return MCOperand::createImm(getInlineImmVal64(Imm));
   case OPW16:
+  case OPWV216:
     return MCOperand::createImm(getInlineImmVal16(Imm));
   default:
     llvm_unreachable("implement me");
@@ -436,6 +458,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return VGPR_32RegClassID;
   case OPW64: return VReg_64RegClassID;
   case OPW128: return VReg_128RegClassID;
@@ -449,6 +472,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return SGPR_32RegClassID;
   case OPW64: return SGPR_64RegClassID;
   case OPW128: return SGPR_128RegClassID;
@@ -462,6 +486,7 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return TTMP_32RegClassID;
   case OPW64: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
@@ -497,6 +522,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
   switch (Width) {
   case OPW32:
   case OPW16:
+  case OPWV216:
     return decodeSpecialReg32(Val);
   case OPW64:
     return decodeSpecialReg64(Val);
@@ -522,6 +548,11 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   case 124: return createRegOperand(M0);
   case 126: return createRegOperand(EXEC_LO);
   case 127: return createRegOperand(EXEC_HI);
+  case 235: return createRegOperand(SRC_SHARED_BASE);
+  case 236: return createRegOperand(SRC_SHARED_LIMIT);
+  case 237: return createRegOperand(SRC_PRIVATE_BASE);
+  case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
+    // TODO: SRC_POPS_EXITING_WAVE_ID
     // ToDo: no support for vccz register
   case 251: break;
     // ToDo: no support for execz register
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index ee5883a984e0..d50665187e10 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -67,6 +67,7 @@ public:
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
+  MCOperand decodeOperand_VSrcV216(unsigned Val) const;
 
   MCOperand decodeOperand_VReg_64(unsigned Val) const;
   MCOperand decodeOperand_VReg_96(unsigned Val) const;
@@ -85,6 +86,7 @@ public:
     OPW64,
     OPW128,
     OPW16,
+    OPWV216,
     OPW_LAST_,
     OPW_FIRST_ = OPW32
   };
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 48c6592ca5b2..5480110d8315 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -35,28 +35,59 @@ class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag
     : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
                  "MEM_RAT_CACHELESS "#name, pattern>;
 
-class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
-                  list<dag> pattern>
-    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                  dag outs, string name, list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, mask, outs, ins,
                  "MEM_RAT "#name, pattern>;
 
 class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
-    : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
-                           i32imm:$rat_id, InstFlag:$eop),
+    : CF_MEM_RAT <0x1, ?, 0xf, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+                           i32imm:$rat_id, InstFlag:$eop), (outs),
                   "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
                                #!if(has_eop, ", $eop", ""),
                   [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
                                              R600_Reg128:$index_gpr,
                                              (i32 imm:$rat_id))]>;
 
-def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), (outs),
   "MSKOR $rw_gpr.XW, $index_gpr",
   [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
 > {
   let eop = 0;
 }
 
+
+multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> {
+  let Constraints = "$rw_gpr = $out_gpr", eop = 0, mayStore = 1 in {
+  def  _RTN: CF_MEM_RAT <op_ret, 0, 0xf,
+             (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+             (outs R600_Reg128:$out_gpr),
+             name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >;
+  def _NORET: CF_MEM_RAT <op_noret, 0, 0xf,
+              (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+              (outs R600_Reg128:$out_gpr),
+              name ## " $rw_gpr, $index_gpr", [] >;
+  }
+}
+
+// Swap no-ret is just store. Raw store to cached target
+// can only store on dword, which exactly matches swap_no_ret.
+defm RAT_ATOMIC_XCHG_INT : RAT_ATOMIC<1, 34, "ATOMIC_XCHG_INT">;
+defm RAT_ATOMIC_CMPXCHG_INT : RAT_ATOMIC<4, 36, "ATOMIC_CMPXCHG_INT">;
+defm RAT_ATOMIC_ADD : RAT_ATOMIC<7, 39, "ATOMIC_ADD">;
+defm RAT_ATOMIC_SUB : RAT_ATOMIC<8, 40, "ATOMIC_SUB">;
+defm RAT_ATOMIC_RSUB : RAT_ATOMIC<9, 41, "ATOMIC_RSUB">;
+defm RAT_ATOMIC_MIN_INT : RAT_ATOMIC<10, 42, "ATOMIC_MIN_INT">;
+defm RAT_ATOMIC_MIN_UINT : RAT_ATOMIC<11, 43, "ATOMIC_MIN_UINT">;
+defm RAT_ATOMIC_MAX_INT : RAT_ATOMIC<12, 44, "ATOMIC_MAX_INT">;
+defm RAT_ATOMIC_MAX_UINT : RAT_ATOMIC<13, 45, "ATOMIC_MAX_UINT">;
+defm RAT_ATOMIC_AND : RAT_ATOMIC<14, 46, "ATOMIC_AND">;
+defm RAT_ATOMIC_OR : RAT_ATOMIC<15, 47, "ATOMIC_OR">;
+defm RAT_ATOMIC_XOR : RAT_ATOMIC<16, 48, "ATOMIC_XOR">;
+defm RAT_ATOMIC_INC_UINT : RAT_ATOMIC<18, 50, "ATOMIC_INC_UINT">;
+defm RAT_ATOMIC_DEC_UINT : RAT_ATOMIC<19, 51, "ATOMIC_DEC_UINT">;
+
 } // End let Predicates = [isEGorCayman]
 
 //===----------------------------------------------------------------------===//
@@ -257,6 +288,76 @@ def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
 
 let Predicates = [isEGorCayman] in {
 
+multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret,
+                     SDPatternOperator node_ret, SDPatternOperator node_noret> {
+  // FIXME: Add _RTN version. We need per WI scratch location to store the old value
+  // EXTRACT_SUBREG here is dummy, we know the node has no uses
+  def : Pat<(i32 (node_noret i32:$ptr, i32:$data)),
+            (EXTRACT_SUBREG (inst_noret
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>;
+}
+multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret,
+                     SDPatternOperator node_ret, SDPatternOperator node_noret, int C> {
+  // FIXME: Add _RTN version. We need per WI scratch location to store the old value
+  // EXTRACT_SUBREG here is dummy, we know the node has no uses
+  def : Pat<(i32 (node_noret i32:$ptr, C)),
+            (EXTRACT_SUBREG (inst_noret
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (MOV_IMM_I32 -1), sub0), $ptr), sub1)>;
+}
+
+// CMPSWAP is pattern is special
+// EXTRACT_SUBREG here is dummy, we know the node has no uses
+// FIXME: Add _RTN version. We need per WI scratch location to store the old value
+def : Pat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)),
+          (EXTRACT_SUBREG (RAT_ATOMIC_CMPXCHG_INT_NORET
+            (INSERT_SUBREG
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $cmp, sub3),
+            $data, sub0),
+          $ptr), sub1)>;
+
+defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN,
+                                RAT_ATOMIC_XCHG_INT_NORET,
+                                atomic_swap_global_ret,
+                                atomic_swap_global_noret>;
+defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET,
+                               atomic_add_global_ret, atomic_add_global_noret>;
+defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET,
+                               atomic_sub_global_ret, atomic_sub_global_noret>;
+defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN,
+                               RAT_ATOMIC_MIN_INT_NORET,
+                               atomic_min_global_ret, atomic_min_global_noret>;
+defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN,
+                                RAT_ATOMIC_MIN_UINT_NORET,
+                                atomic_umin_global_ret, atomic_umin_global_noret>;
+defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN,
+                               RAT_ATOMIC_MAX_INT_NORET,
+                               atomic_max_global_ret, atomic_max_global_noret>;
+defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN,
+                                RAT_ATOMIC_MAX_UINT_NORET,
+                                atomic_umax_global_ret, atomic_umax_global_noret>;
+defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET,
+                               atomic_and_global_ret, atomic_and_global_noret>;
+defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET,
+                              atomic_or_global_ret, atomic_or_global_noret>;
+defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET,
+                               atomic_xor_global_ret, atomic_xor_global_noret>;
+defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
+                                        RAT_ATOMIC_INC_UINT_NORET,
+                                        atomic_add_global_ret,
+                                        atomic_add_global_noret, 1>;
+defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
+                                        RAT_ATOMIC_INC_UINT_NORET,
+                                        atomic_sub_global_ret,
+                                        atomic_sub_global_noret, -1>;
+defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
+                                        RAT_ATOMIC_DEC_UINT_NORET,
+                                        atomic_add_global_ret,
+                                        atomic_add_global_noret, -1>;
+defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
+                                        RAT_ATOMIC_DEC_UINT_NORET,
+                                        atomic_sub_global_ret,
+                                        atomic_sub_global_noret, 1>;
+
 // Should be predicated on FeatureFP64
 // def FMA_64 : R600_3OP <
 //   0xA, "FMA_64",
@@ -287,7 +388,7 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   VecALU
 >;
 
-def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
+defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>;
 
 def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
@@ -337,7 +438,7 @@ defm CUBE_eg : CUBE_Common<0xC0>;
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
-def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, VecALU>;
 def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
 def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 849fb8ad50f5..b0ac0e689a0b 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -136,7 +136,7 @@ multiclass FLAT_Atomic_Pseudo<
 class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
   (ops node:$ptr, node:$value),
   (atomic_op node:$ptr, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}]
 >;
 
 def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
@@ -284,16 +284,16 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
                                                (ld node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+  return AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
                                                (st node:$val, node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS;
+  return AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 def atomic_flat_load   : flat_ld <atomic_load>;
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index dd3b46f13921..80fc4ac9d2a3 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -11,11 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "GCNHazardRecognizer.h"
 #include "AMDGPUSubtarget.h"
+#include "GCNHazardRecognizer.h"
+#include "SIDefines.h"
 #include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <set>
+#include <vector>
 
 using namespace llvm;
 
@@ -26,7 +39,8 @@ using namespace llvm;
 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   CurrCycleInstr(nullptr),
   MF(MF),
-  ST(MF.getSubtarget<SISubtarget>()) {
+  ST(MF.getSubtarget<SISubtarget>()),
+  TII(*ST.getInstrInfo()) {
   MaxLookAhead = 5;
 }
 
@@ -58,8 +72,19 @@ static bool isRFE(unsigned Opcode) {
   return Opcode == AMDGPU::S_RFE_B64;
 }
 
-static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
+static bool isSMovRel(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_MOVRELS_B32:
+  case AMDGPU::S_MOVRELS_B64:
+  case AMDGPU::S_MOVRELD_B32:
+  case AMDGPU::S_MOVRELD_B64:
+    return true;
+  default:
+    return false;
+  }
+}
 
+static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
@@ -96,6 +121,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
     return NoopHazard;
 
+  if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+      checkReadM0Hazards(MI) > 0)
+    return NoopHazard;
+
+  if (checkAnyInstHazards(MI) > 0)
+    return NoopHazard;
+
   return NoHazard;
 }
 
@@ -104,11 +136,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  int WaitStates = std::max(0, checkAnyInstHazards(MI));
+
   if (SIInstrInfo::isSMRD(*MI))
-    return std::max(0, checkSMRDHazards(MI));
+    return std::max(WaitStates, checkSMRDHazards(MI));
 
   if (SIInstrInfo::isVALU(*MI)) {
-    int WaitStates = std::max(0, checkVALUHazards(MI));
+      WaitStates = std::max(WaitStates, checkVALUHazards(MI));
 
     if (SIInstrInfo::isVMEM(*MI))
       WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
@@ -122,19 +156,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
     if (isRWLane(MI->getOpcode()))
       WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
+    if (TII.isVINTRP(*MI))
+      WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
+
     return WaitStates;
   }
 
   if (isSGetReg(MI->getOpcode()))
-    return std::max(0, checkGetRegHazards(MI));
+    return std::max(WaitStates, checkGetRegHazards(MI));
 
   if (isSSetReg(MI->getOpcode()))
-    return std::max(0, checkSetRegHazards(MI));
+    return std::max(WaitStates, checkSetRegHazards(MI));
 
   if (isRFE(MI->getOpcode()))
-    return std::max(0, checkRFEHazards(MI));
+    return std::max(WaitStates, checkRFEHazards(MI));
 
-  return 0;
+  if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))
+    return std::max(WaitStates, checkReadM0Hazards(MI));
+
+  return WaitStates;
 }
 
 void GCNHazardRecognizer::EmitNoop() {
@@ -142,14 +182,12 @@ void GCNHazardRecognizer::EmitNoop() {
 }
 
 void GCNHazardRecognizer::AdvanceCycle() {
-
   // When the scheduler detects a stall, it will call AdvanceCycle() without
   // emitting any instructions.
   if (!CurrCycleInstr)
     return;
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+  unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
 
   // Keep track of emitted instructions
   EmittedInstrs.push_front(CurrCycleInstr);
@@ -180,7 +218,6 @@ void GCNHazardRecognizer::RecedeCycle() {
 
 int GCNHazardRecognizer::getWaitStatesSince(
     function_ref<bool(MachineInstr *)> IsHazard) {
-
   int WaitStates = -1;
   for (MachineInstr *MI : EmittedInstrs) {
     ++WaitStates;
@@ -204,7 +241,6 @@ int GCNHazardRecognizer::getWaitStatesSinceDef(
 
 int GCNHazardRecognizer::getWaitStatesSinceSetReg(
     function_ref<bool(MachineInstr *)> IsHazard) {
-
   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
   };
@@ -281,7 +317,6 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
 
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
   int WaitStatesNeeded = 0;
 
   WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
@@ -293,7 +328,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   // A read of an SGPR by SMRD instruction requires 4 wait states when the
   // SGPR was written by a VALU instruction.
   int SmrdSgprWaitStates = 4;
-  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 
   for (const MachineOperand &Use : SMRD->uses()) {
     if (!Use.isReg())
@@ -486,7 +521,6 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
 }
 
 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
-
   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return 0;
 
@@ -500,3 +534,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
   return RFEWaitStates - WaitStatesNeeded;
 }
+
+int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
+  if (MI->isDebugValue())
+    return 0;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  if (!ST.hasSMovFedHazard())
+    return 0;
+
+  // Check for any instruction reading an SGPR after a write from
+  // s_mov_fed_b32.
+  int MovFedWaitStates = 1;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Use : MI->uses()) {
+    if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+    auto IsHazardFn = [] (MachineInstr *MI) {
+      return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
+    };
+    int WaitStatesNeededForUse =
+        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
+  if (!ST.hasReadM0Hazard())
+    return 0;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  int SMovRelWaitStates = 1;
+  auto IsHazardFn = [TII] (MachineInstr *MI) {
+    return TII->isSALU(*MI);
+  };
+  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 0ab82ff4635b..5680c3de6a1a 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -34,6 +34,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   std::list<MachineInstr*> EmittedInstrs;
   const MachineFunction &MF;
   const SISubtarget &ST;
+  const SIInstrInfo &TII;
 
   int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
   int getWaitStatesSinceDef(unsigned Reg,
@@ -52,6 +53,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   int checkVALUHazards(MachineInstr *VALU);
   int checkRWLaneHazards(MachineInstr *RWLane);
   int checkRFEHazards(MachineInstr *RFE);
+  int checkAnyInstHazards(MachineInstr *MI);
+  int checkReadM0Hazards(MachineInstr *SMovRel);
 public:
   GCNHazardRecognizer(const MachineFunction &MF);
   // We can only issue one instruction per cycle.
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
new file mode 100644
index 000000000000..3bb5c9bc22b7
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -0,0 +1,528 @@
+//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNIterativeScheduler.h"
+#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace llvm {
+  std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+    const ScheduleDAG &DAG);
+}
+
+// shim accessors for different order containers
+static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
+  return MI;
+}
+static inline MachineInstr *getMachineInstr(const SUnit *SU) {
+  return SU->getInstr();
+}
+static inline MachineInstr *getMachineInstr(const SUnit &SU) {
+  return SU.getInstr();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void printRegion(raw_ostream &OS,
+                        MachineBasicBlock::iterator Begin,
+                        MachineBasicBlock::iterator End,
+                        const LiveIntervals *LIS,
+                        unsigned MaxInstNum =
+                          std::numeric_limits<unsigned>::max()) {
+  auto BB = Begin->getParent();
+  OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
+     << ' ' << BB->getName() << ":\n";
+  auto I = Begin;
+  MaxInstNum = std::max(MaxInstNum, 1u);
+  for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (I != End) {
+    OS << "\t...\n";
+    I = std::prev(End);
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (End != BB->end()) { // print boundary inst if present
+    OS << "----\n";
+    if (LIS) OS << LIS->getInstructionIndex(*End) << '\t';
+    OS << *End;
+  }
+}
+
+LLVM_DUMP_METHOD
+static void printLivenessInfo(raw_ostream &OS,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              const LiveIntervals *LIS) {
+  const auto BB = Begin->getParent();
+  const auto &MRI = BB->getParent()->getRegInfo();
+
+  const auto LiveIns = getLiveRegsBefore(*Begin, *LIS);
+  OS << "LIn RP: ";
+  getRegPressure(MRI, LiveIns).print(OS);
+
+  const auto BottomMI = End == BB->end() ? std::prev(End) : End;
+  const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS);
+  OS << "LOt RP: ";
+  getRegPressure(MRI, LiveOuts).print(OS);
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  for (const auto R : Regions) {
+    OS << "Region to schedule ";
+    printRegion(OS, R->Begin, R->End, LIS, 1);
+    printLivenessInfo(OS, R->Begin, R->End, LIS);
+    OS << "Max RP: ";
+    R->MaxPressure.print(OS, &ST);
+  }
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedResult(raw_ostream &OS,
+                                             const Region *R,
+                                             const GCNRegPressure &RP) const {
+  OS << "\nAfter scheduling ";
+  printRegion(OS, R->Begin, R->End, LIS);
+  printSchedRP(OS, R->MaxPressure, RP);
+  OS << '\n';
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
+                                         const GCNRegPressure &Before,
+                                         const GCNRegPressure &After) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  OS << "RP before: ";
+  Before.print(OS, &ST);
+  OS << "RP after:  ";
+  After.print(OS, &ST);
+}
+
+#endif
+
+// DAG builder helper
+class GCNIterativeScheduler::BuildDAG {
+  GCNIterativeScheduler &Sch;
+  SmallVector<SUnit*, 8> TopRoots;
+public:
+  BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
+    : Sch(_Sch) {
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+
+    Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
+                        /*TrackLaneMask*/true);
+    Sch.Topo.InitDAGTopologicalSorting();
+
+    SmallVector<SUnit*, 8> BotRoots;
+    Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
+  }
+  ~BuildDAG() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+  }
+  ArrayRef<const SUnit*> getTopRoots() const {
+    return TopRoots;
+  }
+};
+
+class GCNIterativeScheduler::OverrideLegacyStrategy {
+  GCNIterativeScheduler &Sch;
+  Region &Rgn;
+  std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
+  GCNRegPressure SaveMaxRP;
+public:
+  OverrideLegacyStrategy(Region &R,
+                         MachineSchedStrategy &OverrideStrategy,
+                         GCNIterativeScheduler &_Sch)
+    : Sch(_Sch)
+    , Rgn(R)
+    , SaveSchedImpl(std::move(_Sch.SchedImpl))
+    , SaveMaxRP(R.MaxPressure) {
+    Sch.SchedImpl.reset(&OverrideStrategy);
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+  }
+  ~OverrideLegacyStrategy() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+    Sch.SchedImpl.release();
+    Sch.SchedImpl = std::move(SaveSchedImpl);
+  }
+  void schedule() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    DEBUG(dbgs() << "\nScheduling ";
+      printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+    Sch.BaseClass::schedule();
+
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    Sch.RegionEnd = Rgn.End;
+    //assert(Rgn.End == Sch.RegionEnd);
+    Rgn.Begin = Sch.RegionBegin;
+    Rgn.MaxPressure.clear();
+  }
+  void restoreOrder() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    // DAG SUnits are stored using original region's order
+    // so just use SUnits as the restoring schedule
+    Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP);
+  }
+};
+
+// just a stub to make base class happy
+class SchedStrategyStub : public MachineSchedStrategy {
+public:
+  bool shouldTrackPressure() const override { return false; }
+  bool shouldTrackLaneMasks() const override { return false; }
+  void initialize(ScheduleDAGMI *DAG) override {}
+  SUnit *pickNode(bool &IsTopNode) override { return nullptr; }
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+  void releaseTopNode(SUnit *SU) override {}
+  void releaseBottomNode(SUnit *SU) override {}
+};
+
+GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
+                                             StrategyKind S)
+  : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+  , Context(C)
+  , Strategy(S)
+  , UPTracker(*LIS) {
+}
+
+// returns max pressure for a region
+GCNRegPressure
+GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End)
+  const {
+  // For the purpose of pressure tracking bottom inst of the region should
+  // be also processed. End is either BB end, BB terminator inst or sched
+  // boundary inst.
+  auto const BBEnd = Begin->getParent()->end();
+  auto const BottomMI = End == BBEnd ? std::prev(End) : End;
+
+  // scheduleRegions walks bottom to top, so its likely we just get next
+  // instruction to track
+  auto AfterBottomMI = std::next(BottomMI);
+  if (AfterBottomMI == BBEnd ||
+      &*AfterBottomMI != UPTracker.getLastTrackedMI()) {
+    UPTracker.reset(*BottomMI);
+  } else {
+    assert(UPTracker.isValid());
+  }
+
+  for (auto I = BottomMI; I != Begin; --I)
+    UPTracker.recede(*I);
+
+  UPTracker.recede(*Begin);
+
+  assert(UPTracker.isValid() ||
+         (dbgs() << "Tracked region ",
+          printRegion(dbgs(), Begin, End, LIS), false));
+  return UPTracker.moveMaxPressure();
+}
+
+// returns max pressure for a tentative schedule
+template <typename Range> GCNRegPressure
+GCNIterativeScheduler::getSchedulePressure(const Region &R,
+                                           Range &&Schedule) const {
+  auto const BBEnd = R.Begin->getParent()->end();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  if (R.End != BBEnd) {
+    // R.End points to the boundary instruction but the
+    // schedule doesn't include it
+    RPTracker.reset(*R.End);
+    RPTracker.recede(*R.End);
+  } else {
+    // R.End doesn't point to the boundary instruction
+    RPTracker.reset(*std::prev(BBEnd));
+  }
+  for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
+    RPTracker.recede(*getMachineInstr(*--I));
+  }
+  return RPTracker.moveMaxPressure();
+}
+
+void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
+                                        MachineBasicBlock::iterator Begin,
+                                        MachineBasicBlock::iterator End,
+                                        unsigned NumRegionInstrs) {
+  BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs);
+  if (NumRegionInstrs > 2) {
+    Regions.push_back(
+      new (Alloc.Allocate())
+      Region { Begin, End, NumRegionInstrs,
+               getRegionPressure(Begin, End), nullptr });
+  }
+}
+
+void GCNIterativeScheduler::schedule() { // overriden
+  // do nothing
+  DEBUG(
+    printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+    if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+      dbgs() << "Max RP: ";
+      Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
+    }
+    dbgs() << '\n';
+  );
+}
+
+void GCNIterativeScheduler::finalizeSchedule() { // overriden
+  if (Regions.empty())
+    return;
+  switch (Strategy) {
+  case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
+  case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
+  case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+  }
+}
+
+// Detach schedule from SUnits and interleave it with debug values.
+// Returned schedule becomes independent of DAG state.
+std::vector<MachineInstr*>
+GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
+  std::vector<MachineInstr*> Res;
+  Res.reserve(Schedule.size() * 2);
+
+  if (FirstDbgValue)
+    Res.push_back(FirstDbgValue);
+
+  const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end();
+  for (auto SU : Schedule) {
+    Res.push_back(SU->getInstr());
+    const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) {
+      return P.second == SU->getInstr();
+    });
+    if (D != DbgE)
+      Res.push_back(D->first);
+  }
+  return Res;
+}
+
+void GCNIterativeScheduler::setBestSchedule(Region &R,
+                                            ScheduleRef Schedule,
+                                            const GCNRegPressure &MaxRP) {
+  R.BestSchedule.reset(
+    new TentativeSchedule{ detachSchedule(Schedule), MaxRP });
+}
+
+void GCNIterativeScheduler::scheduleBest(Region &R) {
+  assert(R.BestSchedule.get() && "No schedule specified");
+  scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure);
+  R.BestSchedule.reset();
+}
+
+// minimal required region scheduler, works for ranges of SUnits*,
+// SUnits or MachineIntrs*
+template <typename Range>
+void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
+                                           const GCNRegPressure &MaxRP) {
+  assert(RegionBegin == R.Begin && RegionEnd == R.End);
+  assert(LIS != nullptr);
+#ifndef NDEBUG
+  const auto SchedMaxRP = getSchedulePressure(R, Schedule);
+#endif
+  auto BB = R.Begin->getParent();
+  auto Top = R.Begin;
+  for (const auto &I : Schedule) {
+    auto MI = getMachineInstr(I);
+    if (MI != &*Top) {
+      BB->remove(MI);
+      BB->insert(Top, MI);
+      if (!MI->isDebugValue())
+        LIS->handleMove(*MI, true);
+    }
+    if (!MI->isDebugValue()) {
+      // Reset read - undef flags and update them later.
+      for (auto &Op : MI->operands())
+        if (Op.isReg() && Op.isDef())
+          Op.setIsUndef(false);
+
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
+                                       /*IgnoreDead*/false);
+      // Adjust liveness and add missing dead+read-undef flags.
+      auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    }
+    Top = std::next(MI->getIterator());
+  }
+  RegionBegin = getMachineInstr(Schedule.front());
+
+  // Schedule consisting of MachineInstr* is considered 'detached'
+  // and already interleaved with debug values
+  if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
+    placeDebugValues();
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    //assert(R.End == RegionEnd);
+    RegionEnd = R.End;
+  }
+
+  R.Begin = RegionBegin;
+  R.MaxPressure = MaxRP;
+
+#ifndef NDEBUG
+  const auto RegionMaxRP = getRegionPressure(R);
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+#endif
+  assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
+  || (dbgs() << "Max RP mismatch!!!\n"
+                "RP for schedule (calculated): ",
+      SchedMaxRP.print(dbgs(), &ST),
+      dbgs() << "RP for schedule (reported): ",
+      MaxRP.print(dbgs(), &ST),
+      dbgs() << "RP after scheduling: ",
+      RegionMaxRP.print(dbgs(), &ST),
+      false));
+}
+
+// Sort recorded regions by pressure - highest at the front
+void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  std::sort(Regions.begin(), Regions.end(),
+    [&ST, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  });
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Legacy MaxOccupancy Strategy
+
+// Tries to increase occupancy applying minreg scheduler for a sequence of
+// most demanding regions. Obtained schedules are saved as BestSchedule for a
+// region.
+// TargetOcc is the best achievable occupancy for a kernel.
+// Returns better occupancy on success or current occupancy on fail.
+// BestSchedules aren't deleted on fail.
+unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
+  // TODO: assert Regions are sorted descending by pressure
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
+               << ", current = " << Occ << '\n');
+
+  auto NewOcc = TargetOcc;
+  for (auto R : Regions) {
+    if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+      break;
+
+    DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+          printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+    const auto MaxRP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+          printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+    if (NewOcc <= Occ)
+      break;
+
+    setBestSchedule(*R, MinSchedule, MaxRP);
+  }
+  DEBUG(dbgs() << "New occupancy = " << NewOcc
+               << ", prev occupancy = " << Occ << '\n');
+  return std::max(NewOcc, Occ);
+}
+
+void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
+  bool TryMaximizeOccupancy) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+
+  sortRegionsByPressure(TgtOcc);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+  if (TryMaximizeOccupancy && Occ < TgtOcc)
+    Occ = tryMaximizeOccupancy(TgtOcc);
+
+  // This is really weird but for some magic scheduling regions twice
+  // gives performance improvement
+  const int NumPasses = Occ < TgtOcc ? 2 : 1;
+
+  TgtOcc = std::min(Occ, TgtOcc);
+  DEBUG(dbgs() << "Scheduling using default scheduler, "
+                  "target occupancy = " << TgtOcc << '\n');
+  GCNMaxOccupancySchedStrategy LStrgy(Context);
+
+  for (int I = 0; I < NumPasses; ++I) {
+    // running first pass with TargetOccupancy = 0 mimics previous scheduling
+    // approach and is a performance magic
+    LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
+    for (auto R : Regions) {
+      OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
+
+      Ovr.schedule();
+      const auto RP = getRegionPressure(*R);
+      DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+      if (RP.getOccupancy(ST) < TgtOcc) {
+        DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+        if (R->BestSchedule.get() &&
+            R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+          DEBUG(dbgs() << ", scheduling minimal register\n");
+          scheduleBest(*R);
+        } else {
+          DEBUG(dbgs() << ", restoring\n");
+          Ovr.restoreOrder();
+          assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+        }
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Minimal Register Strategy
+
+void GCNIterativeScheduler::scheduleMinReg(bool force) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  sortRegionsByPressure(TgtOcc);
+
+  auto MaxPressure = Regions.front()->MaxPressure;
+  for (auto R : Regions) {
+    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+      break;
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+
+    const auto RP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+      dbgs() << "\nWarning: Pressure becomes worse after minreg!";
+      printSchedRP(dbgs(), R->MaxPressure, RP);
+    });
+
+    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+      break;
+
+    scheduleRegion(*R, MinSchedule, RP);
+    DEBUG(printSchedResult(dbgs(), R, RP));
+
+    MaxPressure = RP;
+  }
+}
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h
new file mode 100644
index 000000000000..df3afce21ebc
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -0,0 +1,118 @@
+//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+
+#include "GCNRegPressure.h"
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class GCNIterativeScheduler : public ScheduleDAGMILive {
+  typedef ScheduleDAGMILive BaseClass;
+public:
+  enum StrategyKind {
+    SCHEDULE_MINREGONLY,
+    SCHEDULE_MINREGFORCED,
+    SCHEDULE_LEGACYMAXOCCUPANCY
+  };
+
+  GCNIterativeScheduler(MachineSchedContext *C,
+                        StrategyKind S);
+
+  void schedule() override;
+
+  void enterRegion(MachineBasicBlock *BB,
+                   MachineBasicBlock::iterator Begin,
+                   MachineBasicBlock::iterator End,
+                   unsigned RegionInstrs) override;
+
+  void finalizeSchedule() override;
+
+protected:
+
+  typedef ArrayRef<const SUnit*> ScheduleRef;
+
+  struct TentativeSchedule {
+    std::vector<MachineInstr*> Schedule;
+    GCNRegPressure MaxPressure;
+  };
+
+  struct Region {
+    // Fields except for BestSchedule are supposed to reflect current IR state
+    // `const` fields are to emphasize they shouldn't change for any schedule.
+    MachineBasicBlock::iterator Begin;
+    // End is either a boundary instruction or end of basic block
+    const MachineBasicBlock::iterator End;
+    const unsigned NumRegionInstrs;
+    GCNRegPressure MaxPressure;
+
+    // best schedule for the region so far (not scheduled yet)
+    std::unique_ptr<TentativeSchedule> BestSchedule;
+  };
+
+  SpecificBumpPtrAllocator<Region> Alloc;
+  std::vector<Region*> Regions;
+
+  MachineSchedContext *Context;
+  const StrategyKind Strategy;
+  mutable GCNUpwardRPTracker UPTracker;
+
+  class BuildDAG;
+  class OverrideLegacyStrategy;
+
+  template <typename Range>
+  GCNRegPressure getSchedulePressure(const Region &R,
+                                     Range &&Schedule) const;
+
+  GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End) const;
+
+  GCNRegPressure getRegionPressure(const Region &R) const {
+    return getRegionPressure(R.Begin, R.End);
+  }
+
+  void setBestSchedule(Region &R,
+                       ScheduleRef Schedule,
+                       const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  void scheduleBest(Region &R);
+
+  std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const;
+
+  void sortRegionsByPressure(unsigned TargetOcc);
+
+  template <typename Range>
+  void scheduleRegion(Region &R, Range &&Schedule,
+                      const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  unsigned tryMaximizeOccupancy(unsigned TargetOcc =
+                                std::numeric_limits<unsigned>::max());
+
+  void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
+  void scheduleMinReg(bool force = false);
+
+  void printRegions(raw_ostream &OS) const;
+  void printSchedResult(raw_ostream &OS,
+                        const Region *R,
+                        const GCNRegPressure &RP) const;
+  void printSchedRP(raw_ostream &OS,
+                    const GCNRegPressure &Before,
+                    const GCNRegPressure &After) const;
+};
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
new file mode 100644
index 000000000000..c6d0f2179950
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -0,0 +1,266 @@
+//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+class GCNMinRegScheduler {
+  struct Candidate : ilist_node<Candidate> {
+    const SUnit *SU;
+    int Priority;
+
+    Candidate(const SUnit *SU_, int Priority_ = 0)
+      : SU(SU_), Priority(Priority_) {}
+  };
+
+  SpecificBumpPtrAllocator<Candidate> Alloc;
+  typedef simple_ilist<Candidate> Queue;
+  Queue RQ; // Ready queue
+
+  std::vector<unsigned> NumPreds;
+
+  bool isScheduled(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max();
+  }
+
+  void setIsScheduled(const SUnit *SU)  {
+    assert(!SU->isBoundaryNode());
+    NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max();
+  }
+
+  unsigned getNumPreds(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return NumPreds[SU->NodeNum];
+  }
+
+  unsigned decNumPreds(const SUnit *SU) {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return --NumPreds[SU->NodeNum];
+  }
+
+  void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits);
+
+  int getReadySuccessors(const SUnit *SU) const;
+  int getNotReadySuccessors(const SUnit *SU) const;
+
+  template <typename Calc>
+  unsigned findMax(unsigned Num, Calc C);
+
+  Candidate* pickCandidate();
+
+  void bumpPredsPriority(const SUnit *SchedSU, int Priority);
+  void releaseSuccessors(const SUnit* SU, int Priority);
+
+public:
+  std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+                                     const ScheduleDAG &DAG);
+};
+
+void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
+  NumPreds.resize(SUnits.size());
+  for (unsigned I = 0; I < SUnits.size(); ++I)
+    NumPreds[I] = SUnits[I].NumPredsLeft;
+}
+
+int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const {
+  unsigned NumSchedSuccs = 0;
+  for (auto SDep : SU->Succs) {
+    bool wouldBeScheduled = true;
+    for (auto PDep : SDep.getSUnit()->Preds) {
+      auto PSU = PDep.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SU && !isScheduled(PSU)) {
+        wouldBeScheduled = false;
+        break;
+      }
+    }
+    NumSchedSuccs += wouldBeScheduled ? 1 : 0;
+  }
+  return NumSchedSuccs;
+}
+
+int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
+  return SU->Succs.size() - getReadySuccessors(SU);
+}
+
+template <typename Calc>
+unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
+  assert(!RQ.empty() && Num <= RQ.size());
+  typedef decltype(C(*RQ.begin())) T;
+  T Max = std::numeric_limits<T>::min();
+  unsigned NumMax = 0;
+  for (auto I = RQ.begin(); Num; --Num) {
+    T Cur = C(*I);
+    if (Cur >= Max) {
+      if (Cur > Max) {
+        Max = Cur;
+        NumMax = 1;
+      } else
+        ++NumMax;
+      auto &Cand = *I++;
+      RQ.remove(Cand);
+      RQ.push_front(Cand);
+      continue;
+    }
+    ++I;
+  }
+  return NumMax;
+}
+
+GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
+  do {
+    unsigned Num = RQ.size();
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      int Res = getNotReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+                   << Res << " successors, metric = " << -Res << '\n');
+      return -Res;
+    });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting most producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      auto Res = getReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
+                   << Res << " successors, metric = " << Res << '\n');
+      return Res;
+    });
+    if (Num == 1) break;
+
+    Num = Num ? Num : RQ.size();
+    DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
+    assert(Num == 1);
+  } while (false);
+
+  return &RQ.front();
+}
+
+void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
+  SmallPtrSet<const SUnit*, 32> Set;
+  for (const auto &S : SchedSU->Succs) {
+    if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) ||
+        S.getKind() != SDep::Data)
+      continue;
+    for (const auto &P : S.getSUnit()->Preds) {
+      auto PSU = P.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SchedSU && !isScheduled(PSU)) {
+        Set.insert(PSU);
+      }
+    }
+  }
+  SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end());
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    assert(!SU->isBoundaryNode());
+    for (const auto &P : SU->Preds) {
+      if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) &&
+          Set.insert(P.getSUnit()).second)
+        Worklist.push_back(P.getSUnit());
+    }
+  }
+  DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+               << ")'s non-ready successors of " << Priority
+               << " priority in ready queue: ");
+  const auto SetEnd = Set.end();
+  for (auto &C : RQ) {
+    if (Set.find(C.SU) != SetEnd) {
+      C.Priority = Priority;
+      DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
+  for (const auto &S : SU->Succs) {
+    auto SuccSU = S.getSUnit();
+    if (S.isWeak())
+      continue;
+    assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0);
+    if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0)
+      RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority));
+  }
+}
+
+std::vector<const SUnit*>
+GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
+                             const ScheduleDAG &DAG) {
+  const auto &SUnits = DAG.SUnits;
+  std::vector<const SUnit*> Schedule;
+  Schedule.reserve(SUnits.size());
+
+  initNumPreds(SUnits);
+
+  int StepNo = 0;
+
+  for (auto SU : TopRoots) {
+    RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo));
+  }
+  releaseSuccessors(&DAG.EntrySU, StepNo);
+
+  while (!RQ.empty()) {
+    DEBUG(
+      dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
+                "Ready queue:";
+      for (auto &C : RQ)
+        dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+      dbgs() << '\n';
+    );
+
+    auto C = pickCandidate();
+    assert(C);
+    RQ.remove(*C);
+    auto SU = C->SU;
+    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+    releaseSuccessors(SU, StepNo);
+    Schedule.push_back(SU);
+    setIsScheduled(SU);
+
+    if (getReadySuccessors(SU) == 0)
+      bumpPredsPriority(SU, StepNo);
+
+    ++StepNo;
+  }
+  assert(SUnits.size() == Schedule.size());
+
+  return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+                                             const ScheduleDAG &DAG) {
+  GCNMinRegScheduler S;
+  return S.schedule(TopRoots, DAG);
+}
+}
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
new file mode 100644
index 000000000000..4ecfa118fb27
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -0,0 +1,355 @@
+//===------------------------- GCNRegPressure.cpp - -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNRegPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void llvm::printLivesAt(SlotIndex SI,
+                        const LiveIntervals &LIS,
+                        const MachineRegisterInfo &MRI) {
+  dbgs() << "Live regs at " << SI << ": "
+         << *LIS.getInstructionFromIndex(SI);
+  unsigned Num = 0;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    const auto &LI = LIS.getInterval(Reg);
+    if (LI.hasSubRanges()) {
+      bool firstTime = true;
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) continue;
+        if (firstTime) {
+          dbgs() << "  " << PrintReg(Reg, MRI.getTargetRegisterInfo())
+                 << '\n';
+          firstTime = false;
+        }
+        dbgs() << "  " << S << '\n';
+        ++Num;
+      }
+    } else if (LI.liveAt(SI)) {
+      dbgs() << "  " << LI << '\n';
+      ++Num;
+    }
+  }
+  if (!Num) dbgs() << "  <none>\n";
+}
+
+static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+                    const GCNRPTracker::LiveRegSet &S2) {
+  if (S1.size() != S2.size())
+    return false;
+
+  for (const auto &P : S1) {
+    auto I = S2.find(P.first);
+    if (I == S2.end() || I->second != P.second)
+      return false;
+  }
+  return true;
+}
+
+static GCNRPTracker::LiveRegSet
+stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
+  GCNRPTracker::LiveRegSet Res;
+  for (const auto &P : LR) {
+    if (P.second.any())
+      Res.insert(P);
+  }
+  return Res;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRegPressure
+
+unsigned GCNRegPressure::getRegKind(unsigned Reg,
+                                    const MachineRegisterInfo &MRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const auto RC = MRI.getRegClass(Reg);
+  auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  return STI->isSGPRClass(RC) ?
+    (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
+    (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
+}
+
+void GCNRegPressure::inc(unsigned Reg,
+                         LaneBitmask PrevMask,
+                         LaneBitmask NewMask,
+                         const MachineRegisterInfo &MRI) {
+  if (NewMask == PrevMask)
+    return;
+
+  int Sign = 1;
+  if (NewMask < PrevMask) {
+    std::swap(NewMask, PrevMask);
+    Sign = -1;
+  }
+#ifndef NDEBUG
+  const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
+#endif
+  switch (auto Kind = getRegKind(Reg, MRI)) {
+  case SGPR32:
+  case VGPR32:
+    assert(PrevMask.none() && NewMask == MaxMask);
+    Value[Kind] += Sign;
+    break;
+
+  case SGPR_TUPLE:
+  case VGPR_TUPLE:
+    assert(NewMask < MaxMask || NewMask == MaxMask);
+    assert(PrevMask < NewMask);
+
+    Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+      Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
+
+    if (PrevMask.none()) {
+      assert(NewMask.any());
+      Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
+    }
+    break;
+
+  default: llvm_unreachable("Unknown register kind");
+  }
+}
+
+bool GCNRegPressure::less(const SISubtarget &ST,
+                          const GCNRegPressure& O,
+                          unsigned MaxOccupancy) const {
+  const auto SGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+  const auto VGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+  const auto OtherSGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+  const auto OtherVGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+
+  const auto Occ = std::min(SGPROcc, VGPROcc);
+  const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+  if (Occ != OtherOcc)
+    return Occ > OtherOcc;
+
+  bool SGPRImportant = SGPROcc < VGPROcc;
+  const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
+
+  // if both pressures disagree on what is more important compare vgprs
+  if (SGPRImportant != OtherSGPRImportant) {
+    SGPRImportant = false;
+  }
+
+  // compare large regs pressure
+  bool SGPRFirst = SGPRImportant;
+  for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
+    if (SGPRFirst) {
+      auto SW = getSGPRTuplesWeight();
+      auto OtherSW = O.getSGPRTuplesWeight();
+      if (SW != OtherSW)
+        return SW < OtherSW;
+    } else {
+      auto VW = getVGPRTuplesWeight();
+      auto OtherVW = O.getVGPRTuplesWeight();
+      if (VW != OtherVW)
+        return VW < OtherVW;
+    }
+  }
+  return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
+                         (getVGRPNum() < O.getVGRPNum());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+  OS << "VGPRs: " << getVGRPNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
+  OS << ", SGPRs: " << getSGRPNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+  OS << ", LVGPR WT: " << getVGPRTuplesWeight()
+     << ", LSGPR WT: " << getSGPRTuplesWeight();
+  if (ST) OS << " -> Occ: " << getOccupancy(*ST);
+  OS << '\n';
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
+LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
+                                  SlotIndex SI,
+                                  const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI) {
+  assert(!MRI.reg_nodbg_empty(Reg));
+  LaneBitmask LiveMask;
+  const auto &LI = LIS.getInterval(Reg);
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges())
+      if (S.liveAt(SI)) {
+        LiveMask |= S.LaneMask;
+        assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
+               LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
+      }
+  } else if (LI.liveAt(SI)) {
+    LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
+  }
+  return LiveMask;
+}
+
+GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
+                                           const LiveIntervals &LIS,
+                                           const MachineRegisterInfo &MRI) {
+  GCNRPTracker::LiveRegSet LiveRegs;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    auto Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
+    if (LiveMask.any())
+      LiveRegs[Reg] = LiveMask;
+  }
+  return LiveRegs;
+}
+
+void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  LiveRegs = getLiveRegsAfter(MI, LIS);
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
+LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+  assert(MO.isDef() && MO.isReg() &&
+    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0 ?
+    MRI->getMaxLaneMaskForVReg(MO.getReg()) :
+    MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+  assert(MO.isUse() && MO.isReg() &&
+         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  if (auto SubReg = MO.getSubReg())
+    return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+  auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
+  if (MaxMask.getAsInteger() == 1) // cannot have subregs
+    return MaxMask;
+
+  // For a tentative schedule LIS isn't updated yet but livemask should remain
+  // the same on any schedule. Subreg defs can be reordered but they all must
+  // dominate uses anyway.
+  auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+  return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
+}
+
+void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
+  assert(MRI && "call reset first");
+
+  LastTrackedMI = &MI;
+
+  if (MI.isDebugValue())
+    return;
+
+  // process all defs first to ensure early clobbers are handled correctly
+  // iterating over operands() to catch implicit defs
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    auto Reg = MO.getReg();
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask &= ~getDefRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  // then all uses
+  for (const auto &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    auto Reg = MO.getReg();
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask |= getUsedRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
+                           const GCNRPTracker::LiveRegSet &TrackedLR,
+                           const TargetRegisterInfo *TRI) {
+  for (auto const &P : TrackedLR) {
+    auto I = LISLR.find(P.first);
+    if (I == LISLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in LIS reported set\n";
+    }
+    else if (I->second != P.second) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+        << " masks doesn't match: LIS reported "
+        << PrintLaneMask(I->second)
+        << ", tracked "
+        << PrintLaneMask(P.second)
+        << '\n';
+    }
+  }
+  for (auto const &P : LISLR) {
+    auto I = TrackedLR.find(P.first);
+    if (I == TrackedLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in tracked set\n";
+    }
+  }
+}
+
+bool GCNUpwardRPTracker::isValid() const {
+  const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
+  const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
+  const auto TrackedLR = stripEmpty(LiveRegs);
+
+  if (!isEqual(LISLR, TrackedLR)) {
+    dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
+              " LIS reported livesets mismatch:\n";
+    printLivesAt(SI, LIS, *MRI);
+    reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo());
+    return false;
+  }
+
+  auto LISPressure = getRegPressure(*MRI, LISLR);
+  if (LISPressure != CurPressure) {
+    dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: ";
+    CurPressure.print(dbgs());
+    dbgs() << "LIS rpt: ";
+    LISPressure.print(dbgs());
+    return false;
+  }
+  return true;
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
new file mode 100644
index 000000000000..82e76a7bfddc
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -0,0 +1,170 @@
+//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+
+#include "AMDGPUSubtarget.h"
+
+#include <limits>
+
+namespace llvm {
+
+struct GCNRegPressure {
+  enum RegKind {
+    SGPR32,
+    SGPR_TUPLE,
+    VGPR32,
+    VGPR_TUPLE,
+    TOTAL_KINDS
+  };
+
+  GCNRegPressure() {
+    clear();
+  }
+
+  bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+
+  void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+
+  unsigned getSGRPNum() const { return Value[SGPR32]; }
+  unsigned getVGRPNum() const { return Value[VGPR32]; }
+
+  unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+  unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
+
+  unsigned getOccupancy(const SISubtarget &ST) const {
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
+                    ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+  }
+
+  void inc(unsigned Reg,
+           LaneBitmask PrevMask,
+           LaneBitmask NewMask,
+           const MachineRegisterInfo &MRI);
+
+  bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+    return getOccupancy(ST) > O.getOccupancy(ST);
+  }
+
+  bool less(const SISubtarget &ST, const GCNRegPressure& O,
+    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+
+  bool operator==(const GCNRegPressure &O) const {
+    return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
+  }
+
+  bool operator!=(const GCNRegPressure &O) const {
+    return !(*this == O);
+  }
+
+  void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
+  void dump() const { print(dbgs()); }
+
+private:
+  unsigned Value[TOTAL_KINDS];
+
+  static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+
+  friend GCNRegPressure max(const GCNRegPressure &P1,
+                            const GCNRegPressure &P2);
+};
+
+inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
+  GCNRegPressure Res;
+  for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I)
+    Res.Value[I] = std::max(P1.Value[I], P2.Value[I]);
+  return Res;
+}
+
+class GCNRPTracker {
+public:
+  typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
+
+protected:
+  LiveRegSet LiveRegs;
+  GCNRegPressure CurPressure, MaxPressure;
+  const MachineInstr *LastTrackedMI = nullptr;
+  mutable const MachineRegisterInfo *MRI = nullptr;
+  GCNRPTracker() {}
+public:
+  // live regs for the current state
+  const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
+  const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+
+  // returns MaxPressure, resetting it
+  decltype(MaxPressure) moveMaxPressure() {
+    auto Res = MaxPressure;
+    MaxPressure.clear();
+    return Res;
+  }
+  decltype(LiveRegs) moveLiveRegs() {
+    return std::move(LiveRegs);
+  }
+};
+
+class GCNUpwardRPTracker : public GCNRPTracker {
+  const LiveIntervals &LIS;
+  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
+public:
+  GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  // reset tracker to the point just below MI
+  // filling live regs upon this point using LIS
+  void reset(const MachineInstr &MI);
+
+  // move to the state just above the MI
+  void recede(const MachineInstr &MI);
+
+  // checks whether the tracker's state after receding MI corresponds
+  // to reported by LIS
+  bool isValid() const;
+};
+
+LaneBitmask getLiveLaneMask(unsigned Reg,
+                            SlotIndex SI,
+                            const LiveIntervals &LIS,
+                            const MachineRegisterInfo &MRI);
+
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
+                                     const LiveIntervals &LIS,
+                                     const MachineRegisterInfo &MRI);
+
+inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
+                                                 const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
+                                                  const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+template <typename Range>
+GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
+                              Range &&LiveRegs) {
+  GCNRegPressure Res;
+  for (const auto &RM : LiveRegs)
+    Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
+  return Res;
+}
+
+void printLivesAt(SlotIndex SI,
+                  const LiveIntervals &LIS,
+                  const MachineRegisterInfo &MRI);
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 2f88033c807f..ea305a92fc60 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -18,6 +18,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "misched"
 
@@ -25,7 +26,7 @@ using namespace llvm;
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C) :
-    GenericScheduler(C) { }
+    GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
 
 static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
                             const MachineFunction &MF) {
@@ -35,18 +36,46 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
   unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
                                       ST.getOccupancyWithNumVGPRs(VGPRs));
   return std::min(MinRegOccupancy,
-                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                  *MF.getFunction()));
+}
+
+void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
+  GenericScheduler::initialize(DAG);
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  MF = &DAG->MF;
+
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+
+  // FIXME: This is also necessary, because some passes that run after
+  // scheduling and before regalloc increase register pressure.
+  const int ErrorMargin = 3;
+
+  SGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
+  VGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
+  if (TargetOccupancy) {
+    SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true);
+    VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
+  } else {
+    SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                                                    SRI->getSGPRPressureSet());
+    VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                                                    SRI->getVGPRPressureSet());
+  }
+
+  SGPRCriticalLimit -= ErrorMargin;
+  VGPRCriticalLimit -= ErrorMargin;
 }
 
 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop, const RegPressureTracker &RPTracker,
                                      const SIRegisterInfo *SRI,
-                                     int SGPRPressure,
-                                     int VGPRPressure,
-                                     int SGPRExcessLimit,
-                                     int VGPRExcessLimit,
-                                     int SGPRCriticalLimit,
-                                     int VGPRCriticalLimit) {
+                                     unsigned SGPRPressure,
+                                     unsigned VGPRPressure) {
 
   Cand.SU = SU;
   Cand.AtTop = AtTop;
@@ -66,8 +95,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
   }
 
-  int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
-  int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+  unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
 
   // If two instructions increase the pressure of different register sets
   // by the same amount, the generic scheduler will prefer to schedule the
@@ -77,7 +106,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // only for VGPRs or only for SGPRs.
 
   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
-  const int MaxVGPRPressureInc = 16;
+  const unsigned MaxVGPRPressureInc = 16;
   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
 
@@ -86,11 +115,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // to increase the likelihood we don't go over the limits.  We should improve
   // the analysis to look through dependencies to find the path with the least
   // register pressure.
-  // FIXME: This is also necessary, because some passes that run after
-  // scheduling and before regalloc increase register pressure.
-  const int ErrorMargin = 3;
-  VGPRExcessLimit -= ErrorMargin;
-  SGPRExcessLimit -= ErrorMargin;
 
   // We only need to update the RPDelata for instructions that increase
   // register pressure.  Instructions that decrease or keep reg pressure
@@ -103,7 +127,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
 
   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
     Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
-    Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
+    Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
   }
 
   // Register pressure is considered 'CRITICAL' if it is approaching a value
@@ -111,9 +135,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
   // has the same cost, so we don't need to prefer one over the other.
 
-  VGPRCriticalLimit -= ErrorMargin;
-  SGPRCriticalLimit -= ErrorMargin;
-
   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
 
@@ -134,27 +155,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand) {
-  const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
   unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
-  unsigned SGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
-  unsigned VGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
-  unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
-  unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true);
-  unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
-
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
 
     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
-                  SGPRPressure, VGPRPressure,
-                  SGPRExcessLimit, VGPRExcessLimit,
-                  SGPRCriticalLimit, VGPRCriticalLimit);
+                  SGPRPressure, VGPRPressure);
     // Pass SchedBoundary only when comparing nodes from the same boundary.
     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
     GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
@@ -167,16 +177,6 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
-static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
-  switch (Reason) {
-  default:
-    return Reason;
-  case GenericSchedulerBase::RegCritical:
-  case GenericSchedulerBase::RegExcess:
-    return -Reason;
- }
-}
-
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
@@ -224,9 +224,9 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   DEBUG(
     dbgs() << "Top Cand: ";
-    traceCandidate(BotCand);
-    dbgs() << "Bot Cand: ";
     traceCandidate(TopCand);
+    dbgs() << "Bot Cand: ";
+    traceCandidate(BotCand);
   );
   SchedCandidate Cand;
   if (TopCand.Reason == BotCand.Reason) {
@@ -249,9 +249,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
     } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
       Cand = BotCand;
     } else {
-      int TopRank = getBidirectionalReasonRank(TopCand.Reason);
-      int BotRank = getBidirectionalReasonRank(BotCand.Reason);
-      if (TopRank > BotRank) {
+      if (BotCand.Reason > TopCand.Reason) {
         Cand = TopCand;
       } else {
         Cand = BotCand;
@@ -310,3 +308,255 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
   return SU;
 }
+
+GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
+                        std::unique_ptr<MachineSchedStrategy> S) :
+  ScheduleDAGMILive(C, std::move(S)),
+  ST(MF.getSubtarget<SISubtarget>()),
+  MFI(*MF.getInfo<SIMachineFunctionInfo>()),
+  StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
+                                                    *MF.getFunction())),
+  MinOccupancy(StartingOccupancy), Stage(0) {
+
+  DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+}
+
+void GCNScheduleDAGMILive::schedule() {
+  std::vector<MachineInstr*> Unsched;
+  Unsched.reserve(NumRegionInstrs);
+  for (auto &I : *this)
+    Unsched.push_back(&I);
+
+  std::pair<unsigned, unsigned> PressureBefore;
+  if (LIS) {
+    DEBUG(dbgs() << "Pressure before scheduling:\n");
+    discoverLiveIns();
+    PressureBefore = getRealRegPressure();
+  }
+
+  ScheduleDAGMILive::schedule();
+  if (Stage == 0)
+    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+
+  if (!LIS)
+    return;
+
+  // Check the results of scheduling.
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  DEBUG(dbgs() << "Pressure after scheduling:\n");
+  auto PressureAfter = getRealRegPressure();
+  LiveIns.clear();
+
+  if (PressureAfter.first <= S.SGPRCriticalLimit &&
+      PressureAfter.second <= S.VGPRCriticalLimit) {
+    DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    return;
+  }
+  unsigned WavesAfter = getMaxWaves(PressureAfter.first,
+                                    PressureAfter.second, MF);
+  unsigned WavesBefore = getMaxWaves(PressureBefore.first,
+                                      PressureBefore.second, MF);
+  DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
+                  ", after " << WavesAfter << ".\n");
+
+  // We could not keep current target occupancy because of the just scheduled
+  // region. Record new occupancy for next scheduling cycle.
+  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+  if (NewOccupancy < MinOccupancy) {
+    MinOccupancy = NewOccupancy;
+    DEBUG(dbgs() << "Occupancy lowered for the function to "
+                 << MinOccupancy << ".\n");
+  }
+
+  if (WavesAfter >= WavesBefore)
+    return;
+
+  DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RegionEnd = RegionBegin;
+  for (MachineInstr *MI : Unsched) {
+    if (MI->getIterator() != RegionEnd) {
+      BB->remove(MI);
+      BB->insert(RegionEnd, MI);
+      LIS->handleMove(*MI, true);
+    }
+    // Reset read-undef flags and update them later.
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef())
+        Op.setIsUndef(false);
+    RegisterOperands RegOpers;
+    RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+    if (ShouldTrackLaneMasks) {
+      // Adjust liveness and add missing dead+read-undef flags.
+      SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    } else {
+      // Adjust for missing dead-def flags.
+      RegOpers.detectDeadDefs(*MI, *LIS);
+    }
+    RegionEnd = MI->getIterator();
+    ++RegionEnd;
+    DEBUG(dbgs() << "Scheduling " << *MI);
+  }
+  RegionBegin = Unsched.front()->getIterator();
+  if (Stage == 0)
+    Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+
+  placeDebugValues();
+}
+
+static inline void setMask(const MachineRegisterInfo &MRI,
+                           const SIRegisterInfo *SRI, unsigned Reg,
+                           LaneBitmask &PrevMask, LaneBitmask NewMask,
+                           unsigned &SGPRs, unsigned &VGPRs) {
+  int NewRegs = countPopulation(NewMask.getAsInteger()) -
+                countPopulation(PrevMask.getAsInteger());
+  if (SRI->isSGPRReg(MRI, Reg))
+    SGPRs += NewRegs;
+  if (SRI->isVGPR(MRI, Reg))
+    VGPRs += NewRegs;
+  assert ((int)SGPRs >= 0 && (int)VGPRs >= 0);
+  PrevMask = NewMask;
+}
+
+void GCNScheduleDAGMILive::discoverLiveIns() {
+  unsigned SGPRs = 0;
+  unsigned VGPRs = 0;
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+  SlotIndex SI = LIS->getInstructionIndex(*begin()).getBaseIndex();
+  assert (SI.isValid());
+
+  DEBUG(dbgs() << "Region live-ins:");
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    LaneBitmask LaneMask = LaneBitmask::getNone();
+    if (LI.hasSubRanges()) {
+      for (const auto &S : LI.subranges())
+        if (S.liveAt(SI))
+          LaneMask |= S.LaneMask;
+    } else if (LI.liveAt(SI)) {
+      LaneMask = MRI.getMaxLaneMaskForVReg(Reg);
+    }
+
+    if (LaneMask.any()) {
+      setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs);
+
+      DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
+                   << PrintLaneMask(LiveIns[Reg]));
+    }
+  }
+
+  LiveInPressure = std::make_pair(SGPRs, VGPRs);
+
+  DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs
+               << "\nVGPR = " << VGPRs << '\n');
+}
+
+std::pair<unsigned, unsigned>
+GCNScheduleDAGMILive::getRealRegPressure() const {
+  unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs;
+  SGPRs = MaxSGPRs = LiveInPressure.first;
+  VGPRs = MaxVGPRs = LiveInPressure.second;
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+  DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns);
+
+  for (const MachineInstr &MI : *this) {
+    if (MI.isDebugValue())
+      continue;
+    SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
+    assert (SI.isValid());
+
+    // Remove dead registers or mask bits.
+    for (auto &It : LiveRegs) {
+      if (It.second.none())
+        continue;
+      const LiveInterval &LI = LIS->getInterval(It.first);
+      if (LI.hasSubRanges()) {
+        for (const auto &S : LI.subranges())
+          if (!S.liveAt(SI))
+            setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask,
+                    SGPRs, VGPRs);
+      } else if (!LI.liveAt(SI)) {
+        setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(),
+                SGPRs, VGPRs);
+      }
+    }
+
+    // Add new registers or mask bits.
+    for (const auto &MO : MI.defs()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      unsigned SubRegIdx = MO.getSubReg();
+      LaneBitmask LaneMask = SubRegIdx != 0
+                             ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                             : MRI.getMaxLaneMaskForVReg(Reg);
+      LaneBitmask &LM = LiveRegs[Reg];
+      setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs);
+    }
+    MaxSGPRs = std::max(MaxSGPRs, SGPRs);
+    MaxVGPRs = std::max(MaxVGPRs, VGPRs);
+  }
+
+  DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs
+               << "\nVGPR = " << MaxVGPRs << '\n');
+
+  return std::make_pair(MaxSGPRs, MaxVGPRs);
+}
+
+void GCNScheduleDAGMILive::finalizeSchedule() {
+  // Retry function scheduling if we found resulting occupancy and it is
+  // lower than used for first pass scheduling. This will give more freedom
+  // to schedule low register pressure blocks.
+  // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+
+  if (!LIS || StartingOccupancy <= MinOccupancy)
+    return;
+
+  DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
+               << MinOccupancy << ".\n");
+
+  Stage++;
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  S.setTargetOccupancy(MinOccupancy);
+
+  MachineBasicBlock *MBB = nullptr;
+  for (auto Region : Regions) {
+    RegionBegin = Region.first;
+    RegionEnd = Region.second;
+
+    if (RegionBegin->getParent() != MBB) {
+      if (MBB) finishBlock();
+      MBB = RegionBegin->getParent();
+      startBlock(MBB);
+    }
+
+    unsigned NumRegionInstrs = std::distance(begin(), end());
+    enterRegion(MBB, begin(), end(), NumRegionInstrs);
+
+    // Skip empty scheduling regions (0 or 1 schedulable instructions).
+    if (begin() == end() || begin() == std::prev(end())) {
+      exitRegion();
+      continue;
+    }
+    DEBUG(dbgs() << "********** MI Scheduling **********\n");
+    DEBUG(dbgs() << MF.getName()
+          << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+          << "\n  From: " << *begin() << "    To: ";
+          if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+          else dbgs() << "End";
+          dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+    schedule();
+
+    exitRegion();
+  }
+  finishBlock();
+  LiveIns.shrink_and_clear();
+}
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 4cfc0cea81fb..15af232704ff 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -18,13 +18,16 @@
 
 namespace llvm {
 
+class SIMachineFunctionInfo;
 class SIRegisterInfo;
+class SISubtarget;
 
 /// This is a minimal scheduler strategy.  The main difference between this
 /// and the GenericScheduler is that GCNSchedStrategy uses different
 /// heuristics to determine excess/critical pressure sets.  Its goal is to
 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+  friend class GCNScheduleDAGMILive;
 
   SUnit *pickNodeBidirectional(bool &IsTopNode);
 
@@ -35,18 +38,65 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler {
   void initCandidate(SchedCandidate &Cand, SUnit *SU,
                      bool AtTop, const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI,
-                     int SGPRPressure, int VGPRPressure,
-                     int SGPRExcessLimit, int VGPRExcessLimit,
-                     int SGPRCriticalLimit, int VGPRCriticalLimit);
+                     unsigned SGPRPressure, unsigned VGPRPressure);
 
-  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                    SchedBoundary *Zone, const SIRegisterInfo *SRI,
-                    unsigned SGPRPressure, unsigned VGPRPressure);
+  unsigned SGPRExcessLimit;
+  unsigned VGPRExcessLimit;
+  unsigned SGPRCriticalLimit;
+  unsigned VGPRCriticalLimit;
+
+  unsigned TargetOccupancy;
+
+  MachineFunction *MF;
 
 public:
   GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
 
   SUnit *pickNode(bool &IsTopNode) override;
+
+  void initialize(ScheduleDAGMI *DAG) override;
+
+  void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
+};
+
+class GCNScheduleDAGMILive : public ScheduleDAGMILive {
+
+  const SISubtarget &ST;
+
+  const SIMachineFunctionInfo &MFI;
+
+  // Occupancy target at the begining of function scheduling cycle.
+  unsigned StartingOccupancy;
+
+  // Minimal real occupancy recorder for the function.
+  unsigned MinOccupancy;
+
+  // Scheduling stage number.
+  unsigned Stage;
+
+  // Vecor of regions recorder for later rescheduling
+  SmallVector<std::pair<MachineBasicBlock::iterator,
+                        MachineBasicBlock::iterator>, 32> Regions;
+
+  // Region live-ins.
+  DenseMap<unsigned, LaneBitmask> LiveIns;
+
+  // Number of live-ins to the current region, first SGPR then VGPR.
+  std::pair<unsigned, unsigned> LiveInPressure;
+
+  // Collect current region live-ins.
+  void discoverLiveIns();
+
+  // Return current region pressure. First value is SGPR number, second is VGPR.
+  std::pair<unsigned, unsigned> getRealRegPressure() const;
+
+public:
+  GCNScheduleDAGMILive(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S);
+
+  void schedule() override;
+
+  void finalizeSchedule() override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index 7172a0aa7167..a817ff3cbaf0 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -113,7 +113,7 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
-    O << " offset:";
+    O << ((OpNo == 0)? "offset:" : " offset:");
     printU16ImmDecOperand(MI, OpNo, O);
   }
 }
@@ -375,6 +375,14 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
     O << formatHex(static_cast<uint64_t>(Imm));
 }
 
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  uint16_t Lo16 = static_cast<uint16_t>(Imm);
+  assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
+  printImmediate16(Lo16, STI, O);
+}
+
 void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
@@ -489,6 +497,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_IMM_FP16:
       printImmediate16(Op.getImm(), STI, O);
       break;
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+      printImmediateV216(Op.getImm(), STI, O);
+      break;
     case MCOI::OPERAND_UNKNOWN:
     case MCOI::OPERAND_PCREL:
       O << formatDec(Op.getImm());
@@ -531,13 +543,34 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
                                                    const MCSubtargetInfo &STI,
                                                    raw_ostream &O) {
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
-  if (InputModifiers & SISrcMods::NEG)
-    O << '-';
+
+  // Use 'neg(...)' instead of '-' to avoid ambiguity.
+  // This is important for integer literals because
+  // -1 is not the same value as neg(1).
+  bool NegMnemo = false;
+
+  if (InputModifiers & SISrcMods::NEG) {
+    if (OpNo + 1 < MI->getNumOperands() &&
+        (InputModifiers & SISrcMods::ABS) == 0) {
+      const MCOperand &Op = MI->getOperand(OpNo + 1);
+      NegMnemo = Op.isImm() || Op.isFPImm();
+    }
+    if (NegMnemo) {
+      O << "neg(";
+    } else {
+      O << '-';
+    }
+  }
+
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
   printOperand(MI, OpNo + 1, STI, O);
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
+
+  if (NegMnemo) {
+    O << ')';
+  }
 }
 
 void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
@@ -672,11 +705,19 @@ template <unsigned N>
 void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+  unsigned Opc = MI->getOpcode();
+  int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en);
   unsigned En = MI->getOperand(EnIdx).getImm();
 
-  // FIXME: What do we do with compr? The meaning of en changes depending on if
-  // compr is set.
+  int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr);
+
+  // If compr is set, print as src0, src0, src1, src1
+  if (MI->getOperand(ComprIdx).getImm()) {
+    if (N == 1 || N == 2)
+      --OpNo;
+    else if (N == 3)
+      OpNo -= 2;
+  }
 
   if (En & (1 << N))
     printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
@@ -730,6 +771,71 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
   }
 }
 
+static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) {
+  int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+
+  for (int I = 0; I < NumOps; ++I) {
+    if (!!(Ops[I] & Mod) != DefaultValue)
+      return false;
+  }
+
+  return true;
+}
+
+static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
+                                raw_ostream &O) {
+  unsigned Opc = MI->getOpcode();
+  int NumOps = 0;
+  int Ops[3];
+
+  for (int OpName : { AMDGPU::OpName::src0_modifiers,
+                      AMDGPU::OpName::src1_modifiers,
+                      AMDGPU::OpName::src2_modifiers }) {
+    int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+    if (Idx == -1)
+      break;
+
+    Ops[NumOps++] = MI->getOperand(Idx).getImm();
+  }
+
+  if (allOpsDefaultValue(Ops, NumOps, Mod))
+    return;
+
+  O << Name;
+  for (int I = 0; I < NumOps; ++I) {
+    if (I != 0)
+      O << ',';
+
+    O << !!(Ops[I] & Mod);
+  }
+
+  O << ']';
+}
+
+void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
+}
+
+void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O);
+}
+
+void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O);
+}
+
+void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -1057,27 +1163,28 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  IsaVersion IV = getIsaVersion(STI.getFeatureBits());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits());
 
   unsigned SImm16 = MI->getOperand(OpNo).getImm();
   unsigned Vmcnt, Expcnt, Lgkmcnt;
-  decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt);
+  decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt);
 
   bool NeedSpace = false;
 
-  if (Vmcnt != getVmcntBitMask(IV)) {
+  if (Vmcnt != getVmcntBitMask(ISA)) {
     O << "vmcnt(" << Vmcnt << ')';
     NeedSpace = true;
   }
 
-  if (Expcnt != getExpcntBitMask(IV)) {
+  if (Expcnt != getExpcntBitMask(ISA)) {
     if (NeedSpace)
       O << ' ';
     O << "expcnt(" << Expcnt << ')';
     NeedSpace = true;
   }
 
-  if (Lgkmcnt != getLgkmcntBitMask(IV)) {
+  if (Lgkmcnt != getLgkmcntBitMask(ISA)) {
     if (NeedSpace)
       O << ' ';
     O << "lgkmcnt(" << Lgkmcnt << ')';
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index a6d348ff0f12..c0b8e5c51089 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -90,6 +90,8 @@ private:
                    raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
+  void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
+                          raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
@@ -117,6 +119,14 @@ private:
                         const MCSubtargetInfo &STI, raw_ostream &O);
   void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOpSel(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOpSelHi(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNegLo(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNegHi(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
index bbdd17737cf0..c54a13c4b4d8 100644
--- a/lib/Target/AMDGPU/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -30,5 +30,5 @@ has_disassembler = 1
 type = Library
 name = AMDGPUCodeGen
 parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index ffb92aae599e..f3266fe82955 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -37,7 +37,7 @@ public:
                          bool &IsResolved) override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -131,7 +131,7 @@ void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
 
 void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                   unsigned DataSize, uint64_t Value,
-                                  bool IsPCRel) const {
+                                  bool IsPCRel, MCContext &Ctx) const {
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -164,7 +164,20 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
 }
 
 bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  OW->WriteZeros(Count);
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  OW->WriteZeros(Count % 4);
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+
+  // FIXME: R600 support.
+  // s_nop 0
+  const uint32_t Encoded_S_NOP_0 = 0xbf800000;
+
+  for (uint64_t I = 0; I != Count; ++I)
+    OW->write32(Encoded_S_NOP_0);
 
   return true;
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h
new file mode 100644
index 000000000000..816e8c744b27
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h
@@ -0,0 +1,422 @@
+//===--- AMDGPUCodeObjectMetadata.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata definitions and in-memory
+/// representations.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H
+
+#include <cstdint>
+#include <string>
+#include <system_error>
+#include <vector>
+
+namespace llvm {
+namespace AMDGPU {
+
+//===----------------------------------------------------------------------===//
+// Code Object Metadata.
+//===----------------------------------------------------------------------===//
+namespace CodeObject {
+
+/// \brief Code object metadata major version.
+constexpr uint32_t MetadataVersionMajor = 1;
+/// \brief Code object metadata minor version.
+constexpr uint32_t MetadataVersionMinor = 0;
+
+/// \brief Code object metadata beginning assembler directive.
+constexpr char MetadataAssemblerDirectiveBegin[] =
+    ".amdgpu_code_object_metadata";
+/// \brief Code object metadata ending assembler directive.
+constexpr char MetadataAssemblerDirectiveEnd[] =
+    ".end_amdgpu_code_object_metadata";
+
+/// \brief Access qualifiers.
+enum class AccessQualifier : uint8_t {
+  Default   = 0,
+  ReadOnly  = 1,
+  WriteOnly = 2,
+  ReadWrite = 3,
+  Unknown   = 0xff
+};
+
+/// \brief Address space qualifiers.
+enum class AddressSpaceQualifier : uint8_t {
+  Private  = 0,
+  Global   = 1,
+  Constant = 2,
+  Local    = 3,
+  Generic  = 4,
+  Region   = 5,
+  Unknown  = 0xff
+};
+
+/// \brief Value kinds.
+enum class ValueKind : uint8_t {
+  ByValue                = 0,
+  GlobalBuffer           = 1,
+  DynamicSharedPointer   = 2,
+  Sampler                = 3,
+  Image                  = 4,
+  Pipe                   = 5,
+  Queue                  = 6,
+  HiddenGlobalOffsetX    = 7,
+  HiddenGlobalOffsetY    = 8,
+  HiddenGlobalOffsetZ    = 9,
+  HiddenNone             = 10,
+  HiddenPrintfBuffer     = 11,
+  HiddenDefaultQueue     = 12,
+  HiddenCompletionAction = 13,
+  Unknown                = 0xff
+};
+
+/// \brief Value types.
+enum class ValueType : uint8_t {
+  Struct  = 0,
+  I8      = 1,
+  U8      = 2,
+  I16     = 3,
+  U16     = 4,
+  F16     = 5,
+  I32     = 6,
+  U32     = 7,
+  F32     = 8,
+  I64     = 9,
+  U64     = 10,
+  F64     = 11,
+  Unknown = 0xff
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel Metadata.
+//===----------------------------------------------------------------------===//
+namespace Kernel {
+
+//===----------------------------------------------------------------------===//
+// Kernel Attributes Metadata.
+//===----------------------------------------------------------------------===//
+namespace Attrs {
+
+namespace Key {
+/// \brief Key for Kernel::Attr::Metadata::mReqdWorkGroupSize.
+constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize";
+/// \brief Key for Kernel::Attr::Metadata::mWorkGroupSizeHint.
+constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint";
+/// \brief Key for Kernel::Attr::Metadata::mVecTypeHint.
+constexpr char VecTypeHint[] = "VecTypeHint";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel attributes metadata.
+struct Metadata final {
+  /// \brief 'reqd_work_group_size' attribute. Optional.
+  std::vector<uint32_t> mReqdWorkGroupSize = std::vector<uint32_t>();
+  /// \brief 'work_group_size_hint' attribute. Optional.
+  std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>();
+  /// \brief 'vec_type_hint' attribute. Optional.
+  std::string mVecTypeHint = std::string();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \returns True if kernel attributes metadata is empty, false otherwise.
+  bool empty() const {
+    return mReqdWorkGroupSize.empty() &&
+           mWorkGroupSizeHint.empty() &&
+           mVecTypeHint.empty();
+  }
+
+  /// \returns True if kernel attributes metadata is not empty, false otherwise.
+  bool notEmpty() const {
+    return !empty();
+  }
+};
+
+} // end namespace Attrs
+
+//===----------------------------------------------------------------------===//
+// Kernel Argument Metadata.
+//===----------------------------------------------------------------------===//
+namespace Arg {
+
+namespace Key {
+/// \brief Key for Kernel::Arg::Metadata::mSize.
+constexpr char Size[] = "Size";
+/// \brief Key for Kernel::Arg::Metadata::mAlign.
+constexpr char Align[] = "Align";
+/// \brief Key for Kernel::Arg::Metadata::mValueKind.
+constexpr char ValueKind[] = "ValueKind";
+/// \brief Key for Kernel::Arg::Metadata::mValueType.
+constexpr char ValueType[] = "ValueType";
+/// \brief Key for Kernel::Arg::Metadata::mPointeeAlign.
+constexpr char PointeeAlign[] = "PointeeAlign";
+/// \brief Key for Kernel::Arg::Metadata::mAccQual.
+constexpr char AccQual[] = "AccQual";
+/// \brief Key for Kernel::Arg::Metadata::mAddrSpaceQual.
+constexpr char AddrSpaceQual[] = "AddrSpaceQual";
+/// \brief Key for Kernel::Arg::Metadata::mIsConst.
+constexpr char IsConst[] = "IsConst";
+/// \brief Key for Kernel::Arg::Metadata::mIsPipe.
+constexpr char IsPipe[] = "IsPipe";
+/// \brief Key for Kernel::Arg::Metadata::mIsRestrict.
+constexpr char IsRestrict[] = "IsRestrict";
+/// \brief Key for Kernel::Arg::Metadata::mIsVolatile.
+constexpr char IsVolatile[] = "IsVolatile";
+/// \brief Key for Kernel::Arg::Metadata::mName.
+constexpr char Name[] = "Name";
+/// \brief Key for Kernel::Arg::Metadata::mTypeName.
+constexpr char TypeName[] = "TypeName";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel argument metadata.
+struct Metadata final {
+  /// \brief Size in bytes. Required.
+  uint32_t mSize = 0;
+  /// \brief Alignment in bytes. Required.
+  uint32_t mAlign = 0;
+  /// \brief Value kind. Required.
+  ValueKind mValueKind = ValueKind::Unknown;
+  /// \brief Value type. Required.
+  ValueType mValueType = ValueType::Unknown;
+  /// \brief Pointee alignment in bytes. Optional.
+  uint32_t mPointeeAlign = 0;
+  /// \brief Access qualifier. Optional.
+  AccessQualifier mAccQual = AccessQualifier::Unknown;
+  /// \brief Address space qualifier. Optional.
+  AddressSpaceQualifier mAddrSpaceQual = AddressSpaceQualifier::Unknown;
+  /// \brief True if 'const' qualifier is specified. Optional.
+  bool mIsConst = false;
+  /// \brief True if 'pipe' qualifier is specified. Optional.
+  bool mIsPipe = false;
+  /// \brief True if 'restrict' qualifier is specified. Optional.
+  bool mIsRestrict = false;
+  /// \brief True if 'volatile' qualifier is specified. Optional.
+  bool mIsVolatile = false;
+  /// \brief Name. Optional.
+  std::string mName = std::string();
+  /// \brief Type name. Optional.
+  std::string mTypeName = std::string();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+};
+
+} // end namespace Arg
+
+//===----------------------------------------------------------------------===//
+// Kernel Code Properties Metadata.
+//===----------------------------------------------------------------------===//
+namespace CodeProps {
+
+namespace Key {
+/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentSize.
+constexpr char KernargSegmentSize[] = "KernargSegmentSize";
+/// \brief Key for Kernel::CodeProps::Metadata::mWorkgroupGroupSegmentSize.
+constexpr char WorkgroupGroupSegmentSize[] = "WorkgroupGroupSegmentSize";
+/// \brief Key for Kernel::CodeProps::Metadata::mWorkitemPrivateSegmentSize.
+constexpr char WorkitemPrivateSegmentSize[] = "WorkitemPrivateSegmentSize";
+/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontNumSGPRs.
+constexpr char WavefrontNumSGPRs[] = "WavefrontNumSGPRs";
+/// \brief Key for Kernel::CodeProps::Metadata::mWorkitemNumVGPRs.
+constexpr char WorkitemNumVGPRs[] = "WorkitemNumVGPRs";
+/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentAlign.
+constexpr char KernargSegmentAlign[] = "KernargSegmentAlign";
+/// \brief Key for Kernel::CodeProps::Metadata::mGroupSegmentAlign.
+constexpr char GroupSegmentAlign[] = "GroupSegmentAlign";
+/// \brief Key for Kernel::CodeProps::Metadata::mPrivateSegmentAlign.
+constexpr char PrivateSegmentAlign[] = "PrivateSegmentAlign";
+/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontSize.
+constexpr char WavefrontSize[] = "WavefrontSize";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel code properties metadata.
+struct Metadata final {
+  /// \brief Size in bytes of the kernarg segment memory. Kernarg segment memory
+  /// holds the values of the arguments to the kernel. Optional.
+  uint64_t mKernargSegmentSize = 0;
+  /// \brief Size in bytes of the group segment memory required by a workgroup.
+  /// This value does not include any dynamically allocated group segment memory
+  /// that may be added when the kernel is dispatched. Optional.
+  uint32_t mWorkgroupGroupSegmentSize = 0;
+  /// \brief Size in bytes of the private segment memory required by a workitem.
+  /// Private segment memory includes arg, spill and private segments. Optional.
+  uint32_t mWorkitemPrivateSegmentSize = 0;
+  /// \brief Total number of SGPRs used by a wavefront. Optional.
+  uint16_t mWavefrontNumSGPRs = 0;
+  /// \brief Total number of VGPRs used by a workitem. Optional.
+  uint16_t mWorkitemNumVGPRs = 0;
+  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// kernarg memory segment. Expressed as a power of two. Optional.
+  uint8_t mKernargSegmentAlign = 0;
+  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// group memory segment. Expressed as a power of two. Optional.
+  uint8_t mGroupSegmentAlign = 0;
+  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// private memory segment. Expressed as a power of two. Optional.
+  uint8_t mPrivateSegmentAlign = 0;
+  /// \brief Wavefront size. Expressed as a power of two. Optional.
+  uint8_t mWavefrontSize = 0;
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \returns True if kernel code properties metadata is empty, false
+  /// otherwise.
+  bool empty() const {
+    return !notEmpty();
+  }
+
+  /// \returns True if kernel code properties metadata is not empty, false
+  /// otherwise.
+  bool notEmpty() const {
+    return mKernargSegmentSize || mWorkgroupGroupSegmentSize ||
+           mWorkitemPrivateSegmentSize || mWavefrontNumSGPRs ||
+           mWorkitemNumVGPRs || mKernargSegmentAlign || mGroupSegmentAlign ||
+           mPrivateSegmentAlign || mWavefrontSize;
+  }
+};
+
+} // end namespace CodeProps
+
+//===----------------------------------------------------------------------===//
+// Kernel Debug Properties Metadata.
+//===----------------------------------------------------------------------===//
+namespace DebugProps {
+
+namespace Key {
+/// \brief Key for Kernel::DebugProps::Metadata::mDebuggerABIVersion.
+constexpr char DebuggerABIVersion[] = "DebuggerABIVersion";
+/// \brief Key for Kernel::DebugProps::Metadata::mReservedNumVGPRs.
+constexpr char ReservedNumVGPRs[] = "ReservedNumVGPRs";
+/// \brief Key for Kernel::DebugProps::Metadata::mReservedFirstVGPR.
+constexpr char ReservedFirstVGPR[] = "ReservedFirstVGPR";
+/// \brief Key for Kernel::DebugProps::Metadata::mPrivateSegmentBufferSGPR.
+constexpr char PrivateSegmentBufferSGPR[] = "PrivateSegmentBufferSGPR";
+/// \brief Key for
+///     Kernel::DebugProps::Metadata::mWavefrontPrivateSegmentOffsetSGPR.
+constexpr char WavefrontPrivateSegmentOffsetSGPR[] =
+    "WavefrontPrivateSegmentOffsetSGPR";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel debug properties metadata.
+struct Metadata final {
+  /// \brief Debugger ABI version. Optional.
+  std::vector<uint32_t> mDebuggerABIVersion = std::vector<uint32_t>();
+  /// \brief Consecutive number of VGPRs reserved for debugger use. Must be 0 if
+  /// mDebuggerABIVersion is not set. Optional.
+  uint16_t mReservedNumVGPRs = 0;
+  /// \brief First fixed VGPR reserved. Must be uint16_t(-1) if
+  /// mDebuggerABIVersion is not set or mReservedFirstVGPR is 0. Optional.
+  uint16_t mReservedFirstVGPR = uint16_t(-1);
+  /// \brief Fixed SGPR of the first of 4 SGPRs used to hold the scratch V# used
+  /// for the entire kernel execution. Must be uint16_t(-1) if
+  /// mDebuggerABIVersion is not set or SGPR not used or not known. Optional.
+  uint16_t mPrivateSegmentBufferSGPR = uint16_t(-1);
+  /// \brief Fixed SGPR used to hold the wave scratch offset for the entire
+  /// kernel execution. Must be uint16_t(-1) if mDebuggerABIVersion is not set
+  /// or SGPR is not used or not known. Optional.
+  uint16_t mWavefrontPrivateSegmentOffsetSGPR = uint16_t(-1);
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \returns True if kernel debug properties metadata is empty, false
+  /// otherwise.
+  bool empty() const {
+    return !notEmpty();
+  }
+
+  /// \returns True if kernel debug properties metadata is not empty, false
+  /// otherwise.
+  bool notEmpty() const {
+    return !mDebuggerABIVersion.empty();
+  }
+};
+
+} // end namespace DebugProps
+
+namespace Key {
+/// \brief Key for Kernel::Metadata::mName.
+constexpr char Name[] = "Name";
+/// \brief Key for Kernel::Metadata::mLanguage.
+constexpr char Language[] = "Language";
+/// \brief Key for Kernel::Metadata::mLanguageVersion.
+constexpr char LanguageVersion[] = "LanguageVersion";
+/// \brief Key for Kernel::Metadata::mAttrs.
+constexpr char Attrs[] = "Attrs";
+/// \brief Key for Kernel::Metadata::mArgs.
+constexpr char Args[] = "Args";
+/// \brief Key for Kernel::Metadata::mCodeProps.
+constexpr char CodeProps[] = "CodeProps";
+/// \brief Key for Kernel::Metadata::mDebugProps.
+constexpr char DebugProps[] = "DebugProps";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel metadata.
+struct Metadata final {
+  /// \brief Name. Required.
+  std::string mName = std::string();
+  /// \brief Language. Optional.
+  std::string mLanguage = std::string();
+  /// \brief Language version. Optional.
+  std::vector<uint32_t> mLanguageVersion = std::vector<uint32_t>();
+  /// \brief Attributes metadata. Optional.
+  Attrs::Metadata mAttrs = Attrs::Metadata();
+  /// \brief Arguments metadata. Optional.
+  std::vector<Arg::Metadata> mArgs = std::vector<Arg::Metadata>();
+  /// \brief Code properties metadata. Optional.
+  CodeProps::Metadata mCodeProps = CodeProps::Metadata();
+  /// \brief Debug properties metadata. Optional.
+  DebugProps::Metadata mDebugProps = DebugProps::Metadata();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+};
+
+} // end namespace Kernel
+
+namespace Key {
+/// \brief Key for CodeObject::Metadata::mVersion.
+constexpr char Version[] = "Version";
+/// \brief Key for CodeObject::Metadata::mPrintf.
+constexpr char Printf[] = "Printf";
+/// \brief Key for CodeObject::Metadata::mKernels.
+constexpr char Kernels[] = "Kernels";
+} // end namespace Key
+
+/// \brief In-memory representation of code object metadata.
+struct Metadata final {
+  /// \brief Code object metadata version. Required.
+  std::vector<uint32_t> mVersion = std::vector<uint32_t>();
+  /// \brief Printf metadata. Optional.
+  std::vector<std::string> mPrintf = std::vector<std::string>();
+  /// \brief Kernels metadata. Optional.
+  std::vector<Kernel::Metadata> mKernels = std::vector<Kernel::Metadata>();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \brief Converts \p YamlString to \p CodeObjectMetadata.
+  static std::error_code fromYamlString(std::string YamlString,
+                                        Metadata &CodeObjectMetadata);
+
+  /// \brief Converts \p CodeObjectMetadata to \p YamlString.
+  static std::error_code toYamlString(Metadata CodeObjectMetadata,
+                                      std::string &YamlString);
+};
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
new file mode 100644
index 000000000000..29a6ab9fbe93
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -0,0 +1,625 @@
+//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata Streamer.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUCodeObjectMetadataStreamer.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::CodeObject;
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Arg::Metadata)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
+
+namespace llvm {
+
+static cl::opt<bool> DumpCodeObjectMetadata(
+    "amdgpu-dump-comd",
+    cl::desc("Dump AMDGPU Code Object Metadata"));
+static cl::opt<bool> VerifyCodeObjectMetadata(
+    "amdgpu-verify-comd",
+    cl::desc("Verify AMDGPU Code Object Metadata"));
+
+namespace yaml {
+
+template <>
+struct ScalarEnumerationTraits<AccessQualifier> {
+  static void enumeration(IO &YIO, AccessQualifier &EN) {
+    YIO.enumCase(EN, "Default", AccessQualifier::Default);
+    YIO.enumCase(EN, "ReadOnly", AccessQualifier::ReadOnly);
+    YIO.enumCase(EN, "WriteOnly", AccessQualifier::WriteOnly);
+    YIO.enumCase(EN, "ReadWrite", AccessQualifier::ReadWrite);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<AddressSpaceQualifier> {
+  static void enumeration(IO &YIO, AddressSpaceQualifier &EN) {
+    YIO.enumCase(EN, "Private", AddressSpaceQualifier::Private);
+    YIO.enumCase(EN, "Global", AddressSpaceQualifier::Global);
+    YIO.enumCase(EN, "Constant", AddressSpaceQualifier::Constant);
+    YIO.enumCase(EN, "Local", AddressSpaceQualifier::Local);
+    YIO.enumCase(EN, "Generic", AddressSpaceQualifier::Generic);
+    YIO.enumCase(EN, "Region", AddressSpaceQualifier::Region);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<ValueKind> {
+  static void enumeration(IO &YIO, ValueKind &EN) {
+    YIO.enumCase(EN, "ByValue", ValueKind::ByValue);
+    YIO.enumCase(EN, "GlobalBuffer", ValueKind::GlobalBuffer);
+    YIO.enumCase(EN, "DynamicSharedPointer", ValueKind::DynamicSharedPointer);
+    YIO.enumCase(EN, "Sampler", ValueKind::Sampler);
+    YIO.enumCase(EN, "Image", ValueKind::Image);
+    YIO.enumCase(EN, "Pipe", ValueKind::Pipe);
+    YIO.enumCase(EN, "Queue", ValueKind::Queue);
+    YIO.enumCase(EN, "HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX);
+    YIO.enumCase(EN, "HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY);
+    YIO.enumCase(EN, "HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ);
+    YIO.enumCase(EN, "HiddenNone", ValueKind::HiddenNone);
+    YIO.enumCase(EN, "HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer);
+    YIO.enumCase(EN, "HiddenDefaultQueue", ValueKind::HiddenDefaultQueue);
+    YIO.enumCase(EN, "HiddenCompletionAction",
+                 ValueKind::HiddenCompletionAction);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<ValueType> {
+  static void enumeration(IO &YIO, ValueType &EN) {
+    YIO.enumCase(EN, "Struct", ValueType::Struct);
+    YIO.enumCase(EN, "I8", ValueType::I8);
+    YIO.enumCase(EN, "U8", ValueType::U8);
+    YIO.enumCase(EN, "I16", ValueType::I16);
+    YIO.enumCase(EN, "U16", ValueType::U16);
+    YIO.enumCase(EN, "F16", ValueType::F16);
+    YIO.enumCase(EN, "I32", ValueType::I32);
+    YIO.enumCase(EN, "U32", ValueType::U32);
+    YIO.enumCase(EN, "F32", ValueType::F32);
+    YIO.enumCase(EN, "I64", ValueType::I64);
+    YIO.enumCase(EN, "U64", ValueType::U64);
+    YIO.enumCase(EN, "F64", ValueType::F64);
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Attrs::Metadata> {
+  static void mapping(IO &YIO, Kernel::Attrs::Metadata &MD) {
+    YIO.mapOptional(Kernel::Attrs::Key::ReqdWorkGroupSize,
+                    MD.mReqdWorkGroupSize, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::Attrs::Key::WorkGroupSizeHint,
+                    MD.mWorkGroupSizeHint, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
+                    MD.mVecTypeHint, std::string());
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Arg::Metadata> {
+  static void mapping(IO &YIO, Kernel::Arg::Metadata &MD) {
+    YIO.mapRequired(Kernel::Arg::Key::Size, MD.mSize);
+    YIO.mapRequired(Kernel::Arg::Key::Align, MD.mAlign);
+    YIO.mapRequired(Kernel::Arg::Key::ValueKind, MD.mValueKind);
+    YIO.mapRequired(Kernel::Arg::Key::ValueType, MD.mValueType);
+    YIO.mapOptional(Kernel::Arg::Key::PointeeAlign, MD.mPointeeAlign,
+                    uint32_t(0));
+    YIO.mapOptional(Kernel::Arg::Key::AccQual, MD.mAccQual,
+                    AccessQualifier::Unknown);
+    YIO.mapOptional(Kernel::Arg::Key::AddrSpaceQual, MD.mAddrSpaceQual,
+                    AddressSpaceQualifier::Unknown);
+    YIO.mapOptional(Kernel::Arg::Key::IsConst, MD.mIsConst, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsPipe, MD.mIsPipe, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsRestrict, MD.mIsRestrict, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsVolatile, MD.mIsVolatile, false);
+    YIO.mapOptional(Kernel::Arg::Key::Name, MD.mName, std::string());
+    YIO.mapOptional(Kernel::Arg::Key::TypeName, MD.mTypeName, std::string());
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::CodeProps::Metadata> {
+  static void mapping(IO &YIO, Kernel::CodeProps::Metadata &MD) {
+    YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentSize,
+                    MD.mKernargSegmentSize, uint64_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkgroupGroupSegmentSize,
+                    MD.mWorkgroupGroupSegmentSize, uint32_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkitemPrivateSegmentSize,
+                    MD.mWorkitemPrivateSegmentSize, uint32_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WavefrontNumSGPRs,
+                    MD.mWavefrontNumSGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkitemNumVGPRs,
+                    MD.mWorkitemNumVGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentAlign,
+                    MD.mKernargSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::GroupSegmentAlign,
+                    MD.mGroupSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::PrivateSegmentAlign,
+                    MD.mPrivateSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WavefrontSize,
+                    MD.mWavefrontSize, uint8_t(0));
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::DebugProps::Metadata> {
+  static void mapping(IO &YIO, Kernel::DebugProps::Metadata &MD) {
+    YIO.mapOptional(Kernel::DebugProps::Key::DebuggerABIVersion,
+                    MD.mDebuggerABIVersion, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::DebugProps::Key::ReservedNumVGPRs,
+                    MD.mReservedNumVGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::DebugProps::Key::ReservedFirstVGPR,
+                    MD.mReservedFirstVGPR, uint16_t(-1));
+    YIO.mapOptional(Kernel::DebugProps::Key::PrivateSegmentBufferSGPR,
+                    MD.mPrivateSegmentBufferSGPR, uint16_t(-1));
+    YIO.mapOptional(Kernel::DebugProps::Key::WavefrontPrivateSegmentOffsetSGPR,
+                    MD.mWavefrontPrivateSegmentOffsetSGPR, uint16_t(-1));
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Metadata> {
+  static void mapping(IO &YIO, Kernel::Metadata &MD) {
+    YIO.mapRequired(Kernel::Key::Name, MD.mName);
+    YIO.mapOptional(Kernel::Key::Language, MD.mLanguage, std::string());
+    YIO.mapOptional(Kernel::Key::LanguageVersion, MD.mLanguageVersion,
+                    std::vector<uint32_t>());
+    if (!MD.mAttrs.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::Attrs, MD.mAttrs);
+    if (!MD.mArgs.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::Args, MD.mArgs);
+    if (!MD.mCodeProps.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::CodeProps, MD.mCodeProps);
+    if (!MD.mDebugProps.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::DebugProps, MD.mDebugProps);
+  }
+};
+
+template <>
+struct MappingTraits<CodeObject::Metadata> {
+  static void mapping(IO &YIO, CodeObject::Metadata &MD) {
+    YIO.mapRequired(Key::Version, MD.mVersion);
+    YIO.mapOptional(Key::Printf, MD.mPrintf, std::vector<std::string>());
+    if (!MD.mKernels.empty() || !YIO.outputting())
+      YIO.mapOptional(Key::Kernels, MD.mKernels);
+  }
+};
+
+} // end namespace yaml
+
+namespace AMDGPU {
+
+/* static */
+std::error_code CodeObject::Metadata::fromYamlString(
+    std::string YamlString, CodeObject::Metadata &CodeObjectMetadata) {
+  yaml::Input YamlInput(YamlString);
+  YamlInput >> CodeObjectMetadata;
+  return YamlInput.error();
+}
+
+/* static */
+std::error_code CodeObject::Metadata::toYamlString(
+    CodeObject::Metadata CodeObjectMetadata, std::string &YamlString) {
+  raw_string_ostream YamlStream(YamlString);
+  yaml::Output YamlOutput(YamlStream, nullptr, std::numeric_limits<int>::max());
+  YamlOutput << CodeObjectMetadata;
+  return std::error_code();
+}
+
+namespace CodeObject {
+
+void MetadataStreamer::dump(StringRef YamlString) const {
+  errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n';
+}
+
+void MetadataStreamer::verify(StringRef YamlString) const {
+  errs() << "AMDGPU Code Object Metadata Parser Test: ";
+
+  CodeObject::Metadata FromYamlString;
+  if (Metadata::fromYamlString(YamlString, FromYamlString)) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  std::string ToYamlString;
+  if (Metadata::toYamlString(FromYamlString, ToYamlString)) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n';
+  if (YamlString != ToYamlString) {
+    errs() << "Original input: " << YamlString << '\n'
+           << "Produced output: " << ToYamlString << '\n';
+  }
+}
+
+AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+  if (AccQual.empty())
+    return AccessQualifier::Unknown;
+
+  return StringSwitch<AccessQualifier>(AccQual)
+             .Case("read_only",  AccessQualifier::ReadOnly)
+             .Case("write_only", AccessQualifier::WriteOnly)
+             .Case("read_write", AccessQualifier::ReadWrite)
+             .Default(AccessQualifier::Default);
+}
+
+AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+    unsigned AddressSpace) const {
+  if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
+    return AddressSpaceQualifier::Private;
+  if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS)
+    return AddressSpaceQualifier::Global;
+  if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS)
+    return AddressSpaceQualifier::Constant;
+  if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS)
+    return AddressSpaceQualifier::Local;
+  if (AddressSpace == AMDGPUASI.FLAT_ADDRESS)
+    return AddressSpaceQualifier::Generic;
+  if (AddressSpace == AMDGPUASI.REGION_ADDRESS)
+    return AddressSpaceQualifier::Region;
+
+  llvm_unreachable("Unknown address space qualifier");
+}
+
+ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
+                                         StringRef BaseTypeName) const {
+  if (TypeQual.find("pipe") != StringRef::npos)
+    return ValueKind::Pipe;
+
+  return StringSwitch<ValueKind>(BaseTypeName)
+             .Case("sampler_t", ValueKind::Sampler)
+             .Case("queue_t", ValueKind::Queue)
+             .Cases("image1d_t",
+                    "image1d_array_t",
+                    "image1d_buffer_t",
+                    "image2d_t" ,
+                    "image2d_array_t",
+                    "image2d_array_depth_t",
+                    "image2d_array_msaa_t"
+                    "image2d_array_msaa_depth_t"
+                    "image2d_depth_t",
+                    "image2d_msaa_t",
+                    "image2d_msaa_depth_t",
+                    "image3d_t", ValueKind::Image)
+             .Default(isa<PointerType>(Ty) ?
+                          (Ty->getPointerAddressSpace() ==
+                           AMDGPUASI.LOCAL_ADDRESS ?
+                           ValueKind::DynamicSharedPointer :
+                           ValueKind::GlobalBuffer) :
+                      ValueKind::ByValue);
+}
+
+ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    auto Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? ValueType::I8 : ValueType::U8;
+    case 16:
+      return Signed ? ValueType::I16 : ValueType::U16;
+    case 32:
+      return Signed ? ValueType::I32 : ValueType::U32;
+    case 64:
+      return Signed ? ValueType::I64 : ValueType::U64;
+    default:
+      return ValueType::Struct;
+    }
+  }
+  case Type::HalfTyID:
+    return ValueType::F16;
+  case Type::FloatTyID:
+    return ValueType::F32;
+  case Type::DoubleTyID:
+    return ValueType::F64;
+  case Type::PointerTyID:
+    return getValueType(Ty->getPointerElementType(), TypeName);
+  case Type::VectorTyID:
+    return getValueType(Ty->getVectorElementType(), TypeName);
+  default:
+    return ValueType::Struct;
+  }
+}
+
+std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getTypeName(Ty, true)).str();
+
+    auto BitWidth = Ty->getIntegerBitWidth();
+    switch (BitWidth) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BitWidth)).str();
+    }
+  }
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::VectorTyID: {
+    auto VecTy = cast<VectorType>(Ty);
+    auto ElTy = VecTy->getElementType();
+    auto NumElements = VecTy->getVectorNumElements();
+    return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
+    MDNode *Node) const {
+  std::vector<uint32_t> Dims;
+  if (Node->getNumOperands() != 3)
+    return Dims;
+
+  for (auto &Op : Node->operands())
+    Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue());
+  return Dims;
+}
+
+void MetadataStreamer::emitVersion() {
+  auto &Version = CodeObjectMetadata.mVersion;
+
+  Version.push_back(MetadataVersionMajor);
+  Version.push_back(MetadataVersionMinor);
+}
+
+void MetadataStreamer::emitPrintf(const Module &Mod) {
+  auto &Printf = CodeObjectMetadata.mPrintf;
+
+  auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+  if (!Node)
+    return;
+
+  for (auto Op : Node->operands())
+    if (Op->getNumOperands())
+      Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
+}
+
+void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+  auto &Kernel = CodeObjectMetadata.mKernels.back();
+
+  // TODO: What about other languages?
+  auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+  if (!Node || !Node->getNumOperands())
+    return;
+  auto Op0 = Node->getOperand(0);
+  if (Op0->getNumOperands() <= 1)
+    return;
+
+  Kernel.mLanguage = "OpenCL C";
+  Kernel.mLanguageVersion.push_back(
+      mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue());
+  Kernel.mLanguageVersion.push_back(
+      mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
+}
+
+void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+  auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs;
+
+  if (auto Node = Func.getMetadata("reqd_work_group_size"))
+    Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("work_group_size_hint"))
+    Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("vec_type_hint")) {
+    Attrs.mVecTypeHint = getTypeName(
+        cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+        mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
+  }
+}
+
+void MetadataStreamer::emitKernelArgs(const Function &Func) {
+  for (auto &Arg : Func.args())
+    emitKernelArg(Arg);
+
+  // TODO: What about other languages?
+  if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+  if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+    return;
+
+  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+                                      AMDGPUASI.GLOBAL_ADDRESS);
+  emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+}
+
+void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+  auto Func = Arg.getParent();
+  auto ArgNo = Arg.getArgNo();
+  const MDNode *Node;
+
+  StringRef TypeQual;
+  Node = Func->getMetadata("kernel_arg_type_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef BaseTypeName;
+  Node = Func->getMetadata("kernel_arg_base_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef AccQual;
+  if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
+      Arg.hasNoAliasAttr()) {
+    AccQual = "read_only";
+  } else {
+    Node = Func->getMetadata("kernel_arg_access_qual");
+    if (Node && ArgNo < Node->getNumOperands())
+      AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  }
+
+  StringRef Name;
+  Node = Func->getMetadata("kernel_arg_name");
+  if (Node && ArgNo < Node->getNumOperands())
+    Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef TypeName;
+  Node = Func->getMetadata("kernel_arg_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+                getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual,
+                BaseTypeName, AccQual, Name, TypeName);
+}
+
+void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                     ValueKind ValueKind, StringRef TypeQual,
+                                     StringRef BaseTypeName, StringRef AccQual,
+                                     StringRef Name, StringRef TypeName) {
+  CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
+  auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back();
+
+  Arg.mSize = DL.getTypeAllocSize(Ty);
+  Arg.mAlign = DL.getABITypeAlignment(Ty);
+  Arg.mValueKind = ValueKind;
+  Arg.mValueType = getValueType(Ty, BaseTypeName);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    auto ElTy = PtrTy->getElementType();
+    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
+      Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
+  }
+
+  Arg.mAccQual = getAccessQualifier(AccQual);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty))
+    Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+
+  SmallVector<StringRef, 1> SplitTypeQuals;
+  TypeQual.split(SplitTypeQuals, " ", -1, false);
+  for (StringRef Key : SplitTypeQuals) {
+    auto P = StringSwitch<bool*>(Key)
+                 .Case("const",    &Arg.mIsConst)
+                 .Case("pipe",     &Arg.mIsPipe)
+                 .Case("restrict", &Arg.mIsRestrict)
+                 .Case("volatile", &Arg.mIsVolatile)
+                 .Default(nullptr);
+    if (P)
+      *P = true;
+  }
+
+  Arg.mName = Name;
+  Arg.mTypeName = TypeName;
+}
+
+void MetadataStreamer::emitKernelCodeProps(
+    const amd_kernel_code_t &KernelCode) {
+  auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps;
+
+  CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size;
+  CodeProps.mWorkgroupGroupSegmentSize =
+      KernelCode.workgroup_group_segment_byte_size;
+  CodeProps.mWorkitemPrivateSegmentSize =
+      KernelCode.workitem_private_segment_byte_size;
+  CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count;
+  CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count;
+  CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment;
+  CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment;
+  CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment;
+  CodeProps.mWavefrontSize = KernelCode.wavefront_size;
+}
+
+void MetadataStreamer::emitKernelDebugProps(
+    const amd_kernel_code_t &KernelCode) {
+  if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED))
+    return;
+
+  auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps;
+
+  // FIXME: Need to pass down debugger ABI version through features. This is ok
+  // for now because we only have one version.
+  DebugProps.mDebuggerABIVersion.push_back(1);
+  DebugProps.mDebuggerABIVersion.push_back(0);
+  DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count;
+  DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first;
+  DebugProps.mPrivateSegmentBufferSGPR =
+      KernelCode.debug_private_segment_buffer_sgpr;
+  DebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+      KernelCode.debug_wavefront_private_segment_offset_sgpr;
+}
+
+void MetadataStreamer::begin(const Module &Mod) {
+  AMDGPUASI = getAMDGPUAS(Mod);
+  emitVersion();
+  emitPrintf(Mod);
+}
+
+void MetadataStreamer::emitKernel(const Function &Func,
+                                  const amd_kernel_code_t &KernelCode) {
+  if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+    return;
+
+  CodeObjectMetadata.mKernels.push_back(Kernel::Metadata());
+  auto &Kernel = CodeObjectMetadata.mKernels.back();
+
+  Kernel.mName = Func.getName();
+  emitKernelLanguage(Func);
+  emitKernelAttrs(Func);
+  emitKernelArgs(Func);
+  emitKernelCodeProps(KernelCode);
+  emitKernelDebugProps(KernelCode);
+}
+
+ErrorOr<std::string> MetadataStreamer::toYamlString() {
+  std::string YamlString;
+  if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString))
+    return Error;
+
+  if (DumpCodeObjectMetadata)
+    dump(YamlString);
+  if (VerifyCodeObjectMetadata)
+    verify(YamlString);
+
+  return YamlString;
+}
+
+ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) {
+  if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata))
+    return Error;
+
+  return toYamlString();
+}
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
new file mode 100644
index 000000000000..8d4c51763f63
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
@@ -0,0 +1,99 @@
+//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata Streamer.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+
+#include "AMDGPU.h"
+#include "AMDGPUCodeObjectMetadata.h"
+#include "AMDKernelCodeT.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+
+class Argument;
+class DataLayout;
+class Function;
+class MDNode;
+class Module;
+class Type;
+
+namespace AMDGPU {
+namespace CodeObject {
+
+class MetadataStreamer final {
+private:
+  Metadata CodeObjectMetadata;
+  AMDGPUAS AMDGPUASI;
+
+  void dump(StringRef YamlString) const;
+
+  void verify(StringRef YamlString) const;
+
+  AccessQualifier getAccessQualifier(StringRef AccQual) const;
+
+  AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const;
+
+  ValueKind getValueKind(Type *Ty, StringRef TypeQual,
+                         StringRef BaseTypeName) const;
+
+  ValueType getValueType(Type *Ty, StringRef TypeName) const;
+
+  std::string getTypeName(Type *Ty, bool Signed) const;
+
+  std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
+
+  void emitVersion();
+
+  void emitPrintf(const Module &Mod);
+
+  void emitKernelLanguage(const Function &Func);
+
+  void emitKernelAttrs(const Function &Func);
+
+  void emitKernelArgs(const Function &Func);
+
+  void emitKernelArg(const Argument &Arg);
+
+  void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+                     StringRef TypeQual = "", StringRef BaseTypeName = "",
+                     StringRef AccQual = "", StringRef Name = "",
+                     StringRef TypeName = "");
+
+  void emitKernelCodeProps(const amd_kernel_code_t &KernelCode);
+
+  void emitKernelDebugProps(const amd_kernel_code_t &KernelCode);
+
+public:
+  MetadataStreamer() = default;
+  ~MetadataStreamer() = default;
+
+  void begin(const Module &Mod);
+
+  void end() {}
+
+  void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode);
+
+  ErrorOr<std::string> toYamlString();
+
+  ErrorOr<std::string> toYamlString(StringRef YamlString);
+};
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 1847d7a67328..073d19422e86 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -1,16 +1,20 @@
-//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// \file
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -19,20 +23,21 @@ namespace {
 class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
+
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
 
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
                                              bool HasRelocationAddend)
   : MCELFObjectTargetWriter(Is64Bit,
                             ELF::ELFOSABI_AMDGPU_HSA,
                             ELF::EM_AMDGPU,
-                            HasRelocationAddend) { }
+                            HasRelocationAddend) {}
 
 unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
                                              const MCValue &Target,
@@ -77,7 +82,6 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
   llvm_unreachable("unhandled relocation type");
 }
 
-
 MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
                                                   bool HasRelocationAddend,
                                                   raw_pwrite_stream &OS) {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 548bad56e174..f80b5f3a6dba 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -54,11 +54,17 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
 
 #define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
 #include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
+
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
 
 #endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
deleted file mode 100644
index 95387ad1627c..000000000000
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Generates AMDGPU runtime metadata for YAML mapping.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPURuntimeMetadata.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <vector>
-#include "AMDGPURuntimeMD.h"
-
-using namespace llvm;
-using namespace ::AMDGPU::RuntimeMD;
-
-static cl::opt<bool>
-DumpRuntimeMD("amdgpu-dump-rtmd",
-              cl::desc("Dump AMDGPU runtime metadata"));
-
-static cl::opt<bool>
-CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden,
-                     cl::desc("Check AMDGPU runtime metadata YAML parser"));
-
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
-LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
-LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata)
-
-namespace llvm {
-namespace yaml {
-
-template <> struct MappingTraits<KernelArg::Metadata> {
-  static void mapping(IO &YamlIO, KernelArg::Metadata &A) {
-    YamlIO.mapRequired(KeyName::ArgSize, A.Size);
-    YamlIO.mapRequired(KeyName::ArgAlign, A.Align);
-    YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U);
-    YamlIO.mapRequired(KeyName::ArgKind, A.Kind);
-    YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType);
-    YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string());
-    YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string());
-    YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL);
-    YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL);
-    YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0));
-  }
-  static const bool flow = true;
-};
-
-template <> struct MappingTraits<Kernel::Metadata> {
-  static void mapping(IO &YamlIO, Kernel::Metadata &K) {
-    YamlIO.mapRequired(KeyName::KernelName, K.Name);
-    YamlIO.mapOptional(KeyName::Language, K.Language, std::string());
-    YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion);
-    YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize);
-    YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint);
-    YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string());
-    YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex,
-        INVALID_KERNEL_INDEX);
-    YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups,
-        uint8_t(0));
-    YamlIO.mapRequired(KeyName::Args, K.Args);
-  }
-  static const bool flow = true;
-};
-
-template <> struct MappingTraits<Program::Metadata> {
-  static void mapping(IO &YamlIO, Program::Metadata &Prog) {
-    YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq);
-    YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo);
-    YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels);
-  }
-  static const bool flow = true;
-};
-
-} // end namespace yaml
-} // end namespace llvm
-
-// Get a vector of three integer values from MDNode \p Node;
-static std::vector<uint32_t> getThreeInt32(MDNode *Node) {
-  assert(Node->getNumOperands() == 3);
-  std::vector<uint32_t> V;
-  for (const MDOperand &Op : Node->operands()) {
-    const ConstantInt *CI = mdconst::extract<ConstantInt>(Op);
-    V.push_back(CI->getZExtValue());
-  }
-  return V;
-}
-
-static std::string getOCLTypeName(Type *Ty, bool Signed) {
-  switch (Ty->getTypeID()) {
-  case Type::HalfTyID:
-    return "half";
-  case Type::FloatTyID:
-    return "float";
-  case Type::DoubleTyID:
-    return "double";
-  case Type::IntegerTyID: {
-    if (!Signed)
-      return (Twine('u') + getOCLTypeName(Ty, true)).str();
-    unsigned BW = Ty->getIntegerBitWidth();
-    switch (BW) {
-    case 8:
-      return "char";
-    case 16:
-      return "short";
-    case 32:
-      return "int";
-    case 64:
-      return "long";
-    default:
-      return (Twine('i') + Twine(BW)).str();
-    }
-  }
-  case Type::VectorTyID: {
-    VectorType *VecTy = cast<VectorType>(Ty);
-    Type *EleTy = VecTy->getElementType();
-    unsigned Size = VecTy->getVectorNumElements();
-    return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
-  }
-  default:
-    return "unknown";
-  }
-}
-
-static KernelArg::ValueType getRuntimeMDValueType(
-  Type *Ty, StringRef TypeName) {
-  switch (Ty->getTypeID()) {
-  case Type::HalfTyID:
-    return KernelArg::F16;
-  case Type::FloatTyID:
-    return KernelArg::F32;
-  case Type::DoubleTyID:
-    return KernelArg::F64;
-  case Type::IntegerTyID: {
-    bool Signed = !TypeName.startswith("u");
-    switch (Ty->getIntegerBitWidth()) {
-    case 8:
-      return Signed ? KernelArg::I8 : KernelArg::U8;
-    case 16:
-      return Signed ? KernelArg::I16 : KernelArg::U16;
-    case 32:
-      return Signed ? KernelArg::I32 : KernelArg::U32;
-    case 64:
-      return Signed ? KernelArg::I64 : KernelArg::U64;
-    default:
-      // Runtime does not recognize other integer types. Report as struct type.
-      return KernelArg::Struct;
-    }
-  }
-  case Type::VectorTyID:
-    return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName);
-  case Type::PointerTyID:
-    return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName);
-  default:
-    return KernelArg::Struct;
-  }
-}
-
-static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace(
-    AMDGPUAS::AddressSpaces A) {
-  switch (A) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-    return KernelArg::Global;
-  case AMDGPUAS::CONSTANT_ADDRESS:
-    return KernelArg::Constant;
-  case AMDGPUAS::LOCAL_ADDRESS:
-    return KernelArg::Local;
-  case AMDGPUAS::FLAT_ADDRESS:
-    return KernelArg::Generic;
-  case AMDGPUAS::REGION_ADDRESS:
-    return KernelArg::Region;
-  default:
-    return KernelArg::Private;
-  }
-}
-
-static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL,
-    Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "",
-    StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "",
-    StringRef AccQual = "") {
-
-  KernelArg::Metadata Arg;
-
-  // Set ArgSize and ArgAlign.
-  Arg.Size = DL.getTypeAllocSize(T);
-  Arg.Align = DL.getABITypeAlignment(T);
-  if (auto PT = dyn_cast<PointerType>(T)) {
-    auto ET = PT->getElementType();
-    if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized())
-      Arg.PointeeAlign = DL.getABITypeAlignment(ET);
-  }
-
-  // Set ArgTypeName.
-  Arg.TypeName = TypeName;
-
-  // Set ArgName.
-  Arg.Name = ArgName;
-
-  // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe.
-  SmallVector<StringRef, 1> SplitQ;
-  TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */);
-
-  for (StringRef KeyName : SplitQ) {
-    auto *P = StringSwitch<uint8_t *>(KeyName)
-      .Case("volatile", &Arg.IsVolatile)
-      .Case("restrict", &Arg.IsRestrict)
-      .Case("const",    &Arg.IsConst)
-      .Case("pipe",     &Arg.IsPipe)
-      .Default(nullptr);
-    if (P)
-      *P = 1;
-  }
-
-  // Set ArgKind.
-  Arg.Kind = Kind;
-
-  // Set ArgValueType.
-  Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName);
-
-  // Set ArgAccQual.
-  if (!AccQual.empty()) {
-    Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual)
-      .Case("read_only",  KernelArg::ReadOnly)
-      .Case("write_only", KernelArg::WriteOnly)
-      .Case("read_write", KernelArg::ReadWrite)
-      .Default(KernelArg::AccNone);
-  }
-
-  // Set ArgAddrQual.
-  if (auto *PT = dyn_cast<PointerType>(T)) {
-    Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>(
-        PT->getAddressSpace()));
-  }
-
-  return Arg;
-}
-
-static Kernel::Metadata getRuntimeMDForKernel(const Function &F) {
-  Kernel::Metadata Kernel;
-  Kernel.Name = F.getName();
-  auto &M = *F.getParent();
-
-  // Set Language and LanguageVersion.
-  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
-    if (MD->getNumOperands() != 0) {
-      auto Node = MD->getOperand(0);
-      if (Node->getNumOperands() > 1) {
-        Kernel.Language = "OpenCL C";
-        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
-                         ->getZExtValue();
-        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
-                         ->getZExtValue();
-        Kernel.LanguageVersion.push_back(Major);
-        Kernel.LanguageVersion.push_back(Minor);
-      }
-    }
-  }
-
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  for (auto &Arg : F.args()) {
-    unsigned I = Arg.getArgNo();
-    Type *T = Arg.getType();
-    auto TypeName = dyn_cast<MDString>(F.getMetadata(
-        "kernel_arg_type")->getOperand(I))->getString();
-    auto BaseTypeName = cast<MDString>(F.getMetadata(
-        "kernel_arg_base_type")->getOperand(I))->getString();
-    StringRef ArgName;
-    if (auto ArgNameMD = F.getMetadata("kernel_arg_name"))
-      ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString();
-    auto TypeQual = cast<MDString>(F.getMetadata(
-        "kernel_arg_type_qual")->getOperand(I))->getString();
-    auto AccQual = cast<MDString>(F.getMetadata(
-        "kernel_arg_access_qual")->getOperand(I))->getString();
-    KernelArg::Kind Kind;
-    if (TypeQual.find("pipe") != StringRef::npos)
-      Kind = KernelArg::Pipe;
-    else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName)
-      .Case("sampler_t", KernelArg::Sampler)
-      .Case("queue_t",   KernelArg::Queue)
-      .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
-             "image2d_t" , "image2d_array_t",  KernelArg::Image)
-      .Cases("image2d_depth_t", "image2d_array_depth_t",
-             "image2d_msaa_t", "image2d_array_msaa_t",
-             "image2d_msaa_depth_t",  KernelArg::Image)
-      .Cases("image2d_array_msaa_depth_t", "image3d_t",
-             KernelArg::Image)
-      .Default(isa<PointerType>(T) ?
-                   (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ?
-                   KernelArg::DynamicSharedPointer :
-                   KernelArg::GlobalBuffer) :
-                   KernelArg::ByValue);
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind,
-        BaseTypeName, TypeName, ArgName, TypeQual, AccQual));
-  }
-
-  // Emit hidden kernel arguments for OpenCL kernels.
-  if (F.getParent()->getNamedMetadata("opencl.ocl.version")) {
-    auto Int64T = Type::getInt64Ty(F.getContext());
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetX));
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetY));
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetZ));
-    if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) {
-      auto Int8PtrT = Type::getInt8PtrTy(F.getContext(),
-          KernelArg::Global);
-      Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT,
-          KernelArg::HiddenPrintfBuffer));
-    }
-  }
-
-  // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint.
-  if (auto RWGS = F.getMetadata("reqd_work_group_size"))
-    Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS);
-
-  if (auto WGSH = F.getMetadata("work_group_size_hint"))
-    Kernel.WorkGroupSizeHint = getThreeInt32(WGSH);
-
-  if (auto VTH = F.getMetadata("vec_type_hint"))
-    Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>(
-      VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
-      VTH->getOperand(1))->getZExtValue());
-
-  return Kernel;
-}
-
-Program::Metadata::Metadata(const std::string &YAML) {
-  yaml::Input Input(YAML);
-  Input >> *this;
-}
-
-std::string Program::Metadata::toYAML(void) {
-  std::string Text;
-  raw_string_ostream Stream(Text);
-  yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */);
-  Output << *this;
-  return Stream.str();
-}
-
-Program::Metadata Program::Metadata::fromYAML(const std::string &S) {
-  return Program::Metadata(S);
-}
-
-// Check if the YAML string can be parsed.
-static void checkRuntimeMDYAMLString(const std::string &YAML) {
-  auto P = Program::Metadata::fromYAML(YAML);
-  auto S = P.toYAML();
-  llvm::errs() << "AMDGPU runtime metadata parser test "
-               << (YAML == S ? "passes" : "fails") << ".\n";
-  if (YAML != S) {
-    llvm::errs() << "First output: " << YAML << '\n'
-                 << "Second output: " << S << '\n';
-  }
-}
-
-std::string llvm::getRuntimeMDYAMLString(Module &M) {
-  Program::Metadata Prog;
-  Prog.MDVersionSeq.push_back(MDVersion);
-  Prog.MDVersionSeq.push_back(MDRevision);
-
-  // Set PrintfInfo.
-  if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) {
-    for (unsigned I = 0; I < MD->getNumOperands(); ++I) {
-      auto Node = MD->getOperand(I);
-      if (Node->getNumOperands() > 0)
-        Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0))
-            ->getString());
-    }
-  }
-
-  // Set Kernels.
-  for (auto &F: M.functions()) {
-    if (!F.getMetadata("kernel_arg_type"))
-      continue;
-    Prog.Kernels.emplace_back(getRuntimeMDForKernel(F));
-  }
-
-  auto YAML = Prog.toYAML();
-
-  if (DumpRuntimeMD)
-    llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n';
-
-  if (CheckRuntimeMDParser)
-    checkRuntimeMDYAMLString(YAML);
-
-  return YAML;
-}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
deleted file mode 100644
index a92fdd4bebc2..000000000000
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares functions for generating runtime metadata.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
-#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
-
-#include <string>
-
-namespace llvm {
-class Module;
-
-// Get runtime metadata as YAML string.
-std::string getRuntimeMDYAMLString(Module &M);
-
-}
-#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 3392183d33c3..8dc863f723e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -27,7 +27,6 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/FormattedStream.h"
-#include "AMDGPURuntimeMD.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -36,9 +35,27 @@ namespace llvm {
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetStreamer
+//===----------------------------------------------------------------------===//
+
 AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
+void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) {
+  CodeObjectMetadataStreamer.begin(Mod);
+}
+
+void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata(
+    const Function &Func, const amd_kernel_code_t &KernelCode) {
+  CodeObjectMetadataStreamer.emitKernel(Func, KernelCode);
+}
+
+void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() {
+  CodeObjectMetadataStreamer.end();
+  EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get());
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetAsmStreamer
 //===----------------------------------------------------------------------===//
@@ -93,16 +110,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
   OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
 }
 
-void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) {
-  OS << "\t.amdgpu_runtime_metadata\n";
-  OS << getRuntimeMDYAMLString(M);
-  OS << "\n\t.end_amdgpu_runtime_metadata\n";
-}
+bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
+  auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
+  if (!VerifiedYamlString)
+    return false;
 
-void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) {
-  OS << "\t.amdgpu_runtime_metadata";
-  OS << Metadata;
-  OS << "\t.end_amdgpu_runtime_metadata\n";
+  OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n';
+  OS << VerifiedYamlString.get();
+  OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n';
+
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -116,22 +133,21 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
 }
 
-void
-AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ,
-                                        PT_NOTE::NoteType Type,
-                              std::function<void(MCELFStreamer &)> EmitDesc) {
+void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
+    const MCExpr *DescSZ, ElfNote::NoteType Type,
+    function_ref<void(MCELFStreamer &)> EmitDesc) {
   auto &S = getStreamer();
   auto &Context = S.getContext();
 
-  auto NameSZ = sizeof(PT_NOTE::NoteName);
+  auto NameSZ = sizeof(ElfNote::NoteName);
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
-    PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
+    ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
   S.EmitIntValue(NameSZ, 4);                                  // namesz
   S.EmitValue(DescSZ, 4);                                     // descz
-  S.EmitIntValue(Type, 4); // type
-  S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ));          // name
+  S.EmitIntValue(Type, 4);                                    // type
+  S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ));          // name
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
@@ -144,7 +160,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
 
   EmitAMDGPUNote(
     MCConstantExpr::create(8, getContext()),
-    PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
+    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
     [&](MCELFStreamer &OS){
       OS.EmitIntValue(Major, 4);
       OS.EmitIntValue(Minor, 4);
@@ -160,14 +176,14 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
                                                        StringRef ArchName) {
   uint16_t VendorNameSize = VendorName.size() + 1;
   uint16_t ArchNameSize = ArchName.size() + 1;
-  
+
   unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
     sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
     VendorNameSize + ArchNameSize;
 
   EmitAMDGPUNote(
     MCConstantExpr::create(DescSZ, getContext()),
-    PT_NOTE::NT_AMDGPU_HSA_ISA,
+    ElfNote::NT_AMDGPU_HSA_ISA,
     [&](MCELFStreamer &OS) {
       OS.EmitIntValue(VendorNameSize, 2);
       OS.EmitIntValue(ArchNameSize, 2);
@@ -216,7 +232,11 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
   Symbol->setBinding(ELF::STB_GLOBAL);
 }
 
-void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
+  auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
+  if (!VerifiedYamlString)
+    return false;
+
   // Create two labels to mark the beginning and end of the desc field
   // and a MCExpr to calculate the size of the desc field.
   auto &Context = getContext();
@@ -228,15 +248,13 @@ void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
 
   EmitAMDGPUNote(
     DescSZ,
-    PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA,
+    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA,
     [&](MCELFStreamer &OS) {
       OS.EmitLabel(DescBegin);
-      OS.EmitBytes(Metadata);
+      OS.EmitBytes(VerifiedYamlString.get());
       OS.EmitLabel(DescEnd);
     }
   );
-}
 
-void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) {
-  EmitRuntimeMetadata(getRuntimeMDYAMLString(M));
+  return true;
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e2f20586903d..5c588bbded9c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
+#include "AMDGPUCodeObjectMetadataStreamer.h"
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
 
@@ -26,6 +27,7 @@ class Type;
 
 class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
+  AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer;
   MCContext &getContext() const { return Streamer.getContext(); }
 
 public:
@@ -46,12 +48,18 @@ public:
 
   virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
 
-  virtual void EmitRuntimeMetadata(Module &M) = 0;
+  virtual void EmitStartOfCodeObjectMetadata(const Module &Mod);
 
-  virtual void EmitRuntimeMetadata(StringRef Metadata) = 0;
+  virtual void EmitKernelCodeObjectMetadata(
+      const Function &Func, const amd_kernel_code_t &KernelCode);
+
+  virtual void EmitEndOfCodeObjectMetadata();
+
+  /// \returns True on success, false on failure.
+  virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0;
 };
 
-class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
+class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   formatted_raw_ostream &OS;
 public:
   AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
@@ -70,17 +78,16 @@ public:
 
   void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 
-  void EmitRuntimeMetadata(Module &M) override;
-
-  void EmitRuntimeMetadata(StringRef Metadata) override;
+  /// \returns True on success, false on failure.
+  bool EmitCodeObjectMetadata(StringRef YamlString) override;
 };
 
-class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
+class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
 
-  void EmitAMDGPUNote(const MCExpr* DescSize,
-                      AMDGPU::PT_NOTE::NoteType Type,
-                      std::function<void(MCELFStreamer &)> EmitDesc);
+  void EmitAMDGPUNote(const MCExpr *DescSize,
+                      AMDGPU::ElfNote::NoteType Type,
+                      function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
   AMDGPUTargetELFStreamer(MCStreamer &S);
@@ -102,9 +109,8 @@ public:
 
   void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 
-  void EmitRuntimeMetadata(Module &M) override;
-
-  void EmitRuntimeMetadata(StringRef Metadata) override;
+  /// \returns True on success, false on failure.
+  bool EmitCodeObjectMetadata(StringRef YamlString) override;
 };
 
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 8a6d00ce69ed..09e3efad10af 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -1,13 +1,12 @@
-
 add_llvm_library(LLVMAMDGPUDesc
   AMDGPUAsmBackend.cpp
+  AMDGPUCodeObjectMetadataStreamer.cpp
   AMDGPUELFObjectWriter.cpp
   AMDGPUELFStreamer.cpp
+  AMDGPUMCAsmInfo.cpp
   AMDGPUMCCodeEmitter.cpp
   AMDGPUMCTargetDesc.cpp
-  AMDGPUMCAsmInfo.cpp
-  AMDGPURuntimeMD.cpp
   AMDGPUTargetStreamer.cpp
   R600MCCodeEmitter.cpp
   SIMCCodeEmitter.cpp
-  )
+)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 0c5bb0648a16..bda0928036fd 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -220,13 +220,35 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     Imm = MO.getImm();
   }
 
-  switch (AMDGPU::getOperandSize(OpInfo)) {
-  case 4:
+  switch (OpInfo.OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-  case 8:
+
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
-  case 2:
+
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    // FIXME Is this correct? What do inline immediates do on SI for f16 src
+    // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    uint16_t Lo16 = static_cast<uint16_t>(Imm);
+    assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
+    uint32_t Encoding = getLit16Encoding(Lo16, STI);
+    assert(Encoding != 255 && "packed constants can only be inline immediates");
+    return Encoding;
+  }
   default:
     llvm_unreachable("invalid operand size");
   }
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 46803e555711..a515eecc222a 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -475,106 +475,6 @@ class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
     sub0)
 >;
 
-// ======= SI Image Intrinsics ================
-
-// Image load
-defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
-defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
-def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
-
-// Basic sample
-defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
-defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
-defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-// Only the variants which make sense are defined.
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
-def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
-
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
-
 // ======= amdgcn Image Intrinsics ==============
 
 // Image load
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index 3c07cc76b9a1..0e4eda982139 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -187,3 +187,10 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
   [FeatureISAVersion8_1_0]
 >;
 
+def : ProcessorModel<"gfx900",   SIQuarterSpeedModel,
+  [FeatureGFX9, FeatureISAVersion9_0_0, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"gfx901",   SIQuarterSpeedModel,
+  [FeatureGFX9, FeatureXNACK, FeatureISAVersion9_0_1, FeatureLDSBankCount32]
+>;
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 45b36d3d3ebb..811b905588b4 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -19,10 +19,26 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <set>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -43,13 +59,12 @@ struct CFStack {
   std::vector<StackItem> BranchStack;
   std::vector<StackItem> LoopStack;
   unsigned MaxStackSize;
-  unsigned CurrentEntries;
-  unsigned CurrentSubEntries;
+  unsigned CurrentEntries = 0;
+  unsigned CurrentSubEntries = 0;
 
   CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
       // We need to reserve a stack entry for CALL_FS in vertex shaders.
-      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
-      CurrentEntries(0), CurrentSubEntries(0) { }
+      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {}
 
   unsigned getLoopDepth();
   bool branchStackContains(CFStack::StackItem);
@@ -198,9 +213,8 @@ void CFStack::popLoop() {
 }
 
 class R600ControlFlowFinalizer : public MachineFunctionPass {
-
 private:
-  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+  typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile;
 
   enum ControlFlowInstruction {
     CF_TC,
@@ -217,10 +231,10 @@ private:
   };
 
   static char ID;
-  const R600InstrInfo *TII;
-  const R600RegisterInfo *TRI;
+  const R600InstrInfo *TII = nullptr;
+  const R600RegisterInfo *TRI = nullptr;
   unsigned MaxFetchInst;
-  const R600Subtarget *ST;
+  const R600Subtarget *ST = nullptr;
 
   bool IsTrivialInst(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
@@ -355,7 +369,7 @@ private:
         continue;
       int64_t Imm = Src.second;
       std::vector<MachineOperand *>::iterator It =
-          find_if(Lits, [&](MachineOperand *val) {
+          llvm::find_if(Lits, [&](MachineOperand *val) {
             return val->isImm() && (val->getImm() == Imm);
           });
 
@@ -485,8 +499,7 @@ private:
   }
 
 public:
-  R600ControlFlowFinalizer(TargetMachine &tm)
-      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
+  R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     ST = &MF.getSubtarget<R600Subtarget>();
@@ -501,7 +514,7 @@ public:
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
-      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+      std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
       if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
@@ -554,7 +567,7 @@ public:
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_WHILE_LOOP))
               .addImm(1);
-          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+          std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount,
               std::set<MachineInstr *>());
           Pair.second.insert(MIb);
           LoopStack.push_back(std::move(Pair));
@@ -564,7 +577,7 @@ public:
         }
         case AMDGPU::ENDLOOP: {
           CFStack.popLoop();
-          std::pair<unsigned, std::set<MachineInstr *> > Pair =
+          std::pair<unsigned, std::set<MachineInstr *>> Pair =
               std::move(LoopStack.back());
           LoopStack.pop_back();
           CounterPropagateAddr(Pair.second, CfCount);
@@ -693,7 +706,6 @@ char R600ControlFlowFinalizer::ID = 0;
 
 } // end anonymous namespace
 
-
-llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
+FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
   return new R600ControlFlowFinalizer(TM);
 }
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 9a5db6ccc672..03fc1aff5ec1 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -17,26 +17,37 @@
 #include "AMDGPU.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 namespace llvm {
+
   void initializeR600EmitClauseMarkersPass(PassRegistry&);
-}
+
+} // end namespace llvm
 
 namespace {
 
 class R600EmitClauseMarkers : public MachineFunctionPass {
-
 private:
-  const R600InstrInfo *TII;
-  int Address;
+  const R600InstrInfo *TII = nullptr;
+  int Address = 0;
 
   unsigned OccupiedDwords(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
@@ -118,7 +129,7 @@ private:
   SubstituteKCacheBank(MachineInstr &MI,
                        std::vector<std::pair<unsigned, unsigned>> &CachedConsts,
                        bool UpdateInstr = true) const {
-    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+    std::vector<std::pair<unsigned, unsigned>> UsedKCache;
 
     if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
       return true;
@@ -181,10 +192,11 @@ private:
 
   bool canClauseLocalKillFitInClause(
                         unsigned AluInstCount,
-                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        std::vector<std::pair<unsigned, unsigned>> KCacheBanks,
                         MachineBasicBlock::iterator Def,
                         MachineBasicBlock::iterator BBEnd) {
     const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    //TODO: change this to defs?
     for (MachineInstr::const_mop_iterator
            MOI = Def->operands_begin(),
            MOE = Def->operands_end(); MOI != MOE; ++MOI) {
@@ -207,15 +219,17 @@ private:
         if (AluInstCount >= TII->getMaxAlusPerClause())
           return false;
 
+        // TODO: Is this true? kill flag appears to work OK below
         // Register kill flags have been cleared by the time we get to this
         // pass, but it is safe to assume that all uses of this register
         // occur in the same basic block as its definition, because
         // it is illegal for the scheduler to schedule them in
         // different blocks.
-        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+        if (UseI->readsRegister(MOI->getReg()))
           LastUseCount = AluInstCount;
 
-        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+        // Exit early if the current use kills the register
+        if (UseI != Def && UseI->killsRegister(MOI->getReg()))
           break;
       }
       if (LastUseCount)
@@ -228,7 +242,7 @@ private:
   MachineBasicBlock::iterator
   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
     MachineBasicBlock::iterator ClauseHead = I;
-    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+    std::vector<std::pair<unsigned, unsigned>> KCacheBanks;
     bool PushBeforeModifier = false;
     unsigned AluInstCount = 0;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
@@ -294,8 +308,8 @@ private:
 
 public:
   static char ID;
-  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
 
+  R600EmitClauseMarkers() : MachineFunctionPass(ID) {
     initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
   }
 
@@ -310,9 +324,11 @@ public:
       if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
-        if (isALU(*I))
-          I = MakeALUClause(MBB, I);
-        else
+        if (isALU(*I)) {
+          auto next = MakeALUClause(MBB, I);
+          assert(next != I);
+          I = next;
+        } else
           ++I;
       }
     }
@@ -333,7 +349,6 @@ INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
                       "R600 Emit Clause Markters", false, false)
 
-llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+FunctionPass *llvm::createR600EmitClauseMarkers() {
   return new R600EmitClauseMarkers();
 }
-
diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp
index 5813786abe01..1f01ad732e00 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -8,7 +8,43 @@
 //==-----------------------------------------------------------------------===//
 
 #include "R600FrameLowering.h"
+#include "AMDGPUSubtarget.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
 R600FrameLowering::~R600FrameLowering() = default;
+
+/// \returns The number of registers allocated for \p FI.
+int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                              int FI,
+                                              unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const R600RegisterInfo *RI
+    = MF.getSubtarget<R600Subtarget>().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
+  // Start the offset at 2 so we don't overwrite work group information.
+  // FIXME: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
+
+  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+    OffsetBytes += MFI.getObjectSize(i);
+    // Each register holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = alignTo(OffsetBytes, 4);
+  }
+
+  if (FI != -1)
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
+}
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index 874435f35ce4..142f70967eda 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -25,6 +25,8 @@ public:
                     MachineBasicBlock &MBB) const override {}
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 77fee4356b65..3590a9b05e1d 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -221,6 +221,15 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SUBE, VT, Expand);
   }
 
+  // LLVM will expand these to atomic_cmp_swap(0)
+  // and atomic_swap, respectively.
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
   setSchedulingPreference(Sched::Source);
 
   setTargetDAGCombine(ISD::FP_ROUND);
@@ -266,7 +275,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
-        NewMI.addOperand(MI.getOperand(i));
+        NewMI.add(MI.getOperand(i));
       }
     } else {
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -339,34 +348,34 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
-        .addOperand(MI.getOperand(0))
-        .addOperand(MI.getOperand(1))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
   case AMDGPU::RAT_STORE_TYPED_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
-        .addOperand(MI.getOperand(0))
-        .addOperand(MI.getOperand(1))
-        .addOperand(MI.getOperand(2))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
   case AMDGPU::BRANCH:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-        .addOperand(MI.getOperand(0));
+        .add(MI.getOperand(0));
     break;
 
   case AMDGPU::BRANCH_COND_f32: {
     MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
                 AMDGPU::PREDICATE_BIT)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(AMDGPU::PRED_SETNE)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-        .addOperand(MI.getOperand(0))
+        .add(MI.getOperand(0))
         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
@@ -375,12 +384,12 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
                 AMDGPU::PREDICATE_BIT)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(AMDGPU::PRED_SETNE_INT)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-        .addOperand(MI.getOperand(0))
+        .add(MI.getOperand(0))
         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
@@ -408,13 +417,13 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       return BB;
     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
-        .addOperand(MI.getOperand(0))
-        .addOperand(MI.getOperand(1))
-        .addOperand(MI.getOperand(2))
-        .addOperand(MI.getOperand(3))
-        .addOperand(MI.getOperand(4))
-        .addOperand(MI.getOperand(5))
-        .addOperand(MI.getOperand(6))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .add(MI.getOperand(5))
+        .add(MI.getOperand(6))
         .addImm(CfInst)
         .addImm(EOP);
     break;
@@ -490,8 +499,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
-    switch(IntrinsicID) {
-    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    switch (IntrinsicID) {
     case AMDGPUIntrinsic::r600_tex:
     case AMDGPUIntrinsic::r600_texc: {
       unsigned TextureOp;
@@ -552,7 +560,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     }
 
     case Intrinsic::r600_implicitarg_ptr: {
-      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
@@ -599,6 +607,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 
     case Intrinsic::r600_recipsqrt_clamped:
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+    default:
+      return Op;
     }
 
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
@@ -702,12 +712,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                                SDValue Op,
                                                SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   const DataLayout &DL = DAG.getDataLayout();
   const GlobalValue *GV = GSD->getGlobal();
-  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
 
   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
@@ -864,7 +874,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUAS::CONSTANT_BUFFER_0);
+                                      AMDGPUASI.CONSTANT_BUFFER_0);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
@@ -911,7 +921,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 
   if (VT == MVT::f32) {
     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
-    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     if (MinMax)
       return MinMax;
   }
@@ -1102,7 +1112,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   //TODO: Who creates the i8 stores?
   assert(Store->isTruncatingStore()
          || Store->getValue().getValueType() == MVT::i8);
-  assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+  assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   SDValue Mask;
   if (Store->getMemoryVT() == MVT::i8) {
@@ -1200,9 +1210,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   // Neither LOCAL nor PRIVATE can do vectors at the moment
-  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
-    if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) {
+    if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+         StoreNode->isTruncatingStore()) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
       // TODO: can the chain be replaced without creating a new store?
@@ -1225,7 +1236,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
                                   DAG.getConstant(2, DL, PtrVT));
 
-  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // It is beneficial to create MSKOR here instead of combiner to avoid
     // artificial dependencies introduced by RMW
     if (StoreNode->isTruncatingStore()) {
@@ -1278,7 +1289,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
-  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
+  if (AS != AMDGPUASI.PRIVATE_ADDRESS)
     return SDValue();
 
   if (MemVT.bitsLT(MVT::i32))
@@ -1297,39 +1308,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
 // return (512 + (kc_bank << 12)
 static int
-ConstantAddressBlock(unsigned AddressSpace) {
+ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) {
   switch (AddressSpace) {
-  case AMDGPUAS::CONSTANT_BUFFER_0:
+  case AMDGPUASI.CONSTANT_BUFFER_0:
     return 512;
-  case AMDGPUAS::CONSTANT_BUFFER_1:
+  case AMDGPUASI.CONSTANT_BUFFER_1:
     return 512 + 4096;
-  case AMDGPUAS::CONSTANT_BUFFER_2:
+  case AMDGPUASI.CONSTANT_BUFFER_2:
     return 512 + 4096 * 2;
-  case AMDGPUAS::CONSTANT_BUFFER_3:
+  case AMDGPUASI.CONSTANT_BUFFER_3:
     return 512 + 4096 * 3;
-  case AMDGPUAS::CONSTANT_BUFFER_4:
+  case AMDGPUASI.CONSTANT_BUFFER_4:
     return 512 + 4096 * 4;
-  case AMDGPUAS::CONSTANT_BUFFER_5:
+  case AMDGPUASI.CONSTANT_BUFFER_5:
     return 512 + 4096 * 5;
-  case AMDGPUAS::CONSTANT_BUFFER_6:
+  case AMDGPUASI.CONSTANT_BUFFER_6:
     return 512 + 4096 * 6;
-  case AMDGPUAS::CONSTANT_BUFFER_7:
+  case AMDGPUASI.CONSTANT_BUFFER_7:
     return 512 + 4096 * 7;
-  case AMDGPUAS::CONSTANT_BUFFER_8:
+  case AMDGPUASI.CONSTANT_BUFFER_8:
     return 512 + 4096 * 8;
-  case AMDGPUAS::CONSTANT_BUFFER_9:
+  case AMDGPUASI.CONSTANT_BUFFER_9:
     return 512 + 4096 * 9;
-  case AMDGPUAS::CONSTANT_BUFFER_10:
+  case AMDGPUASI.CONSTANT_BUFFER_10:
     return 512 + 4096 * 10;
-  case AMDGPUAS::CONSTANT_BUFFER_11:
+  case AMDGPUASI.CONSTANT_BUFFER_11:
     return 512 + 4096 * 11;
-  case AMDGPUAS::CONSTANT_BUFFER_12:
+  case AMDGPUASI.CONSTANT_BUFFER_12:
     return 512 + 4096 * 12;
-  case AMDGPUAS::CONSTANT_BUFFER_13:
+  case AMDGPUASI.CONSTANT_BUFFER_13:
     return 512 + 4096 * 13;
-  case AMDGPUAS::CONSTANT_BUFFER_14:
+  case AMDGPUASI.CONSTANT_BUFFER_14:
     return 512 + 4096 * 14;
-  case AMDGPUAS::CONSTANT_BUFFER_15:
+  case AMDGPUASI.CONSTANT_BUFFER_15:
     return 512 + 4096 * 15;
   default:
     return -1;
@@ -1397,7 +1408,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = LoadNode->getMemoryVT();
   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
 
-  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+  if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
     return lowerPrivateExtLoad(Op, DAG);
   }
@@ -1407,13 +1418,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = LoadNode->getChain();
   SDValue Ptr = LoadNode->getBasePtr();
 
-  if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-      LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+  if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+      LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
       return scalarizeVectorLoad(LoadNode, DAG);
   }
 
-  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(),
+      AMDGPUASI);
   if (ConstantBlock > -1 &&
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
@@ -1445,7 +1457,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
                       DAG.getConstant(4, DL, MVT::i32)),
                       DAG.getConstant(LoadNode->getAddressSpace() -
-                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
+                                      AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
           );
     }
 
@@ -1481,7 +1493,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(MergedValues, DL);
   }
 
-  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
     return SDValue();
   }
 
@@ -1535,7 +1547,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
   SmallVector<ISD::InputArg, 8> LocalIns;
 
   if (AMDGPU::isShader(CallConv)) {
-    AnalyzeFormalArguments(CCInfo, Ins);
+    CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
   } else {
     analyzeFormalArgumentsCompute(CCInfo, Ins);
   }
@@ -1558,7 +1570,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUAS::CONSTANT_BUFFER_0);
+                                          AMDGPUASI.CONSTANT_BUFFER_0);
 
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index e88bd076718e..2422d57269eb 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -12,16 +12,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "R600InstrInfo.h"
 #include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
 #include "R600Defines.h"
-#include "R600MachineFunctionInfo.h"
+#include "R600FrameLowering.h"
+#include "R600InstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -191,7 +209,7 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
   const MachineFunction *MF = MI.getParent()->getParent();
   return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
           usesVertexCache(MI.getOpcode())) ||
-         usesTextureCache(MI.getOpcode());
+          usesTextureCache(MI.getOpcode());
 }
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
@@ -321,7 +339,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
                            unsigned &ConstCount) const {
   ConstCount = 0;
   const std::pair<int, unsigned> DummyPair(-1, 0);
-  std::vector<std::pair<int, unsigned> > Result;
+  std::vector<std::pair<int, unsigned>> Result;
   unsigned i = 0;
   for (const auto &Src : getSrcs(MI)) {
     ++i;
@@ -348,8 +366,8 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
   return Result;
 }
 
-static std::vector<std::pair<int, unsigned> >
-Swizzle(std::vector<std::pair<int, unsigned> > Src,
+static std::vector<std::pair<int, unsigned>>
+Swizzle(std::vector<std::pair<int, unsigned>> Src,
         R600InstrInfo::BankSwizzle Swz) {
   if (Src[0] == Src[1])
     Src[1].first = -1;
@@ -404,14 +422,14 @@ static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
 /// in the same Instruction Group while meeting read port limitations given a
 /// Swz swizzle sequence.
 unsigned  R600InstrInfo::isLegalUpTo(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs,
     const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    const std::vector<std::pair<int, unsigned>> &TransSrcs,
     R600InstrInfo::BankSwizzle TransSwz) const {
   int Vector[4][3];
   memset(Vector, -1, sizeof(Vector));
   for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
-    const std::vector<std::pair<int, unsigned> > &Srcs =
+    const std::vector<std::pair<int, unsigned>> &Srcs =
         Swizzle(IGSrcs[i], Swz[i]);
     for (unsigned j = 0; j < 3; j++) {
       const std::pair<int, unsigned> &Src = Srcs[j];
@@ -473,9 +491,9 @@ NextPossibleSolution(
 /// Enumerate all possible Swizzle sequence to find one that can meet all
 /// read port requirements.
 bool R600InstrInfo::FindSwizzleForVectorSlot(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs,
     std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    const std::vector<std::pair<int, unsigned>> &TransSrcs,
     R600InstrInfo::BankSwizzle TransSwz) const {
   unsigned ValidUpTo = 0;
   do {
@@ -490,7 +508,7 @@ bool R600InstrInfo::FindSwizzleForVectorSlot(
 /// a const, and can't read a gpr at cycle 1 if they read 2 const.
 static bool
 isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
-                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  const std::vector<std::pair<int, unsigned>> &TransOps,
                   unsigned ConstCount) {
   // TransALU can't read 3 constants
   if (ConstCount > 2)
@@ -516,7 +534,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
     const {
   //Todo : support shared src0 - src1 operand
 
-  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs;
   ValidSwizzle.clear();
   unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
@@ -527,7 +545,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
         IG[i]->getOperand(Op).getImm());
   }
-  std::vector<std::pair<int, unsigned> > TransOps;
+  std::vector<std::pair<int, unsigned>> TransOps;
   if (!isLastAluTrans)
     return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
 
@@ -556,7 +574,6 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   return false;
 }
 
-
 bool
 R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
     const {
@@ -780,7 +797,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
                                      int *BytesRemoved) const {
-    assert(!BytesRemoved && "code size not handled");
+  assert(!BytesRemoved && "code size not handled");
 
   // Note : we leave PRED* instructions there.
   // They may be needed when predicating instructions.
@@ -852,7 +869,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
   }
 }
 
-bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
+bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   // XXX: KILL* instructions can be predicated, but they must be the last
   // instruction in a clause, so this means any instructions after them cannot
   // be predicated.  Until we have proper support for instruction clauses in the
@@ -863,7 +880,7 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
-    if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
+    if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
       return false;
     // TODO: We don't support KC merging atm
     return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
@@ -874,10 +891,9 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   }
 }
 
-
 bool
 R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCyles,
+                                   unsigned NumCycles,
                                    unsigned ExtraPredCycles,
                                    BranchProbability Probability) const{
   return true;
@@ -896,7 +912,7 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
 
 bool
 R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCyles,
+                                         unsigned NumCycles,
                                          BranchProbability Probability)
                                          const {
   return true;
@@ -908,7 +924,6 @@ R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
   return false;
 }
 
-
 bool
 R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   MachineOperand &MO = Cond[1];
@@ -948,7 +963,6 @@ bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
   return isPredicateSetter(MI.getOpcode());
 }
 
-
 bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
                                          ArrayRef<MachineOperand> Pred) const {
   int PIdx = MI.findFirstPredOperandIdx();
@@ -1067,7 +1081,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
-void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600FrameLowering *TFL = ST.getFrameLowering();
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index a280052dbd4a..3b828006807e 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -177,12 +177,12 @@ public:
 
   bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
-  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                  BranchProbability Probability) const override;
 
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override ;
 
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 9210e66b0fe7..bac557ba989e 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -316,7 +316,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
   [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
-            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }]
 >;
 
 def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
@@ -326,8 +326,8 @@ def vtx_id3_load : LoadParamFrag<load>;
 class LoadVtxId1 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
            !isa<GlobalValue>(GetUnderlyingObject(
            LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
 }]>;
@@ -339,7 +339,7 @@ def vtx_id1_load : LoadVtxId1 <load>;
 class LoadVtxId2 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
          isa<GlobalValue>(GetUnderlyingObject(
          LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
 }]>;
@@ -1013,7 +1013,7 @@ multiclass CUBE_Common <bits<11> inst> {
     (outs R600_Reg128:$dst),
     (ins R600_Reg128:$src0),
     "CUBE $dst $src0",
-    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+    [(set v4f32:$dst, (int_r600_cube v4f32:$src0))],
     VecALU
   > {
     let isPseudo = 1;
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d70f52e0f295..b7e62075244b 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -34,15 +35,6 @@ namespace {
 typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
-// Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.amdgcn.if";
-static const char *const ElseIntrinsic = "llvm.amdgcn.else";
-static const char *const BreakIntrinsic = "llvm.amdgcn.break";
-static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
-static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
-static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
-
 class SIAnnotateControlFlow : public FunctionPass {
   DivergenceAnalysis *DA;
 
@@ -56,13 +48,13 @@ class SIAnnotateControlFlow : public FunctionPass {
   UndefValue *BoolUndef;
   Constant *Int64Zero;
 
-  Constant *If;
-  Constant *Else;
-  Constant *Break;
-  Constant *IfBreak;
-  Constant *ElseBreak;
-  Constant *Loop;
-  Constant *EndCf;
+  Function *If;
+  Function *Else;
+  Function *Break;
+  Function *IfBreak;
+  Function *ElseBreak;
+  Function *Loop;
+  Function *EndCf;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -86,7 +78,8 @@ class SIAnnotateControlFlow : public FunctionPass {
   void insertElse(BranchInst *Term);
 
   Value *handleLoopCondition(Value *Cond, PHINode *Broken,
-                             llvm::Loop *L, BranchInst *Term);
+                             llvm::Loop *L, BranchInst *Term,
+                             SmallVectorImpl<WeakVH> &LoopPhiConditions);
 
   void handleLoop(BranchInst *Term);
 
@@ -118,6 +111,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
                       "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
                     "Annotate SI Control Flow", false, false)
@@ -138,30 +132,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   BoolUndef = UndefValue::get(Boolean);
   Int64Zero = ConstantInt::get(Int64, 0);
 
-  If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
-
-  Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
-
-  Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
-  cast<Function>(Break)->setDoesNotAccessMemory();
-
-  IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
-  cast<Function>(IfBreak)->setDoesNotAccessMemory();;
-
-  ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
-  cast<Function>(ElseBreak)->setDoesNotAccessMemory();
-
-  Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
-
-  EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
-
+  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
+  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
+  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
+  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
+  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
+  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
+  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
 }
 
@@ -208,15 +185,16 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
 
 // \brief Erase "Phi" if it is not used any more
 void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
-  if (!Phi->hasNUsesOrMore(1))
-    Phi->eraseFromParent();
+  if (llvm::RecursivelyDeleteDeadPHINode(Phi)) {
+    DEBUG(dbgs() << "Erased unused condition phi\n");
+  }
 }
 
 /// \brief Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
-  if (isUniform(Term)) {
+  if (isUniform(Term))
     return;
-  }
+
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -233,8 +211,10 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 }
 
 /// \brief Recursively handle the condition leading to a loop
-Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
-                                             llvm::Loop *L, BranchInst *Term) {
+Value *SIAnnotateControlFlow::handleLoopCondition(
+  Value *Cond, PHINode *Broken,
+  llvm::Loop *L, BranchInst *Term,
+  SmallVectorImpl<WeakVH> &LoopPhiConditions) {
 
   // Only search through PHI nodes which are inside the loop.  If we try this
   // with PHI nodes that are outside of the loop, we end up inserting new PHI
@@ -245,7 +225,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
   if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
 
     BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
     Value *Ret = NewPhi;
 
     // Handle all non-constant incoming values first
@@ -258,14 +238,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
       }
 
       Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
+                                          Term, LoopPhiConditions);
       NewPhi->addIncoming(PhiArg, From);
     }
 
     BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
 
     for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-
       Value *Incoming = Phi->getIncomingValue(i);
       if (Incoming != BoolTrue)
         continue;
@@ -295,14 +275,17 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
           continue;
         }
       }
+
       TerminatorInst *Insert = From->getTerminator();
       Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
       NewPhi->setIncomingValue(i, PhiArg);
     }
-    eraseIfUnused(Phi);
+
+    LoopPhiConditions.push_back(WeakVH(Phi));
     return Ret;
+  }
 
-  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+  if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
     if (L->contains(Inst)) {
@@ -310,46 +293,55 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
     } else {
       Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     }
+
     Value *Args[] = { Cond, Broken };
     return CallInst::Create(IfBreak, Args, "", Insert);
+  }
 
-  // Insert IfBreak before TERM for constant COND.
-  } else if (isa<ConstantInt>(Cond)) {
-    Value *Args[] = { Cond, Broken };
-    return CallInst::Create(IfBreak, Args, "", Term);
+  // Insert IfBreak in the loop header TERM for constant COND other than true.
+  if (isa<Constant>(Cond)) {
+    Instruction *Insert = Cond == BoolTrue ?
+      Term : L->getHeader()->getTerminator();
 
-  } else {
-    llvm_unreachable("Unhandled loop condition!");
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
   }
-  return nullptr;
+
+  llvm_unreachable("Unhandled loop condition!");
 }
 
 /// \brief Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
-  if (isUniform(Term)) {
+  if (isUniform(Term))
     return;
-  }
 
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   if (!L)
     return;
+
   BasicBlock *Target = Term->getSuccessor(1);
-  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+  PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
 
+  SmallVector<WeakVH, 8> LoopPhiConditions;
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
 
-  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
-       PI != PE; ++PI) {
+  for (BasicBlock *Pred : predecessors(Target))
+    Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
-    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  for (WeakVH Val : reverse(LoopPhiConditions)) {
+    if (PHINode *Cond = cast_or_null<PHINode>(Val))
+      eraseIfUnused(Cond);
   }
 
-  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
   push(Term->getSuccessor(0), Arg);
-}/// \brief Close the last opened control flow
+}
+
+/// \brief Close the last opened control flow
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
@@ -359,59 +351,62 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     // We can't insert an EndCF call into a loop header, because it will
     // get executed on every iteration of the loop, when it should be
     // executed only once before the loop.
-    SmallVector <BasicBlock*, 8> Latches;
+    SmallVector <BasicBlock *, 8> Latches;
     L->getLoopLatches(Latches);
 
-    std::vector<BasicBlock*> Preds;
-    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (!is_contained(Latches, *PI))
-        Preds.push_back(*PI);
+    SmallVector<BasicBlock *, 2> Preds;
+    for (BasicBlock *Pred : predecessors(BB)) {
+      if (!is_contained(Latches, Pred))
+        Preds.push_back(Pred);
     }
+
     BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
   Value *Exec = popSaved();
-  if (!isa<UndefValue>(Exec))
-    CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
+  Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
+  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt))
+    CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
-
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DA = &getAnalysis<DivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
-
-    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+    BasicBlock *BB = *I;
+    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
 
     if (!Term || Term->isUnconditional()) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
+      if (isTopOfStack(BB))
+        closeControlFlow(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
+      if (isTopOfStack(BB))
+        closeControlFlow(BB);
 
       handleLoop(Term);
       continue;
     }
 
-    if (isTopOfStack(*I)) {
+    if (isTopOfStack(BB)) {
       PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
-      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+      if (Phi && Phi->getParent() == BB && isElse(Phi)) {
         insertElse(Term);
         eraseIfUnused(Phi);
         continue;
       }
-      closeControlFlow(*I);
+
+      closeControlFlow(BB);
     }
+
     openIf(Term);
   }
 
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index ff4e32147184..3dd372b32866 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -36,6 +36,7 @@ enum : uint64_t {
 
  // TODO: Should this be spilt into VOP3 a and b?
   VOP3 = 1 << 10,
+  VOP3P = 1 << 12,
 
   VINTRP = 1 << 13,
   SDWA = 1 << 14,
@@ -65,8 +66,8 @@ enum : uint64_t {
   SOPK_ZEXT = UINT64_C(1) << 38,
   SCALAR_STORE = UINT64_C(1) << 39,
   FIXED_SIZE = UINT64_C(1) << 40,
-  VOPAsmPrefer32Bit = UINT64_C(1) << 41
-
+  VOPAsmPrefer32Bit = UINT64_C(1) << 41,
+  HasFPClamp = UINT64_C(1) << 42
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -102,12 +103,14 @@ namespace AMDGPU {
     OPERAND_REG_INLINE_C_FP16,
     OPERAND_REG_INLINE_C_FP32,
     OPERAND_REG_INLINE_C_FP64,
+    OPERAND_REG_INLINE_C_V2FP16,
+    OPERAND_REG_INLINE_C_V2INT16,
 
     OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
 
     OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
-    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16,
 
     OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -125,9 +128,12 @@ namespace AMDGPU {
 // NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
   enum {
-   NEG = 1 << 0,  // Floating-point negate modifier
-   ABS = 1 << 1,  // Floating-point absolute modifier
-   SEXT = 1 << 0  // Integer sign-extend modifier
+   NEG = 1 << 0,   // Floating-point negate modifier
+   ABS = 1 << 1,   // Floating-point absolute modifier
+   SEXT = 1 << 0,  // Integer sign-extend modifier
+   NEG_HI = ABS,   // Floating-point negate high packed component modifier.
+   OP_SEL_0 = 1 << 2,
+   OP_SEL_1 = 1 << 3
   };
 }
 
@@ -242,6 +248,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_LDS_ALLOC = 6,
   ID_IB_STS = 7,
   ID_SYMBOLIC_LAST_ = 8,
+  ID_MEM_BASES = 15,
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -251,14 +258,20 @@ enum Offset { // Offset, (5) [10:6]
   OFFSET_DEFAULT_ = 0,
   OFFSET_SHIFT_ = 6,
   OFFSET_WIDTH_ = 5,
-  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
+
+  OFFSET_SRC_SHARED_BASE = 16,
+  OFFSET_SRC_PRIVATE_BASE = 0
 };
 
 enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
   WIDTH_M1_DEFAULT_ = 31,
   WIDTH_M1_SHIFT_ = 11,
   WIDTH_M1_WIDTH_ = 5,
-  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
+
+  WIDTH_M1_SRC_SHARED_BASE = 15,
+  WIDTH_M1_SRC_PRIVATE_BASE = 15
 };
 
 } // namespace Hwreg
@@ -300,6 +313,9 @@ enum DstUnused {
 #define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TRAP_HANDLER(x)                                    (((x) & 0x1) << 6)
+#define   G_00B84C_TRAP_HANDLER(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B84C_TRAP_HANDLER                                       0xFFFFFFBF
 #define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
@@ -387,7 +403,6 @@ enum DstUnused {
 
 #define R_SPILLED_SGPRS         0x4
 #define R_SPILLED_VGPRS         0x8
-
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 6a422e70fe1f..f9d258f44a62 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -65,6 +65,7 @@
 /// ultimately led to the creation of an illegal COPY.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseSet.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
@@ -198,6 +199,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   if (!CopyUse.isCopy())
     return false;
 
+  // It is illegal to have vreg inputs to a physreg defining reg_sequence.
+  if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+    return false;
+
   const TargetRegisterClass *SrcRC, *DstRC;
   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
 
@@ -234,8 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 
     unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
 
-    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
-      .addOperand(MI.getOperand(I));
+    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
+            TmpReg)
+        .add(MI.getOperand(I));
 
     MI.getOperand(I).setReg(TmpReg);
   }
@@ -326,6 +332,27 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
   return true;
 }
 
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                               const TargetRegisterInfo *TRI) {
+  DenseSet<MachineBasicBlock*> Visited;
+  SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), 
+                                              MBB->pred_end());
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *mbb = Worklist.back();
+    Worklist.pop_back();
+
+    if (!Visited.insert(mbb).second)
+      continue;
+    if (hasTerminatorThatModifiesExec(*mbb, *TRI))
+      return true;
+
+    Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
+  }
+
+  return false;
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -382,8 +409,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
           MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
 
-          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
-          if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+          if (!predsHasDivergentTerminator(MBB0, TRI) &&
+              !predsHasDivergentTerminator(MBB1, TRI)) {
             DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
             break;
           }
diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
new file mode 100644
index 000000000000..3d3121788b5e
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -0,0 +1,72 @@
+//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Add implicit use of exec to vector register copies.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-vgpr-copies"
+
+namespace {
+
+class SIFixVGPRCopies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixVGPRCopies() : MachineFunctionPass(ID) {
+    initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fix VGPR copies"; }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixVGPRCopies, DEBUG_TYPE, "SI Fix VGPR copies", false, false)
+
+char SIFixVGPRCopies::ID = 0;
+
+char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
+
+bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case AMDGPU::COPY:
+        if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) {
+          MI.addOperand(MF,
+                        MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+          DEBUG(dbgs() << "Add exec use to " << MI);
+          Changed = true;
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index a5c0d4923d6b..d63414735b95 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,6 +12,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -66,6 +67,7 @@ public:
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
+  const SISubtarget *ST;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
@@ -75,6 +77,12 @@ public:
 
   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
+  const MachineOperand *isClamp(const MachineInstr &MI) const;
+  bool tryFoldClamp(MachineInstr &MI);
+
+  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
+  bool tryFoldOMod(MachineInstr &MI);
+
 public:
   SIFoldOperands() : MachineFunctionPass(ID) {
     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
@@ -131,27 +139,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
 
-static bool isSafeToFold(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
-  case AMDGPU::V_MOV_B64_PSEUDO: {
-    // If there are additional implicit register operands, this may be used for
-    // register indexing so the source register operand isn't simply copied.
-    unsigned NumOps = MI.getDesc().getNumOperands() +
-      MI.getDesc().getNumImplicitUses();
-
-    return MI.getNumOperands() == NumOps;
-  }
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::COPY:
-    return true;
-  default:
-    return false;
-  }
-}
-
 static bool updateOperand(FoldCandidate &Fold,
                           const TargetRegisterInfo &TRI) {
   MachineInstr *MI = Fold.UseMI;
@@ -359,8 +346,6 @@ void SIFoldOperands::foldOperand(
   const TargetRegisterClass *FoldRC =
     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
 
-  APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
-            OpToFold.getImm());
 
   // Split 64-bit constants into 32-bits for folding.
   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
@@ -370,21 +355,25 @@ void SIFoldOperands::foldOperand(
       MRI->getRegClass(UseReg) :
       TRI->getPhysRegClass(UseReg);
 
-    assert(Imm.getBitWidth() == 64);
-
     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
       return;
 
+    APInt Imm(64, OpToFold.getImm());
     if (UseOp.getSubReg() == AMDGPU::sub0) {
       Imm = Imm.getLoBits(32);
     } else {
       assert(UseOp.getSubReg() == AMDGPU::sub1);
       Imm = Imm.getHiBits(32);
     }
+
+    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+    return;
   }
 
-  MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+
+
+  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 }
 
 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
@@ -581,6 +570,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   return false;
 }
 
+// Try to fold an instruction into a simpler one
+static bool tryFoldInst(const SIInstrInfo *TII,
+                        MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
+      Opc == AMDGPU::V_CNDMASK_B32_e64    ||
+      Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
+    const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
+    if (Src1->isIdenticalTo(*Src0)) {
+      DEBUG(dbgs() << "Folded " << *MI << " into ");
+      int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+      if (Src2Idx != -1)
+        MI->RemoveOperand(Src2Idx);
+      MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+      mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
+                                               : getMovOpc(false)));
+      DEBUG(dbgs() << *MI << '\n');
+      return true;
+    }
+  }
+
+  return false;
+}
+
 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
                                      MachineOperand &OpToFold) const {
   // We need mutate the operands of new mov instructions to add implicit
@@ -682,20 +697,213 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       }
       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+      tryFoldInst(TII, Fold.UseMI);
     }
   }
 }
 
+const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
+  unsigned Op = MI.getOpcode();
+  switch (Op) {
+  case AMDGPU::V_MAX_F32_e64:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F64: {
+    if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
+      return nullptr;
+
+    // Make sure sources are identical.
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() ||
+        Src0->getSubReg() != AMDGPU::NoSubRegister)
+      return nullptr;
+
+    // Can't fold up if we have modifiers.
+    if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+      return nullptr;
+    return Src0;
+  }
+  default:
+    return nullptr;
+  }
+}
+
+// We obviously have multiple uses in a clamp since the register is used twice
+// in the same instruction.
+static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
+  int Count = 0;
+  for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
+       I != E; ++I) {
+    if (++Count > 1)
+      return false;
+  }
+
+  return true;
+}
+
+bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
+  const MachineOperand *ClampSrc = isClamp(MI);
+  if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+  if (!TII->hasFPClamp(*Def))
+    return false;
+  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
+  if (!DefClamp)
+    return false;
+
+  DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+
+  // Clamp is applied after omod, so it is OK if omod is set.
+  DefClamp->setImm(1);
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
+static int getOModValue(unsigned Opc, int64_t Val) {
+  switch (Opc) {
+  case AMDGPU::V_MUL_F32_e64: {
+    switch (static_cast<uint32_t>(Val)) {
+    case 0x3f000000: // 0.5
+      return SIOutMods::DIV2;
+    case 0x40000000: // 2.0
+      return SIOutMods::MUL2;
+    case 0x40800000: // 4.0
+      return SIOutMods::MUL4;
+    default:
+      return SIOutMods::NONE;
+    }
+  }
+  case AMDGPU::V_MUL_F16_e64: {
+    switch (static_cast<uint16_t>(Val)) {
+    case 0x3800: // 0.5
+      return SIOutMods::DIV2;
+    case 0x4000: // 2.0
+      return SIOutMods::MUL2;
+    case 0x4400: // 4.0
+      return SIOutMods::MUL4;
+    default:
+      return SIOutMods::NONE;
+    }
+  }
+  default:
+    llvm_unreachable("invalid mul opcode");
+  }
+}
+
+// FIXME: Does this really not support denormals with f16?
+// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
+// handled, so will anything other than that break?
+std::pair<const MachineOperand *, int>
+SIFoldOperands::isOMod(const MachineInstr &MI) const {
+  unsigned Op = MI.getOpcode();
+  switch (Op) {
+  case AMDGPU::V_MUL_F32_e64:
+  case AMDGPU::V_MUL_F16_e64: {
+    // If output denormals are enabled, omod is ignored.
+    if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
+        (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    const MachineOperand *RegOp = nullptr;
+    const MachineOperand *ImmOp = nullptr;
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    if (Src0->isImm()) {
+      ImmOp = Src0;
+      RegOp = Src1;
+    } else if (Src1->isImm()) {
+      ImmOp = Src1;
+      RegOp = Src0;
+    } else
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    int OMod = getOModValue(Op, ImmOp->getImm());
+    if (OMod == SIOutMods::NONE ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    return std::make_pair(RegOp, OMod);
+  }
+  case AMDGPU::V_ADD_F32_e64:
+  case AMDGPU::V_ADD_F16_e64: {
+    // If output denormals are enabled, omod is ignored.
+    if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
+        (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+    if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
+        Src0->getSubReg() == Src1->getSubReg() &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+      return std::make_pair(Src0, SIOutMods::MUL2);
+
+    return std::make_pair(nullptr, SIOutMods::NONE);
+  }
+  default:
+    return std::make_pair(nullptr, SIOutMods::NONE);
+  }
+}
+
+// FIXME: Does this need to check IEEE bit on function?
+bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
+  const MachineOperand *RegOp;
+  int OMod;
+  std::tie(RegOp, OMod) = isOMod(MI);
+  if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
+      RegOp->getSubReg() != AMDGPU::NoSubRegister ||
+      !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
+  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
+  if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
+    return false;
+
+  // Clamp is applied after omod. If the source already has clamp set, don't
+  // fold it.
+  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
+    return false;
+
+  DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+
+  DefOMod->setImm(OMod);
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-
   MRI = &MF.getRegInfo();
-  TII = ST.getInstrInfo();
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
+  // correctly handle signed zeros.
+  //
+  // TODO: Check nsz on instructions when fast math flags are preserved to MI
+  // level.
+  bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -705,8 +913,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
-      if (!isSafeToFold(MI))
+      tryFoldInst(TII, &MI);
+
+      if (!TII->isFoldableCopy(MI)) {
+        if (IsIEEEMode || !tryFoldOMod(MI))
+          tryFoldClamp(MI);
         continue;
+      }
 
       MachineOperand &OpToFold = MI.getOperand(1);
       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0b5715515880..abe6af9a6d3f 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,22 +21,24 @@
 using namespace llvm;
 
 
-static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
-                                         const SIRegisterInfo *TRI) {
+static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+                                         const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
-                      TRI->getMaxNumSGPRs(MF) / 4);
+                      ST.getMaxNumSGPRs(MF) / 4);
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
-                                       const SIRegisterInfo *TRI) {
+static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+                                       const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-                      TRI->getMaxNumSGPRs(MF));
+                      ST.getMaxNumSGPRs(MF));
 }
 
-void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
-                                          const SIRegisterInfo* TRI,
+void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
                                           MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+
   // We don't need this if we only have spills since there is no user facing
   // scratch.
 
@@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
   MRI.addLiveIn(FlatScratchInitReg);
   MBB.addLiveIn(FlatScratchInitReg);
 
-  // Copy the size in bytes.
-  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
-    .addReg(FlatScrInitHi, RegState::Kill);
-
   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 
+  // Do a 64-bit pointer add.
+  if (ST.flatScratchIsPointer()) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
+      .addReg(FlatScrInitHi)
+      .addImm(0);
+
+    return;
+  }
+
+  // Copy the size in bytes.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
+    .addReg(FlatScrInitHi, RegState::Kill);
+
   // Add wave offset in bytes to private base offset.
   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@@ -111,16 +125,15 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
-  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
 
-  // Skip the last 2 elements because the last one is reserved for VCC, and
-  // this is the 2nd to last element already.
+  // Skip the last N reserved elements because they should have already been
+  // reserved for VCC etc.
   for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
     // reserved input we needed.
     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-      //assert(MRI.isAllocatable(Reg));
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -143,10 +156,9 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
 
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-
   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
 
-  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
   if (NumPreloaded > AllSGPRs.size())
     return ScratchWaveOffsetReg;
 
@@ -190,6 +202,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  auto AMDGPUASI = ST.getAMDGPUAS();
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
@@ -229,7 +242,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   // emitted after frame indices are eliminated.
 
   if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
-    emitFlatScratchInit(TII, TRI, MF, MBB);
+    emitFlatScratchInit(ST, MF, MBB);
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
@@ -328,7 +341,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
         PointerType *PtrTy =
           PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
-                           AMDGPUAS::CONSTANT_ADDRESS);
+                           AMDGPUASI.CONSTANT_ADDRESS);
         MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
         auto MMO = MF.getMachineMemOperand(PtrInfo,
                                            MachineMemOperand::MOLoad |
@@ -371,6 +384,24 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 
 }
 
+static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+       I != E; ++I) {
+    if (!MFI.isDeadObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            unsigned &FrameReg) const {
+  const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+
+  FrameReg = RI->getFrameRegister(MF);
+  return MF.getFrameInfo().getObjectOffset(FI);
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
@@ -379,15 +410,66 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (!MFI.hasStackObjects())
     return;
 
-  bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  bool AllSGPRSpilledToVGPRs = false;
+
+  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
+    AllSGPRSpilledToVGPRs = true;
+
+    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
+    // are spilled to VGPRs, in which case we can eliminate the stack usage.
+    //
+    // XXX - This operates under the assumption that only other SGPR spills are
+    // users of the frame index. I'm not 100% sure this is correct. The
+    // StackColoring pass has a comment saying a future improvement would be to
+    // merging of allocas with spill slots, but for now according to
+    // MachineFrameInfo isSpillSlot can't alias any other object.
+    for (MachineBasicBlock &MBB : MF) {
+      MachineBasicBlock::iterator Next;
+      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+        MachineInstr &MI = *I;
+        Next = std::next(I);
+
+        if (TII->isSGPRSpill(MI)) {
+          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
+            (void)Spilled;
+            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+          } else
+            AllSGPRSpilledToVGPRs = false;
+        }
+      }
+    }
 
-  assert((RS || !MayNeedScavengingEmergencySlot) &&
-         "RegScavenger required if spilling");
+    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
+  }
 
-  if (MayNeedScavengingEmergencySlot) {
-    int ScavengeFI = MFI.CreateStackObject(
-      AMDGPU::SGPR_32RegClass.getSize(),
-      AMDGPU::SGPR_32RegClass.getAlignment(), false);
+  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
+  // but currently hasNonSpillStackObjects is set only from source
+  // allocas. Stack temps produced from legalization are not counted currently.
+  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
+      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
+    assert(RS && "RegScavenger required if spilling");
+
+    // We force this to be at offset 0 so no user object ever has 0 as an
+    // address, so we may use 0 as an invalid pointer value. This is because
+    // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
+    // is required to be address space 0, we are forced to accept this for
+    // now. Ideally we could have the stack in another address space with 0 as a
+    // valid pointer, and -1 as the null value.
+    //
+    // This will also waste additional space when user stack objects require > 4
+    // byte alignment.
+    //
+    // The main cost here is losing the offset for addressing modes. However
+    // this also ensures we shouldn't need a register for the offset when
+    // emergency scavenging.
+    int ScavengeFI = MFI.CreateFixedObject(
+      AMDGPU::SGPR_32RegClass.getSize(), 0, false);
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 7657b4e03864..1bfc08093da2 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -30,14 +30,15 @@ public:
                     MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
 
 private:
-  void emitFlatScratchInit(const SIInstrInfo *TII,
-                           const SIRegisterInfo* TRI,
+  void emitFlatScratchInit(const SISubtarget &ST,
                            MachineFunction &MF,
                            MachineBasicBlock &MBB) const;
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index b98f9f400ee7..7268131396dc 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,26 +15,70 @@
 #ifdef _MSC_VER
 // Provide M_PI.
 #define _USE_MATH_DEFINES
-#include <cmath>
 #endif
 
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "AMDGPUSubtarget.h"
 #include "SIDefines.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -43,7 +87,6 @@ static cl::opt<bool> EnableVGPRIndexMode(
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
   cl::init(false));
 
-
 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -84,6 +127,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
   }
 
+  if (Subtarget->hasVOP3PInsts()) {
+    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+  }
+
   computeRegisterProperties(STI.getRegisterInfo());
 
   // We need to custom lower vector stores from local memory
@@ -110,7 +158,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
 
-
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
@@ -142,10 +189,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
@@ -153,9 +207,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
 
+  setOperationAction(ISD::UADDO, MVT::i32, Legal);
+  setOperationAction(ISD::USUBO, MVT::i32, Legal);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+        MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -202,6 +260,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
 
+  // Avoid stack access for these.
+  // TODO: Generalize to more vector types.
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
   // and output demarshalling
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
@@ -222,7 +287,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
-  setOperationAction(ISD::TRAP, MVT::Other, Custom);
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
@@ -303,6 +369,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
+    setOperationAction(ISD::FROUND, MVT::f16, Custom);
 
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
@@ -317,6 +384,85 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
   }
 
+  if (Subtarget->hasVOP3PInsts()) {
+    for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+        switch (Op) {
+        case ISD::LOAD:
+        case ISD::STORE:
+        case ISD::BUILD_VECTOR:
+        case ISD::BITCAST:
+        case ISD::EXTRACT_VECTOR_ELT:
+        case ISD::INSERT_VECTOR_ELT:
+        case ISD::INSERT_SUBVECTOR:
+        case ISD::EXTRACT_SUBVECTOR:
+        case ISD::SCALAR_TO_VECTOR:
+          break;
+        case ISD::CONCAT_VECTORS:
+          setOperationAction(Op, VT, Custom);
+          break;
+        default:
+          setOperationAction(Op, VT, Expand);
+          break;
+        }
+      }
+    }
+
+    // XXX - Do these do anything? Vector constants turn into build_vector.
+    setOperationAction(ISD::Constant, MVT::v2i16, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::STORE, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::STORE, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::AND, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::OR, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::XOR, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::ADD, MVT::v2i16, Legal);
+    setOperationAction(ISD::SUB, MVT::v2i16, Legal);
+    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SHL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SRL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SRA, MVT::v2i16, Legal);
+    setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
+    setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
+    setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
+
+    setOperationAction(ISD::FADD, MVT::v2f16, Legal);
+    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMA, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    // This isn't really legal, but this avoids the legalizer unrolling it (and
+    // allows matching fneg (fabs x) patterns)
+    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+  }
+
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
@@ -332,6 +478,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::FCANONICALIZE);
+  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -364,30 +512,49 @@ const SISubtarget *SITargetLowering::getSubtarget() const {
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+                                          EVT) const {
+  // SI has some legal vector types, but no legal vector operations. Say no
+  // shuffles are legal in order to prefer scalarizing some vector operations.
+  return false;
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           unsigned IntrID) const {
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_atomic_dec: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align = 0;
-    Info.vol = false;
+
+    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
+    Info.vol = !Vol || !Vol->isNullValue();
     Info.readMem = true;
     Info.writeMem = true;
     return true;
+  }
   default:
     return false;
   }
 }
 
-bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
-                                          EVT) const {
-  // SI has some legal vector types, but no legal vector operations. Say no
-  // shuffles are legal in order to prefer scalarizing some vector operations.
-  return false;
+bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
+                                            SmallVectorImpl<Value*> &Ops,
+                                            Type *&AccessTy) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    Value *Ptr = II->getArgOperand(0);
+    AccessTy = II->getType();
+    Ops.push_back(Ptr);
+    return true;
+  }
+  default:
+    return false;
+  }
 }
 
 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
@@ -438,8 +605,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS: {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
@@ -454,8 +620,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     }
 
     return isLegalMUBUFAddressingMode(AM);
-  }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
+  } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -478,7 +643,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -492,13 +657,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return true;
 
     return false;
-  }
 
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     return isLegalMUBUFAddressingMode(AM);
-
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
+             AS == AMDGPUASI.REGION_ADDRESS) {
     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     // field.
     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -513,17 +676,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return true;
 
     return false;
-  }
-  case AMDGPUAS::FLAT_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+  } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
+             AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     // For an unknown address space, this usually means that this is for some
     // reason being used for pure arithmetic, and not based on some addressing
     // computation. We don't have instructions that compute pointers with any
     // addressing modes, so treat them as having no offset like flat
     // instructions.
     return isLegalFlatAddressingMode(AM);
-
-  default:
+  } else {
     llvm_unreachable("unhandled address space");
   }
 }
@@ -544,8 +705,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     return false;
   }
 
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
+  if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUASI.REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
@@ -560,8 +721,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
   if (!Subtarget->hasUnalignedScratchAccess() &&
-      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
-       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+      (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
+       AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
     return false;
   }
 
@@ -569,7 +730,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
         (Align % 4 == 0) : true;
     }
 
@@ -609,15 +770,16 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
-static bool isFlatGlobalAddrSpace(unsigned AS) {
-  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
+  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
+         isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
 }
 
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
@@ -631,7 +793,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
   // Flat -> global is no-op
-  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
     return true;
 
   return isNoopAddrSpaceCast(SrcAS, DestAS);
@@ -639,18 +801,8 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
 
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
-  const Value *Ptr = MemNode->getMemOperand()->getValue();
 
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
-
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
+  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -693,40 +845,28 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   return TargetLowering::isTypeDesirableForOp(Op, VT);
 }
 
-SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
-                                            const SDLoc &SL, SDValue Chain,
-                                            unsigned Offset) const {
+SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
+                                                   const SDLoc &SL,
+                                                   SDValue Chain,
+                                                   uint64_t Offset) const {
   const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF,
+                                                SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
   return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                      DAG.getConstant(Offset, SL, PtrVT));
 }
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         const SDLoc &SL, SDValue Chain,
-                                         unsigned Offset, bool Signed,
+SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Val,
+                                         bool Signed,
                                          const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
-  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
-
-  unsigned Align = DL.getABITypeAlignment(Ty);
-
-  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
-  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
-                             MachineMemOperand::MONonTemporal |
-                             MachineMemOperand::MODereferenceable |
-                             MachineMemOperand::MOInvariant);
-
-  SDValue Val = Load;
   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
       VT.bitsLT(MemVT)) {
     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
@@ -740,373 +880,434 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   else
     Val = DAG.getZExtOrTrunc(Val, SL, VT);
 
-  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
+  return Val;
 }
 
-SDValue SITargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  FunctionType *FType = MF.getFunction()->getFunctionType();
-  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+SDValue SITargetLowering::lowerKernargMemParameter(
+  SelectionDAG &DAG, EVT VT, EVT MemVT,
+  const SDLoc &SL, SDValue Chain,
+  uint64_t Offset, bool Signed,
+  const ISD::InputArg *Arg) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
-    const Function *Fn = MF.getFunction();
-    DiagnosticInfoUnsupported NoGraphicsHSA(
-        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
-    DAG.getContext()->diagnose(NoGraphicsHSA);
-    return DAG.getEntryNode();
-  }
+  unsigned Align = DL.getABITypeAlignment(Ty);
 
-  // Create stack objects that are used for emitting debugger prologue if
-  // "amdgpu-debugger-emit-prologue" attribute was specified.
-  if (ST.debuggerEmitPrologue())
-    createDebuggerPrologueStackObjects(MF);
+  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
+  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+                             MachineMemOperand::MONonTemporal |
+                             MachineMemOperand::MODereferenceable |
+                             MachineMemOperand::MOInvariant);
 
-  SmallVector<ISD::InputArg, 16> Splits;
-  BitVector Skipped(Ins.size());
+  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
+  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
+}
 
-  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
-    const ISD::InputArg &Arg = Ins[i];
+static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+                                   CallingConv::ID CallConv,
+                                   ArrayRef<ISD::InputArg> Ins,
+                                   BitVector &Skipped,
+                                   FunctionType *FType,
+                                   SIMachineFunctionInfo *Info) {
+  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
+    const ISD::InputArg &Arg = Ins[I];
 
-    // First check if it's a PS input addr
+    // First check if it's a PS input addr.
     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
-        // We can safely skip PS inputs
-        Skipped.set(i);
+        // We can safely skip PS inputs.
+        Skipped.set(I);
         ++PSInputNum;
         continue;
       }
 
       Info->markPSInputAllocated(PSInputNum);
       if (Arg.Used)
-        Info->PSInputEna |= 1 << PSInputNum;
+        Info->markPSInputEnabled(PSInputNum);
 
       ++PSInputNum;
     }
 
-    if (AMDGPU::isShader(CallConv)) {
-      // Second split vertices into their elements
-      if (Arg.VT.isVector()) {
-        ISD::InputArg NewArg = Arg;
-        NewArg.Flags.setSplit();
-        NewArg.VT = Arg.VT.getVectorElementType();
-
-        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-        // three or five element vertex only needs three or five registers,
-        // NOT four or eight.
-        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-        unsigned NumElements = ParamType->getVectorNumElements();
-
-        for (unsigned j = 0; j != NumElements; ++j) {
-          Splits.push_back(NewArg);
-          NewArg.PartOffset += NewArg.VT.getStoreSize();
-        }
-      } else {
-        Splits.push_back(Arg);
+    // Second split vertices into their elements.
+    if (Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eight.
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned J = 0; J != NumElements; ++J) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
+    } else {
+      Splits.push_back(Arg);
     }
   }
+}
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+// Allocate special inputs passed in VGPRs.
+static void allocateSpecialInputVGPRs(CCState &CCInfo,
+                                      MachineFunction &MF,
+                                      const SIRegisterInfo &TRI,
+                                      SIMachineFunctionInfo &Info) {
+  if (Info.hasWorkItemIDX()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
 
-  // At least one interpolation mode must be enabled or else the GPU will hang.
-  //
-  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
-  // PSInputAddr, the user wants to enable some bits after the compilation
-  // based on run-time states. Since we can't know what the final PSInputEna
-  // will look like, so we shouldn't do anything here and the user should take
-  // responsibility for the correct programming.
-  //
-  // Otherwise, the following restrictions apply:
-  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
-  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
-  //   enabled too.
-  if (CallConv == CallingConv::AMDGPU_PS &&
-      ((Info->getPSInputAddr() & 0x7F) == 0 ||
-       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
-    CCInfo.AllocateReg(AMDGPU::VGPR0);
-    CCInfo.AllocateReg(AMDGPU::VGPR1);
-    Info->markPSInputAllocated(0);
-    Info->PSInputEna |= 1;
-  }
-
-  if (!AMDGPU::isShader(CallConv)) {
-    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
-  } else {
-    assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
-           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
+  if (Info.hasWorkItemIDY()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasPrivateMemoryInputPtr()) {
-    unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
-    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
+  if (Info.hasWorkItemIDZ()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+}
+
+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+                                 MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) {
+  if (Info.hasPrivateMemoryInputPtr()) {
+    unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
+    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(PrivateMemoryPtrReg);
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
-  if (Info->hasPrivateSegmentBuffer()) {
-    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
-    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+  if (Info.hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
     CCInfo.AllocateReg(PrivateSegmentBufferReg);
   }
 
-  if (Info->hasDispatchPtr()) {
-    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+  if (Info.hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
-  if (Info->hasQueuePtr()) {
-    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+  if (Info.hasQueuePtr()) {
+    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
   }
 
-  if (Info->hasKernargSegmentPtr()) {
-    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+  if (Info.hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
-  if (Info->hasDispatchID()) {
-    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+  if (Info.hasDispatchID()) {
+    unsigned DispatchIDReg = Info.addDispatchID(TRI);
     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchIDReg);
   }
 
-  if (Info->hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+  if (Info.hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
-  if (!AMDGPU::isShader(CallConv))
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
-  else
-    AnalyzeFormalArguments(CCInfo, Splits);
-
-  SmallVector<SDValue, 16> Chains;
-
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
-
-    const ISD::InputArg &Arg = Ins[i];
-    if (Skipped[i]) {
-      InVals.push_back(DAG.getUNDEF(Arg.VT));
-      continue;
-    }
-
-    CCValAssign &VA = ArgLocs[ArgIdx++];
-    MVT VT = VA.getLocVT();
-
-    if (VA.isMemLoc()) {
-      VT = Ins[i].VT;
-      EVT MemVT = VA.getLocVT();
-      const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
-                              VA.getLocMemOffset();
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
-                                   Offset, Ins[i].Flags.isSExt(),
-                                   &Ins[i]);
-      Chains.push_back(Arg.getValue(1));
-
-      auto *ParamTy =
-        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
-          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-        // On SI local pointers are just offsets into LDS, so they are always
-        // less than 16-bits.  On CI and newer they could potentially be
-        // real pointers, so we can't guarantee their size.
-        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
-                          DAG.getValueType(MVT::i16));
-      }
-
-      InVals.push_back(Arg);
-      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
-      continue;
-    }
-    assert(VA.isRegLoc() && "Parameter must be in a register!");
-
-    unsigned Reg = VA.getLocReg();
-
-    if (VT == MVT::i64) {
-      // For now assume it is a pointer
-      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
-                                     &AMDGPU::SGPR_64RegClass);
-      Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
-      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-      InVals.push_back(Copy);
-      continue;
-    }
-
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-
-    Reg = MF.addLiveIn(Reg, RC);
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-
-    if (Arg.VT.isVector()) {
-
-      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      SmallVector<SDValue, 4> Regs;
-      Regs.push_back(Val);
-      for (unsigned j = 1; j != NumElements; ++j) {
-        Reg = ArgLocs[ArgIdx++].getLocReg();
-        Reg = MF.addLiveIn(Reg, RC);
-
-        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-        Regs.push_back(Copy);
-      }
-
-      // Fill up the missing vector elements
-      NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      Regs.append(NumElements, DAG.getUNDEF(VT));
-
-      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
-      continue;
-    }
-
-    InVals.push_back(Val);
-  }
-
   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
   // these from the dispatch pointer.
+}
 
-  // Start adding system SGPRs.
-  if (Info->hasWorkGroupIDX()) {
-    unsigned Reg = Info->addWorkGroupIDX();
+// Allocate special input registers that are initialized per-wave.
+static void allocateSystemSGPRs(CCState &CCInfo,
+                                MachineFunction &MF,
+                                SIMachineFunctionInfo &Info,
+                                bool IsShader) {
+  if (Info.hasWorkGroupIDX()) {
+    unsigned Reg = Info.addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupIDY()) {
-    unsigned Reg = Info->addWorkGroupIDY();
+  if (Info.hasWorkGroupIDY()) {
+    unsigned Reg = Info.addWorkGroupIDY();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupIDZ()) {
-    unsigned Reg = Info->addWorkGroupIDZ();
+  if (Info.hasWorkGroupIDZ()) {
+    unsigned Reg = Info.addWorkGroupIDZ();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupInfo()) {
-    unsigned Reg = Info->addWorkGroupInfo();
+  if (Info.hasWorkGroupInfo()) {
+    unsigned Reg = Info.addWorkGroupInfo();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasPrivateSegmentWaveByteOffset()) {
+  if (Info.hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
     unsigned PrivateSegmentWaveByteOffsetReg;
 
-    if (AMDGPU::isShader(CallConv)) {
+    if (IsShader) {
       PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
-      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+      Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
     } else
-      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
+      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
   }
+}
 
+static void reservePrivateMemoryRegs(const TargetMachine &TM,
+                                     MachineFunction &MF,
+                                     const SIRegisterInfo &TRI,
+                                     SIMachineFunctionInfo &Info) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
   bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+
   // Record that we know we have non-spill stack objects so we don't need to
   // check all stack objects later.
   if (HasStackObjects)
-    Info->setHasNonSpillStackObjects(true);
+    Info.setHasNonSpillStackObjects(true);
 
   // Everything live out of a block is spilled with fast regalloc, so it's
   // almost certain that spilling will be required.
-  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+  if (TM.getOptLevel() == CodeGenOpt::None)
     HasStackObjects = true;
 
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   if (ST.isAmdCodeObjectV2(MF)) {
     if (HasStackObjects) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
       // SGPR inputs. We can reserve those and use them directly.
 
-      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+      unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
-      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+      Info.setScratchRSrcReg(PrivateSegmentBufferReg);
 
-      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+      Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
     } else {
       unsigned ReservedBufferReg
-        = TRI->reservedPrivateSegmentBufferReg(MF);
+        = TRI.reservedPrivateSegmentBufferReg(MF);
       unsigned ReservedOffsetReg
-        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
 
       // We tentatively reserve the last registers (skipping the last two
       // which may contain VCC). After register allocation, we'll replace
       // these with the ones immediately after those which were really
       // allocated. In the prologue copies will be inserted from the argument
       // to these reserved registers.
-      Info->setScratchRSrcReg(ReservedBufferReg);
-      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+      Info.setScratchRSrcReg(ReservedBufferReg);
+      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   } else {
-    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
 
     // Without HSA, relocations are used for the scratch pointer and the
     // buffer resource setup is always inserted in the prologue. Scratch wave
     // offset is still in an input SGPR.
-    Info->setScratchRSrcReg(ReservedBufferReg);
+    Info.setScratchRSrcReg(ReservedBufferReg);
 
     if (HasStackObjects) {
-      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+      unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+      Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
     } else {
       unsigned ReservedOffsetReg
-        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
-      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
+}
 
-  if (Info->hasWorkItemIDX()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+SDValue SITargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return DAG.getEntryNode();
   }
 
-  if (Info->hasWorkItemIDY()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  SmallVector<CCValAssign, 16> ArgLocs;
+  BitVector Skipped(Ins.size());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  bool IsShader = AMDGPU::isShader(CallConv);
+  bool IsKernel = AMDGPU::isKernel(CallConv);
+  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
+
+  if (IsShader) {
+    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+
+    // At least one interpolation mode must be enabled or else the GPU will
+    // hang.
+    //
+    // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
+    // set PSInputAddr, the user wants to enable some bits after the compilation
+    // based on run-time states. Since we can't know what the final PSInputEna
+    // will look like, so we shouldn't do anything here and the user should take
+    // responsibility for the correct programming.
+    //
+    // Otherwise, the following restrictions apply:
+    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+    //   enabled too.
+    if (CallConv == CallingConv::AMDGPU_PS &&
+        ((Info->getPSInputAddr() & 0x7F) == 0 ||
+         ((Info->getPSInputAddr() & 0xF) == 0 &&
+          Info->isPSInputAllocated(11)))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
+    }
+
+    assert(!Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  } else {
+    assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
   }
 
-  if (Info->hasWorkItemIDZ()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+  if (IsEntryFunc) {
+    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+    allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+  }
+
+  if (IsKernel) {
+    analyzeFormalArgumentsCompute(CCInfo, Ins);
+  } else {
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+  }
+
+  SmallVector<SDValue, 16> Chains;
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+    if (Skipped[i]) {
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    MVT VT = VA.getLocVT();
+
+    if (IsEntryFunc && VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = VA.getLocVT();
+
+      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
+        VA.getLocMemOffset();
+      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = lowerKernargMemParameter(
+        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+      Chains.push_back(Arg.getValue(1));
+
+      auto *ParamTy =
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+        // On SI local pointers are just offsets into LDS, so they are always
+        // less than 16-bits.  On CI and newer they could potentially be
+        // real pointers, so we can't guarantee their size.
+        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+                          DAG.getValueType(MVT::i16));
+      }
+
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    if (VA.isMemLoc())
+      report_fatal_error("memloc not supported with calling convention");
+
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    if (Arg.VT.isVector()) {
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      Regs.append(NumElements, DAG.getUNDEF(VT));
+
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
+      continue;
+    }
+
+    InVals.push_back(Val);
   }
 
-  if (Chains.empty())
-    return Chain;
+  // Start adding system SGPRs.
+  if (IsEntryFunc)
+    allocateSystemSGPRs(CCInfo, MF, *Info, IsShader);
+
+  reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
 
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  return Chains.empty() ? Chain :
+    DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
 SDValue
@@ -1197,7 +1398,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -1470,16 +1671,16 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
     if (Offset == 0) {
       MachineInstr *SetOn =
-        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-        .addOperand(*Idx)
-        .addImm(IdxMode);
+          BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+              .add(*Idx)
+              .addImm(IdxMode);
 
       SetOn->getOperand(3).setIsUndef();
     } else {
       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
-        .addOperand(*Idx)
-        .addImm(Offset);
+          .add(*Idx)
+          .addImm(Offset);
       MachineInstr *SetOn =
         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
         .addReg(Tmp, RegState::Kill)
@@ -1493,10 +1694,10 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
 
   if (Offset == 0) {
     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-      .addOperand(*Idx);
+      .add(*Idx);
   } else {
     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-      .addOperand(*Idx)
+      .add(*Idx)
       .addImm(Offset);
   }
 
@@ -1522,7 +1723,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   std::tie(SubReg, Offset)
     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
 
-  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
 
   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
     MachineBasicBlock::iterator I(&MI);
@@ -1548,7 +1749,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
     return &MBB;
   }
 
-
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
@@ -1625,7 +1825,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
                                                          SrcVec->getReg(),
                                                          Offset);
-  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
 
   if (Idx->getReg() == AMDGPU::NoRegister) {
     MachineBasicBlock::iterator I(&MI);
@@ -1634,9 +1834,9 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
     assert(Offset == 0);
 
     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
-      .addOperand(*SrcVec)
-      .addOperand(*Val)
-      .addImm(SubReg);
+        .add(*SrcVec)
+        .add(*Val)
+        .addImm(SubReg);
 
     MI.eraseFromParent();
     return &MBB;
@@ -1648,11 +1848,11 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
     if (UseGPRIdxMode) {
       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
-        .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
-        .addOperand(*Val)
-        .addReg(Dst, RegState::ImplicitDefine)
-        .addReg(SrcVec->getReg(), RegState::Implicit)
-        .addReg(AMDGPU::M0, RegState::Implicit);
+          .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+          .add(*Val)
+          .addReg(Dst, RegState::ImplicitDefine)
+          .addReg(SrcVec->getReg(), RegState::Implicit)
+          .addReg(AMDGPU::M0, RegState::Implicit);
 
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
     } else {
@@ -1661,7 +1861,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
       BuildMI(MBB, I, DL, MovRelDesc)
           .addReg(Dst, RegState::Define)
           .addReg(SrcVec->getReg())
-          .addOperand(*Val)
+          .add(*Val)
           .addImm(SubReg - AMDGPU::sub0);
     }
 
@@ -1694,18 +1894,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
   if (UseGPRIdxMode) {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
-      .addReg(PhiReg, RegState::Undef, SubReg) // vdst
-      .addOperand(*Val) // src0
-      .addReg(Dst, RegState::ImplicitDefine)
-      .addReg(PhiReg, RegState::Implicit)
-      .addReg(AMDGPU::M0, RegState::Implicit);
+        .addReg(PhiReg, RegState::Undef, SubReg) // vdst
+        .add(*Val)                               // src0
+        .addReg(Dst, RegState::ImplicitDefine)
+        .addReg(PhiReg, RegState::Implicit)
+        .addReg(AMDGPU::M0, RegState::Implicit);
   } else {
     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
 
     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
         .addReg(Dst, RegState::Define)
         .addReg(PhiReg)
-        .addOperand(*Val)
+        .add(*Val)
         .addImm(SubReg - AMDGPU::sub0);
   }
 
@@ -1741,18 +1941,62 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   }
 
   switch (MI.getOpcode()) {
-  case AMDGPU::SI_INIT_M0: {
+  case AMDGPU::S_TRAP_PSEUDO: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    const int TrapType = MI.getOperand(0).getImm();
+
+    if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
+        Subtarget->isTrapHandlerEnabled()) {
+
+      MachineFunction *MF = BB->getParent();
+      SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+      unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+      assert(UserSGPR != AMDGPU::NoRegister);
+
+      if (!BB->isLiveIn(UserSGPR))
+        BB->addLiveIn(UserSGPR);
+
+      BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1)
+        .addReg(UserSGPR);
+      BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP))
+        .addImm(TrapType)
+        .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit);
+    } else {
+      switch (TrapType) {
+      case SISubtarget::TrapIDLLVMTrap:
+        BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM));
+        break;
+      case SISubtarget::TrapIDLLVMDebugTrap: {
+        DiagnosticInfoUnsupported NoTrap(*MF->getFunction(),
+                                         "debugtrap handler not supported",
+                                         DL,
+                                         DS_Warning);
+        LLVMContext &C = MF->getFunction()->getContext();
+        C.diagnose(NoTrap);
+        BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+        break;
+      }
+      default:
+        llvm_unreachable("unsupported trap handler type!");
+      }
+    }
+
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_INIT_M0:
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-      .addOperand(MI.getOperand(0));
+        .add(MI.getOperand(0));
     MI.eraseFromParent();
     return BB;
-  }
+
   case AMDGPU::GET_GROUPSTATICSIZE: {
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
-      .addOperand(MI.getOperand(0))
-      .addImm(MFI->getLDSSize());
+        .add(MI.getOperand(0))
+        .addImm(MFI->getLDSSize());
     MI.eraseFromParent();
     return BB;
   }
@@ -1803,7 +2047,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
     const DebugLoc &DL = MI.getDebugLoc();
     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
-      .addOperand(MI.getOperand(0));
+                           .add(MI.getOperand(0));
     Br->getOperand(1).setIsUndef(true); // read undef SCC
     MI.eraseFromParent();
     return BB;
@@ -1856,9 +2100,6 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
-  if (!VT.isSimple())
-    return false;
-
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
     // This is as fast on some subtargets. However, we always have full rate f32
@@ -1909,13 +2150,52 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
-  case ISD::TRAP: return lowerTRAP(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return lowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
   }
   return SDValue();
 }
 
+void SITargetLowering::ReplaceNodeResults(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::INSERT_VECTOR_ELT: {
+    if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
+      Results.push_back(Res);
+    return;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
+      Results.push_back(Res);
+    return;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IID) {
+    case Intrinsic::amdgcn_cvt_pkrtz: {
+      SDValue Src0 = N->getOperand(1);
+      SDValue Src1 = N->getOperand(2);
+      SDLoc SL(N);
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
+                                Src0, Src1);
+
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
+      return;
+    }
+    default:
+      break;
+    }
+  }
+  default:
+    break;
+  }
+}
+
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -1932,31 +2212,25 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
   return nullptr;
 }
 
-bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_if:
-    case AMDGPUIntrinsic::amdgcn_else:
-    case AMDGPUIntrinsic::amdgcn_end_cf:
-    case AMDGPUIntrinsic::amdgcn_loop:
-      return true;
+    case Intrinsic::amdgcn_if:
+      return AMDGPUISD::IF;
+    case Intrinsic::amdgcn_else:
+      return AMDGPUISD::ELSE;
+    case Intrinsic::amdgcn_loop:
+      return AMDGPUISD::LOOP;
+    case Intrinsic::amdgcn_end_cf:
+      llvm_unreachable("should not occur");
     default:
-      return false;
+      return 0;
     }
   }
 
-  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_break:
-    case AMDGPUIntrinsic::amdgcn_if_break:
-    case AMDGPUIntrinsic::amdgcn_else_break:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  return false;
+  // break, if_break, else_break are all only used as inputs to loop, not
+  // directly as branch conditions.
+  return 0;
 }
 
 void SITargetLowering::createDebuggerPrologueStackObjects(
@@ -1987,13 +2261,13 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
 
 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
   const Triple &TT = getTargetMachine().getTargetTriple();
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
-  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-              GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -2006,7 +2280,6 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
 /// last parameter, also switches branch target with BR if the need arise
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
                                       SelectionDAG &DAG) const {
-
   SDLoc DL(BRCOND);
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
@@ -2032,7 +2305,8 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
 
-  if (!isCFIntrinsic(Intr)) {
+  unsigned CFNode = isCFIntrinsic(Intr);
+  if (CFNode == 0) {
     // This is a uniform branch so we don't need to legalize.
     return BRCOND;
   }
@@ -2050,15 +2324,13 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   if (HaveChain)
     Ops.push_back(BRCOND.getOperand(0));
 
-  Ops.append(Intr->op_begin() + (HaveChain ?  1 : 0), Intr->op_end());
+  Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
   Ops.push_back(Target);
 
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
 
   // build the new intrinsic call
-  SDNode *Result = DAG.getNode(
-    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res), Ops).getNode();
+  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
 
   if (!HaveChain) {
     SDValue Ops[] =  {
@@ -2130,9 +2402,28 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
 }
 
-SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                              SelectionDAG &DAG) const {
-  SDLoc SL;
+  // FIXME: Use inline constants (src_{shared, private}_base) instead.
+  if (Subtarget->hasApertureRegs()) {
+    unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
+        AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
+        AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
+    unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
+        AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
+        AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
+    unsigned Encoding =
+        AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
+        Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
+        WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
+
+    SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
+    SDValue ApertureReg = SDValue(
+        DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
+    SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
+    return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
+  }
+
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
@@ -2143,19 +2434,19 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS,
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
-  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
 
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
-                            DAG.getConstant(StructOffset, SL, MVT::i64));
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, DL, MVT::i64));
 
   // TODO: Use custom target PseudoSourceValue.
   // TODO: We should use the value from the IR intrinsic call, but it might not
   // be available and how do we get it?
   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
-                                              AMDGPUAS::CONSTANT_ADDRESS));
+                                              AMDGPUASI.CONSTANT_ADDRESS));
 
   MachinePointerInfo PtrInfo(V, StructOffset);
-  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
                      MinAlign(64, StructOffset),
                      MachineMemOperand::MODereferenceable |
                          MachineMemOperand::MOInvariant);
@@ -2167,15 +2458,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
 
   SDValue Src = ASC->getOperand(0);
-
-  // FIXME: Really support non-0 null pointers.
-  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
 
+  const AMDGPUTargetMachine &TM =
+    static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+
   // flat -> local/private
-  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+  if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+    unsigned DestAS = ASC->getDestAddressSpace();
+
+    if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
+        DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
+      unsigned NullVal = TM.getNullPointerValue(DestAS);
+      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
@@ -2185,13 +2480,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   }
 
   // local/private -> flat
-  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+  if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+    unsigned SrcAS = ASC->getSrcAddressSpace();
+
+    if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
+        SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
+      unsigned NullVal = TM.getNullPointerValue(SrcAS);
+      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
+
       SDValue NonNull
         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
 
-      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
       SDValue CvtPtr
         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
 
@@ -2211,17 +2511,88 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   return DAG.getUNDEF(ASC->getValueType(0));
 }
 
+SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDValue Idx = Op.getOperand(2);
+  if (isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  // Avoid stack access for dynamic indexing.
+  SDLoc SL(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+
+  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
+  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+
+  // Convert vector index to bit-index.
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
+                                  DAG.getConstant(16, SL, MVT::i32));
+
+  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+
+  SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
+                            DAG.getConstant(0xffff, SL, MVT::i32),
+                            ScaledIdx);
+
+  SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
+  SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
+                            DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+
+  SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
+  return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+}
+
+SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  EVT ResultVT = Op.getValueType();
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+
+  if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+
+    if (CIdx->getZExtValue() == 1) {
+      Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
+                           DAG.getConstant(16, SL, MVT::i32));
+    } else {
+      assert(CIdx->getZExtValue() == 0);
+    }
+
+    if (ResultVT.bitsLT(MVT::i32))
+      Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+    return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+  }
+
+  SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+
+  // Convert vector index to bit-index.
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+  SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+
+  SDValue Result = Elt;
+  if (ResultVT.bitsLT(MVT::i32))
+    Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+
+  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+}
+
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // We can fold offsets for anything that doesn't require a GOT relocation.
-  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-              GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
          !shouldEmitGOTReloc(GA->getGlobal());
 }
 
-static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
-                                      SDLoc DL, unsigned Offset, EVT PtrVT,
-                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
+static SDValue
+buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+                        const SDLoc &DL, unsigned Offset, EVT PtrVT,
+                        unsigned GAFlags = SIInstrInfo::MO_NONE) {
   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
   // lowered to the following code sequence:
   //
@@ -2265,8 +2636,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
@@ -2283,7 +2654,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                             SIInstrInfo::MO_GOTPCREL32);
 
   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
   const DataLayout &DataLayout = DAG.getDataLayout();
   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
@@ -2294,23 +2665,6 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                          MachineMemOperand::MOInvariant);
 }
 
-SDValue SITargetLowering::lowerTRAP(SDValue Op,
-                                    SelectionDAG &DAG) const {
-  const MachineFunction &MF = DAG.getMachineFunction();
-  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
-                                   "trap handler not supported",
-                                   Op.getDebugLoc(),
-                                   DS_Warning);
-  DAG.getContext()->diagnose(NoTrap);
-
-  // Emit s_endpgm.
-
-  // FIXME: This should really be selected to s_trap, but that requires
-  // setting up the trap handler for it o do anything.
-  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
-                     Op.getOperand(0));
-}
-
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
                                    const SDLoc &DL, SDValue V) const {
   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
@@ -2332,14 +2686,15 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                  MVT VT,
                                                  unsigned Offset) const {
   SDLoc SL(Op);
-  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
-                                 DAG.getEntryNode(), Offset, false);
+  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
+                                           DAG.getEntryNode(), Offset, false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
 }
 
-static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
+                                        EVT VT) {
   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
                                       "non-hsa intrinsic with hsa target",
                                       DL.getDebugLoc());
@@ -2347,7 +2702,8 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
   return DAG.getUNDEF(VT);
 }
 
-static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
+                                         EVT VT) {
   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
                                       "intrinsic not supported on subtarget",
                                       DL.getDebugLoc());
@@ -2389,7 +2745,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_implicitarg_ptr: {
     unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
-    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+    return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
   }
   case Intrinsic::amdgcn_kernarg_segment_ptr: {
     unsigned Reg
@@ -2403,19 +2759,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_rcp:
     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq:
-  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-  case Intrinsic::amdgcn_rsq_legacy: {
+  case Intrinsic::amdgcn_rsq_legacy:
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
 
     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-  }
-  case Intrinsic::amdgcn_rcp_legacy: {
+  case Intrinsic::amdgcn_rcp_legacy:
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
-  }
   case Intrinsic::amdgcn_rsq_clamp: {
     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
@@ -2434,38 +2787,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_X, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_X, false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Y, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_Y, false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Z, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_Z, false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -2522,43 +2875,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+  case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
-  }
-  case AMDGPUIntrinsic::SI_vs_load_input:
-    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                       Op.getOperand(1),
-                       Op.getOperand(2),
-                       Op.getOperand(3));
-
-  case AMDGPUIntrinsic::SI_fs_constant: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
-                       DAG.getConstant(2, DL, MVT::i32), // P0
-                       Op.getOperand(1), Op.getOperand(2), Glue);
-  }
-  case AMDGPUIntrinsic::SI_packf16:
-    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
-      return DAG.getUNDEF(MVT::i32);
-    return Op;
-  case AMDGPUIntrinsic::SI_fs_interp: {
-    SDValue IJ = Op.getOperand(4);
-    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(0, DL, MVT::i32));
-    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(1, DL, MVT::i32));
-    I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
-    J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
-                             DAG.getVTList(MVT::f32, MVT::Glue),
-                             I, Op.getOperand(1), Op.getOperand(2), Glue);
-    Glue = SDValue(P1.getNode(), 1);
-    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
-                             Op.getOperand(1), Op.getOperand(2), Glue);
-  }
   case Intrinsic::amdgcn_interp_mov: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
     SDValue Glue = M0.getValue(1);
@@ -2639,10 +2957,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_icmp: {
     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    int CondCode = CD->getSExtValue();
+    if (!CD)
+      return DAG.getUNDEF(VT);
 
+    int CondCode = CD->getSExtValue();
     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
-        CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+        CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
       return DAG.getUNDEF(VT);
 
     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
@@ -2652,10 +2972,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_fcmp: {
     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    int CondCode = CD->getSExtValue();
+    if (!CD)
+      return DAG.getUNDEF(VT);
 
-    if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
-        CondCode >= FCmpInst::Predicate::FCMP_TRUE)
+    int CondCode = CD->getSExtValue();
+    if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
+        CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
       return DAG.getUNDEF(VT);
 
     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
@@ -2663,14 +2985,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
   }
+  case Intrinsic::amdgcn_fmed3:
+    return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_fmul_legacy:
     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::amdgcn_sffbh:
-  case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_sbfe:
+    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_ubfe:
+    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    // FIXME: Stop adding cast if v2f16 legal.
+    EVT VT = Op.getValueType();
+    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+                               Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::BITCAST, DL, VT, Node);
+  }
   default:
-    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    return Op;
   }
 }
 
@@ -2718,6 +3055,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  // Basic sample.
+  case Intrinsic::amdgcn_image_sample:
+  case Intrinsic::amdgcn_image_sample_cl:
+  case Intrinsic::amdgcn_image_sample_d:
+  case Intrinsic::amdgcn_image_sample_d_cl:
+  case Intrinsic::amdgcn_image_sample_l:
+  case Intrinsic::amdgcn_image_sample_b:
+  case Intrinsic::amdgcn_image_sample_b_cl:
+  case Intrinsic::amdgcn_image_sample_lz:
+  case Intrinsic::amdgcn_image_sample_cd:
+  case Intrinsic::amdgcn_image_sample_cd_cl:
+
+  // Sample with comparison.
+  case Intrinsic::amdgcn_image_sample_c:
+  case Intrinsic::amdgcn_image_sample_c_cl:
+  case Intrinsic::amdgcn_image_sample_c_d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl:
+  case Intrinsic::amdgcn_image_sample_c_l:
+  case Intrinsic::amdgcn_image_sample_c_b:
+  case Intrinsic::amdgcn_image_sample_c_b_cl:
+  case Intrinsic::amdgcn_image_sample_c_lz:
+  case Intrinsic::amdgcn_image_sample_c_cd:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl:
+
+  // Sample with offsets.
+  case Intrinsic::amdgcn_image_sample_o:
+  case Intrinsic::amdgcn_image_sample_cl_o:
+  case Intrinsic::amdgcn_image_sample_d_o:
+  case Intrinsic::amdgcn_image_sample_d_cl_o:
+  case Intrinsic::amdgcn_image_sample_l_o:
+  case Intrinsic::amdgcn_image_sample_b_o:
+  case Intrinsic::amdgcn_image_sample_b_cl_o:
+  case Intrinsic::amdgcn_image_sample_lz_o:
+  case Intrinsic::amdgcn_image_sample_cd_o:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o:
+
+  // Sample with comparison and offsets.
+  case Intrinsic::amdgcn_image_sample_c_o:
+  case Intrinsic::amdgcn_image_sample_c_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_d_o:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_l_o:
+  case Intrinsic::amdgcn_image_sample_c_b_o:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_lz_o:
+  case Intrinsic::amdgcn_image_sample_c_cd_o:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
+
+  case Intrinsic::amdgcn_image_getlod: {
+    // Replace dmask with everything disabled with undef.
+    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
+    if (!DMask || DMask->isNullValue()) {
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
+    }
+
+    return SDValue();
+  }
   default:
     return SDValue();
   }
@@ -2731,17 +3126,60 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
   switch (IntrinsicID) {
-  case AMDGPUIntrinsic::SI_sendmsg:
-  case Intrinsic::amdgcn_s_sendmsg: {
-    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
-    SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
-                       Op.getOperand(2), Glue);
+  case Intrinsic::amdgcn_exp: {
+    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
+    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
+    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
+
+    const SDValue Ops[] = {
+      Chain,
+      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
+      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
+      Op.getOperand(4), // src0
+      Op.getOperand(5), // src1
+      Op.getOperand(6), // src2
+      Op.getOperand(7), // src3
+      DAG.getTargetConstant(0, DL, MVT::i1), // compr
+      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
+    };
+
+    unsigned Opc = Done->isNullValue() ?
+      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+  }
+  case Intrinsic::amdgcn_exp_compr: {
+    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
+    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
+    SDValue Src0 = Op.getOperand(4);
+    SDValue Src1 = Op.getOperand(5);
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
+    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
+
+    SDValue Undef = DAG.getUNDEF(MVT::f32);
+    const SDValue Ops[] = {
+      Chain,
+      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
+      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
+      Undef, // src2
+      Undef, // src3
+      DAG.getTargetConstant(1, DL, MVT::i1), // compr
+      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
+    };
+
+    unsigned Opc = Done->isNullValue() ?
+      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
   }
+  case Intrinsic::amdgcn_s_sendmsg:
   case Intrinsic::amdgcn_s_sendmsghalt: {
+    unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
+      AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
     SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
+    return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
                        Op.getOperand(2), Glue);
   }
   case AMDGPUIntrinsic::SI_tbuffer_store: {
@@ -2784,31 +3222,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
   }
-  case AMDGPUIntrinsic::SI_export: {
-    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
-    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
-    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
-    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
-    const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
-
-    const SDValue Ops[] = {
-      Chain,
-      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
-      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
-      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
-      DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
-      Op.getOperand(7), // src0
-      Op.getOperand(8), // src1
-      Op.getOperand(9), // src2
-      Op.getOperand(10) // src3
-    };
-
-    unsigned Opc = Done->isNullValue() ?
-      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
-    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
-  }
-  default:
+  case Intrinsic::amdgcn_s_barrier: {
+    if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
+      const MachineFunction &MF = DAG.getMachineFunction();
+      const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+      unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
+      if (WGSize <= ST.getWavefrontSize())
+        return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
+                                          Op.getOperand(0)), 0);
+    }
     return SDValue();
+  };
+  default:
+    return Op;
   }
 }
 
@@ -2857,21 +3283,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUASI.FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = MemVT.getVectorNumElements();
-  switch (AS) {
-  case AMDGPUAS::CONSTANT_ADDRESS:
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     if (isMemOpUniform(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
-    LLVM_FALLTHROUGH;
-  case AMDGPUAS::GLOBAL_ADDRESS: {
+  }
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
                   isMemOpHasNoClobberedMemOperand(Load))
       return SDValue();
@@ -2880,13 +3305,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     // loads.
     //
   }
-    LLVM_FALLTHROUGH;
-  case AMDGPUAS::FLAT_ADDRESS:
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+      AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
     // v4 loads are supported for private and global memory.
     return SDValue();
-  case AMDGPUAS::PRIVATE_ADDRESS: {
+  }
+  if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     // Depending on the setting of the private_element_size field in the
     // resource descriptor, we can only make private accesses up to a certain
     // size.
@@ -2905,8 +3331,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  }
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
 
@@ -2916,9 +3341,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     // If properly aligned, if we split we might be able to use ds_read_b64.
     return SplitVectorLoad(Op, DAG);
   }
-  default:
-    return SDValue();
-  }
+  return SDValue();
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -3287,18 +3710,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUASI.FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = VT.getVectorNumElements();
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::FLAT_ADDRESS:
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
+      AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorStore(Op, DAG);
     return SDValue();
-  case AMDGPUAS::PRIVATE_ADDRESS: {
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     switch (Subtarget->getMaxPrivateElementSize()) {
     case 4:
       return scalarizeVectorStore(Store, DAG);
@@ -3313,8 +3735,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  }
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
@@ -3323,8 +3744,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
     // If properly aligned, if we split we might be able to use ds_write_b64.
     return SplitVectorStore(Op, DAG);
-  }
-  default:
+  } else {
     llvm_unreachable("unhandled address space");
   }
 }
@@ -3355,7 +3775,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
   unsigned AS = AtomicNode->getAddressSpace();
 
   // No custom lowering required for local address space
-  if (!isFlatGlobalAddrSpace(AS))
+  if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
     return Op;
 
   // Non-local address space requires custom lowering for atomic compare
@@ -3412,12 +3832,12 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
 /// the immediate offsets of a memory instruction for the given address space.
 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
                           const SISubtarget &STI) {
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS: {
+  auto AMDGPUASI = STI.getAMDGPUAS();
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // MUBUF instructions a 12-bit offset in bytes.
     return isUInt<12>(OffsetSize);
   }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // SMRD instructions have an 8-bit offset in dwords on SI and
     // a 20-bit offset in bytes on VI.
     if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
@@ -3425,16 +3845,13 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
     else
       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
   }
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS: {
+  if (AS == AMDGPUASI.LOCAL_ADDRESS ||
+      AS == AMDGPUASI.REGION_ADDRESS) {
     // The single offset versions have a 16-bit offset in bytes.
     return isUInt<16>(OffsetSize);
   }
-  case AMDGPUAS::PRIVATE_ADDRESS:
   // Indirect register addressing does not use any offsets.
-  default:
-    return 0;
-  }
+  return false;
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
@@ -3492,7 +3909,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
 
   // TODO: We could also do this for multiplies.
   unsigned AS = N->getAddressSpace();
-  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
     if (NewPtr) {
       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
@@ -3692,6 +4109,88 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
   return SDValue();
 }
 
+// Instructions that will be lowered with a final instruction that zeros the
+// high result bits.
+// XXX - probably only need to list legal operations.
+static bool fp16SrcZerosHighBits(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FMA:
+  case ISD::FMAD:
+  case ISD::FCANONICALIZE:
+  case ISD::FP_ROUND:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::FABS:
+    // Fabs is lowered to a bit operation, but it's an and which will clear the
+    // high bits anyway.
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FFLOOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::CLAMP:
+  case AMDGPUISD::COS_HW:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMIN3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMAD_FTZ:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::LDEXP:
+    return true;
+  default:
+    // fcopysign, select and others may be lowered to 32-bit bit operations
+    // which don't zero the high bits.
+    return false;
+  }
+}
+
+SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  if (!Subtarget->has16BitInsts() ||
+      DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  if (Src.getValueType() != MVT::i16)
+    return SDValue();
+
+  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
+  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
+  if (Src.getOpcode() == ISD::BITCAST) {
+    SDValue BCSrc = Src.getOperand(0);
+    if (BCSrc.getValueType() == MVT::f16 &&
+        fp16SrcZerosHighBits(BCSrc.getOpcode()))
+      return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::performClassCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -3713,7 +4212,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
 SDValue SITargetLowering::performFCanonicalizeCombine(
   SDNode *N,
   DAGCombinerInfo &DCI) const {
-  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
   if (!CFP)
     return SDValue();
 
@@ -3723,13 +4222,14 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
   // Flush denormals to 0 if not enabled.
   if (C.isDenormal()) {
     EVT VT = N->getValueType(0);
-    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+    EVT SVT = VT.getScalarType();
+    if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
 
-    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+    if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
 
-    if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+    if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
   }
 
@@ -3749,7 +4249,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
   }
 
-  return SDValue(CFP, 0);
+  return N->getOperand(0);
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -3771,8 +4271,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   }
 }
 
-static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                        SDValue Op0, SDValue Op1, bool Signed) {
+SDValue SITargetLowering::performIntMed3ImmCombine(
+  SelectionDAG &DAG, const SDLoc &SL,
+  SDValue Op0, SDValue Op1, bool Signed) const {
   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
   if (!K1)
     return SDValue();
@@ -3790,23 +4291,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
   }
 
   EVT VT = K0->getValueType(0);
+  unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
+  if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
+    return DAG.getNode(Med3Opc, SL, VT,
+                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+  }
 
+  // If there isn't a 16-bit med3 operation, convert to 32-bit.
   MVT NVT = MVT::i32;
   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
 
-  SDValue Tmp1, Tmp2, Tmp3;
-  Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
-  Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
-  Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
-
-  if (VT == MVT::i16) {
-    Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
-                       Tmp1, Tmp2, Tmp3);
+  SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+  SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+  SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
 
-    return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
-  } else
-    return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
-                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+  SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+  return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
 }
 
 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
@@ -3816,8 +4316,10 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   return DAG.isKnownNeverNaN(Op);
 }
 
-static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                       SDValue Op0, SDValue Op1) {
+SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
+                                                  const SDLoc &SL,
+                                                  SDValue Op0,
+                                                  SDValue Op1) const {
   ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
   if (!K1)
     return SDValue();
@@ -3831,6 +4333,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
   if (Cmp == APFloat::cmpGreaterThan)
     return SDValue();
 
+  // TODO: Check IEEE bit enabled?
+  EVT VT = K0->getValueType(0);
+  if (Subtarget->enableDX10Clamp()) {
+    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
+    // hardware fmed3 behavior converting to a min.
+    // FIXME: Should this be allowing -0.0?
+    if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
+  }
+
+  // med3 for f16 is only available on gfx9+.
+  if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
+    return SDValue();
+
   // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
   // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
   // give the other result, which is different from med3 with a NaN input.
@@ -3846,6 +4362,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
+  EVT VT = N->getValueType(0);
   unsigned Opc = N->getOpcode();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -3853,7 +4370,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
+      VT != MVT::f64) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
@@ -3895,7 +4414,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
-      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+      (VT == MVT::f32 || VT == MVT::f64 ||
+       (VT == MVT::f16 && Subtarget->has16BitInsts())) &&
+      Op0.hasOneUse()) {
     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
       return Res;
   }
@@ -3903,6 +4424,69 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   return SDValue();
 }
 
+static bool isClampZeroToOne(SDValue A, SDValue B) {
+  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
+    if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
+      // FIXME: Should this be allowing -0.0?
+      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
+             (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
+    }
+  }
+
+  return false;
+}
+
+// FIXME: Should only worry about snans for version with chain.
+SDValue SITargetLowering::performFMed3Combine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
+  // NaNs. With a NaN input, the order of the operands may change the result.
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue Src2 = N->getOperand(2);
+
+  if (isClampZeroToOne(Src0, Src1)) {
+    // const_a, const_b, x -> clamp is safe in all cases including signaling
+    // nans.
+    // FIXME: Should this be allowing -0.0?
+    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
+  }
+
+  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
+  // handling no dx10-clamp?
+  if (Subtarget->enableDX10Clamp()) {
+    // If NaNs is clamped to 0, we are free to reorder the inputs.
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
+      std::swap(Src1, Src2);
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isClampZeroToOne(Src1, Src2))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  if (Src0.isUndef() && Src1.isUndef())
+    return DCI.DAG.getUNDEF(N->getValueType(0));
+  return SDValue();
+}
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -3933,7 +4517,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
-  assert(!VT.isVector());
 
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
@@ -4112,7 +4695,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
       return performMinMaxCombine(N, DCI);
     break;
@@ -4135,17 +4717,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case AMDGPUISD::ATOMIC_INC:
-  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
+  case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
-  }
   case ISD::AND:
     return performAndCombine(N, DCI);
   case ISD::OR:
     return performOrCombine(N, DCI);
   case ISD::XOR:
     return performXorCombine(N, DCI);
+  case ISD::ZERO_EXTEND:
+    return performZeroExtendCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
   case ISD::FCANONICALIZE:
@@ -4170,6 +4753,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3:
     return performCvtF32UByteNCombine(N, DCI);
+  case AMDGPUISD::FMED3:
+    return performFMed3Combine(N, DCI);
+  case AMDGPUISD::CVT_PKRTZ_F16_F32:
+    return performCvtPkRTZCombine(N, DCI);
+  case ISD::SCALAR_TO_VECTOR: {
+    SelectionDAG &DAG = DCI.DAG;
+    EVT VT = N->getValueType(0);
+
+    // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
+    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+      SDLoc SL(N);
+      SDValue Src = N->getOperand(0);
+      EVT EltVT = Src.getValueType();
+      if (EltVT == MVT::f16)
+        Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
+
+      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
+      return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
+    }
+
+    break;
+  }
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -4198,6 +4803,10 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
        I != E; ++I) {
 
+    // Don't look at users of the chain.
+    if (I.getUse().getResNo() != 0)
+      continue;
+
     // Abort if we can't understand the usage
     if (!I->isMachineOpcode() ||
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
@@ -4250,7 +4859,6 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // Update the users of the node with the new indices
   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
-
     SDNode *User = Users[i];
     if (!User)
       continue;
@@ -4277,8 +4885,33 @@ static bool isFrameIndexOp(SDValue Op) {
 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
 /// with frame index operands.
 /// LLVM assumes that inputs are to these instructions are registers.
-void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
-                                                     SelectionDAG &DAG) const {
+SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+                                                        SelectionDAG &DAG) const {
+  if (Node->getOpcode() == ISD::CopyToReg) {
+    RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
+    SDValue SrcVal = Node->getOperand(2);
+
+    // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
+    // to try understanding copies to physical registers.
+    if (SrcVal.getValueType() == MVT::i1 &&
+        TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
+      SDLoc SL(Node);
+      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+      SDValue VReg = DAG.getRegister(
+        MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
+
+      SDNode *Glued = Node->getGluedNode();
+      SDValue ToVReg
+        = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
+                         SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
+      SDValue ToResultReg
+        = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
+                           VReg, ToVReg.getValue(1));
+      DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
+      DAG.RemoveDeadNode(Node);
+      return ToResultReg.getNode();
+    }
+  }
 
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
@@ -4294,6 +4927,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
   }
 
   DAG.UpdateNodeOperands(Node, Ops);
+  return Node;
 }
 
 /// \brief Fold the instructions after selecting them.
@@ -4496,6 +5130,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
       case 256:
         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
       }
 
     case 'v':
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 6c04e4f30977..d177777ad5ee 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -21,11 +21,13 @@
 namespace llvm {
 
 class SITargetLowering final : public AMDGPUTargetLowering {
-  SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
-                            unsigned Offset) const;
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
-                         SDValue Chain, unsigned Offset, bool Signed,
-                         const ISD::InputArg *Arg = nullptr) const;
+  SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
+                                   SDValue Chain, uint64_t Offset) const;
+  SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                   const SDLoc &SL, SDValue Chain,
+                                   uint64_t Offset, bool Signed,
+                                   const ISD::InputArg *Arg = nullptr) const;
+
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -55,11 +57,19 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                             const SDLoc &DL,
                             EVT VT) const;
 
+  SDValue convertArgType(
+    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
+    bool Signed, const ISD::InputArg *Arg = nullptr) const;
+
   /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+  SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
+                             SelectionDAG &DAG) const;
+
   SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
 
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
@@ -79,10 +89,17 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
+  SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                  SDValue Op0, SDValue Op1) const;
+  SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                   SDValue Op0, SDValue Op1, bool Signed) const;
   SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
@@ -94,7 +111,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
-  bool isCFIntrinsic(const SDNode *Intr) const;
+  unsigned isCFIntrinsic(const SDNode *Intr) const;
 
   void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
 
@@ -115,11 +132,15 @@ public:
 
   const SISubtarget *getSubtarget() const;
 
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+                          EVT /*VT*/) const override;
+
   bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
                           unsigned IntrinsicID) const override;
 
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
-                          EVT /*VT*/) const override;
+  bool getAddrModeArguments(IntrinsicInst * /*I*/,
+                            SmallVectorImpl<Value*> &/*Ops*/,
+                            Type *&/*AccessTy*/) const override;
 
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
@@ -175,6 +196,9 @@ public:
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
   void AdjustInstrPostInstrSelection(MachineInstr &MI,
@@ -182,7 +206,7 @@ public:
 
   SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
                                unsigned Reg, EVT VT) const override;
-  void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+  SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
 
   MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
                                 SDValue Ptr) const;
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index 91e4bf755c53..ba346d2fad02 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -1,4 +1,4 @@
-//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
+//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,33 +12,46 @@
 /// branches when it's expected that jumping over the untaken control flow will
 /// be cheaper than having every workitem no-op through it.
 //
+//===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "si-insert-skips"
 
-namespace {
-
 static cl::opt<unsigned> SkipThresholdFlag(
   "amdgpu-skip-threshold",
   cl::desc("Number of instructions before jumping over divergent control flow"),
   cl::init(12), cl::Hidden);
 
+namespace {
+
 class SIInsertSkips : public MachineFunctionPass {
 private:
-  const SIRegisterInfo *TRI;
-  const SIInstrInfo *TII;
-  unsigned SkipThreshold;
+  const SIRegisterInfo *TRI = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  unsigned SkipThreshold = 0;
 
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
@@ -55,8 +68,7 @@ private:
 public:
   static char ID;
 
-  SIInsertSkips() :
-    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+  SIInsertSkips() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -69,7 +81,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 char SIInsertSkips::ID = 0;
 
@@ -195,8 +207,8 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     }
   } else {
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
-      .addImm(0)
-      .addOperand(Op);
+        .addImm(0)
+        .add(Op);
   }
 }
 
@@ -251,6 +263,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
+    bool HaveSkipBlock = false;
 
     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
       // Reached convergence point for last divergent branch.
@@ -270,27 +283,33 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
-      case AMDGPU::SI_MASK_BRANCH: {
+      case AMDGPU::SI_MASK_BRANCH:
         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
         MadeChange |= skipMaskBranch(MI, MBB);
         break;
-      }
-      case AMDGPU::S_BRANCH: {
+
+      case AMDGPU::S_BRANCH:
         // Optimize out branches to the next block.
         // FIXME: Shouldn't this be handled by BranchFolding?
-        if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
+        if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+          MI.eraseFromParent();
+        } else if (HaveSkipBlock) {
+          // Remove the given unconditional branch when a skip block has been
+          // inserted after the current one and let skip the two instructions
+          // performing the kill if the exec mask is non-zero.
           MI.eraseFromParent();
+        }
         break;
-      }
-      case AMDGPU::SI_KILL_TERMINATOR: {
+
+      case AMDGPU::SI_KILL_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
         if (ExecBranchStack.empty()) {
           if (skipIfDead(MI, *NextBB)) {
+            HaveSkipBlock = true;
             NextBB = std::next(BI);
             BE = MF.end();
-            Next = MBB.end();
           }
         } else {
           HaveKill = true;
@@ -298,15 +317,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
 
         MI.eraseFromParent();
         break;
-      }
-      case AMDGPU::SI_RETURN: {
+
+      case AMDGPU::SI_RETURN_TO_EPILOG:
         // FIXME: Should move somewhere else
         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
 
         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
         // because external bytecode will be appended at the end.
         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
-          // SI_RETURN is not the last instruction. Add an empty block at
+          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
           // the end and jump there.
           if (!EmptyMBBAtEnd) {
             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
@@ -318,7 +337,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
             .addMBB(EmptyMBBAtEnd);
           I->eraseFromParent();
         }
-      }
+        break;
+
       default:
         break;
       }
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
new file mode 100644
index 000000000000..c2a3e62aa827
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -0,0 +1,1863 @@
+//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "si-insert-waitcnts"
+
+using namespace llvm;
+
+namespace {
+
+// Class of object that encapsulates latest instruction counter score
+// associated with the operand.  Used for determining whether
+// s_waitcnt instruction needs to be emited.
+
+#define CNT_MASK(t) (1u << (t))
+
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+
+typedef std::pair<signed, signed> RegInterval;
+
+struct {
+  int32_t VmcntMax;
+  int32_t ExpcntMax;
+  int32_t LgkmcntMax;
+  int32_t NumVGPRsMax;
+  int32_t NumSGPRsMax;
+} HardwareLimits;
+
+struct {
+  unsigned VGPR0;
+  unsigned VGPRL;
+  unsigned SGPR0;
+  unsigned SGPRL;
+} RegisterEncoding;
+
+enum WaitEventType {
+  VMEM_ACCESS,      // vector-memory read & write
+  LDS_ACCESS,       // lds read & write
+  GDS_ACCESS,       // gds read & write
+  SQ_MESSAGE,       // send message
+  SMEM_ACCESS,      // scalar-memory read & write
+  EXP_GPR_LOCK,     // export holding on its data src
+  GDS_GPR_LOCK,     // GDS holding on its data and addr src
+  EXP_POS_ACCESS,   // write to export position
+  EXP_PARAM_ACCESS, // write to export parameter
+  VMW_GPR_LOCK,     // vector-memory write holding on its data src
+  NUM_WAIT_EVENTS,
+};
+
+// The mapping is:
+//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
+//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
+//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// We reserve a fixed number of VGPR slots in the scoring tables for
+// special tokens like SCMEM_LDS (needed for buffer load to LDS).
+enum RegisterMapping {
+  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
+  EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
+  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
+};
+
+#define ForAllWaitEventType(w)                                                 \
+  for (enum WaitEventType w = (enum WaitEventType)0;                           \
+       (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
+       (w) = (enum WaitEventType)((w) + 1))
+
+// This is a per-basic-block object that maintains current score brackets
+// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait-count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class BlockWaitcntBrackets {
+public:
+  static int32_t getWaitCountMax(InstCounterType T) {
+    switch (T) {
+    case VM_CNT:
+      return HardwareLimits.VmcntMax;
+    case LGKM_CNT:
+      return HardwareLimits.LgkmcntMax;
+    case EXP_CNT:
+      return HardwareLimits.ExpcntMax;
+    default:
+      break;
+    }
+    return 0;
+  };
+
+  void setScoreLB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  };
+
+  void setScoreUB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
+      if (ScoreLBs[T] < UB)
+        ScoreLBs[T] = UB;
+    }
+  };
+
+  int32_t getScoreLB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreLBs[T];
+  };
+
+  int32_t getScoreUB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreUBs[T];
+  };
+
+  // Mapping from event to counter.
+  InstCounterType eventCounter(WaitEventType E) {
+    switch (E) {
+    case VMEM_ACCESS:
+      return VM_CNT;
+    case LDS_ACCESS:
+    case GDS_ACCESS:
+    case SQ_MESSAGE:
+    case SMEM_ACCESS:
+      return LGKM_CNT;
+    case EXP_GPR_LOCK:
+    case GDS_GPR_LOCK:
+    case VMW_GPR_LOCK:
+    case EXP_POS_ACCESS:
+    case EXP_PARAM_ACCESS:
+      return EXP_CNT;
+    default:
+      llvm_unreachable("unhandled event type");
+    }
+    return NUM_INST_CNTS;
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
+  }
+
+  int32_t getRegScore(int GprNo, InstCounterType T) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      return VgprScores[T][GprNo];
+    }
+    return SgprScores[GprNo - NUM_ALL_VGPRS];
+  }
+
+  void clear() {
+    memset(ScoreLBs, 0, sizeof(ScoreLBs));
+    memset(ScoreUBs, 0, sizeof(ScoreUBs));
+    memset(EventUBs, 0, sizeof(EventUBs));
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+    memset(SgprScores, 0, sizeof(SgprScores));
+  }
+
+  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI, unsigned OpNo,
+                             bool Def) const;
+
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, int32_t Val);
+
+  void setWaitAtBeginning() { WaitAtBeginning = true; }
+  void clearWaitAtBeginning() { WaitAtBeginning = false; }
+  bool getWaitAtBeginning() const { return WaitAtBeginning; }
+  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
+  int32_t getMaxVGPR() const { return VgprUB; }
+  int32_t getMaxSGPR() const { return SgprUB; }
+  int32_t getEventUB(enum WaitEventType W) const {
+    assert(W < NUM_WAIT_EVENTS);
+    return EventUBs[W];
+  }
+  bool counterOutOfOrder(InstCounterType T);
+  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  BlockWaitcntBrackets()
+      : WaitAtBeginning(false), ValidLoop(false), MixedExpTypes(false),
+        LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+  }
+  ~BlockWaitcntBrackets(){};
+
+  bool hasPendingSMEM() const {
+    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
+             LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
+            (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
+             LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
+    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+  }
+
+  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
+
+  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+
+  bool getRevisitLoop() const { return RevisitLoop; }
+  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+
+  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
+  int32_t getPostOrder() const { return PostOrder; }
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
+  void clearWaitcnt() { Waitcnt = NULL; }
+  MachineInstr *getWaitcnt() const { return Waitcnt; }
+
+  bool mixedExpTypes() const { return MixedExpTypes; }
+  void setMixedExpTypes(bool MixedExpTypesIn) {
+    MixedExpTypes = MixedExpTypesIn;
+  }
+
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
+
+private:
+  bool WaitAtBeginning;
+  bool RevisitLoop;
+  bool ValidLoop;
+  bool MixedExpTypes;
+  MachineLoop *LoopRegion;
+  int32_t PostOrder;
+  MachineInstr *Waitcnt;
+  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
+  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
+  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  // Remember the last flat memory operation.
+  int32_t LastFlat[NUM_INST_CNTS] = {0};
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int32_t VgprUB;
+  int32_t SgprUB;
+  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+};
+
+// This is a per-loop-region object that records waitcnt status at the end of
+// loop footer from the previous iteration. We also maintain an iteration
+// count to track the number of times the loop has been visited. When it
+// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
+// at the end of the loop footer.
+class LoopWaitcntData {
+public:
+  void incIterCnt() { IterCnt++; }
+  void resetIterCnt() { IterCnt = 0; }
+  int32_t getIterCnt() { return IterCnt; }
+
+  LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
+  ~LoopWaitcntData(){};
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
+  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
+
+  void print() {
+    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
+    return;
+  }
+
+private:
+  // s_waitcnt added at the end of loop footer to stablize wait scores
+  // at the end of the loop footer.
+  MachineInstr *LfWaitcnt;
+  // Number of iterations the loop has been visited, not including the initial
+  // walk over.
+  int32_t IterCnt;
+};
+
+class SIInsertWaitcnts : public MachineFunctionPass {
+
+private:
+  const SISubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const MachineLoopInfo *MLI;
+  AMDGPU::IsaInfo::IsaVersion IV;
+  AMDGPUAS AMDGPUASI;
+
+  DenseSet<MachineBasicBlock *> BlockVisitedSet;
+  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> VCCZBugHandledSet;
+
+  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
+      BlockWaitcntBracketsMap;
+
+  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+
+  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
+
+  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+
+public:
+  static char ID;
+
+  SIInsertWaitcnts()
+      : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
+        MRI(nullptr), MLI(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI insert wait instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
+    // The waitcnt information is copied because it changes as the block is
+    // traversed.
+    KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
+  }
+
+  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
+                                           BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitCntAfter(MachineInstr &Inst,
+                               BlockWaitcntBrackets *ScoreBrackets);
+  void mergeInputScoreBrackets(MachineBasicBlock &Block);
+  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
+  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+};
+
+} // End anonymous namespace.
+
+RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                                 const SIInstrInfo *TII,
+                                                 const MachineRegisterInfo *MRI,
+                                                 const SIRegisterInfo *TRI,
+                                                 unsigned OpNo,
+                                                 bool Def) const {
+  const MachineOperand &Op = MI->getOperand(OpNo);
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
+      (Def && !Op.isDef()))
+    return {-1, -1};
+
+  // A use via a PW operand does not need a waitcnt.
+  // A partial write is not a WAW.
+  assert(!Op.getSubReg() || !Op.isUndef());
+
+  RegInterval Result;
+  const MachineRegisterInfo &MRIA = *MRI;
+
+  unsigned Reg = TRI->getEncodingValue(Op.getReg());
+
+  if (TRI->isVGPR(MRIA, Op.getReg())) {
+    assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
+    Result.first = Reg - RegisterEncoding.VGPR0;
+    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
+    assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+    Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
+    assert(Result.first >= NUM_ALL_VGPRS &&
+           Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+  }
+  // TODO: Handle TTMP
+  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
+  else
+    return {-1, -1};
+
+  const MachineInstr &MIA = *MI;
+  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+  unsigned Size = RC->getSize();
+  Result.second = Result.first + (Size / 4);
+
+  return Result;
+}
+
+void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                       const SIInstrInfo *TII,
+                                       const SIRegisterInfo *TRI,
+                                       const MachineRegisterInfo *MRI,
+                                       unsigned OpNo, int32_t Val) {
+  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
+  DEBUG({
+    const MachineOperand &Opnd = MI->getOperand(OpNo);
+    assert(TRI->isVGPR(*MRI, Opnd.getReg()));
+  });
+  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+    setRegScore(RegNo, EXP_CNT, Val);
+  }
+}
+
+void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         const MachineRegisterInfo *MRI,
+                                         WaitEventType E, MachineInstr &Inst) {
+  const MachineRegisterInfo &MRIA = *MRI;
+  InstCounterType T = eventCounter(E);
+  int32_t CurrScore = getScoreUB(T) + 1;
+  // EventUB and ScoreUB need to be update regardless if this event changes
+  // the score of a register or not.
+  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
+  EventUBs[E] = CurrScore;
+  setScoreUB(T, CurrScore);
+
+  if (T == EXP_CNT) {
+    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
+    // is required.
+    if (!MixedExpTypes) {
+      MixedExpTypes = counterOutOfOrder(EXP_CNT);
+    }
+
+    // Put score on the source vgprs. If this is a store, just use those
+    // specific register(s).
+    if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+      // All GDS operations must protect their address register (same as
+      // export.)
+      if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
+          Inst.getOpcode() != AMDGPU::DS_CONSUME) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
+            CurrScore);
+      }
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
+            CurrScore);
+        if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                       AMDGPU::OpName::data1) != -1) {
+          setExpScore(&Inst, TII, TRI, MRI,
+                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                                 AMDGPU::OpName::data1),
+                      CurrScore);
+        }
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
+                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
+                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
+                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          const MachineOperand &Op = Inst.getOperand(I);
+          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+          }
+        }
+      }
+    } else if (TII->isFLAT(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMIMG(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMTBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      }
+    } else if (TII->isMUBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else {
+      if (TII->isEXP(Inst)) {
+        // For export the destination registers are really temps that
+        // can be used as the actual source after export patching, so
+        // we need to treat them like sources and set the EXP_CNT
+        // score.
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          MachineOperand &DefMO = Inst.getOperand(I);
+          if (DefMO.isReg() && DefMO.isDef() &&
+              TRI->isVGPR(MRIA, DefMO.getReg())) {
+            setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
+                        CurrScore);
+          }
+        }
+      }
+      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = Inst.getOperand(I);
+        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+        }
+      }
+    }
+#if 0 // TODO: check if this is handled by MUBUF code above.
+  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
+    unsigned OpNo;//TODO: find the OpNo for this operand;
+    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
+    for (signed RegNo = Interval.first; RegNo < Interval.second;
+	 ++RegNo) {
+      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
+    }
+#endif
+  } else {
+    // Match the score to the destination registers.
+    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
+      if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+        continue;
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
+    if (TII->isDS(Inst) && Inst.mayStore()) {
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+    }
+  }
+}
+
+void BlockWaitcntBrackets::print(raw_ostream &OS) {
+  OS << '\n';
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1)) {
+    int LB = getScoreLB(T);
+    int UB = getScoreUB(T);
+
+    switch (T) {
+    case VM_CNT:
+      OS << "    VM_CNT(" << UB - LB << "): ";
+      break;
+    case LGKM_CNT:
+      OS << "    LGKM_CNT(" << UB - LB << "): ";
+      break;
+    case EXP_CNT:
+      OS << "    EXP_CNT(" << UB - LB << "): ";
+      break;
+    default:
+      OS << "    UNKNOWN(" << UB - LB << "): ";
+      break;
+    }
+
+    if (LB < UB) {
+      // Print vgpr scores.
+      for (int J = 0; J <= getMaxVGPR(); J++) {
+        int RegScore = getRegScore(J, T);
+        if (RegScore <= LB)
+          continue;
+        int RelScore = RegScore - LB - 1;
+        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
+          OS << RelScore << ":v" << J << " ";
+        } else {
+          OS << RelScore << ":ds ";
+        }
+      }
+      // Also need to print sgpr scores for lgkm_cnt.
+      if (T == LGKM_CNT) {
+        for (int J = 0; J <= getMaxSGPR(); J++) {
+          int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          if (RegScore <= LB)
+            continue;
+          int RelScore = RegScore - LB - 1;
+          OS << RelScore << ":s" << J << " ";
+        }
+      }
+    }
+    OS << '\n';
+  }
+  OS << '\n';
+  return;
+}
+
+unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
+                                                int ScoreToWait) {
+  unsigned int NeedWait = 0;
+  if (ScoreToWait == -1) {
+    // The score to wait is unknown. This implies that it was not encountered
+    // during the path of the CFG walk done during the current traversal but
+    // may be seen on a different path. Emit an s_wait counter with a
+    // conservative value of 0 for the counter.
+    NeedWait = CNT_MASK(T);
+    setScoreLB(T, getScoreUB(T));
+    return NeedWait;
+  }
+
+  // If the score of src_operand falls within the bracket, we need an
+  // s_waitcnt instruction.
+  const int32_t LB = getScoreLB(T);
+  const int32_t UB = getScoreUB(T);
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if (T == VM_CNT && hasPendingFlat()) {
+      // If there is a pending FLAT operation, and this is a VM waitcnt,
+      // then we need to force a waitcnt 0 for VM.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the brack. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else {
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, ScoreToWait);
+    }
+  }
+
+  return NeedWait;
+}
+
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
+  switch (T) {
+  case VM_CNT:
+    return false;
+  case LGKM_CNT: {
+    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      // Scalar memory read always can go out of order.
+      return true;
+    }
+    int NumEventTypes = 0;
+    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  case EXP_CNT: {
+    // If there has been a mixture of export types, then a waitcnt exp(0) is
+    // required.
+    if (MixedExpTypes)
+      return true;
+    int NumEventTypes = 0;
+    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                      false)
+INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                    false)
+
+char SIInsertWaitcnts::ID = 0;
+
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+
+FunctionPass *llvm::createSIInsertWaitcntsPass() {
+  return new SIInsertWaitcnts();
+}
+
+static bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
+}
+
+///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+///  Instructions of a given type are returned in order,
+///  but instructions of different types can complete out of order.
+///  We rely on this in-order completion
+///  and simply assign a score to the memory access instructions.
+///  We keep track of the active "score bracket" to determine
+///  if an access of a memory read requires an s_waitcnt
+///  and if so what the value of each counter is.
+///  The "score bracket" is bound by the lower bound and upper bound
+///  scores (*_score_LB and *_score_ub respectively).
+MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
+  // To emit, or not to emit - that's the question!
+  // Start with an assumption that there is no need to emit.
+  unsigned int EmitSwaitcnt = 0;
+  // s_waitcnt instruction to return; default is NULL.
+  MachineInstr *SWaitInst = nullptr;
+  // No need to wait before phi. If a phi-move exists, then the wait should
+  // has been inserted before the move. If a phi-move does not exist, then
+  // wait should be inserted before the real use. The same is true for
+  // sc-merge. It is not a coincident that all these cases correspond to the
+  // instructions that are skipped in the assembling loop.
+  bool NeedLineMapping = false; // TODO: Check on this.
+  if (MI.isDebugValue() &&
+      // TODO: any other opcode?
+      !NeedLineMapping) {
+    return SWaitInst;
+  }
+
+  // See if an s_waitcnt is forced at block entry, or is needed at
+  // program end.
+  if (ScoreBrackets->getWaitAtBeginning()) {
+    // Note that we have already cleared the state, so we don't need to update
+    // it.
+    ScoreBrackets->clearWaitAtBeginning();
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      EmitSwaitcnt |= CNT_MASK(T);
+      ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+    }
+  }
+
+  // See if this instruction has a forced S_WAITCNT VM.
+  // TODO: Handle other cases of NeedsWaitcntVmBefore()
+  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+  }
+
+  // All waits must be resolved at call return.
+  // NOTE: this could be improved with knowledge of all call sites or
+  //   with knowledge of the called routines.
+  if (MI.getOpcode() == AMDGPU::RETURN ||
+      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+        EmitSwaitcnt |= CNT_MASK(T);
+      }
+    }
+  }
+  // Resolve vm waits before gs-done.
+  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
+            MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
+            AMDGPU::SendMsg::ID_GS_DONE)) {
+    if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
+      ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+    }
+  }
+#if 0 // TODO: the following blocks of logic when we have fence.
+  else if (MI.getOpcode() == SC_FENCE) {
+    const unsigned int group_size =
+      context->shader_info->GetMaxThreadGroupSize();
+    // group_size == 0 means thread group size is unknown at compile time
+    const bool group_is_multi_wave =
+      (group_size == 0 || group_size > target_info->GetWaveFrontSize());
+    const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
+
+    for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
+      SCRegType src_type = Inst->GetSrcType(i);
+      switch (src_type) {
+        case SCMEM_LDS:
+          if (group_is_multi_wave ||
+	      context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+                               ScoreBrackets->getScoreUB(LGKM_CNT));
+            // LDS may have to wait for VM_CNT after buffer load to LDS
+            if (target_info->HasBufferLoadToLDS()) {
+              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+                                 ScoreBrackets->getScoreUB(VM_CNT));
+            }
+          }
+          break;
+
+        case SCMEM_GDS:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+			       ScoreBrackets->getScoreUB(LGKM_CNT));
+          }
+          break;
+
+        case SCMEM_UAV:
+        case SCMEM_TFBUF:
+        case SCMEM_RING:
+        case SCMEM_SCATTER:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+			       ScoreBrackets->getScoreUB(VM_CNT));
+          }
+          break;
+
+        case SCMEM_SCRATCH:
+        default:
+          break;
+      }
+    }
+  }
+#endif
+
+  // Export & GDS instructions do not read the EXEC mask until after the export
+  // is granted (which can occur well after the instruction is issued).
+  // The shader program must flush all EXP operations on the export-count
+  // before overwriting the EXEC mask.
+  else {
+    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+      // Export and GDS are tracked individually, either may trigger a waitcnt
+      // for EXEC.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+    }
+
+#if 0 // TODO: the following code to handle CALL.
+    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
+    // However, there is a problem with EXP_CNT, because the call cannot
+    // easily tell if a register is used in the function, and if it did, then
+    // the referring instruction would have to have an S_WAITCNT, which is
+    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
+    // before the call.
+    if (MI.getOpcode() == SC_CALL) {
+      if (ScoreBrackets->getScoreUB(EXP_CNT) >
+	  ScoreBrackets->getScoreLB(EXP_CNT)) {
+        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+      }
+    }
+#endif
+
+    // Look at the source operands of every instruction to see if
+    // any of them results from a previous memory operation that affects
+    // its current usage. If so, an s_waitcnt instruction needs to be
+    // emitted.
+    // If the source operand was defined by a load, add the s_waitcnt
+    // instruction.
+    for (const MachineMemOperand *Memop : MI.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS != AMDGPUASI.LOCAL_ADDRESS)
+        continue;
+      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+      // VM_CNT is only relevant to vgpr or LDS.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      const MachineOperand &Op = MI.getOperand(I);
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Op.getReg())) {
+          // VM_CNT is only relevant to vgpr or LDS.
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    }
+    // End of for loop that looks at all source operands to decide vm_wait_cnt
+    // and lgk_wait_cnt.
+
+    // Two cases are handled for destination operands:
+    // 1) If the destination operand was defined by a load, add the s_waitcnt
+    // instruction to guarantee the right WAW order.
+    // 2) If a destination operand that was used by a recent export/store ins,
+    // add s_waitcnt on exp_cnt to guarantee the WAR order.
+    if (MI.mayStore()) {
+      for (const MachineMemOperand *Memop : MI.memoperands()) {
+        unsigned AS = Memop->getAddrSpace();
+        if (AS != AMDGPUASI.LOCAL_ADDRESS)
+          continue;
+        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+      }
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      MachineOperand &Def = MI.getOperand(I);
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Def.getReg())) {
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    } // End of for loop that looks at all dest operands.
+  }
+
+  // TODO: Tie force zero to a compiler triage option.
+  bool ForceZero = false;
+
+  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
+  // occurs before the instruction. Doing it here prevents any additional
+  // S_WAITCNTs from being emitted if the instruction was marked as
+  // requiring a WAITCNT beforehand.
+  if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
+  }
+
+  // TODO: Remove this work-around, enable the assert for Bug 457939
+  //       after fixing the scheduler. Also, the Shader Compiler code is
+  //       independent of target.
+  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+            ScoreBrackets->getScoreUB(LGKM_CNT) &&
+        ScoreBrackets->hasPendingSMEM()) {
+      // Wait on everything, not just LGKM.  vccz reads usually come from
+      // terminators, and we always wait on everything at the end of the
+      // block, so if we only wait on LGKM here, we might end up with
+      // another s_waitcnt inserted right after this if there are non-LGKM
+      // instructions still outstanding.
+      ForceZero = true;
+      EmitSwaitcnt = true;
+    }
+  }
+
+  // Does this operand processing indicate s_wait counter update?
+  if (EmitSwaitcnt) {
+    int CntVal[NUM_INST_CNTS];
+
+    bool UseDefaultWaitcntStrategy = true;
+    if (ForceZero) {
+      // Force all waitcnts to 0.
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+      }
+      CntVal[VM_CNT] = 0;
+      CntVal[EXP_CNT] = 0;
+      CntVal[LGKM_CNT] = 0;
+      UseDefaultWaitcntStrategy = false;
+    }
+
+    if (UseDefaultWaitcntStrategy) {
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        if (EmitSwaitcnt & CNT_MASK(T)) {
+          int Delta =
+              ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
+          int MaxDelta = ScoreBrackets->getWaitCountMax(T);
+          if (Delta >= MaxDelta) {
+            Delta = -1;
+            if (T != EXP_CNT) {
+              ScoreBrackets->setScoreLB(
+                  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
+            }
+            EmitSwaitcnt &= ~CNT_MASK(T);
+          }
+          CntVal[T] = Delta;
+        } else {
+          // If we are not waiting for a particular counter then encode
+          // it as -1 which means "don't care."
+          CntVal[T] = -1;
+        }
+      }
+    }
+
+    // If we are not waiting on any counter we can skip the wait altogether.
+    if (EmitSwaitcnt != 0) {
+      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
+      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
+      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+                          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
+          (AMDGPU::decodeExpcnt(IV, Imm) !=
+           (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
+          (AMDGPU::decodeLgkmcnt(IV, Imm) !=
+           (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
+        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
+        if (ContainingLoop) {
+          MachineBasicBlock *TBB = ContainingLoop->getTopBlock();
+          BlockWaitcntBrackets *ScoreBracket =
+              BlockWaitcntBracketsMap[TBB].get();
+          if (!ScoreBracket) {
+            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
+            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
+          }
+          ScoreBracket->setRevisitLoop(true);
+          DEBUG(dbgs() << "set-revisit: block"
+                       << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+        }
+      }
+
+      // Update an existing waitcount, or make a new one.
+      MachineFunction &MF = *MI.getParent()->getParent();
+      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
+        SWaitInst = OldWaitcnt;
+      } else {
+        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
+                                          MI.getDebugLoc());
+        CompilerGeneratedWaitcntSet.insert(SWaitInst);
+      }
+
+      const MachineOperand &Op =
+          MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
+              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
+      SWaitInst->addOperand(MF, Op);
+
+      if (CntVal[EXP_CNT] == 0) {
+        ScoreBrackets->setMixedExpTypes(false);
+      }
+    }
+  }
+
+  return SWaitInst;
+}
+
+void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
+                                             MachineInstr *Waitcnt) {
+  if (MBB.empty()) {
+    MBB.push_back(Waitcnt);
+    return;
+  }
+
+  MachineBasicBlock::iterator It = MBB.end();
+  MachineInstr *MI = &*(--It);
+  if (MI->isBranch()) {
+    MBB.insert(It, Waitcnt);
+  } else {
+    MBB.push_back(Waitcnt);
+  }
+
+  return;
+}
+
+void SIInsertWaitcnts::updateEventWaitCntAfter(
+    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+  // Now look at the instruction opcode. If it is a memory access
+  // instruction, update the upper-bound of the appropriate counter's
+  // bracket and the destination operand scores.
+  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
+  if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) {
+    if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+    } else {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+    }
+  } else if (TII->isFLAT(Inst)) {
+    assert(Inst.mayLoad() || Inst.mayStore());
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+
+    // This is a flat memory operation. Check to see if it has memory
+    // tokens for both LDS and Memory, and if so mark it as a flat.
+    bool FoundLDSMem = false;
+    for (const MachineMemOperand *Memop : Inst.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+        FoundLDSMem = true;
+    }
+
+    // This is a flat memory operation, so note it - it will require
+    // that both the VM and LGKM be flushed to zero if it is pending when
+    // a VM or LGKM dependency occurs.
+    if (FoundLDSMem) {
+      ScoreBrackets->setPendingFlat();
+    }
+  } else if (SIInstrInfo::isVMEM(Inst) &&
+             // TODO: get a better carve out.
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+    }
+  } else if (TII->isSMRD(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else {
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_SENDMSG:
+    case AMDGPU::S_SENDMSGHALT:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+      break;
+    case AMDGPU::EXP:
+    case AMDGPU::EXP_DONE: {
+      int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+      if (Imm >= 32 && Imm <= 63)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+      else if (Imm >= 12 && Imm <= 15)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+      else
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+      break;
+    }
+    case AMDGPU::S_MEMTIME:
+    case AMDGPU::S_MEMREALTIME:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+  int32_t MaxPending[NUM_INST_CNTS] = {0};
+  int32_t MaxFlat[NUM_INST_CNTS] = {0};
+  bool MixedExpTypes = false;
+
+  // Clear the score bracket state.
+  ScoreBrackets->clear();
+
+  // Compute the number of pending elements on block entry.
+
+  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
+  // need to handle single BBs with backedges to themselves. This means that
+  // they will need to retain and not clear their initial state.
+
+  // See if there are any uninitialized predecessors. If so, emit an
+  // s_waitcnt 0 at the beginning of the block.
+  for (MachineBasicBlock *pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[pred].get();
+    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      int span =
+          PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
+      MaxPending[T] = std::max(MaxPending[T], span);
+      span =
+          PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
+      MaxFlat[T] = std::max(MaxFlat[T], span);
+    }
+
+    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Also handle kills for exit block.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        int Span = KillWaitBrackets[I]->getScoreUB(T) -
+                   KillWaitBrackets[I]->getScoreLB(T);
+        MaxPending[T] = std::max(MaxPending[T], Span);
+        Span = KillWaitBrackets[I]->pendingFlat(T) -
+               KillWaitBrackets[I]->getScoreLB(T);
+        MaxFlat[T] = std::max(MaxFlat[T], Span);
+      }
+
+      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
+    }
+  }
+
+  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+
+    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+    }
+  }
+
+#if 0
+  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
+  // TODO: how does LC distinguish between function entry and main entry?
+  // If this is the entry to a function, force a wait.
+  MachineBasicBlock &Entry = Block.getParent()->front();
+  if (Entry.getNumber() == Block.getNumber()) {
+    ScoreBrackets->setWaitAtBeginning();
+    return;
+  }
+#endif
+
+  // Now set the current Block's brackets to the largest ending bracket.
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1)) {
+    ScoreBrackets->setScoreUB(T, MaxPending[T]);
+    ScoreBrackets->setScoreLB(T, 0);
+    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
+  }
+
+  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
+
+  // Set the register scoreboard.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    // Now merge the gpr_reg_score information
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      int PredLB = PredScoreBrackets->getScoreLB(T);
+      int PredUB = PredScoreBrackets->getScoreUB(T);
+      if (PredLB < PredUB) {
+        int PredScale = MaxPending[T] - PredUB;
+        // Merge vgpr scores.
+        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
+          int PredRegScore = PredScoreBrackets->getRegScore(J, T);
+          if (PredRegScore <= PredLB)
+            continue;
+          int NewRegScore = PredScale + PredRegScore;
+          ScoreBrackets->setRegScore(
+              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+        }
+        // Also need to merge sgpr scores for lgkm_cnt.
+        if (T == LGKM_CNT) {
+          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
+            int PredRegScore =
+                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J + NUM_ALL_VGPRS, LGKM_CNT,
+                std::max(
+                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                    NewRegScore));
+          }
+        }
+      }
+    }
+
+    // Also merge the WaitEvent information.
+    ForAllWaitEventType(W) {
+      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
+      int PredEventUB = PredScoreBrackets->getEventUB(W);
+      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
+        int NewEventUB =
+            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
+        if (NewEventUB > 0) {
+          ScoreBrackets->setEventUB(
+              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+        }
+      }
+    }
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Set the register scoreboard.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      // Now merge the gpr_reg_score information.
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
+        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
+        if (PredLB < PredUB) {
+          int PredScale = MaxPending[T] - PredUB;
+          // Merge vgpr scores.
+          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
+            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+          }
+          // Also need to merge sgpr scores for lgkm_cnt.
+          if (T == LGKM_CNT) {
+            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
+              int PredRegScore =
+                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+              if (PredRegScore <= PredLB)
+                continue;
+              int NewRegScore = PredScale + PredRegScore;
+              ScoreBrackets->setRegScore(
+                  J + NUM_ALL_VGPRS, LGKM_CNT,
+                  std::max(
+                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                      NewRegScore));
+            }
+          }
+        }
+      }
+
+      // Also merge the WaitEvent information.
+      ForAllWaitEventType(W) {
+        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
+        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
+        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
+          int NewEventUB =
+              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
+          if (NewEventUB > 0) {
+            ScoreBrackets->setEventUB(
+                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+          }
+        }
+      }
+    }
+  }
+
+  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
+  // sequencing predecessors, because changes to EXEC require waitcnts due to
+  // the delayed nature of these operations.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
+    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_gds_ub > 0) {
+        ScoreBrackets->setEventUB(
+            GDS_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
+      }
+    }
+    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
+    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_exp_ub > 0) {
+        ScoreBrackets->setEventUB(
+            EXP_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+      }
+    }
+  }
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+// Generate s_waitcnt instructions where needed.
+void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block) {
+  // Initialize the state information.
+  mergeInputScoreBrackets(Block);
+
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+
+  DEBUG({
+    dbgs() << "Block" << Block.getNumber();
+    ScoreBrackets->dump();
+  });
+
+  bool InsertNOP = false;
+
+  // Walk over the instructions.
+  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
+       Iter != E;) {
+    MachineInstr &Inst = *Iter;
+    // Remove any previously existing waitcnts.
+    if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
+      // TODO: Register the old waitcnt and optimize the following waitcnts.
+      // Leaving the previously existing waitcnts is conservatively correct.
+      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
+          CompilerGeneratedWaitcntSet.end())
+        ++Iter;
+      else {
+        ScoreBrackets->setWaitcnt(&Inst);
+        ++Iter;
+        Inst.removeFromParent();
+      }
+      continue;
+    }
+
+    // Kill instructions generate a conditional branch to the endmain block.
+    // Merge the current waitcnt state into the endmain block information.
+    // TODO: Are there other flavors of KILL instruction?
+    if (Inst.getOpcode() == AMDGPU::KILL) {
+      addKillWaitBracket(ScoreBrackets);
+    }
+
+    bool VCCZBugWorkAround = false;
+    if (readsVCCZ(Inst) &&
+        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+              ScoreBrackets->getScoreUB(LGKM_CNT) &&
+          ScoreBrackets->hasPendingSMEM()) {
+        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+          VCCZBugWorkAround = true;
+      }
+    }
+
+    // Generate an s_waitcnt instruction to be placed before
+    // cur_Inst, if needed.
+    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
+
+    if (SWaitInst) {
+      Block.insert(Inst, SWaitInst);
+      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
+        DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                     << "Old Instr: " << Inst << '\n'
+                     << "New Instr: " << *SWaitInst << '\n';);
+      }
+    }
+
+    updateEventWaitCntAfter(Inst, ScoreBrackets);
+
+#if 0 // TODO: implement resource type check controlled by options with ub = LB.
+    // If this instruction generates a S_SETVSKIP because it is an
+    // indexed resource, and we are on Tahiti, then it will also force
+    // an S_WAITCNT vmcnt(0)
+    if (RequireCheckResourceType(Inst, context)) {
+      // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
+      ScoreBrackets->setScoreLB(VM_CNT,
+				   ScoreBrackets->getScoreUB(VM_CNT));
+    }
+#endif
+
+    ScoreBrackets->clearWaitcnt();
+
+    if (SWaitInst) {
+      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
+    }
+    DEBUG({
+      Inst.print(dbgs());
+      ScoreBrackets->dump();
+    });
+
+    // Check to see if this is a GWS instruction. If so, and if this is CI or
+    // VI, then the generated code sequence will include an S_WAITCNT 0.
+    // TODO: Are these the only GWS instructions?
+    if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
+      // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
+      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+      ScoreBrackets->updateByWait(LGKM_CNT,
+                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+    }
+
+    // TODO: Remove this work-around after fixing the scheduler and enable the
+    // assert above.
+    if (VCCZBugWorkAround) {
+      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+      // bit is updated, so we can restore the bit by reading the value of
+      // vcc and then writing it back to the register.
+      BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::VCC);
+      VCCZBugHandledSet.insert(&Inst);
+    }
+
+    if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+
+      // This avoids a s_nop after a waitcnt has just been inserted.
+      if (!SWaitInst && InsertNOP) {
+        BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+      }
+      InsertNOP = false;
+
+      // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
+      // or SMEM clause, respectively.
+      //
+      // The temporary workaround is to break the clauses with S_NOP.
+      //
+      // The proper solution would be to allocate registers such that all source
+      // and destination registers don't overlap, e.g. this is illegal:
+      //   r0 = load r2
+      //   r2 = load r0
+      bool IsSMEM = false;
+      bool IsVMEM = false;
+      if (TII->isSMRD(Inst))
+        IsSMEM = true;
+      else if (TII->usesVM_CNT(Inst))
+        IsVMEM = true;
+
+      ++Iter;
+      if (Iter == E)
+        break;
+
+      MachineInstr &Next = *Iter;
+
+      // TODO: How about consecutive SMEM instructions?
+      //       The comments above says break the clause but the code does not.
+      // if ((TII->isSMRD(next) && isSMEM) ||
+      if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
+          // TODO: Enable this check when hasSoftClause is upstreamed.
+          // ST->hasSoftClauses() &&
+          ST->isXNACKEnabled()) {
+        // Insert a NOP to break the clause.
+        InsertNOP = true;
+        continue;
+      }
+
+      // There must be "S_NOP 0" between an instruction writing M0 and
+      // S_SENDMSG.
+      if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
+           Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+          Inst.definesRegister(AMDGPU::M0))
+        InsertNOP = true;
+
+      continue;
+    }
+
+    ++Iter;
+  }
+
+  // Check if we need to force convergence at loop footer.
+  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
+  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+    WaitcntData->print();
+    DEBUG(dbgs() << '\n';);
+
+    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
+    // placement and doesn't always guarantee convergence for a loop. Each
+    // loop should take at most 2 iterations for it to converge naturally.
+    // When this max is reached and result doesn't converge, we force
+    // convergence by inserting a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > 2) {
+      // To ensure convergence, need to make wait events at loop footer be no
+      // more than those from the previous iteration.
+      // As a simplification, Instead of tracking individual scores and
+      // generate the precise wait count, just wait on 0.
+      bool HasPending = false;
+      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+          HasPending = true;
+        }
+      }
+
+      if (HasPending) {
+        if (!SWaitInst) {
+          SWaitInst = Block.getParent()->CreateMachineInstr(
+              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          CompilerGeneratedWaitcntSet.insert(SWaitInst);
+          const MachineOperand &Op = MachineOperand::CreateImm(0);
+          SWaitInst->addOperand(MF, Op);
+#if 0 // TODO: Format the debug output
+          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
+          OutputTransformAdd(SWaitInst, context);
+#endif
+        }
+#if 0 // TODO: ??
+        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
+#endif
+      }
+
+      if (SWaitInst) {
+        DEBUG({
+          SWaitInst->print(dbgs());
+          dbgs() << "\nAdjusted score board:";
+          ScoreBrackets->dump();
+        });
+
+        // Add this waitcnt to the block. It is either newly created or
+        // created in previous iterations and added back since block traversal
+        // always remove waitcnt.
+        insertWaitcntBeforeCF(Block, SWaitInst);
+        WaitcntData->setWaitcnt(SWaitInst);
+      }
+    }
+  }
+}
+
+bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+  AMDGPUASI = ST->getAMDGPUAS();
+
+  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+
+  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
+  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
+  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+
+  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+  RegisterEncoding.VGPRL =
+      RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
+  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+  RegisterEncoding.SGPRL =
+      RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+
+  // Walk over the blocks in reverse post-dominator order, inserting
+  // s_waitcnt where needed.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+           I = RPOT.begin(),
+           E = RPOT.end(), J = RPOT.begin();
+       I != E;) {
+    MachineBasicBlock &MBB = **I;
+
+    BlockVisitedSet.insert(&MBB);
+
+    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    if (!ScoreBrackets) {
+      BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
+      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    }
+    ScoreBrackets->setPostOrder(MBB.getNumber());
+    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
+    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
+      LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
+
+    // If we are walking into the block from before the loop, then guarantee
+    // at least 1 re-walk over the loop to propagate the information, even if
+    // no S_WAITCNT instructions were generated.
+    if (ContainingLoop && ContainingLoop->getTopBlock() == &MBB && J < I &&
+        (BlockWaitcntProcessedSet.find(&MBB) ==
+         BlockWaitcntProcessedSet.end())) {
+      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+      DEBUG(dbgs() << "set-revisit: block"
+                   << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+    }
+
+    // Walk over the instructions.
+    insertWaitcntInBlock(MF, MBB);
+
+    // Flag that waitcnts have been processed at least once.
+    BlockWaitcntProcessedSet.insert(&MBB);
+
+    // See if we want to revisit the loop.
+    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+      MachineBasicBlock *EntryBB = ContainingLoop->getTopBlock();
+      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
+      if (EntrySB && EntrySB->getRevisitLoop()) {
+        EntrySB->setRevisitLoop(false);
+        J = I;
+        int32_t PostOrder = EntrySB->getPostOrder();
+        // TODO: Avoid this loop. Find another way to set I.
+        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+                 X = RPOT.begin(),
+                 Y = RPOT.end();
+             X != Y; ++X) {
+          MachineBasicBlock &MBBX = **X;
+          if (MBBX.getNumber() == PostOrder) {
+            I = X;
+            break;
+          }
+        }
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        WaitcntData->incIterCnt();
+        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        continue;
+      } else {
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        // Loop converged, reset iteration count. If this loop gets revisited,
+        // it must be from an outer loop, the counter will restart, this will
+        // ensure we don't force convergence on such revisits.
+        WaitcntData->resetIterCnt();
+      }
+    }
+
+    J = I;
+    ++I;
+  }
+
+  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+  bool HaveScalarStores = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         ++I) {
+
+      if (!HaveScalarStores && TII->isScalarStore(*I))
+        HaveScalarStores = true;
+
+      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+        EndPgmBlocks.push_back(&MBB);
+    }
+  }
+
+  if (HaveScalarStores) {
+    // If scalar writes are used, the cache must be flushed or else the next
+    // wave to reuse the same scratch memory can be clobbered.
+    //
+    // Insert s_dcache_wb at wave termination points if there were any scalar
+    // stores, and only if the cache hasn't already been flushed. This could be
+    // improved by looking across blocks for flushes in postdominating blocks
+    // from the stores but an explicitly requested flush is probably very rare.
+    for (MachineBasicBlock *MBB : EndPgmBlocks) {
+      bool SeenDCacheWB = false;
+
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+           ++I) {
+
+        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+          SeenDCacheWB = true;
+        else if (TII->isScalarStore(*I))
+          SeenDCacheWB = false;
+
+        // FIXME: It would be better to insert this before a waitcnt if any.
+        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+            !SeenDCacheWB) {
+          Modified = true;
+          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index fceabd7a8fdd..47257ce16ceb 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -21,16 +21,32 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <new>
+#include <utility>
 
 #define DEBUG_TYPE "si-insert-waits"
 
 using namespace llvm;
-using namespace llvm::AMDGPU;
 
 namespace {
 
@@ -42,7 +58,6 @@ typedef union {
     unsigned LGKM;
   } Named;
   unsigned Array[3];
-
 } Counters;
 
 typedef enum {
@@ -55,13 +70,12 @@ typedef Counters RegCounters[512];
 typedef std::pair<unsigned, unsigned> RegInterval;
 
 class SIInsertWaits : public MachineFunctionPass {
-
 private:
-  const SISubtarget *ST;
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
+  const SISubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI;
-  IsaVersion IV;
+  AMDGPU::IsaInfo::IsaVersion ISA;
 
   /// \brief Constant zero value
   static const Counters ZeroCounts;
@@ -86,7 +100,7 @@ private:
   RegCounters DefinedRegs;
 
   /// \brief Different export instruction types seen since last wait.
-  unsigned ExpInstrTypesSeen;
+  unsigned ExpInstrTypesSeen = 0;
 
   /// \brief Type of the last opcode.
   InstType LastOpcodeType;
@@ -100,7 +114,7 @@ private:
   bool ReturnsVoid;
 
   /// Whether the VCCZ bit is possibly corrupt
-  bool VCCZCorrupt;
+  bool VCCZCorrupt = false;
 
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
@@ -141,13 +155,7 @@ private:
 public:
   static char ID;
 
-  SIInsertWaits() :
-    MachineFunctionPass(ID),
-    ST(nullptr),
-    TII(nullptr),
-    TRI(nullptr),
-    ExpInstrTypesSeen(0),
-    VCCZCorrupt(false) { }
+  SIInsertWaits() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -161,7 +169,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
                       "SI Insert Waits", false, false)
@@ -294,7 +302,6 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     const Counters &Increment) {
-
   // Get the hardware counter increments and sum them up
   Counters Limit = ZeroCounts;
   unsigned Sum = 0;
@@ -366,7 +373,6 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const Counters &Required) {
-
   // End of program? No need to wait on anything
   // A function not returning void needs to wait, because other bytecode will
   // be appended after it and we don't know what it will be.
@@ -393,7 +399,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
   bool NeedWait = false;
 
   for (unsigned i = 0; i < 3; ++i) {
-
     if (Required.Array[i] <= WaitedOn.Array[i])
       continue;
 
@@ -421,10 +426,10 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 
   // Build the wait instruction
   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-    .addImm(encodeWaitcnt(IV,
-                          Counts.Named.VM,
-                          Counts.Named.EXP,
-                          Counts.Named.LGKM));
+    .addImm(AMDGPU::encodeWaitcnt(ISA,
+                                  Counts.Named.VM,
+                                  Counts.Named.EXP,
+                                  Counts.Named.LGKM));
 
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -434,7 +439,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 
 /// \brief helper function for handleOperands
 static void increaseCounters(Counters &Dst, const Counters &Src) {
-
   for (unsigned i = 0; i < 3; ++i)
     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 }
@@ -453,9 +457,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
   unsigned Imm = I->getOperand(0).getImm();
   Counters Counts, WaitOn;
 
-  Counts.Named.VM = decodeVmcnt(IV, Imm);
-  Counts.Named.EXP = decodeExpcnt(IV, Imm);
-  Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
+  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
+  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
+  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
 
   for (unsigned i = 0; i < 3; ++i) {
     if (Counts.Array[i] <= LastIssued.Array[i])
@@ -468,7 +472,6 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
 }
 
 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
-
   Counters Result = ZeroCounts;
 
   // For each register affected by this instruction increase the result
@@ -484,7 +487,6 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
     RegInterval Interval = getRegInterval(RC, Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
       if (Op.isDef()) {
         increaseCounters(Result, UsedRegs[j]);
         increaseCounters(Result, DefinedRegs[j]);
@@ -522,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
   }
 }
 
+/// Return true if \p MBB has one successor immediately following, and is its
+/// only predecessor
+static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
+  if (MBB.succ_size() != 1)
+    return false;
+
+  const MachineBasicBlock *Succ = *MBB.succ_begin();
+  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -531,12 +543,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  IV = getIsaVersion(ST->getFeatureBits());
+  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  HardwareLimits.Named.VM = getVmcntBitMask(IV);
-  HardwareLimits.Named.EXP = getExpcntBitMask(IV);
-  HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
+  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
+  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
+  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
 
   WaitedOn = ZeroCounts;
   DelayedWaitOn = ZeroCounts;
@@ -636,12 +648,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
       handleSendMsg(MBB, I);
 
       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN)
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
         EndPgmBlocks.push_back(&MBB);
     }
 
-    // Wait for everything at the end of the MBB
-    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+    // Wait for everything at the end of the MBB. If there is only one
+    // successor, we can defer this until the uses there.
+    if (!hasTrivialSuccessor(MBB))
+      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
   }
 
   if (HaveScalarStores) {
@@ -665,7 +679,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 
         // FIXME: It would be better to insert this before a waitcnt if any.
         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
           Changes = true;
           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
         }
@@ -676,5 +690,19 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   for (MachineInstr *I : RemoveMI)
     I->eraseFromParent();
 
+  if (!MFI->isEntryFunction()) {
+    // Wait for any outstanding memory operations that the input registers may
+    // depend on. We can't track them and it's better to to the wait after the
+    // costly call sequence.
+
+    // TODO: Could insert earlier and schedule more liberally with operations
+    // that only use caller preserved registers.
+    MachineBasicBlock &EntryBB = MF.front();
+    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      .addImm(0);
+
+    Changes = true;
+  }
+
   return Changes;
 }
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 5523ec142ba7..b83a1fe187eb 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -31,6 +31,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bit VOP2 = 0;
   field bit VOPC = 0;
   field bit VOP3 = 0;
+  field bit VOP3P = 0;
   field bit VINTRP = 0;
   field bit SDWA = 0;
   field bit DPP = 0;
@@ -78,6 +79,10 @@ class InstSI <dag outs, dag ins, string asm = "",
   // is unable to infer the encoding from the operands.
   field bit VOPAsmPrefer32Bit = 0;
 
+  // This bit indicates that this has a floating point result type, so
+  // the clamp modifier has floating point semantics.
+  field bit FPClamp = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -92,6 +97,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{8} = VOP2;
   let TSFlags{9} = VOPC;
   let TSFlags{10} = VOP3;
+  let TSFlags{12} = VOP3P;
 
   let TSFlags{13} = VINTRP;
   let TSFlags{14} = SDWA;
@@ -120,6 +126,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{39} = ScalarStore;
   let TSFlags{40} = FixedSize;
   let TSFlags{41} = VOPAsmPrefer32Bit;
+  let TSFlags{42} = FPClamp;
 
   let SchedRW = [Write32Bit];
 
@@ -131,19 +138,19 @@ class InstSI <dag outs, dag ins, string asm = "",
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
 
-class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : InstSI<outs, ins, "", pattern> {
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : InstSI<outs, ins, asm, pattern> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let SALU = 1;
 }
 
-class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let VALU = 1;
   let Uses = [EXEC];
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 26a8d22062a9..05ac67d26620 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -36,7 +37,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
                  cl::desc("Restrict range of branch instructions (DEBUG)"));
 
 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -315,7 +316,8 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   const MachineOperand *SecondDst = nullptr;
 
   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
-      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
+      (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
@@ -346,6 +348,21 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
 }
 
+static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, unsigned DestReg,
+                              unsigned SrcReg, bool KillSrc) {
+  MachineFunction *MF = MBB.getParent();
+  DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
+                                        "illegal SGPR to VGPR copy",
+                                        DL, DS_Error);
+  LLVMContext &C = MF->getFunction()->getContext();
+  C.diagnose(IllegalCopy);
+
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, unsigned DestReg,
@@ -369,7 +386,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -391,7 +412,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -415,8 +440,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Opcode = AMDGPU::S_MOV_B32;
       EltSize = 4;
     }
+
+    if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
   }
 
+
   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
 
@@ -870,9 +901,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MachineInstr *MovRel =
         BuildMI(MBB, MI, DL, MovRelDesc)
             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
-            .addOperand(MI.getOperand(2))
+            .add(MI.getOperand(2))
             .addReg(VecReg, RegState::ImplicitDefine)
-            .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+            .addReg(VecReg,
+                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));
 
     const int ImpDefIdx =
         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
@@ -897,14 +929,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     // constant data.
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
                        .addReg(RegLo)
-                       .addOperand(MI.getOperand(1)));
+                       .add(MI.getOperand(1)));
 
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                                   .addReg(RegHi);
     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
       MIB.addImm(0);
     else
-      MIB.addOperand(MI.getOperand(2));
+      MIB.add(MI.getOperand(2));
 
     Bundler.append(MIB);
     llvm::finalizeBundle(MBB, Bundler.begin());
@@ -1290,6 +1322,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
   return Count;
 }
 
+// Copy the flags onto the implicit condition register operand.
+static void preserveCondRegFlags(MachineOperand &CondReg,
+                                 const MachineOperand &OrigCond) {
+  CondReg.setIsUndef(OrigCond.isUndef());
+  CondReg.setIsKill(OrigCond.isKill());
+}
+
 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                    MachineBasicBlock *TBB,
                                    MachineBasicBlock *FBB,
@@ -1317,9 +1356,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
       .addMBB(TBB);
 
     // Copy the flags onto the implicit condition register operand.
-    MachineOperand &CondReg = CondBr->getOperand(1);
-    CondReg.setIsUndef(Cond[1].isUndef());
-    CondReg.setIsKill(Cond[1].isKill());
+    preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
 
     if (BytesAdded)
       *BytesAdded = 4;
@@ -1351,6 +1388,157 @@ bool SIInstrInfo::reverseBranchCondition(
   return false;
 }
 
+bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                                  ArrayRef<MachineOperand> Cond,
+                                  unsigned TrueReg, unsigned FalseReg,
+                                  int &CondCycles,
+                                  int &TrueCycles, int &FalseCycles) const {
+  switch (Cond[0].getImm()) {
+  case VCCNZ:
+  case VCCZ: {
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
+    assert(MRI.getRegClass(FalseReg) == RC);
+
+    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
+
+    // Limit to equal cost for branch vs. N v_cndmask_b32s.
+    return !RI.isSGPRClass(RC) && NumInsts <= 6;
+  }
+  case SCC_TRUE:
+  case SCC_FALSE: {
+    // FIXME: We could insert for VGPRs if we could replace the original compare
+    // with a vector one.
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
+    assert(MRI.getRegClass(FalseReg) == RC);
+
+    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+
+    // Multiples of 8 can do s_cselect_b64
+    if (NumInsts % 2 == 0)
+      NumInsts /= 2;
+
+    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
+    return RI.isSGPRClass(RC);
+  }
+  default:
+    return false;
+  }
+}
+
+void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, const DebugLoc &DL,
+                               unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                               unsigned TrueReg, unsigned FalseReg) const {
+  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
+  if (Pred == VCCZ || Pred == SCC_FALSE) {
+    Pred = static_cast<BranchPredicate>(-Pred);
+    std::swap(TrueReg, FalseReg);
+  }
+
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  unsigned DstSize = DstRC->getSize();
+
+  if (DstSize == 4) {
+    unsigned SelOp = Pred == SCC_TRUE ?
+      AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
+
+    // Instruction's operands are backwards from what is expected.
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(SelOp), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg);
+
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+    return;
+  }
+
+  if (DstSize == 8 && Pred == SCC_TRUE) {
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg);
+
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+    return;
+  }
+
+  static const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+  };
+
+  static const int16_t Sub0_15_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
+  };
+
+  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
+  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
+  const int16_t *SubIndices = Sub0_15;
+  int NElts = DstSize / 4;
+
+  // 64-bit select is only avaialble for SALU.
+  if (Pred == SCC_TRUE) {
+    SelOp = AMDGPU::S_CSELECT_B64;
+    EltRC = &AMDGPU::SGPR_64RegClass;
+    SubIndices = Sub0_15_64;
+
+    assert(NElts % 2 == 0);
+    NElts /= 2;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(
+    MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
+
+  I = MIB->getIterator();
+
+  SmallVector<unsigned, 8> Regs;
+  for (int Idx = 0; Idx != NElts; ++Idx) {
+    unsigned DstElt = MRI.createVirtualRegister(EltRC);
+    Regs.push_back(DstElt);
+
+    unsigned SubIdx = SubIndices[Idx];
+
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(SelOp), DstElt)
+      .addReg(FalseReg, 0, SubIdx)
+      .addReg(TrueReg, 0, SubIdx);
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+
+    MIB.addReg(DstElt)
+       .addImm(SubIdx);
+  }
+}
+
+bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    // If there are additional implicit register operands, this may be used for
+    // register indexing so the source register operand isn't simply copied.
+    unsigned NumOps = MI.getDesc().getNumOperands() +
+      MI.getDesc().getNumImplicitUses();
+
+    return MI.getNumOperands() == NumOps;
+  }
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::COPY:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static void removeModOperands(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -1400,15 +1588,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
-    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
-
-    // Don't fold if we are using source modifiers. The new VOP2 instructions
-    // don't have them.
-    if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
-        hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
+    // Don't fold if we are using source or output modifiers. The new VOP2
+    // instructions don't have them.
+    if (hasAnyModifiersSet(UseMI))
       return false;
-    }
 
     const MachineOperand &ImmOp = DefMI.getOperand(1);
 
@@ -1421,6 +1604,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     if (isInlineConstant(UseMI, *Src0, ImmOp))
       return false;
 
+    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
@@ -1633,20 +1817,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
 
   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src0Mods =
+    getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mods =
+    getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
 
   return BuildMI(*MBB, MI, MI.getDebugLoc(),
                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
-      .addOperand(*Dst)
-      .addImm(0) // Src0 mods
-      .addOperand(*Src0)
-      .addImm(0) // Src1 mods
-      .addOperand(*Src1)
+      .add(*Dst)
+      .addImm(Src0Mods ? Src0Mods->getImm() : 0)
+      .add(*Src0)
+      .addImm(Src1Mods ? Src1Mods->getImm() : 0)
+      .add(*Src1)
       .addImm(0) // Src mods
-      .addOperand(*Src2)
-      .addImm(0)  // clamp
-      .addImm(0); // omod
+      .add(*Src2)
+      .addImm(Clamp ? Clamp->getImm() : 0)
+      .addImm(Omod ? Omod->getImm() : 0);
 }
 
 // It's not generally safe to move VALU instructions across these since it will
@@ -1687,7 +1877,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   case 16:
-    return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+    return ST.has16BitInsts() &&
+           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   default:
     llvm_unreachable("invalid bitwidth");
@@ -1705,24 +1896,43 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
 
   int64_t Imm = MO.getImm();
-  switch (operandBitWidth(OperandType)) {
-  case 32: {
+  switch (OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     int32_t Trunc = static_cast<int32_t>(Imm);
     return Trunc == Imm &&
            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
   }
-  case 64: {
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
     return AMDGPU::isInlinableLiteral64(MO.getImm(),
                                         ST.hasInv2PiInlineImm());
   }
-  case 16: {
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+      // A few special case instructions have 16-bit operands on subtargets
+      // where 16-bit instructions are not legal.
+      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
+      // constants in these cases
       int16_t Trunc = static_cast<int16_t>(Imm);
-      return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
     }
 
     return false;
   }
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    uint32_t Trunc = static_cast<uint32_t>(Imm);
+    return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
+  }
   default:
     llvm_unreachable("invalid bitwidth");
   }
@@ -1801,6 +2011,14 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
   return Mods && Mods->getImm();
 }
 
+bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
+  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
+         hasModifiersSet(MI, AMDGPU::OpName::omod);
+}
+
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
                                   const MachineOperand &MO,
                                   const MCOperandInfo &OpInfo) const {
@@ -2238,7 +2456,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
 
   unsigned Reg = MRI.createVirtualRegister(VRC);
   DebugLoc DL = MBB->findDebugLoc(I);
-  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO);
+  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
   MO.ChangeToRegister(Reg, false);
 }
 
@@ -2564,8 +2782,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
     return;
 
   unsigned DstReg = MRI.createVirtualRegister(DstRC);
-  MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg)
-                               .addOperand(Op);
+  MachineInstr *Copy =
+      BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
 
   Op.setReg(DstReg);
   Op.setSubReg(0);
@@ -2810,13 +3028,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
         // Regular buffer load / store.
         MachineInstrBuilder MIB =
             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
-                .addOperand(*VData)
+                .add(*VData)
                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
                 // This will be replaced later
                 // with the new value of vaddr.
-                .addOperand(*SRsrc)
-                .addOperand(*SOffset)
-                .addOperand(*Offset);
+                .add(*SRsrc)
+                .add(*SOffset)
+                .add(*Offset);
 
         // Atomics do not have this operand.
         if (const MachineOperand *GLC =
@@ -2836,14 +3054,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
       } else {
         // Atomics with return.
         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
-                     .addOperand(*VData)
-                     .addOperand(*VDataIn)
+                     .add(*VData)
+                     .add(*VDataIn)
                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
                      // This will be replaced later
                      // with the new value of vaddr.
-                     .addOperand(*SRsrc)
-                     .addOperand(*SOffset)
-                     .addOperand(*Offset)
+                     .add(*SRsrc)
+                     .add(*SOffset)
+                     .add(*Offset)
                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
@@ -2970,6 +3188,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
+
+    case AMDGPU::S_PACK_LL_B32_B16:
+    case AMDGPU::S_PACK_LH_B32_B16:
+    case AMDGPU::S_PACK_HH_B32_B16: {
+      movePackToVALU(Worklist, MRI, Inst);
+      Inst.eraseFromParent();
+      continue;
+    }
     }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -3027,12 +3253,15 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
     unsigned NewDstReg = AMDGPU::NoRegister;
     if (HasDst) {
+      unsigned DstReg = Inst.getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+        continue;
+
       // Update the destination register class.
       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
       if (!NewDstRC)
         continue;
 
-      unsigned DstReg = Inst.getOperand(0).getReg();
       if (Inst.isCopy() &&
           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
@@ -3112,15 +3341,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-    .addOperand(SrcReg0Sub0);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-    .addOperand(SrcReg0Sub1);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -3174,8 +3401,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-                              .addOperand(SrcReg0Sub0)
-                              .addOperand(SrcReg1Sub0);
+                              .add(SrcReg0Sub0)
+                              .add(SrcReg1Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
@@ -3184,8 +3411,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-                              .addOperand(SrcReg0Sub1)
-                              .addOperand(SrcReg1Sub1);
+                              .add(SrcReg0Sub1)
+                              .add(SrcReg1Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -3231,13 +3458,9 @@ void SIInstrInfo::splitScalar64BitBCNT(
   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
                                                       AMDGPU::sub1, SrcSubRC);
 
-  BuildMI(MBB, MII, DL, InstDesc, MidReg)
-    .addOperand(SrcRegSub0)
-    .addImm(0);
+  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
 
-  BuildMI(MBB, MII, DL, InstDesc, ResultReg)
-    .addOperand(SrcRegSub1)
-    .addReg(MidReg);
+  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
 
@@ -3326,6 +3549,68 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
+void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+                                 MachineRegisterInfo &MRI,
+                                 MachineInstr &Inst) const {
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  MachineBasicBlock *MBB = Inst.getParent();
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  switch (Inst.getOpcode()) {
+  case AMDGPU::S_PACK_LL_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
+    // 0.
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
+      .addReg(ImmReg, RegState::Kill)
+      .add(Src0);
+
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
+      .add(Src1)
+      .addImm(16)
+      .addReg(TmpReg, RegState::Kill);
+    break;
+  }
+  case AMDGPU::S_PACK_LH_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
+      .addReg(ImmReg, RegState::Kill)
+      .add(Src0)
+      .add(Src1);
+    break;
+  }
+  case AMDGPU::S_PACK_HH_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+      .addImm(16)
+      .add(Src0);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
+      .add(Src1)
+      .addReg(ImmReg, RegState::Kill)
+      .addReg(TmpReg, RegState::Kill);
+    break;
+  }
+  default:
+    llvm_unreachable("unhandled s_pack_* instruction");
+  }
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
     MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
   // This assumes that all the users of SCC are in the same block
@@ -3448,10 +3733,13 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   if (ST.isAmdHsaOS()) {
-    RsrcDataFormat |= (1ULL << 56);
+    // Set ATC = 1. GFX9 doesn't have this bit.
+    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+      RsrcDataFormat |= (1ULL << 56);
 
-    if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
-      // Set MTYPE = 2
+    // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
+    // BTW, it disables TC L2 and therefore decreases performance.
+    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (2ULL << 59);
   }
 
@@ -3463,11 +3751,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     AMDGPU::RSRC_TID_ENABLE |
                     0xffffffff; // Size;
 
-  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+  // GFX9 doesn't have ELEMENT_SIZE.
+  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+    Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
+  }
 
-  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
-            // IndexStride = 64
-            (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+  // IndexStride = 64.
+  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
 
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
@@ -3496,7 +3787,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
     return AMDGPU::NoRegister;
 
   assert(!MI.memoperands_empty() &&
-         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   FrameIndex = Addr->getIndex();
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -3552,16 +3843,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (DescSize != 0 && DescSize != 4)
     return DescSize;
 
-  if (Opc == AMDGPU::WAVE_BARRIER)
-    return 0;
-
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    if (isFixedSize(MI)) {
-      assert(DescSize == 4);
+    if (isFixedSize(MI))
       return DescSize;
-    }
 
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
@@ -3584,7 +3870,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return 4;
 
   switch (Opc) {
-  case AMDGPU::SI_MASK_BRANCH:
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
   case TargetOpcode::DBG_VALUE:
@@ -3609,7 +3894,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
     return true;
 
   for (const MachineMemOperand *MMO : MI.memoperands()) {
-    if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
       return true;
   }
   return false;
@@ -3640,3 +3925,21 @@ ScheduleHazardRecognizer *
 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
   return new GCNHazardRecognizer(MF);
 }
+
+bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
+  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
+         MI.modifiesRegister(AMDGPU::EXEC, &RI);
+}
+
+MachineInstrBuilder
+SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I,
+                           const DebugLoc &DL,
+                           unsigned DestReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+           .addReg(UnusedCarry, RegState::Define | RegState::Dead);
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index e68f6f92ba96..659473ca6a47 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -69,6 +69,9 @@ private:
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
                            MachineInstr &Inst) const;
+  void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+                      MachineRegisterInfo &MRI,
+                      MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
@@ -203,10 +206,24 @@ public:
   bool reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const override;
 
+
+  bool canInsertSelect(const MachineBasicBlock &MBB,
+                       ArrayRef<MachineOperand> Cond,
+                       unsigned TrueReg, unsigned FalseReg,
+                       int &CondCycles,
+                       int &TrueCycles, int &FalseCycles) const override;
+
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+
   bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 
+  bool isFoldableCopy(const MachineInstr &MI) const;
+
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const final;
 
@@ -308,6 +325,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VOP3;
   }
 
+  static bool isSDWA(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
+  }
+
+  bool isSDWA(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SDWA;
+  }
+
   static bool isVOPC(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
   }
@@ -420,6 +445,22 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::DPP;
   }
 
+  static bool isVOP3P(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
+  }
+
+  bool isVOP3P(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
+  }
+
+  static bool isVINTRP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
+  }
+
+  bool isVINTRP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
+  }
+
   static bool isScalarUnit(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
   }
@@ -454,6 +495,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
+  static bool hasFPClamp(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp;
+  }
+
+  bool hasFPClamp(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp;
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
     unsigned Dest = MI.getOperand(0).getReg();
@@ -462,28 +511,6 @@ public:
     return !RI.isSGPRReg(MRI, Dest);
   }
 
-  static int operandBitWidth(uint8_t OperandType) {
-    switch (OperandType) {
-    case AMDGPU::OPERAND_REG_IMM_INT32:
-    case AMDGPU::OPERAND_REG_IMM_FP32:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
-      return 32;
-    case AMDGPU::OPERAND_REG_IMM_INT64:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
-      return 64;
-    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-    case AMDGPU::OPERAND_REG_IMM_INT16:
-    case AMDGPU::OPERAND_REG_IMM_FP16:
-      return 16;
-    default:
-      llvm_unreachable("unexpected operand type");
-    }
-  }
-
   bool isInlineConstant(const APInt &Imm) const;
 
   bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
@@ -571,6 +598,7 @@ public:
 
   bool hasModifiersSet(const MachineInstr &MI,
                        unsigned OpName) const;
+  bool hasAnyModifiersSet(const MachineInstr &MI) const;
 
   bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
@@ -731,6 +759,17 @@ public:
 
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+
+  bool isBasicBlockPrologue(const MachineInstr &MI) const override;
+
+  /// \brief Return a partially built integer add instruction without carry.
+  /// Caller must add source operands.
+  /// For pre-GFX9 it will generate unused carry destination operand.
+  /// TODO: After GFX9 it should return a no-carry operation.
+  MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL,
+                                    unsigned DestReg) const;
 };
 
 namespace AMDGPU {
@@ -741,6 +780,9 @@ namespace AMDGPU {
   int getVOPe32(uint16_t Opcode);
 
   LLVM_READONLY
+  int getSDWAOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index ebaefae3bfef..c6daf743f3ac 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -71,11 +71,6 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
 def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 
-def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
-                       SDTCisVT<3, i32>]>
->;
-
 class SDSample<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
                        SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
@@ -107,7 +102,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
 >;
 
 def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
@@ -144,7 +139,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore,
 
 def si_st_local : PatFrag <
   (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 def si_store_local : PatFrag <
@@ -196,6 +191,21 @@ def si_uniform_br_scc : PatFrag <
   return isCBranchSCC(N);
 }]>;
 
+def lshr_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (srl $src0, $src1)
+>;
+
+def ashr_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (sra $src0, $src1)
+>;
+
+def lshl_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (shl $src0, $src1)
+>;
+
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
 
   def _glue : SDNode <
@@ -266,10 +276,6 @@ def SIMM16bit : PatLeaf <(imm),
   [{return isInt<16>(N->getSExtValue());}]
 >;
 
-def IMM20bit : PatLeaf <(imm),
-  [{return isUInt<20>(N->getZExtValue());}]
->;
-
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
   return isInlineImmediate(N);
 }]>;
@@ -299,6 +305,19 @@ class VGPRImm <dag frag> : PatLeaf<frag, [{
   return Limit < 10;
 }]>;
 
+def NegateImm : SDNodeXForm<imm, [{
+  return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// TODO: When FP inline imm values work?
+def NegSubInlineConst32 : ImmLeaf<i32, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
+def NegSubInlineConst16 : ImmLeaf<i16, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
@@ -449,6 +468,12 @@ class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
   let ParserMatchClass = MatchClass;
 }
 
+class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> :
+  OperandWithDefaultOps<i32, (ops (i32 0))> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
@@ -486,6 +511,11 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
 def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
 def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
 
+def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
+def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
+def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
+def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+
 def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
 
 def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
@@ -525,6 +555,7 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass {
   let ParserMethod = "parseRegOrImmWithFPInputMods";
   let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
 }
+
 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
@@ -577,6 +608,33 @@ def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
+class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "PackedFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImm";
+  let PredicateMethod = "isRegOrImm";
+//  let PredicateMethod = "isPackedFP"#opSize#"InputMods";
+}
+
+class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "PackedInt"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImm";
+  let PredicateMethod = "isRegOrImm";
+//  let PredicateMethod = "isPackedInt"#opSize#"InputMods";
+}
+
+def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>;
+def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>;
+
+class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> {
+//  let PrintMethod = "printPackedFPInputMods";
+}
+
+class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <matchClass> {
+  //let PrintMethod = "printPackedIntInputMods";
+}
+
+def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>;
+def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 
 //===----------------------------------------------------------------------===//
 // Complex patterns
@@ -593,6 +651,14 @@ def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
 def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+// VOP3Mods, but the input source is known to never be NaN.
+def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
+
+def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
+
+def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
+def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">;
+
 
 //===----------------------------------------------------------------------===//
 // SI assembler operands
@@ -604,19 +670,32 @@ def SIOperand {
   int FLAT_SCR = 0x68;
 }
 
+// This should be kept in sync with SISrcMods enum
 def SRCMODS {
   int NONE = 0;
   int NEG = 1;
+  int ABS = 2;
+  int NEG_ABS = 3;
+
+  int NEG_HI = ABS;
+  int OP_SEL_0 = 4;
+  int OP_SEL_1 = 8;
 }
 
 def DSTCLAMP {
   int NONE = 0;
+  int ENABLE = 1;
 }
 
 def DSTOMOD {
   int NONE = 0;
 }
 
+def TRAPID{
+  int LLVM_TRAP = 2;
+  int LLVM_DEBUG_TRAP = 3;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // SI Instruction multiclass helpers.
@@ -648,8 +727,9 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
        ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
        exp_vm:$vm, exp_compr:$compr, i8imm:$en),
   "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
-  [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
-         f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> {
+  [(node (i8 timm:$tgt), (i8 timm:$en),
+         f32:$src0, f32:$src1, f32:$src2, f32:$src3,
+         (i1 timm:$compr), (i1 timm:$vm))]> {
   let AsmMatchConverter = "cvtExp";
 }
 
@@ -666,6 +746,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> {
       def _si : EXP_Helper<done>,
                 SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
                 EXPe {
+        let AssemblerPredicates = [isSICI];
         let DecoderNamespace = "SICI";
         let DisableDecoder = DisableSIDecoder;
       }
@@ -673,6 +754,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> {
       def _vi : EXP_Helper<done>,
                 SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
                 EXPe_vi {
+        let AssemblerPredicates = [isVI];
         let DecoderNamespace = "VI";
         let DisableDecoder = DisableVIDecoder;
       }
@@ -706,12 +788,34 @@ class getVALUDstForVT<ValueType VT> {
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
   bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, v2f16.Value), 1,
              !if(!eq(VT.Value, f32.Value), 1,
              !if(!eq(VT.Value, f64.Value), 1,
-             0)));
-  RegisterOperand ret = !if(isFP,
-                            !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
-                            !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
+             0))));
+
+  RegisterOperand ret =
+    !if(isFP,
+      !if(!eq(VT.Size, 64),
+         VSrc_f64,
+         !if(!eq(VT.Value, f16.Value),
+            VSrc_f16,
+            !if(!eq(VT.Value, v2f16.Value),
+               VCSrc_v2f16,
+               VSrc_f32
+            )
+         )
+       ),
+       !if(!eq(VT.Size, 64),
+          VSrc_b64,
+          !if(!eq(VT.Value, i16.Value),
+             VSrc_b16,
+             !if(!eq(VT.Value, v2i16.Value),
+                VCSrc_v2b16,
+                VSrc_b32
+             )
+          )
+       )
+    );
 }
 
 // Returns the vreg register class to use for source operand given VT
@@ -725,25 +829,38 @@ class getVregSrcForVT<ValueType VT> {
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
   bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, v2f16.Value), 1,
              !if(!eq(VT.Value, f32.Value), 1,
              !if(!eq(VT.Value, f64.Value), 1,
-             0)));
+             0))));
   RegisterOperand ret =
   !if(!eq(VT.Size, 128),
-      VSrc_128,
-    !if(!eq(VT.Size, 64),
+     VSrc_128,
+     !if(!eq(VT.Size, 64),
         !if(isFP,
-            VCSrc_f64,
-            VCSrc_b64),
+           VCSrc_f64,
+           VCSrc_b64),
         !if(!eq(VT.Value, i1.Value),
-            SCSrc_b64,
-            !if(isFP,
-                !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
-                !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
-            )
-         )
-	   )
-     );
+           SCSrc_b64,
+           !if(isFP,
+              !if(!eq(VT.Value, f16.Value),
+                 VCSrc_f16,
+                 !if(!eq(VT.Value, v2f16.Value),
+                    VCSrc_v2f16,
+                    VCSrc_f32
+                 )
+              ),
+              !if(!eq(VT.Value, i16.Value),
+                 VCSrc_b16,
+                 !if(!eq(VT.Value, v2i16.Value),
+                    VCSrc_v2b16,
+                    VCSrc_b32
+                 )
+              )
+           )
+        )
+     )
+  );
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
@@ -753,7 +870,8 @@ class isFloatType<ValueType SrcVT> {
     !if(!eq(SrcVT.Value, f16.Value), 1,
     !if(!eq(SrcVT.Value, f32.Value), 1,
     !if(!eq(SrcVT.Value, f64.Value), 1,
-    0)));
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    0))));
 }
 
 class isIntType<ValueType SrcVT> {
@@ -764,6 +882,23 @@ class isIntType<ValueType SrcVT> {
     0)));
 }
 
+class isPackedType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+    );
+}
+
+// Float or packed int
+class isModifierType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, f16.Value), 1,
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+    0)))));
+}
 
 // Return type of input modifiers operand for specified input operand
 class getSrcMod <ValueType VT> {
@@ -771,6 +906,7 @@ class getSrcMod <ValueType VT> {
                !if(!eq(VT.Value, f32.Value), 1,
                !if(!eq(VT.Value, f64.Value), 1,
                0)));
+  bit isPacked = isPackedType<VT>.ret;
   Operand ret =  !if(!eq(VT.Size, 64),
                      !if(isFP, FP64InputMods, Int64InputMods),
                        !if(isFP,
@@ -801,8 +937,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
 // Returns the input arguments for VOP3 instructions for the given SrcVT.
 class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 RegisterOperand Src2RC, int NumSrcArgs,
-                bit HasModifiers, Operand Src0Mod, Operand Src1Mod,
-                Operand Src2Mod> {
+                bit HasModifiers, bit HasOMod,
+                Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
 
   dag ret =
     !if (!eq(NumSrcArgs, 0),
@@ -821,9 +957,13 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     !if (!eq(NumSrcArgs, 2),
       !if (!eq(HasModifiers, 1),
         // VOP 2 with modifiers
-        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-             Src1Mod:$src1_modifiers, Src1RC:$src1,
-             clampmod:$clamp, omod:$omod)
+        !if( !eq(HasOMod, 1),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               clampmod:$clamp, omod:$omod),
+           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               clampmod:$clamp))
       /* else */,
         // VOP2 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1)
@@ -831,16 +971,57 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     /* NumSrcArgs == 3 */,
       !if (!eq(HasModifiers, 1),
         // VOP3 with modifiers
-        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-             Src1Mod:$src1_modifiers, Src1RC:$src1,
-             Src2Mod:$src2_modifiers, Src2RC:$src2,
-             clampmod:$clamp, omod:$omod)
+        !if (!eq(HasOMod, 1),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               Src2Mod:$src2_modifiers, Src2RC:$src2,
+               clampmod:$clamp, omod:$omod),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               Src2Mod:$src2_modifiers, Src2RC:$src2,
+               clampmod:$clamp))
       /* else */,
         // VOP3 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
       /* endif */ ))));
 }
 
+/// XXX - src1 may only allow VGPRs?
+
+// The modifiers (except clamp) are dummy operands for the benefit of
+// printing and parsing. They defer their values to looking at the
+// srcN_modifiers for what to print.
+class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                   RegisterOperand Src2RC, int NumSrcArgs,
+                   bit HasClamp,
+                   Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = !if (!eq(NumSrcArgs, 2),
+    !if (HasClamp,
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           clampmod:$clamp,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi),
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi)),
+    // else NumSrcArgs == 3
+    !if (HasClamp,
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           Src2Mod:$src2_modifiers, Src2RC:$src2,
+           clampmod:$clamp,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi),
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           Src2Mod:$src2_modifiers, Src2RC:$src2,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi))
+  );
+}
+
 class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
                  bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
 
@@ -924,7 +1105,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers,
+                bit HasOMod, ValueType DstVT = i32> {
   string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
@@ -934,7 +1116,26 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
   string ret =
   !if(!eq(HasModifiers, 0),
       getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
-      dst#", "#src0#src1#src2#"$clamp"#"$omod");
+      dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", ""));
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3P
+// instruction.
+class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
+                   bit HasClamp, ValueType DstVT = i32> {
+  string dst = " $vdst";
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1",
+                                           " $src1,"));
+  string src2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+  string mods = !if(HasModifiers, "$neg_lo$neg_hi", "");
+  string clamp = !if(HasClamp, "$clamp", "");
+
+  // Each modifier is printed as an array of bits for each operand, so
+  // all operands are printed as part of src0_modifiers.
+  string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp;
 }
 
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
@@ -1035,7 +1236,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
   field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
   field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
-  
+
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
   field bit HasDst32 = HasDst;
@@ -1046,7 +1247,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
 
   // TODO: Modifiers logic is somewhat adhoc here, to be refined later
-  field bit HasModifiers = isFloatType<Src0VT>.ret;
+  field bit HasModifiers = isModifierType<Src0VT>.ret;
 
   field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
   field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
@@ -1060,12 +1261,20 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
   field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
 
-  field bit HasOMod = HasModifiers;
   field bit HasClamp = HasModifiers;
   field bit HasSDWAClamp = HasSrc0;
+  field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
+
+  field bit IsPacked = isPackedType<Src0VT>.ret;
+  field bit HasOpSel = IsPacked;
+  field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
 
+  field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
+  field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
+  field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);
+
   field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
 
   // VOP3b instructions are a special case with a second explicit
@@ -1077,7 +1286,12 @@ class VOPProfile <list<ValueType> _ArgVT> {
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
-                             HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+                             HasModifiers, HasOMod, Src0Mod, Src1Mod,
+                             Src2Mod>.ret;
+  field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
+                                   NumSrcArgs, HasClamp,
+                                   Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
+
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
                                HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
@@ -1085,7 +1299,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
                                  DstVT>.ret;
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
-  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
+  field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
   field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
   field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
 }
@@ -1101,11 +1316,18 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
 def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
-def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
 
-def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
 
+def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
+def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
+
+def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
+def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+
 def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
 
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
@@ -1117,6 +1339,8 @@ def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
 def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
 def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
 def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
+def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
 
 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -1126,6 +1350,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
 def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
 def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
@@ -1213,6 +1438,15 @@ def getVOPe32 : InstrMapping {
   let ValueCols = [["4", "0"]];
 }
 
+// Maps ordinary instructions to their SDWA counterparts
+def getSDWAOp : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["SDWA"]];
+}
+
 def getMaskedMIMGOp : InstrMapping {
   let FilterClass = "MIMG_Mask";
   let RowFields = ["Op"];
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 38e31e75ee67..2f89503e129a 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -111,6 +111,12 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
+def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> {
+  let hasSideEffects = 1;
+  let SALU = 1;
+  let usesCustomInserter = 1;
+}
+
 let usesCustomInserter = 1, SALU = 1 in {
 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
@@ -146,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   let mayStore = 1;
   let isBarrier = 1;
   let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -153,48 +161,44 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
 
 // Dummy terminator instruction to use after control flow instructions
 // replaced with exec mask operations.
-def SI_MASK_BRANCH : PseudoInstSI <
+def SI_MASK_BRANCH : VPseudoInstSI <
   (outs), (ins brtarget:$target)> {
   let isBranch = 0;
   let isTerminator = 1;
   let isBarrier = 0;
-  let Uses = [EXEC];
   let SchedRW = [];
   let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 let isTerminator = 1 in {
 
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
-  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
+  [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
   let Constraints = "";
   let Size = 12;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
 }
 
 def SI_ELSE : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
   let Constraints = "$src = $dst";
   let Size = 12;
-  let mayStore = 1;
-  let mayLoad = 1;
   let hasSideEffects = 1;
 }
 
 def SI_LOOP : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved, brtarget:$target),
-  [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+  [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
   let Size = 8;
-  let isBranch = 1;
+  let isBranch = 0;
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
 }
 
-} // End isBranch = 1, isTerminator = 1
+} // End isTerminator = 1
 
 def SI_END_CF : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved),
@@ -202,9 +206,9 @@ def SI_END_CF : CFPseudoInstSI <
   let Size = 4;
   let isAsCheapAsAMove = 1;
   let isReMaterializable = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
+  let mayLoad = 1; // FIXME: Should not need memory flags
+  let mayStore = 1;
 }
 
 def SI_BREAK : CFPseudoInstSI <
@@ -244,6 +248,10 @@ def SI_KILL_TERMINATOR : SPseudoInstSI <
   let isTerminator = 1;
 }
 
+def SI_ILLEGAL_COPY : SPseudoInstSI <
+  (outs unknown:$dst), (ins unknown:$src),
+  [], " ; illegal copy $src to $dst">;
+
 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 
 // Branch on undef scc. Used to avoid intermediate copy from
@@ -259,6 +267,14 @@ def SI_PS_LIVE : PseudoInstSI <
   let SALU = 1;
 }
 
+def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
+  [(int_amdgcn_unreachable)],
+  "; divergent unreachable"> {
+  let Size = 0;
+  let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+}
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
@@ -270,12 +286,12 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
   let isReMaterializable = 1;
 }
 
-def SI_RETURN : SPseudoInstSI <
-  (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+// Return for returning shaders to a shader variant epilog.
+def SI_RETURN_TO_EPILOG : SPseudoInstSI <
+  (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isReturn = 1;
-  let hasSideEffects = 1;
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
 }
@@ -383,9 +399,18 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
 } // End SubtargetPredicate = isGCN
 
 let Predicates = [isGCN] in {
+def : Pat<
+  (trap),
+  (S_TRAP_PSEUDO TRAPID.LLVM_TRAP)
+>;
 
 def : Pat<
-  (int_amdgcn_else i64:$src, bb:$target),
+  (debugtrap),
+  (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP)
+>;
+
+def : Pat<
+  (AMDGPUelse i64:$src, bb:$target),
   (SI_ELSE $src, $target, 0)
 >;
 
@@ -423,24 +448,37 @@ def : Pat <
 
 } // End Predicates = [UnsafeFPMath]
 
+
+// f16_to_fp patterns
 def : Pat <
-  (f32 (fpextend f16:$src)),
-  (V_CVT_F32_F16_e32 $src)
+  (f32 (f16_to_fp i32:$src0)),
+  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
-  (f64 (fpextend f16:$src)),
-  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
+  (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
-  (f16 (fpround f32:$src)),
-  (V_CVT_F16_F32_e32 $src)
+  (f64 (fpextend f16:$src)),
+  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
 >;
 
+// fp_to_fp16 patterns
 def : Pat <
-  (f16 (fpround f64:$src)),
-  (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
+  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))),
+  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod)
 >;
 
 def : Pat <
@@ -480,6 +518,16 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
 defm : FMADPat <f16, V_MAC_F16_e64>;
 defm : FMADPat <f32, V_MAC_F32_e64>;
 
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat<
+  (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
+  (VOP3Mods f32:$src1, i32:$src1_mod),
+  (VOP3Mods f32:$src2, i32:$src2_mod))),
+  (inst $src0_mod, $src0, $src1_mod, $src1,
+  $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+
 multiclass SelectPat <ValueType vt, Instruction inst> {
   def : Pat <
     (vt (select i1:$src0, vt:$src1, vt:$src2)),
@@ -578,6 +626,16 @@ def : BitConvert <i32, f32, VGPR_32>;
 def : BitConvert <f32, i32, VGPR_32>;
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <v2i16, i32, SReg_32>;
+def : BitConvert <i32, v2i16, SReg_32>;
+def : BitConvert <v2f16, i32, SReg_32>;
+def : BitConvert <i32, v2f16, SReg_32>;
+def : BitConvert <v2i16, v2f16, SReg_32>;
+def : BitConvert <v2f16, v2i16, SReg_32>;
+def : BitConvert <v2f16, f32, SReg_32>;
+def : BitConvert <f32, v2f16, SReg_32>;
+def : BitConvert <v2i16, f32, SReg_32>;
+def : BitConvert <f32, v2i16, SReg_32>;
 
 // 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
@@ -619,12 +677,20 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
-def : Pat <
-  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
-               (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
+
+// If denormals are not enabled, it only impacts the compare of the
+// inputs. The output result is not flushed.
+class ClampPat<Instruction inst, ValueType vt> : Pat <
+  (vt (AMDGPUclamp
+        (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))),
+  (inst i32:$src0_modifiers, vt:$src0,
+        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
 >;
 
+def : ClampPat<V_MAX_F32_e64, f32>;
+def : ClampPat<V_MAX_F64, f64>;
+def : ClampPat<V_MAX_F16_e64, f16>;
+
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
 /********** ================================ **********/
@@ -678,6 +744,37 @@ def : Pat <
 >;
 
 def : Pat <
+  (fcopysign f16:$src0, f16:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
+>;
+
+def : Pat <
+  (fcopysign f32:$src0, f16:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+             (V_LSHLREV_B32_e64 (i32 16), $src1))
+>;
+
+def : Pat <
+  (fcopysign f64:$src0, f16:$src1),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
+>;
+
+def : Pat <
+  (fcopysign f16:$src0, f32:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+             (V_LSHRREV_B32_e64 (i32 16), $src1))
+>;
+
+def : Pat <
+  (fcopysign f16:$src0, f64:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+
+def : Pat <
   (fneg f16:$src),
   (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
 >;
@@ -692,6 +789,25 @@ def : Pat <
   (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
 >;
 
+def : Pat <
+  (fneg v2f16:$src),
+  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+>;
+
+def : Pat <
+  (fabs v2f16:$src),
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : Pat <
+  (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
+  (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+>;
+
 /********** ================== **********/
 /********** Immediate Patterns **********/
 /********** ================== **********/
@@ -759,27 +875,6 @@ def : Pat <
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 
 def : Pat <
-  (int_AMDGPU_cube v4f32:$src),
-  (REG_SEQUENCE VReg_128,
-    (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub0,
-    (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub1,
-    (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub2,
-    (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub3)
->;
-
-def : Pat <
   (i32 (sext i1:$src0)),
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
 >;
@@ -985,6 +1080,11 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 // Miscellaneous Patterns
 //===----------------------------------------------------------------------===//
+def : Pat <
+  (i32 (AMDGPUfp16_zext f16:$src)),
+  (COPY $src)
+>;
+
 
 def : Pat <
   (i32 (trunc i64:$a)),
@@ -1028,24 +1128,72 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
 
 defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 
-def : BFEPattern <V_BFE_U32, S_MOV_B32>;
+def : Pat<
+  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+>;
 
 def : Pat<
-  (fcanonicalize f16:$src),
-  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
+  (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
 >;
 
 def : Pat<
-  (fcanonicalize f32:$src),
-  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
+  (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
 >;
 
 def : Pat<
-  (fcanonicalize f64:$src),
-  (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+  (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+>;
+
+
+// Allow integer inputs
+class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
+  (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
+  (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
+>;
+
+def : ExpPattern<AMDGPUexport, i32, EXP>;
+def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
+
+def : Pat <
+  (v2i16 (build_vector i16:$src0, i16:$src1)),
+  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+def : Pat <
+  (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+  (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
 >;
 
+def : Pat <
+  (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
+                       (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+  (v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
+>;
+
+// TODO: Should source modifiers be matched to v_pack_b32_f16?
+def : Pat <
+  (v2f16 (build_vector f16:$src0, f16:$src1)),
+  (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// def : Pat <
+//   (v2f16 (scalar_to_vector f16:$src0)),
+//   (COPY $src0)
+// >;
+
+// def : Pat <
+//   (v2i16 (scalar_to_vector i16:$src0)),
+//   (COPY $src0)
+// >;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -1083,11 +1231,39 @@ def : Pat <
 // Miscellaneous Optimization Patterns
 //============================================================================//
 
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
 def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 
 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
 
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class FPMed3Pat<ValueType vt,
+                Instruction med3Inst> : Pat<
+  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
+  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FPMed3Pat<f32, V_MED3_F32>;
+
+let Predicates = [isGFX9] in {
+def : FPMed3Pat<f16, V_MED3_F16>;
+def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
+def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+} // End Predicates = [isGFX9]
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 5da375468713..7b7cf1635050 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -14,23 +14,7 @@
 
 
 let TargetPrefix = "SI", isTarget = 1 in {
-  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-
-  def int_SI_export : Intrinsic <[],
-    [llvm_i32_ty,   // en
-    llvm_i32_ty,    // vm   (FIXME: should be i1)
-    llvm_i32_ty,    // done (FIXME: should be i1)
-    llvm_i32_ty,    // tgt
-    llvm_i32_ty,    // compr (FIXME: should be i1)
-    llvm_float_ty,  // src0
-    llvm_float_ty,  // src1
-    llvm_float_ty,  // src2
-    llvm_float_ty], // src3
-    []
-  >;
-
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
   // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
   def int_SI_tbuffer_store : Intrinsic <
@@ -64,146 +48,4 @@ let TargetPrefix = "SI", isTarget = 1 in {
      llvm_i32_ty],    // tfe(imm)
     [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Fully-flexible SAMPLE instruction.
-  class SampleRaw : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_v4i32_ty,     // sampler(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Image instruction without a sampler.
-  class Image : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Basic sample
-  def int_SI_image_sample : SampleRaw;
-  def int_SI_image_sample_cl : SampleRaw;
-  def int_SI_image_sample_d : SampleRaw;
-  def int_SI_image_sample_d_cl : SampleRaw;
-  def int_SI_image_sample_l : SampleRaw;
-  def int_SI_image_sample_b : SampleRaw;
-  def int_SI_image_sample_b_cl : SampleRaw;
-  def int_SI_image_sample_lz : SampleRaw;
-  def int_SI_image_sample_cd : SampleRaw;
-  def int_SI_image_sample_cd_cl : SampleRaw;
-
-  // Sample with comparison
-  def int_SI_image_sample_c : SampleRaw;
-  def int_SI_image_sample_c_cl : SampleRaw;
-  def int_SI_image_sample_c_d : SampleRaw;
-  def int_SI_image_sample_c_d_cl : SampleRaw;
-  def int_SI_image_sample_c_l : SampleRaw;
-  def int_SI_image_sample_c_b : SampleRaw;
-  def int_SI_image_sample_c_b_cl : SampleRaw;
-  def int_SI_image_sample_c_lz : SampleRaw;
-  def int_SI_image_sample_c_cd : SampleRaw;
-  def int_SI_image_sample_c_cd_cl : SampleRaw;
-
-  // Sample with offsets
-  def int_SI_image_sample_o : SampleRaw;
-  def int_SI_image_sample_cl_o : SampleRaw;
-  def int_SI_image_sample_d_o : SampleRaw;
-  def int_SI_image_sample_d_cl_o : SampleRaw;
-  def int_SI_image_sample_l_o : SampleRaw;
-  def int_SI_image_sample_b_o : SampleRaw;
-  def int_SI_image_sample_b_cl_o : SampleRaw;
-  def int_SI_image_sample_lz_o : SampleRaw;
-  def int_SI_image_sample_cd_o : SampleRaw;
-  def int_SI_image_sample_cd_cl_o : SampleRaw;
-
-  // Sample with comparison and offsets
-  def int_SI_image_sample_c_o : SampleRaw;
-  def int_SI_image_sample_c_cl_o : SampleRaw;
-  def int_SI_image_sample_c_d_o : SampleRaw;
-  def int_SI_image_sample_c_d_cl_o : SampleRaw;
-  def int_SI_image_sample_c_l_o : SampleRaw;
-  def int_SI_image_sample_c_b_o : SampleRaw;
-  def int_SI_image_sample_c_b_cl_o : SampleRaw;
-  def int_SI_image_sample_c_lz_o : SampleRaw;
-  def int_SI_image_sample_c_cd_o : SampleRaw;
-  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
-
-  // Basic gather4
-  def int_SI_gather4 : SampleRaw;
-  def int_SI_gather4_cl : SampleRaw;
-  def int_SI_gather4_l : SampleRaw;
-  def int_SI_gather4_b : SampleRaw;
-  def int_SI_gather4_b_cl : SampleRaw;
-  def int_SI_gather4_lz : SampleRaw;
-
-  // Gather4 with comparison
-  def int_SI_gather4_c : SampleRaw;
-  def int_SI_gather4_c_cl : SampleRaw;
-  def int_SI_gather4_c_l : SampleRaw;
-  def int_SI_gather4_c_b : SampleRaw;
-  def int_SI_gather4_c_b_cl : SampleRaw;
-  def int_SI_gather4_c_lz : SampleRaw;
-
-  // Gather4 with offsets
-  def int_SI_gather4_o : SampleRaw;
-  def int_SI_gather4_cl_o : SampleRaw;
-  def int_SI_gather4_l_o : SampleRaw;
-  def int_SI_gather4_b_o : SampleRaw;
-  def int_SI_gather4_b_cl_o : SampleRaw;
-  def int_SI_gather4_lz_o : SampleRaw;
-
-  // Gather4 with comparison and offsets
-  def int_SI_gather4_c_o : SampleRaw;
-  def int_SI_gather4_c_cl_o : SampleRaw;
-  def int_SI_gather4_c_l_o : SampleRaw;
-  def int_SI_gather4_c_b_o : SampleRaw;
-  def int_SI_gather4_c_b_cl_o : SampleRaw;
-  def int_SI_gather4_c_lz_o : SampleRaw;
-
-  def int_SI_getlod : SampleRaw;
-
-  // Image instrinsics.
-  def int_SI_image_load : Image;
-  def int_SI_image_load_mip : Image;
-  def int_SI_getresinfo : Image;
-
-  /* Interpolation Intrinsics */
-
-  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
 } // End TargetPrefix = "SI", isTarget = 1
-
-let TargetPrefix = "amdgcn", isTarget = 1 in {
-  // Emit 2.5 ulp, no denormal division. Should only be inserted by
-  // pass based on !fpmath metadata.
-  def int_amdgcn_fdiv_fast : Intrinsic<
-    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
-  >;
-
-  /* Control flow Intrinsics */
-
-  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
-}
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 99fe96c0be22..933a16646746 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -39,15 +39,27 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -56,39 +68,36 @@ using namespace llvm;
 namespace {
 
 class SILoadStoreOptimizer : public MachineFunctionPass {
+
+  typedef struct {
+    MachineBasicBlock::iterator I;
+    MachineBasicBlock::iterator Paired;
+    unsigned EltSize;
+    unsigned Offset0;
+    unsigned Offset1;
+    unsigned BaseOff;
+    bool UseST64;
+    SmallVector<MachineInstr*, 8> InstsToMove;
+   } CombineInfo;
+
 private:
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-  AliasAnalysis *AA;
-
-  static bool offsetsCanBeCombined(unsigned Offset0,
-                                   unsigned Offset1,
-                                   unsigned EltSize);
-
-  MachineBasicBlock::iterator findMatchingDSInst(
-    MachineBasicBlock::iterator I,
-    unsigned EltSize,
-    SmallVectorImpl<MachineInstr*> &InstsToMove);
-
-  MachineBasicBlock::iterator mergeRead2Pair(
-    MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator Paired,
-    unsigned EltSize,
-    ArrayRef<MachineInstr*> InstsToMove);
-
-  MachineBasicBlock::iterator mergeWrite2Pair(
-    MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator Paired,
-    unsigned EltSize,
-    ArrayRef<MachineInstr*> InstsToMove);
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  AliasAnalysis *AA = nullptr;
+
+  static bool offsetsCanBeCombined(CombineInfo &CI);
+
+  bool findMatchingDSInst(CombineInfo &CI);
+
+  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+
+  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
 
 public:
   static char ID;
 
-  SILoadStoreOptimizer()
-      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
-        AA(nullptr) {}
+  SILoadStoreOptimizer() : MachineFunctionPass(ID) {}
 
   SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
@@ -108,7 +117,7 @@ public:
   }
 };
 
-} // End anonymous namespace.
+} // end anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
                       "SI Load / Store Optimizer", false, false)
@@ -141,11 +150,10 @@ static void addDefsToList(const MachineInstr &MI,
   }
 }
 
-static bool memAccessesCanBeReordered(
-  MachineBasicBlock::iterator A,
-  MachineBasicBlock::iterator B,
-  const SIInstrInfo *TII,
-  llvm::AliasAnalysis * AA) {
+static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
+                                      MachineBasicBlock::iterator B,
+                                      const SIInstrInfo *TII,
+                                      AliasAnalysis * AA) {
   return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
     // RAW or WAR - cannot reorder
     // WAW - cannot reorder
@@ -179,7 +187,6 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
                         ArrayRef<MachineInstr*> InstsToMove,
                         const SIInstrInfo *TII,
                         AliasAnalysis *AA) {
-
   assert(MemOp.mayLoadOrStore());
 
   for (MachineInstr *InstToMove : InstsToMove) {
@@ -191,47 +198,68 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
   return true;
 }
 
-bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
-                                                unsigned Offset1,
-                                                unsigned Size) {
+bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   // XXX - Would the same offset be OK? Is there any reason this would happen or
   // be useful?
-  if (Offset0 == Offset1)
+  if (CI.Offset0 == CI.Offset1)
     return false;
 
   // This won't be valid if the offset isn't aligned.
-  if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     return false;
 
-  unsigned EltOffset0 = Offset0 / Size;
-  unsigned EltOffset1 = Offset1 / Size;
+  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
+  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
+  CI.UseST64 = false;
+  CI.BaseOff = 0;
+
+  // If the offset in elements doesn't fit in 8-bits, we might be able to use
+  // the stride 64 versions.
+  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
+      isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
+    CI.Offset0 = EltOffset0 / 64;
+    CI.Offset1 = EltOffset1 / 64;
+    CI.UseST64 = true;
+    return true;
+  }
 
   // Check if the new offsets fit in the reduced 8-bit range.
-  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
+    CI.Offset0 = EltOffset0;
+    CI.Offset1 = EltOffset1;
     return true;
+  }
 
-  // If the offset in elements doesn't fit in 8-bits, we might be able to use
-  // the stride 64 versions.
-  if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
-    return false;
+  // Try to shift base address to decrease offsets.
+  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
+  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
+
+  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
+    CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+    CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+    CI.UseST64 = true;
+    return true;
+  }
+
+  if (isUInt<8>(OffsetDiff)) {
+    CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
+    CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
+    return true;
+  }
 
-  return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+  return false;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
-                                  unsigned EltSize,
-                                  SmallVectorImpl<MachineInstr*> &InstsToMove) {
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock::iterator MBBI = I;
+bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
+  MachineBasicBlock::iterator E = CI.I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = CI.I;
   ++MBBI;
 
   SmallVector<const MachineOperand *, 8> DefsToMove;
-  addDefsToList(*I, DefsToMove);
+  addDefsToList(*CI.I, DefsToMove);
 
   for ( ; MBBI != E; ++MBBI) {
-
-    if (MBBI->getOpcode() != I->getOpcode()) {
+    if (MBBI->getOpcode() != CI.I->getOpcode()) {
 
       // This is not a matching DS instruction, but we can keep looking as
       // long as one of these conditions are met:
@@ -242,14 +270,14 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
       if (MBBI->hasUnmodeledSideEffects())
         // We can't re-order this instruction with respect to other memory
         // opeations, so we fail both conditions mentioned above.
-        return E;
+        return false;
 
       if (MBBI->mayLoadOrStore() &&
-        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
+        !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
-        InstsToMove.push_back(&*MBBI);
+        CI.InstsToMove.push_back(&*MBBI);
         addDefsToList(*MBBI, DefsToMove);
         continue;
       }
@@ -257,13 +285,13 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
+      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
       continue;
     }
 
     // Don't merge volatiles.
     if (MBBI->hasOrderedMemoryRef())
-      return E;
+      return false;
 
     // Handle a case like
     //   DS_WRITE_B32 addr, v, idx0
@@ -271,77 +299,67 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
     //   DS_WRITE_B32 addr, f(w), idx1
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
       continue;
 
-    int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
-    const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+    int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
+                                             AMDGPU::OpName::addr);
+    const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
     const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
 
     // Check same base pointer. Be careful of subregisters, which can occur with
     // vectors of pointers.
     if (AddrReg0.getReg() == AddrReg1.getReg() &&
         AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
-      int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+      int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
                                                  AMDGPU::OpName::offset);
-      unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
-      unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Paired = MBBI;
 
       // Check both offsets fit in the reduced range.
       // We also need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
-          canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
-        return MBBI;
+      if (offsetsCanBeCombined(CI))
+        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+          return true;
     }
 
     // We've found a load/store that we couldn't merge for some reason.
     // We could potentially keep looking, but we'd need to make sure that
     // it was safe to move I and also all the instruction in InstsToMove
     // down past this instruction.
-    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
-      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
-     )
+    // check if we can move I across MBBI and if we can move all I's users
+    if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
       break;
   }
-  return E;
+  return false;
 }
 
 MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
-  MachineBasicBlock::iterator I,
-  MachineBasicBlock::iterator Paired,
-  unsigned EltSize,
-  ArrayRef<MachineInstr*> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
+  CombineInfo &CI) {
+  MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
   // cases, like vectors of pointers.
-  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
-
-  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
-  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
-
-  unsigned Offset0
-    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
-  unsigned Offset1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
-
-  unsigned NewOffset0 = Offset0 / EltSize;
-  unsigned NewOffset1 = Offset1 / EltSize;
-  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
-
-  // Prefer the st64 form if we can use it, even if we can fit the offset in the
-  // non st64 version. I'm not sure if there's any real reason to do this.
-  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
-  if (UseST64) {
-    NewOffset0 /= 64;
-    NewOffset1 /= 64;
-    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
-  }
+  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
+
+  unsigned NewOffset0 = CI.Offset0;
+  unsigned NewOffset1 = CI.Offset1;
+  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
+                                   : AMDGPU::DS_READ2_B64;
+
+  if (CI.UseST64)
+    Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
+                            : AMDGPU::DS_READ2ST64_B64;
 
-  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
-  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -356,72 +374,70 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   const MCInstrDesc &Read2Desc = TII->get(Opc);
 
   const TargetRegisterClass *SuperRC
-    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+    = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 
-  DebugLoc DL = I->getDebugLoc();
-  MachineInstrBuilder Read2
-    = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
-    .addOperand(*AddrReg) // addr
-    .addImm(NewOffset0) // offset0
-    .addImm(NewOffset1) // offset1
-    .addImm(0) // gds
-    .addMemOperand(*I->memoperands_begin())
-    .addMemOperand(*Paired->memoperands_begin());
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  unsigned BaseReg = AddrReg->getReg();
+  unsigned BaseRegFlags = 0;
+  if (CI.BaseOff) {
+    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BaseRegFlags = RegState::Kill;
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
+           .addImm(CI.BaseOff)
+           .addReg(AddrReg->getReg());
+  }
+
+  MachineInstrBuilder Read2 =
+    BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+      .addReg(BaseReg, BaseRegFlags) // addr
+      .addImm(NewOffset0)            // offset0
+      .addImm(NewOffset1)            // offset1
+      .addImm(0)                     // gds
+      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
   (void)Read2;
 
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 
   // Copy to the old destination registers.
-  BuildMI(*MBB, Paired, DL, CopyDesc)
-    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
-    .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
-    .addOperand(*Dest1)
-    .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+                            .add(*Dest1)
+                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, InstsToMove);
+  moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(I);
-  I->eraseFromParent();
-  Paired->eraseFromParent();
+  MachineBasicBlock::iterator Next = std::next(CI.I);
+  CI.I->eraseFromParent();
+  CI.Paired->eraseFromParent();
 
   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   return Next;
 }
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
-  MachineBasicBlock::iterator I,
-  MachineBasicBlock::iterator Paired,
-  unsigned EltSize,
-  ArrayRef<MachineInstr*> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
+  CombineInfo &CI) {
+  MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
-  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
   const MachineOperand *Data1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+    = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
 
+  unsigned NewOffset0 = CI.Offset0;
+  unsigned NewOffset1 = CI.Offset1;
+  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
+                                   : AMDGPU::DS_WRITE2_B64;
 
-  unsigned Offset0
-    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
-  unsigned Offset1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
-
-  unsigned NewOffset0 = Offset0 / EltSize;
-  unsigned NewOffset1 = Offset1 / EltSize;
-  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
-
-  // Prefer the st64 form if we can use it, even if we can fit the offset in the
-  // non st64 version. I'm not sure if there's any real reason to do this.
-  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
-  if (UseST64) {
-    NewOffset0 /= 64;
-    NewOffset1 /= 64;
-    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
-  }
+  if (CI.UseST64)
+    Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
+                            : AMDGPU::DS_WRITE2ST64_B64;
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -434,24 +450,33 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
          "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
-  DebugLoc DL = I->getDebugLoc();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  unsigned BaseReg = Addr->getReg();
+  unsigned BaseRegFlags = 0;
+  if (CI.BaseOff) {
+    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BaseRegFlags = RegState::Kill;
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
+           .addImm(CI.BaseOff)
+           .addReg(Addr->getReg());
+  }
 
-  MachineInstrBuilder Write2
-    = BuildMI(*MBB, Paired, DL, Write2Desc)
-    .addOperand(*Addr) // addr
-    .addOperand(*Data0) // data0
-    .addOperand(*Data1) // data1
-    .addImm(NewOffset0) // offset0
-    .addImm(NewOffset1) // offset1
-    .addImm(0) // gds
-    .addMemOperand(*I->memoperands_begin())
-    .addMemOperand(*Paired->memoperands_begin());
+  MachineInstrBuilder Write2 =
+    BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+      .addReg(BaseReg, BaseRegFlags) // addr
+      .add(*Data0)                   // data0
+      .add(*Data1)                   // data1
+      .addImm(NewOffset0)            // offset0
+      .addImm(NewOffset1)            // offset1
+      .addImm(0)                     // gds
+      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
 
-  moveInstsAfter(Write2, InstsToMove);
+  moveInstsAfter(Write2, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(I);
-  I->eraseFromParent();
-  Paired->eraseFromParent();
+  MachineBasicBlock::iterator Next = std::next(CI.I);
+  CI.I->eraseFromParent();
+  CI.Paired->eraseFromParent();
 
   DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
   return Next;
@@ -472,27 +497,24 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
-    SmallVector<MachineInstr*, 8> InstsToMove;
+    CombineInfo CI;
+    CI.I = I;
     unsigned Opc = MI.getOpcode();
     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
-      unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
-                                                             InstsToMove);
-      if (Match != E) {
+      CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+      if (findMatchingDSInst(CI)) {
         Modified = true;
-        I = mergeRead2Pair(I, Match, Size, InstsToMove);
+        I = mergeRead2Pair(CI);
       } else {
         ++I;
       }
 
       continue;
     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
-      unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
-                                                             InstsToMove);
-      if (Match != E) {
+      CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+      if (findMatchingDSInst(CI)) {
         Modified = true;
-        I = mergeWrite2Pair(I, Match, Size, InstsToMove);
+        I = mergeWrite2Pair(CI);
       } else {
         ++I;
       }
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 7ed18f27e591..35d3a93d8710 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -51,13 +51,23 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
 
 using namespace llvm;
 
@@ -67,10 +77,10 @@ namespace {
 
 class SILowerControlFlow : public MachineFunctionPass {
 private:
-  const SIRegisterInfo *TRI;
-  const SIInstrInfo *TII;
-  LiveIntervals *LIS;
-  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  LiveIntervals *LIS = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
 
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
@@ -88,12 +98,7 @@ private:
 public:
   static char ID;
 
-  SILowerControlFlow() :
-    MachineFunctionPass(ID),
-    TRI(nullptr),
-    TII(nullptr),
-    LIS(nullptr),
-    MRI(nullptr) {}
+  SILowerControlFlow() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -113,7 +118,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 char SILowerControlFlow::ID = 0;
 
@@ -175,9 +180,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
 
   // Insert a pseudo terminator to help keep the verifier happy. This will also
   // be used later when inserting skips.
-  MachineInstr *NewBr =
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
-    .addOperand(MI.getOperand(2));
+  MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+                            .add(MI.getOperand(2));
 
   if (!LIS) {
     MI.eraseFromParent();
@@ -220,8 +224,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   // tied. In order to correctly tie the registers, split this into a copy of
   // the src like it does.
   unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
-    .addOperand(MI.getOperand(1)); // Saved EXEC
+  MachineInstr *CopyExec =
+    BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
+      .add(MI.getOperand(1)); // Saved EXEC
 
   // This must be inserted before phis and any spill code inserted before the
   // else.
@@ -262,6 +267,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   LIS->RemoveMachineInstrFromMaps(MI);
   MI.eraseFromParent();
 
+  LIS->InsertMachineInstrInMaps(*CopyExec);
   LIS->InsertMachineInstrInMaps(*OrSaveExec);
 
   LIS->InsertMachineInstrInMaps(*Xor);
@@ -283,10 +289,9 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) {
   const DebugLoc &DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
 
-  MachineInstr *Or =
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-    .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(1));
+  MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+                         .addReg(AMDGPU::EXEC)
+                         .add(MI.getOperand(1));
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *Or);
@@ -306,13 +311,13 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   const DebugLoc &DL = MI.getDebugLoc();
 
   MachineInstr *AndN2 =
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
-    .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(0));
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .add(MI.getOperand(0));
 
   MachineInstr *Branch =
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addOperand(MI.getOperand(1));
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .add(MI.getOperand(1));
 
   if (LIS) {
     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
@@ -328,9 +333,9 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 
   MachineBasicBlock::iterator InsPt = MBB.begin();
   MachineInstr *NewMI =
-    BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
-    .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(0));
+      BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .add(MI.getOperand(0));
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index be2e14fd4623..3680e02da576 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -114,18 +114,18 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
             assert(Val == 0 || Val == -1);
 
             BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
-              .addOperand(Dst)
-              .addImm(Val);
+                .add(Dst)
+                .addImm(Val);
             MI.eraseFromParent();
             continue;
           }
         }
 
         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-          .addOperand(Dst)
-          .addImm(0)
-          .addImm(-1)
-          .addOperand(Src);
+            .add(Dst)
+            .addImm(0)
+            .addImm(-1)
+            .add(Src);
         MI.eraseFromParent();
       } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                  SrcRC == &AMDGPU::VReg_1RegClass) {
@@ -140,14 +140,14 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
               MRI.getRegClass(DefInst->getOperand(3).getReg()),
               &AMDGPU::SGPR_64RegClass)) {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
-            .addOperand(Dst)
-            .addReg(AMDGPU::EXEC)
-            .addOperand(DefInst->getOperand(3));
+              .add(Dst)
+              .addReg(AMDGPU::EXEC)
+              .add(DefInst->getOperand(3));
         } else {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
-            .addOperand(Dst)
-            .addOperand(Src)
-            .addImm(0);
+              .add(Dst)
+              .add(Src)
+              .addImm(0);
         }
         MI.eraseFromParent();
       }
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ecd46b95ca6f..8e612d2ddfda 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -20,12 +20,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToVGPR(
-  "amdgpu-spill-sgpr-to-vgpr",
-  cl::desc("Enable spilling VGPRs to SGPRs"),
-  cl::ReallyHidden,
-  cl::init(true));
-
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
@@ -47,13 +41,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
+    PSInputEnable(0),
     ReturnsVoid(true),
     FlatWorkGroupSizes(0, 0),
     WavesPerEU(0, 0),
     DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
     DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
     LDSWaveSpillSize(0),
-    PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
@@ -81,34 +75,48 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PrivateMemoryInputPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
+  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+  WavesPerEU = ST.getWavesPerEU(*F);
 
-  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+  // Non-entry functions have no special inputs for now.
+  // TODO: Return early for non-entry CCs.
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC == CallingConv::AMDGPU_PS)
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
 
-  if (!AMDGPU::isShader(F->getCallingConv())) {
+  if (AMDGPU::isKernel(CC)) {
     KernargSegmentPtr = true;
     WorkGroupIDX = true;
     WorkItemIDX = true;
   }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
+  if (ST.debuggerEmitPrologue()) {
+    // Enable everything.
     WorkGroupIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
     WorkItemIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
+  } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+      WorkGroupIDY = true;
+
+    if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+      WorkGroupIDZ = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+      WorkItemIDY = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+      WorkItemIDZ = true;
+  }
 
   // X, XY, and XYZ are the only supported combinations, so make sure Y is
   // enabled if Z is.
   if (WorkItemIDZ)
     WorkItemIDY = true;
 
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
@@ -135,12 +143,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   // We don't need to worry about accessing spills with flat instructions.
   // TODO: On VI where we must use flat for global, we should be able to omit
   // this if it is never used for generic access.
-  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
-      ST.isAmdHsaOS())
+  if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS())
     FlatScratchInit = true;
-
-  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
-  WavesPerEU = ST.getWavesPerEU(*F);
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -193,45 +197,60 @@ unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
   return PrivateMemoryPtrUserSGPR;
 }
 
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
-                                                       MachineFunction *MF,
-                                                       unsigned FrameIndex,
-                                                       unsigned SubIdx) {
-  if (!EnableSpillSGPRToVGPR)
-    return SpilledReg();
-
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
-  Offset += SubIdx * 4;
-
-  unsigned LaneVGPRIdx = Offset / (64 * 4);
-  unsigned Lane = (Offset / 4) % 64;
-
-  struct SpilledReg Spill;
-  Spill.Lane = Lane;
-
-  if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
-                                                *MF);
+/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
+                                                    int FI) {
+  std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
 
-    if (LaneVGPR == AMDGPU::NoRegister)
-      // We have no VGPRs left for spilling SGPRs.
-      return Spill;
+  // This has already been allocated.
+  if (!SpillLanes.empty())
+    return true;
 
-    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-
-    // Add this register as live-in to all blocks to avoid machine verifer
-    // complaining about use of an undefined physical register.
-    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
-         BI != BE; ++BI) {
-      BI->addLiveIn(LaneVGPR);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned WaveSize = ST.getWavefrontSize();
+
+  unsigned Size = FrameInfo.getObjectSize(FI);
+  assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
+  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+
+  int NumLanes = Size / 4;
+
+  // Make sure to handle the case where a wide SGPR spill may span between two
+  // VGPRs.
+  for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+    unsigned LaneVGPR;
+    unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
+
+    if (VGPRIndex == 0) {
+      LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+      if (LaneVGPR == AMDGPU::NoRegister) {
+        // We have no VGPRs left for spilling SGPRs. Reset because we won't
+        // partially spill the SGPR to VGPRs.
+        SGPRToVGPRSpills.erase(FI);
+        NumVGPRSpillLanes -= I;
+        return false;
+      }
+
+      SpillVGPRs.push_back(LaneVGPR);
+
+      // Add this register as live-in to all blocks to avoid machine verifer
+      // complaining about use of an undefined physical register.
+      for (MachineBasicBlock &BB : MF)
+        BB.addLiveIn(LaneVGPR);
+    } else {
+      LaneVGPR = SpillVGPRs.back();
     }
+
+    SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
   }
 
-  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  return Spill;
+  return true;
+}
+
+void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
+  for (auto &R : SGPRToVGPRSpills)
+    MFI.RemoveStackObject(R.first);
 }
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 6fc8d18bceba..a84f3e274f82 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,13 +16,17 @@
 
 #include "AMDGPUMachineFunction.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <array>
+#include <cassert>
 #include <map>
+#include <utility>
 
 namespace llvm {
 
-class MachineRegisterInfo;
-
 class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
 public:
   explicit AMDGPUImagePseudoSourceValue() :
@@ -109,6 +113,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
 
   // Graphics info.
   unsigned PSInputAddr;
+  unsigned PSInputEnable;
+
   bool ReturnsVoid;
 
   // A pair of default/requested minimum/maximum flat work group sizes.
@@ -130,8 +136,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
-  unsigned PSInputEna;
-  std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
   unsigned NumSystemSGPRs;
@@ -182,19 +186,39 @@ private:
 
 public:
   struct SpilledReg {
-    unsigned VGPR;
-    int Lane;
+    unsigned VGPR = AMDGPU::NoRegister;
+    int Lane = -1;
+
+    SpilledReg() = default;
     SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
-    SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
+
     bool hasLane() { return Lane != -1;}
     bool hasReg() { return VGPR != AMDGPU::NoRegister;}
   };
 
-  // SIMachineFunctionInfo definition
+private:
+  // SGPR->VGPR spilling support.
+  typedef std::pair<unsigned, unsigned> SpillRegMask;
+
+  // Track VGPR + wave index for each subregister of the SGPR spilled to
+  // frameindex key.
+  DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
+  unsigned NumVGPRSpillLanes = 0;
+  SmallVector<unsigned, 2> SpillVGPRs;
+
+public:
 
   SIMachineFunctionInfo(const MachineFunction &MF);
-  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
-                           unsigned SubIdx);
+
+  ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
+    auto I = SGPRToVGPRSpills.find(FrameIndex);
+    return (I == SGPRToVGPRSpills.end()) ?
+      ArrayRef<SpilledReg>() : makeArrayRef(I->second);
+  }
+
+  bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
+  void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
@@ -399,6 +423,10 @@ public:
     return PSInputAddr;
   }
 
+  unsigned getPSInputEnable() const {
+    return PSInputEnable;
+  }
+
   bool isPSInputAllocated(unsigned Index) const {
     return PSInputAddr & (1 << Index);
   }
@@ -407,6 +435,10 @@ public:
     PSInputAddr |= 1 << Index;
   }
 
+  void markPSInputEnabled(unsigned Index) {
+    PSInputEnable |= 1 << Index;
+  }
+
   bool returnsVoid() const {
     return ReturnsVoid;
   }
@@ -512,6 +544,6 @@ public:
   }
 };
 
-} // End namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index da86bbf9dd2a..9d4e677400e6 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
   Preds.push_back(Pred);
 
   assert(none_of(Succs,
-                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+                 [=](std::pair<SIScheduleBlock*,
+                     SIScheduleBlockLinkKind> S) {
+                   return PredID == S.first->getID();
+                    }) &&
          "Loop in the Block Graph!");
 }
 
-void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
+                              SIScheduleBlockLinkKind Kind) {
   unsigned SuccID = Succ->getID();
 
   // Check if not already predecessor.
-  for (SIScheduleBlock* S : Succs) {
-    if (SuccID == S->getID())
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
+    if (SuccID == S.first->getID()) {
+      if (S.second == SIScheduleBlockLinkKind::NoData &&
+          Kind == SIScheduleBlockLinkKind::Data)
+        S.second = Kind;
       return;
+    }
   }
   if (Succ->isHighLatencyBlock())
     ++NumHighLatencySuccessors;
-  Succs.push_back(Succ);
+  Succs.push_back(std::make_pair(Succ, Kind));
+
   assert(none_of(Preds,
                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
          "Loop in the Block Graph!");
@@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) {
   }
 
   dbgs() << "\nSuccessors:\n";
-  for (SIScheduleBlock* S : Succs) {
-    S->printDebug(false);
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
+    if (S.second == SIScheduleBlockLinkKind::Data)
+      dbgs() << "(Data Dep) ";
+    S.first->printDebug(false);
   }
 
   if (Scheduled) {
@@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() {
   }
 }
 
+static bool
+hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
+  for (const auto &PredDep : SU.Preds) {
+    if (PredDep.getSUnit() == &FromSU &&
+        PredDep.getKind() == llvm::SDep::Data)
+      return true;
+  }
+  return false;
+}
+
 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   unsigned DAGSize = DAG->SUnits.size();
   unsigned NumHighLatencies = 0;
   unsigned GroupSize;
-  unsigned Color = NextReservedID;
+  int Color = NextReservedID;
   unsigned Count = 0;
   std::set<unsigned> FormingGroup;
 
@@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   else
     GroupSize = 4;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    const SUnit &SU = DAG->SUnits[SUNum];
+    if (DAG->IsHighLatencySU[SU.NodeNum]) {
       unsigned CompatibleGroup = true;
-      unsigned ProposedColor = Color;
+      int ProposedColor = Color;
+      std::vector<int> AdditionalElements;
+
+      // We don't want to put in the same block
+      // two high latency instructions that depend
+      // on each other.
+      // One way would be to check canAddEdge
+      // in both directions, but that currently is not
+      // enough because there the high latency order is
+      // enforced (via links).
+      // Instead, look at the dependencies between the
+      // high latency instructions and deduce if it is
+      // a data dependency or not.
       for (unsigned j : FormingGroup) {
-        // TODO: Currently CompatibleGroup will always be false,
-        // because the graph enforces the load order. This
-        // can be fixed, but as keeping the load order is often
-        // good for performance that causes a performance hit (both
-        // the default scheduler and this scheduler).
-        // When this scheduler determines a good load order,
-        // this can be fixed.
-        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
-            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+        bool HasSubGraph;
+        std::vector<int> SubGraph;
+        // By construction (topological order), if SU and
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // in the parent graph of SU.
+#ifndef NDEBUG
+        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+                                               HasSubGraph);
+        assert(!HasSubGraph);
+#endif
+        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+                                               HasSubGraph);
+        if (!HasSubGraph)
+          continue; // No dependencies between each other
+        else if (SubGraph.size() > 5) {
+          // Too many elements would be required to be added to the block.
           CompatibleGroup = false;
+          break;
+        }
+        else {
+          // Check the type of dependency
+          for (unsigned k : SubGraph) {
+            // If in the path to join the two instructions,
+            // there is another high latency instruction,
+            // or instructions colored for another block
+            // abort the merge.
+            if (DAG->IsHighLatencySU[k] ||
+                (CurrentColoring[k] != ProposedColor &&
+                 CurrentColoring[k] != 0)) {
+              CompatibleGroup = false;
+              break;
+            }
+            // If one of the SU in the subgraph depends on the result of SU j,
+            // there'll be a data dependency.
+            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
+              CompatibleGroup = false;
+              break;
+            }
+          }
+          if (!CompatibleGroup)
+            break;
+          // Same check for the SU
+          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
+            CompatibleGroup = false;
+            break;
+          }
+          // Add all the required instructions to the block
+          // These cannot live in another block (because they
+          // depend (order dependency) on one of the
+          // instruction in the block, and are required for the
+          // high latency instruction we add.
+          AdditionalElements.insert(AdditionalElements.end(),
+                                    SubGraph.begin(), SubGraph.end());
+        }
       }
-      if (!CompatibleGroup || ++Count == GroupSize) {
+      if (CompatibleGroup) {
+        FormingGroup.insert(SU.NodeNum);
+        for (unsigned j : AdditionalElements)
+          CurrentColoring[j] = ProposedColor;
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        ++Count;
+      }
+      // Found one incompatible instruction,
+      // or has filled a big enough group.
+      // -> start a new one.
+      if (!CompatibleGroup) {
         FormingGroup.clear();
         Color = ++NextReservedID;
-        if (!CompatibleGroup) {
-          ProposedColor = Color;
-          FormingGroup.insert(SU->NodeNum);
-        }
+        ProposedColor = Color;
+        FormingGroup.insert(SU.NodeNum);
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        Count = 0;
+      } else if (Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        ProposedColor = Color;
         Count = 0;
-      } else {
-        FormingGroup.insert(SU->NodeNum);
       }
-      CurrentColoring[SU->NodeNum] = ProposedColor;
     }
   }
 }
@@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
   unsigned DAGSize = DAG->SUnits.size();
   std::vector<int> PendingColoring = CurrentColoring;
 
+  assert(DAGSize >= 1 &&
+         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
+         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
+  // If there is no reserved block at all, do nothing. We don't want
+  // everything in one block.
+  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
+                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
+      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
+                        CurrentTopDownReservedDependencyColoring.end()) == 0)
+    return;
+
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
@@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
         SUColors.insert(CurrentColoring[Succ->NodeNum]);
       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     }
+    // If there is only one child/parent block, and that block
+    // is not among the ones we are removing in this path, then
+    // merge the instruction to that block
     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
       PendingColoring[SU->NodeNum] = *SUColors.begin();
     else // TODO: Attribute new colors depending on color
@@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
-    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
-      if (Pos != ColorCount.end()) {
-        ++ColorCount[color];
-      } else {
-        ColorCount[color] = 1;
-      }
+     ++ColorCount[color];
   }
 
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
@@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
         continue;
       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
-        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
+                                     SuccDep.isCtrl() ? NoData : Data);
     }
     for (SDep& PredDep : SU->Preds) {
       SUnit *Pred = PredDep.getSUnit();
@@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() {
       Block->Height = 0;
     else {
       unsigned Height = 0;
-      for (SIScheduleBlock *Succ : Block->getSuccs()) {
-        if (Height < Succ->Height + 1)
-          Height = Succ->Height + 1;
-      }
+      for (const auto &Succ : Block->getSuccs())
+        Height = std::min(Height, Succ.first->Height + 1);
       Block->Height = Height;
     }
   }
@@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         continue;
 
       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
-      std::map<unsigned, unsigned>::iterator RegPos =
-        LiveOutRegsNumUsages[PredID].find(Reg);
-      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
-        ++LiveOutRegsNumUsages[PredID][Reg];
-      } else {
-        LiveOutRegsNumUsages[PredID][Reg] = 1;
-      }
+      ++LiveOutRegsNumUsages[PredID][Reg];
     }
   }
 
@@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   std::set<unsigned> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
+  // Increase LiveOutRegsNumUsages for blocks
+  // producing registers consumed in another
+  // scheduling region.
+  for (unsigned Reg : DAG->getOutRegs()) {
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      // Do reverse traversal
+      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
+      SIScheduleBlock *Block = Blocks[ID];
+      const std::set<unsigned> &OutRegs = Block->getOutRegs();
+
+      if (OutRegs.find(Reg) == OutRegs.end())
+        continue;
+
+      ++LiveOutRegsNumUsages[ID][Reg];
+      break;
+    }
+  }
+
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         }
       }
 
-      if (!Found) {
-        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
-          LiveRegsConsumers[Reg] = 1;
-        else
-          ++LiveRegsConsumers[Reg];
-      }
+      if (!Found)
+        ++LiveRegsConsumers[Reg];
     }
   }
 
@@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
     for (SIScheduleBlock* Block : BlocksScheduled) {
       dbgs() << ' ' << Block->getID();
     }
+    dbgs() << '\n';
   );
 }
 
@@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
                         VregCurrentUsage, SregCurrentUsage);
   if (VregCurrentUsage > maxVregUsage)
     maxVregUsage = VregCurrentUsage;
-  if (VregCurrentUsage > maxSregUsage)
-    maxSregUsage = VregCurrentUsage;
+  if (SregCurrentUsage > maxSregUsage)
+    maxSregUsage = SregCurrentUsage;
   DEBUG(
     dbgs() << "Picking New Blocks\n";
     dbgs() << "Available: ";
@@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
 }
 
 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
-  for (SIScheduleBlock* Block : Parent->getSuccs()) {
-    --BlockNumPredsLeft[Block->getID()];
-    if (BlockNumPredsLeft[Block->getID()] == 0) {
-      ReadyBlocks.push_back(Block);
-    }
-    // TODO: Improve check. When the dependency between the high latency
-    // instructions and the instructions of the other blocks are WAR or WAW
-    // there will be no wait triggered. We would like these cases to not
-    // update LastPosHighLatencyParentScheduled.
-    if (Parent->isHighLatencyBlock())
-      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  for (const auto &Block : Parent->getSuccs()) {
+    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
+      ReadyBlocks.push_back(Block.first);
+
+    if (Parent->isHighLatencyBlock() &&
+        Block.second == SIScheduleBlockLinkKind::Data)
+      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
   }
 }
 
@@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
        LiveOutRegsNumUsages[Block->getID()].begin(),
        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
     std::pair<unsigned, unsigned> RegP = *RegI;
-    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
-      LiveRegsConsumers[RegP.first] = RegP.second;
-    else {
-      assert(LiveRegsConsumers[RegP.first] == 0);
-      LiveRegsConsumers[RegP.first] += RegP.second;
-    }
+    // We produce this register, thus it must not be previously alive.
+    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
+           LiveRegsConsumers[RegP.first] == 0);
+    LiveRegsConsumers[RegP.first] += RegP.second;
   }
   if (LastPosHighLatencyParentScheduled[Block->getID()] >
         (unsigned)LastPosWaitedHighLatency)
@@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is extremely high, try other good performing variants
   // which could lead to lower VGPR usage
   if (Best.MaxVGPRUsage > 180) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
       { LatenciesAlone, BlockRegUsageLatency },
 //      { LatenciesAlone, BlockRegUsage },
       { LatenciesGrouped, BlockLatencyRegUsage },
@@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is still extremely high, we may spill. Try other variants
   // which are less performing, but that could lead to lower VGPR usage.
   if (Best.MaxVGPRUsage > 200) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
 //      { LatenciesAlone, BlockRegUsageLatency },
       { LatenciesAlone, BlockRegUsage },
 //      { LatenciesGrouped, BlockLatencyRegUsage },
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index 77c07350d325..122d0f67ca8c 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -40,13 +40,12 @@ enum SIScheduleCandReason {
 
 struct SISchedulerCandidate {
   // The reason for this candidate.
-  SIScheduleCandReason Reason;
+  SIScheduleCandReason Reason = NoCand;
 
   // Set of reasons that apply to multiple candidates.
-  uint32_t RepeatReasonSet;
+  uint32_t RepeatReasonSet = 0;
 
-  SISchedulerCandidate()
-    :  Reason(NoCand), RepeatReasonSet(0) {}
+  SISchedulerCandidate() = default;
 
   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
@@ -55,6 +54,11 @@ struct SISchedulerCandidate {
 class SIScheduleDAGMI;
 class SIScheduleBlockCreator;
 
+enum SIScheduleBlockLinkKind {
+  NoData,
+  Data
+};
+
 class SIScheduleBlock {
   SIScheduleDAGMI *DAG;
   SIScheduleBlockCreator *BC;
@@ -84,8 +88,8 @@ class SIScheduleBlock {
   std::set<unsigned> LiveInRegs;
   std::set<unsigned> LiveOutRegs;
 
-  bool Scheduled;
-  bool HighLatencyBlock;
+  bool Scheduled = false;
+  bool HighLatencyBlock = false;
 
   std::vector<unsigned> HasLowLatencyNonWaitedParent;
 
@@ -93,14 +97,14 @@ class SIScheduleBlock {
   unsigned ID;
 
   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
-  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
-  unsigned NumHighLatencySuccessors;
+  // All blocks successors, and the kind of link
+  std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
+  unsigned NumHighLatencySuccessors = 0;
 
 public:
   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
                   unsigned ID):
-    DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false),
-    HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {}
+    DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
 
   ~SIScheduleBlock() = default;
 
@@ -114,10 +118,11 @@ public:
 
   // Add block pred, which has instruction predecessor of SU.
   void addPred(SIScheduleBlock *Pred);
-  void addSucc(SIScheduleBlock *Succ);
+  void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
 
   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
-  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+  ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
+    getSuccs() const { return Succs; }
 
   unsigned Height;  // Maximum topdown path length to block without outputs
   unsigned Depth;   // Maximum bottomup path length to block without inputs
@@ -213,9 +218,9 @@ struct SIScheduleBlocks {
 };
 
 enum SISchedulerBlockCreatorVariant {
-    LatenciesAlone,
-    LatenciesGrouped,
-    LatenciesAlonePlusConsecutive
+  LatenciesAlone,
+  LatenciesGrouped,
+  LatenciesAlonePlusConsecutive
 };
 
 class SIScheduleBlockCreator {
@@ -451,6 +456,7 @@ public:
   LiveIntervals *getLIS() { return LIS; }
   MachineRegisterInfo *getMRI() { return &MRI; }
   const TargetRegisterInfo *getTRI() { return TRI; }
+  ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
   SUnit& getEntrySU() { return EntrySU; }
   SUnit& getExitSU() { return ExitSU; }
 
@@ -469,6 +475,14 @@ public:
     return InRegs;
   }
 
+  std::set<unsigned> getOutRegs() {
+    std::set<unsigned> OutRegs;
+    for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+      OutRegs.insert(RegMaskPair.RegUnit);
+    }
+    return OutRegs;
+  };
+
   unsigned getVGPRSetID() const { return VGPRSetID; }
   unsigned getSGPRSetID() const { return SGPRSetID; }
 
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
new file mode 100644
index 000000000000..e02c2e3240e8
--- /dev/null
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -0,0 +1,713 @@
+//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass tries to apply several peephole SDWA patterns.
+///
+/// E.g. original:
+///   V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
+///   V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
+///   V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
+///
+/// Replace:
+///   V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
+///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+///
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include <unordered_map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-peephole-sdwa"
+
+STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
+STATISTIC(NumSDWAInstructionsPeepholed,
+          "Number of instruction converted to SDWA.");
+
+namespace {
+
+class SDWAOperand;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
+  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+
+  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
+
+public:
+  static char ID;
+
+  typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
+
+  SIPeepholeSDWA() : MachineFunctionPass(ID) {
+    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void matchSDWAOperands(MachineFunction &MF);
+  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
+
+  StringRef getPassName() const override { return "SI Peephole SDWA"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class SDWAOperand {
+private:
+  MachineOperand *Target; // Operand that would be used in converted instruction
+  MachineOperand *Replaced; // Operand that would be replace by Target
+
+public:
+  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
+      : Target(TargetOp), Replaced(ReplacedOp) {
+    assert(Target->isReg());
+    assert(Replaced->isReg());
+  }
+
+  virtual ~SDWAOperand() {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
+
+  MachineOperand *getTargetOperand() const { return Target; }
+  MachineOperand *getReplacedOperand() const { return Replaced; }
+  MachineInstr *getParentInst() const { return Target->getParent(); }
+  MachineRegisterInfo *getMRI() const {
+    return &getParentInst()->getParent()->getParent()->getRegInfo();
+  }
+};
+
+using namespace AMDGPU::SDWA;
+
+class SDWASrcOperand : public SDWAOperand {
+private:
+  SdwaSel SrcSel;
+  bool Abs;
+  bool Neg;
+  bool Sext;
+
+public:
+  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
+                 bool Sext_ = false)
+      : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
+        Neg(Neg_), Sext(Sext_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getSrcSel() const { return SrcSel; }
+  bool getAbs() const { return Abs; }
+  bool getNeg() const { return Neg; }
+  bool getSext() const { return Sext; }
+
+  uint64_t getSrcMods() const;
+};
+
+class SDWADstOperand : public SDWAOperand {
+private:
+  SdwaSel DstSel;
+  DstUnused DstUn;
+
+public:
+  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
+      : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getDstSel() const { return DstSel; }
+  DstUnused getDstUnused() const { return DstUn; }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
+
+char SIPeepholeSDWA::ID = 0;
+
+char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
+
+FunctionPass *llvm::createSIPeepholeSDWAPass() {
+  return new SIPeepholeSDWA();
+}
+
+#ifndef NDEBUG
+
+static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+  switch(Sel) {
+  case BYTE_0: OS << "BYTE_0"; break;
+  case BYTE_1: OS << "BYTE_1"; break;
+  case BYTE_2: OS << "BYTE_2"; break;
+  case BYTE_3: OS << "BYTE_3"; break;
+  case WORD_0: OS << "WORD_0"; break;
+  case WORD_1: OS << "WORD_1"; break;
+  case DWORD:  OS << "DWORD"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
+  switch(Un) {
+  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
+  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
+  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
+  OS << "SDWA src: " << *Src.getTargetOperand()
+     << " src_sel:" << Src.getSrcSel()
+     << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
+     << " sext:" << Src.getSext() << '\n';
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
+  OS << "SDWA dst: " << *Dst.getTargetOperand()
+     << " dst_sel:" << Dst.getDstSel()
+     << " dst_unused:" << Dst.getDstUnused() << '\n';
+  return OS;
+}
+
+#endif
+
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+  assert(To.isReg() && From.isReg());
+  To.setReg(From.getReg());
+  To.setSubReg(From.getSubReg());
+  To.setIsUndef(From.isUndef());
+  if (To.isUse()) {
+    To.setIsKill(From.isKill());
+  } else {
+    To.setIsDead(From.isDead());
+  }
+}
+
+static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
+  return LHS.isReg() &&
+         RHS.isReg() &&
+         LHS.getReg() == RHS.getReg() &&
+         LHS.getSubReg() == RHS.getSubReg();
+}
+
+static bool isSubregOf(const MachineOperand &SubReg,
+                       const MachineOperand &SuperReg,
+                       const TargetRegisterInfo *TRI) {
+  
+  if (!SuperReg.isReg() || !SubReg.isReg())
+    return false;
+
+  if (isSameReg(SuperReg, SubReg))
+    return true;
+
+  if (SuperReg.getReg() != SubReg.getReg())
+    return false;
+
+  LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
+  LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
+  SuperMask |= ~SubMask;
+  return SuperMask.all();
+}
+
+uint64_t SDWASrcOperand::getSrcMods() const {
+  uint64_t Mods = 0;
+  if (Abs || Neg) {
+    assert(!Sext &&
+           "Float and integer src modifiers can't be set simulteniously");
+    Mods |= Abs ? SISrcMods::ABS : 0;
+    Mods |= Neg ? SISrcMods::NEG : 0;
+  } else if (Sext) {
+    Mods |= SISrcMods::SEXT;
+  }
+
+  return Mods;
+}
+
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA src operand potential instruction is one that use register
+  // defined by parent instruction
+  MachineRegisterInfo *MRI = getMRI();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  MachineInstr *PotentialMI = nullptr;
+  for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
+    // If this is use of another subreg of dst reg then do nothing
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    // If there exist use of superreg of dst then we should not combine this
+    // opernad
+    if (!isSameReg(PotentialMO, *Replaced))
+      return nullptr;
+
+    // Check that PotentialMI is only instruction that uses dst reg
+    if (PotentialMI == nullptr) {
+      PotentialMI = PotentialMO.getParent();
+    } else if (PotentialMI != PotentialMO.getParent()) {
+      return nullptr;
+    }
+  }
+
+  return PotentialMI;
+}
+
+bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Find operand in instruction that matches source operand and replace it with
+  // target operand. Set corresponding src_sel
+
+  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
+  MachineOperand *SrcMods =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+  assert(Src && Src->isReg());
+  if (!isSameReg(*Src, *getReplacedOperand())) {
+    // If this is not src0 then it should be src1
+    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
+    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+    assert(Src && Src->isReg());
+
+    if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
+      // src2. This is not allowed.
+      return false;
+    }
+
+    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+  }
+  copyRegOperand(*Src, *getTargetOperand());
+  SrcSel->setImm(getSrcSel());
+  SrcMods->setImm(getSrcMods());
+  getTargetOperand()->setIsKill(false);
+  return true;
+}
+
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA dst operand potential instruction is one that defines register
+  // that this operand uses
+  MachineRegisterInfo *MRI = getMRI();
+  MachineInstr *ParentMI = getParentInst();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    if (!isSameReg(*Replaced, PotentialMO))
+      return nullptr;
+
+    // Check that ParentMI is the only instruction that uses replaced register
+    for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
+      if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
+          UseMO.getParent() != ParentMI) {
+        return nullptr;
+      }
+    }
+
+    // Due to SSA this should be onle def of replaced register, so return it
+    return PotentialMO.getParent();
+  }
+
+  return nullptr;
+}
+
+bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
+
+  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+      getDstSel() != AMDGPU::SDWA::DWORD) {
+    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
+    return false;
+  }
+
+  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  assert(Operand &&
+         Operand->isReg() &&
+         isSameReg(*Operand, *getReplacedOperand()));
+  copyRegOperand(*Operand, *getTargetOperand());
+  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
+  assert(DstSel);
+  DstSel->setImm(getDstSel());
+  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  assert(DstUnused);
+  DstUnused->setImm(getDstUnused());
+
+  // Remove original instruction  because it would conflict with our new
+  // instruction by register definition
+  getParentInst()->eraseFromParent();
+  return true;
+}
+
+Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
+  if (Op.isImm()) {
+    return Op.getImm();
+  }
+
+  // If this is not immediate then it can be copy of immediate value, e.g.:
+  // %vreg1<def> = S_MOV_B32 255;
+  if (Op.isReg()) {
+    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
+      if (!isSameReg(Op, Def))
+        continue;
+
+      const MachineInstr *DefInst = Def.getParent();
+      if (!TII->isFoldableCopy(*DefInst))
+        return None;
+
+      const MachineOperand &Copied = DefInst->getOperand(1);
+      if (!Copied.isImm())
+        return None;
+
+      return Copied.getImm();
+    }
+  }
+
+  return None;
+}
+
+void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opcode = MI.getOpcode();
+      switch (Opcode) {
+      case AMDGPU::V_LSHRREV_B32_e32:
+      case AMDGPU::V_ASHRREV_I32_e32:
+      case AMDGPU::V_LSHLREV_B32_e32: {
+        // from: v_lshrrev_b32_e32 v1, 16/24, v0
+        // to SDWA src:v0 src_sel:WORD_1/BYTE_3
+
+        // from: v_ashrrev_i32_e32 v1, 16/24, v0
+        // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
+
+        // from: v_lshlrev_b32_e32 v1, 16/24, v0
+        // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm)
+          break;
+
+        if (*Imm != 16 && *Imm != 24)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
+          auto SDWADst = make_unique<SDWADstOperand>(
+              Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+          SDWAOperands[&MI] = std::move(SDWADst);
+          ++NumSDWAPatternsFound;
+        } else {
+          auto SDWASrc = make_unique<SDWASrcOperand>(
+              Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
+              Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+          SDWAOperands[&MI] = std::move(SDWASrc);
+          ++NumSDWAPatternsFound;
+        }
+        break;
+      }
+
+      case AMDGPU::V_LSHRREV_B16_e32:
+      case AMDGPU::V_ASHRREV_I16_e32:
+      case AMDGPU::V_LSHLREV_B16_e32: {
+        // from: v_lshrrev_b16_e32 v1, 8, v0
+        // to SDWA src:v0 src_sel:BYTE_1
+
+        // from: v_ashrrev_i16_e32 v1, 8, v0
+        // to SDWA src:v0 src_sel:BYTE_1 sext:1
+
+        // from: v_lshlrev_b16_e32 v1, 8, v0
+        // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm || *Imm != 8)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
+          auto SDWADst =
+              make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+          SDWAOperands[&MI] = std::move(SDWADst);
+          ++NumSDWAPatternsFound;
+        } else {
+          auto SDWASrc = make_unique<SDWASrcOperand>(
+              Src1, Dst, BYTE_1, false, false,
+              Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+          SDWAOperands[&MI] = std::move(SDWASrc);
+          ++NumSDWAPatternsFound;
+        }
+        break;
+      }
+
+      case AMDGPU::V_BFE_I32:
+      case AMDGPU::V_BFE_U32: {
+        // e.g.:
+        // from: v_bfe_u32 v1, v0, 8, 8
+        // to SDWA src:v0 src_sel:BYTE_1
+
+        // offset | width | src_sel
+        // ------------------------
+        // 0      | 8     | BYTE_0
+        // 0      | 16    | WORD_0
+        // 0      | 32    | DWORD ?
+        // 8      | 8     | BYTE_1
+        // 16     | 8     | BYTE_2
+        // 16     | 16    | WORD_1
+        // 24     | 8     | BYTE_3
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        auto Offset = foldToImm(*Src1);
+        if (!Offset)
+          break;
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        auto Width = foldToImm(*Src2);
+        if (!Width)
+          break;
+
+        SdwaSel SrcSel = DWORD;
+
+        if (*Offset == 0 && *Width == 8)
+          SrcSel = BYTE_0;
+        else if (*Offset == 0 && *Width == 16)
+          SrcSel = WORD_0;
+        else if (*Offset == 0 && *Width == 32)
+          SrcSel = DWORD;
+        else if (*Offset == 8 && *Width == 8)
+          SrcSel = BYTE_1;
+        else if (*Offset == 16 && *Width == 8)
+          SrcSel = BYTE_2;
+        else if (*Offset == 16 && *Width == 16)
+          SrcSel = WORD_1;
+        else if (*Offset == 24 && *Width == 8)
+          SrcSel = BYTE_3;
+        else
+          break;
+
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      
+        if (TRI->isPhysicalRegister(Src0->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src0, Dst, SrcSel, false, false,
+            Opcode == AMDGPU::V_BFE_U32 ? false : true);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+        break;
+      }
+      case AMDGPU::V_AND_B32_e32: {
+        // e.g.:
+        // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
+        // to SDWA src:v0 src_sel:WORD_0/BYTE_0
+
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm)
+          break;
+
+        if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+        break;
+      }
+      }
+    }
+  }
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+                                   const SDWAOperandsVector &SDWAOperands) {
+  // Check if this instruction can be converted to SDWA:
+  // 1. Does this opcode support SDWA
+  if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
+    return false;
+
+  // 2. Are all operands - VGPRs
+  for (const MachineOperand &Operand : MI.explicit_operands()) {
+    if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
+      return false;
+  }
+
+  // Convert to sdwa
+  int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
+  assert(SDWAOpcode != -1);
+
+  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
+
+  // Create SDWA version of instruction MI and initialize its operands
+  MachineInstrBuilder SDWAInst =
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+
+  // Copy dst, if it is present in original then should also be present in SDWA
+  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (Dst) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
+    SDWAInst.add(*Dst);
+  } else {
+    assert(TII->isVOPC(MI));
+  }
+
+  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
+  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
+  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  assert(
+    Src0 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
+  SDWAInst.addImm(0);
+  SDWAInst.add(*Src0);
+
+  // Copy src1 if present, initialize src1_modifiers.
+  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
+    SDWAInst.addImm(0);
+    SDWAInst.add(*Src1);
+  } else {
+    assert(TII->isVOP1(MI));
+  }
+
+  if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
+    // v_mac_f16/32 has additional src2 operand tied to vdst
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    assert(Src2);
+    SDWAInst.add(*Src2);
+  }
+
+  // Initialize clamp.
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
+  SDWAInst.addImm(0);
+
+  // Initialize dst_sel and dst_unused if present
+  if (Dst) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+    SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+  }
+
+  // Initialize src0_sel
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
+  SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+
+
+  // Initialize src1_sel if present
+  if (Src1) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Apply all sdwa operand pattenrs
+  bool Converted = false;
+  for (auto &Operand : SDWAOperands) {
+    Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+  }
+  if (!Converted) {
+    SDWAInst->eraseFromParent();
+    return false;
+  }
+
+  DEBUG(dbgs() << "Convert instruction:" << MI
+               << "Into:" << *SDWAInst << '\n');
+  ++NumSDWAInstructionsPeepholed;
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (!ST.hasSDWA() ||
+      !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+    return false;
+  }
+
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
+
+  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+
+  matchSDWAOperands(MF);
+
+  for (auto &OperandPair : SDWAOperands) {
+    auto &Operand = OperandPair.second;
+    MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+    if (PotentialMI) {
+      PotentialMatches[PotentialMI].push_back(std::move(Operand));
+    }
+  }
+
+  for (auto &PotentialPair : PotentialMatches) {
+    MachineInstr &PotentialMI = *PotentialPair.first;
+    convertToSDWA(PotentialMI, PotentialPair.second);
+  }
+
+  SDWAOperands.clear();
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a1ed5e8441df..36d4df52ff0e 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -24,12 +24,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToSMEM(
-  "amdgpu-spill-sgpr-to-smem",
-  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
-  cl::init(false));
-
-
 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
   for (unsigned i = 0; PSets[i] != -1; ++i) {
     if (PSets[i] == (int)PSetID)
@@ -49,9 +43,28 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
   }
 }
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
-                                   SGPRPressureSets(getNumRegPressureSets()),
-                                   VGPRPressureSets(getNumRegPressureSets()) {
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+  "amdgpu-spill-sgpr-to-smem",
+  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+  cl::init(false));
+
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+  "amdgpu-spill-sgpr-to-vgpr",
+  cl::desc("Enable spilling VGPRs to SGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+  AMDGPURegisterInfo(),
+  SGPRPressureSets(getNumRegPressureSets()),
+  VGPRPressureSets(getNumRegPressureSets()),
+  SpillSGPRToVGPR(false),
+  SpillSGPRToSMEM(false) {
+  if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
+    SpillSGPRToSMEM = true;
+  else if (EnableSpillSGPRToVGPR)
+    SpillSGPRToVGPR = true;
+
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
   SGPRSetID = NumRegPressureSets;
@@ -97,14 +110,18 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  unsigned RegCount = getMaxNumSGPRs(MF);
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned RegCount = ST.getMaxNumSGPRs(MF);
   unsigned Reg;
 
   // Try to place it in a hole after PrivateSegmentbufferReg.
@@ -129,6 +146,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
+  // Reserve the memory aperture registers.
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+
   // Reserve Trap Handler registers - support is not implemented in Codegen.
   reserveRegisterTuples(Reserved, AMDGPU::TBA);
   reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -139,14 +162,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
 
-  unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
   }
 
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
+  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
@@ -253,7 +278,6 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   }
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
   unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
   unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -263,8 +287,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
     .addFrameIndex(FrameIdx);
 
-  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
-    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+  TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
     .addReg(OffsetReg, RegState::Kill)
     .addReg(FIReg);
 }
@@ -415,14 +438,14 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
   unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();
 
   BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-    .addReg(Reg, getDefRegState(!IsStore))
-    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
-    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
-    .addImm(Offset)
-    .addImm(0) // glc
-    .addImm(0) // slc
-    .addImm(0) // tfe
-    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      .addReg(Reg, getDefRegState(!IsStore))
+      .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+      .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   return true;
 }
 
@@ -545,11 +568,20 @@ static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
                       AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
 }
 
-void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
+bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                int Index,
-                               RegScavenger *RS) const {
+                               RegScavenger *RS,
+                               bool OnlyToVGPR) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
+    = MFI->getSGPRToVGPRSpills(Index);
+  bool SpillToVGPR = !VGPRSpills.empty();
+  if (OnlyToVGPR && !SpillToVGPR)
+    return false;
+
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -558,10 +590,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   bool IsKill = MI->getOperand(0).isKill();
   const DebugLoc &DL = MI->getDebugLoc();
 
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 
-  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+  bool SpillToSMEM = spillSGPRToSMEM();
+  if (SpillToSMEM && OnlyToVGPR)
+    return false;
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -634,9 +667,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       continue;
     }
 
-    struct SIMachineFunctionInfo::SpilledReg Spill =
-      MFI->getSpilledReg(MF, Index, i);
-    if (Spill.hasReg()) {
+    if (SpillToVGPR) {
+      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+
       BuildMI(*MBB, MI, DL,
               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
               Spill.VGPR)
@@ -647,6 +680,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       // frame index, we should delete the frame index when all references to
       // it are fixed.
     } else {
+      // XXX - Can to VGPR spill fail for some subregisters but not others?
+      if (OnlyToVGPR)
+        return false;
+
       // Spill SGPR to a frame index.
       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -690,22 +727,33 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
 
   MI->eraseFromParent();
   MFI->addToSpilledSGPRs(NumSubRegs);
+  return true;
 }
 
-void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
+bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                  int Index,
-                                 RegScavenger *RS) const {
+                                 RegScavenger *RS,
+                                 bool OnlyToVGPR) const {
   MachineFunction *MF = MI->getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
+    = MFI->getSGPRToVGPRSpills(Index);
+  bool SpillToVGPR = !VGPRSpills.empty();
+  if (OnlyToVGPR && !SpillToVGPR)
+    return false;
+
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
   unsigned SuperReg = MI->getOperand(0).getReg();
-  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+  bool SpillToSMEM = spillSGPRToSMEM();
+  if (SpillToSMEM && OnlyToVGPR)
+    return false;
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -773,10 +821,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       continue;
     }
 
-    SIMachineFunctionInfo::SpilledReg Spill
-      = MFI->getSpilledReg(MF, Index, i);
-
-    if (Spill.hasReg()) {
+    if (SpillToVGPR) {
+      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
       auto MIB =
         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
                 SubReg)
@@ -786,6 +832,9 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       if (NumSubRegs > 1)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
     } else {
+      if (OnlyToVGPR)
+        return false;
+
       // Restore SGPR from a stack slot.
       // FIXME: We should use S_LOAD_DWORD here for VI.
       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -820,6 +869,32 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   }
 
   MI->eraseFromParent();
+  return true;
+}
+
+/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
+/// a VGPR and the stack slot can be safely eliminated when all other users are
+/// handled.
+bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
+  MachineBasicBlock::iterator MI,
+  int FI,
+  RegScavenger *RS) const {
+  switch (MI->getOpcode()) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S32_SAVE:
+    return spillSGPR(MI, FI, RS, true);
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+    return restoreSGPR(MI, FI, RS, true);
+  default:
+    llvm_unreachable("not an SGPR spill instruction");
+  }
 }
 
 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
@@ -1156,210 +1231,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
   return AMDGPU::NoRegister;
 }
 
-unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return 800;
-  return 512;
-}
-
-unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return 102;
-  return 104;
-}
-
-unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
-                                             const SIMachineFunctionInfo &MFI) const {
-  if (MFI.hasFlatScratchInit()) {
-    if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
-
-    if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
-      return 4; // FLAT_SCRATCH, VCC (in that order)
-  }
-
-  if (ST.isXNACKEnabled())
-    return 4; // XNACK, VCC (in that order)
-
-  return 2; // VCC.
-}
-
-unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
-                                        unsigned WavesPerEU) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WavesPerEU) {
-      case 0:  return 0;
-      case 10: return 0;
-      case 9:  return 0;
-      case 8:  return 81;
-      default: return 97;
-    }
-  } else {
-    switch (WavesPerEU) {
-      case 0:  return 0;
-      case 10: return 0;
-      case 9:  return 49;
-      case 8:  return 57;
-      case 7:  return 65;
-      case 6:  return 73;
-      case 5:  return 81;
-      default: return 97;
-    }
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
-                                        unsigned WavesPerEU,
-                                        bool Addressable) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WavesPerEU) {
-      case 0:  return 80;
-      case 10: return 80;
-      case 9:  return 80;
-      case 8:  return 96;
-      default: return Addressable ? getNumAddressableSGPRs(ST) : 112;
-    }
-  } else {
-    switch (WavesPerEU) {
-      case 0:  return 48;
-      case 10: return 48;
-      case 9:  return 56;
-      case 8:  return 64;
-      case 7:  return 72;
-      case 6:  return 80;
-      case 5:  return 96;
-      default: return getNumAddressableSGPRs(ST);
-    }
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
-  const Function &F = *MF.getFunction();
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
-  // Compute maximum number of SGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
-  unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false);
-  unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true);
-
-  // Check if maximum number of SGPRs was explicitly requested using
-  // "amdgpu-num-sgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
-    unsigned Requested = AMDGPU::getIntegerAttribute(
-      F, "amdgpu-num-sgpr", MaxNumSGPRs);
-
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
-      Requested = 0;
-
-    // If more SGPRs are required to support the input user/system SGPRs,
-    // increase to accommodate them.
-    //
-    // FIXME: This really ends up using the requested number of SGPRs + number
-    // of reserved special registers in total. Theoretically you could re-use
-    // the last input registers for these special registers, but this would
-    // require a lot of complexity to deal with the weird aliasing.
-    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
-    if (Requested && Requested < NumInputSGPRs)
-      Requested = NumInputSGPRs;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false))
-      Requested = 0;
-    if (WavesPerEU.second &&
-        Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
-      Requested = 0;
-
-    if (Requested)
-      MaxNumSGPRs = Requested;
-  }
-
-  if (ST.hasSGPRInitBug())
-    MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-
-  return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
-                  MaxNumAddressableSGPRs);
-}
-
-unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
-  const SISubtarget &ST) const {
-  if (ST.debuggerReserveRegs())
-    return 4;
-  return 0;
-}
-
-unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
-  switch (WavesPerEU) {
-    case 0:  return 0;
-    case 10: return 0;
-    case 9:  return 25;
-    case 8:  return 29;
-    case 7:  return 33;
-    case 6:  return 37;
-    case 5:  return 41;
-    case 4:  return 49;
-    case 3:  return 65;
-    case 2:  return 85;
-    default: return 129;
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
-  switch (WavesPerEU) {
-    case 0:  return 24;
-    case 10: return 24;
-    case 9:  return 28;
-    case 8:  return 32;
-    case 7:  return 36;
-    case 6:  return 40;
-    case 5:  return 48;
-    case 4:  return 64;
-    case 3:  return 84;
-    case 2:  return 128;
-    default: return getTotalNumVGPRs();
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
-  const Function &F = *MF.getFunction();
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
-  // Compute maximum number of VGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
-
-  // Check if maximum number of VGPRs was explicitly requested using
-  // "amdgpu-num-vgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
-    unsigned Requested = AMDGPU::getIntegerAttribute(
-      F, "amdgpu-num-vgpr", MaxNumVGPRs);
-
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
-      Requested = 0;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
-      Requested = 0;
-    if (WavesPerEU.second &&
-        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
-      Requested = 0;
-
-    if (Requested)
-      MaxNumVGPRs = Requested;
-  }
-
-  return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
-}
-
 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
                                                    unsigned EltSize) const {
   if (EltSize == 4) {
@@ -1476,3 +1347,62 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
                             unsigned Reg) const {
   return hasVGPRs(getRegClassForReg(MRI, Reg));
 }
+
+bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
+                                    const TargetRegisterClass *SrcRC,
+                                    unsigned SubReg,
+                                    const TargetRegisterClass *DstRC,
+                                    unsigned DstSubReg,
+                                    const TargetRegisterClass *NewRC) const {
+  unsigned SrcSize = SrcRC->getSize();
+  unsigned DstSize = DstRC->getSize();
+  unsigned NewSize = NewRC->getSize();
+
+  // Do not increase size of registers beyond dword, we would need to allocate
+  // adjacent registers and constraint regalloc more than needed.
+
+  // Always allow dword coalescing.
+  if (SrcSize <= 4 || DstSize <= 4)
+    return true;
+
+  return NewSize <= DstSize || NewSize <= SrcSize;
+}
+
+unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                             MachineFunction &MF) const {
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                       *MF.getFunction());
+  switch (RC->getID()) {
+  default:
+    return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
+  case AMDGPU::VGPR_32RegClassID:
+    return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
+  case AMDGPU::SGPR_32RegClassID:
+    return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
+  }
+}
+
+unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+                                                unsigned Idx) const {
+  if (Idx == getVGPRPressureSet())
+    return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+                               const_cast<MachineFunction &>(MF));
+
+  if (Idx == getSGPRPressureSet())
+    return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
+                               const_cast<MachineFunction &>(MF));
+
+  return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
+}
+
+const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+  static const int Empty[] = { -1 };
+
+  if (hasRegUnit(AMDGPU::M0, RegUnit))
+    return Empty;
+  return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 0bcae7d9840c..679ed229758a 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -21,8 +21,8 @@
 
 namespace llvm {
 
-class SISubtarget;
 class MachineRegisterInfo;
+class SISubtarget;
 class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -31,13 +31,22 @@ private:
   unsigned VGPRSetID;
   BitVector SGPRPressureSets;
   BitVector VGPRPressureSets;
+  bool SpillSGPRToVGPR;
+  bool SpillSGPRToSMEM;
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
   void classifyPressureSet(unsigned PSetID, unsigned Reg,
                            BitVector &PressureSets) const;
-
 public:
-  SIRegisterInfo();
+  SIRegisterInfo(const SISubtarget &ST);
+
+  bool spillSGPRToVGPR() const {
+    return SpillSGPRToVGPR;
+  }
+
+  bool spillSGPRToSMEM() const {
+    return SpillSGPRToSMEM;
+  }
 
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
@@ -78,16 +87,22 @@ public:
   const TargetRegisterClass *getPointerRegClass(
     const MachineFunction &MF, unsigned Kind = 0) const override;
 
-  void spillSGPR(MachineBasicBlock::iterator MI,
-                 int FI, RegScavenger *RS) const;
+  /// If \p OnlyToVGPR is true, this will only succeed if this
+  bool spillSGPR(MachineBasicBlock::iterator MI,
+                 int FI, RegScavenger *RS,
+                 bool OnlyToVGPR = false) const;
 
-  void restoreSGPR(MachineBasicBlock::iterator MI,
-                   int FI, RegScavenger *RS) const;
+  bool restoreSGPR(MachineBasicBlock::iterator MI,
+                   int FI, RegScavenger *RS,
+                   bool OnlyToVGPR = false) const;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
+  bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
+                                          int FI, RegScavenger *RS) const;
+
   unsigned getHWRegIndex(unsigned Reg) const {
     return getEncodingValue(Reg) & 0xff;
   }
@@ -195,74 +210,23 @@ public:
     return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
   }
 
-  /// \returns SGPR allocation granularity supported by the subtarget.
-  unsigned getSGPRAllocGranule() const {
-    return 8;
-  }
-
-  /// \returns Total number of SGPRs supported by the subtarget.
-  unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
-
-  /// \returns Number of addressable SGPRs supported by the subtarget.
-  unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
-
-  /// \returns Number of reserved SGPRs supported by the subtarget.
-  unsigned getNumReservedSGPRs(const SISubtarget &ST,
-                               const SIMachineFunctionInfo &MFI) const;
-
-  /// \returns Minimum number of SGPRs that meets given number of waves per
-  /// execution unit requirement for given subtarget.
-  unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
-
-  /// \returns Maximum number of SGPRs that meets given number of waves per
-  /// execution unit requirement for given subtarget.
-  unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU,
-                          bool Addressable) const;
-
-  /// \returns Maximum number of SGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of SGPRs explicitly
-  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
-
-  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule() const {
-    return 4;
-  }
-
-  /// \returns Total number of VGPRs supported by the subtarget.
-  unsigned getTotalNumVGPRs() const {
-    return 256;
-  }
-
-  /// \returns Number of reserved VGPRs for debugger use supported by the
-  /// subtarget.
-  unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
+  ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
+                                     unsigned EltSize) const;
 
-  /// \returns Minimum number of SGPRs that meets given number of waves per
-  /// execution unit requirement.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
 
-  /// \returns Maximum number of VGPRs that meets given number of waves per
-  /// execution unit requirement.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 
-  /// \returns Maximum number of VGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of VGPRs explicitly
-  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
 
-  ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
-                                     unsigned EltSize) const;
+  const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index 31e714b9f6b9..fc808011cd88 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -44,6 +44,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
+def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
+def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
+def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
+def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+
 // Trap handler registers
 def TBA_LO : SIReg<"tba_lo", 108>;
 def TBA_HI : SIReg<"tba_hi", 109>;
@@ -128,7 +133,7 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 // SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "SGPR%u", 0, 103))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
@@ -179,7 +184,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                (add (decimate (shl SGPR_32, 15), 4))]>;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
                             (add (sequence "TTMP%u", 0, 11))> {
   let isAllocatable = 0;
 }
@@ -197,7 +202,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
                                (add (decimate (shl TTMP_32, 3), 4))]>;
 
 // VGPR 32-bit registers
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+// i16/f16 only on VI+
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
@@ -258,19 +264,20 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
-   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
+   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
   let AllocationPriority = 7;
 }
 
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 7;
 }
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 7;
 }
@@ -319,7 +326,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
   let AllocationPriority = 11;
 }
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
   let AllocationPriority = 12;
@@ -366,7 +373,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
   let Size = 32;
 }
 
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                           (add VGPR_32, SReg_32)> {
   let isAllocatable = 0;
 }
@@ -417,6 +424,18 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> {
       let OperandType = opType#"_FP64";
       let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
     }
+
+    def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_V2INT16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">;
+      let DecoderMethod = "decodeOperand_VSrcV216";
+    }
+
+    def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_V2FP16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">;
+      let DecoderMethod = "decodeOperand_VSrcV216";
+    }
   }
 }
 
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index be27966fd5f1..0f02f5825cb0 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -53,6 +53,11 @@ class SISchedMachineModel : SchedMachineModel {
   let MicroOpBufferSize = 1;
   let IssueWidth = 1;
   let PostRAScheduler = 1;
+
+  // FIXME:Approximate 2 * branch cost.  Try to hack around bad
+  // early-ifcvt heuristics. These need improvement to avoid the OOE
+  // heuristics.
+  int MispredictPenalty = 20;
 }
 
 def SIFullSpeedModel : SISchedMachineModel;
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index dd31dc690840..c5f121757e62 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -497,24 +497,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
       if (Op32DstIdx != -1) {
         // dst
-        Inst32.addOperand(MI.getOperand(0));
+        Inst32.add(MI.getOperand(0));
       } else {
         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
                "Unexpected case");
       }
 
 
-      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+      Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
 
       const MachineOperand *Src1 =
           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
       if (Src1)
-        Inst32.addOperand(*Src1);
+        Inst32.add(*Src1);
 
       if (Src2) {
         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
         if (Op32Src2Idx != -1) {
-          Inst32.addOperand(*Src2);
+          Inst32.add(*Src2);
         } else {
           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
           // replaced with an implicit read of vcc. This was already added
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 02656483cd74..5b840a14dbc3 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -226,9 +226,9 @@ def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
-    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
@@ -293,12 +293,6 @@ def : Pat <
 
 let Predicates = [isVI] in {
 
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
-  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0)
->;
-
 def : Pat <
   (i64 (readcyclecounter)),
   (S_MEMREALTIME)
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 73cd5774128e..b4adbdd1df07 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -82,6 +82,12 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
   let has_sdst = 0;
 }
 
+class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_32:$src0),
+  "$src0", pattern> {
+  let has_sdst = 0;
+}
+
 class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
   opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0),
   "$sdst, $src0", pattern
@@ -210,7 +216,7 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
 def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
 } // End Uses = [M0]
 
-def S_CBRANCH_JOIN : SOP1_1  <"s_cbranch_join">;
+def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
 def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
 let Defs = [SCC] in {
 def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
@@ -428,7 +434,7 @@ def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">;
 
 def S_CBRANCH_G_FORK : SOP2_Pseudo <
   "s_cbranch_g_fork", (outs),
-  (ins SReg_64:$src0, SReg_64:$src1),
+  (ins SCSrc_b64:$src0, SCSrc_b64:$src1),
   "$src0, $src1"
 > {
   let has_sdst = 0;
@@ -438,6 +444,22 @@ let Defs = [SCC] in {
 def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
 } // End Defs = [SCC]
 
+let SubtargetPredicate = isVI in {
+  def S_RFE_RESTORE_B64 : SOP2_Pseudo <
+    "s_rfe_restore_b64", (outs),
+    (ins SSrc_b64:$src0, SSrc_b32:$src1),
+    "$src0, $src1"
+  > {
+    let hasSideEffects = 1;
+    let has_sdst = 0;
+  }
+}
+
+let SubtargetPredicate = isGFX9 in {
+  def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
+  def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
+  def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+}
 
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
@@ -751,6 +773,14 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
   let isReturn = 1;
 }
 
+let SubtargetPredicate = isVI in {
+def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+}
+
 let isBranch = 1, SchedRW = [WriteBranch] in {
 def S_BRANCH : SOPP <
   0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -792,6 +822,25 @@ def S_CBRANCH_EXECNZ : SOPP <
 >;
 } // End Uses = [EXEC]
 
+def S_CBRANCH_CDBGSYS : SOPP <
+  0x00000017, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys $simm16"
+>;
+
+def S_CBRANCH_CDBGSYS_AND_USER : SOPP <
+  0x0000001A, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys_and_user $simm16"
+>;
+
+def S_CBRANCH_CDBGSYS_OR_USER : SOPP <
+  0x00000019, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys_or_user $simm16"
+>;
+
+def S_CBRANCH_CDBGUSER : SOPP <
+  0x00000018, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbguser $simm16"
+>;
 
 } // End isBranch = 1
 } // End isTerminator = 1
@@ -806,9 +855,18 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   let isConvergent = 1;
 }
 
+let SubtargetPredicate = isVI in {
+def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+  let simm16 = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+}
+
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
 
 // On SI the documentation says sleep for approximately 64 * low 2
 // bits, consistent with the reported maximum of 448. On VI the
@@ -1207,6 +1265,10 @@ def S_BFE_U64_vi           : SOP2_Real_vi <0x27, S_BFE_U64>;
 def S_BFE_I64_vi           : SOP2_Real_vi <0x28, S_BFE_I64>;
 def S_CBRANCH_G_FORK_vi    : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>;
 def S_ABSDIFF_I32_vi       : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>;
+def S_PACK_LL_B32_B16_vi   : SOP2_Real_vi <0x32, S_PACK_LL_B32_B16>;
+def S_PACK_LH_B32_B16_vi   : SOP2_Real_vi <0x33, S_PACK_LH_B32_B16>;
+def S_PACK_HH_B32_B16_vi   : SOP2_Real_vi <0x34, S_PACK_HH_B32_B16>;
+def S_RFE_RESTORE_B64_vi   : SOP2_Real_vi <0x2b, S_RFE_RESTORE_B64>;
 
 def S_MOVK_I32_vi          : SOPK_Real_vi <0x00, S_MOVK_I32>;
 def S_CMOVK_I32_vi         : SOPK_Real_vi <0x01, S_CMOVK_I32>;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5f651d4da5d2..86095a8e1142 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===//
+//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,32 +6,42 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#include "AMDGPUBaseInfo.h"
+
 #include "AMDGPU.h"
+#include "AMDGPUBaseInfo.h"
 #include "SIDefines.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
-#define GET_REGINFO_ENUM
-#include "AMDGPUGenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
 
 #define GET_INSTRINFO_NAMED_OPS
-#define GET_INSTRINFO_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 #undef GET_INSTRINFO_NAMED_OPS
-#undef GET_INSTRINFO_ENUM
 
 namespace {
 
@@ -56,11 +66,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
   return (Src & getBitMask(Shift, Width)) >> Shift;
 }
 
-/// \returns Vmcnt bit shift.
-unsigned getVmcntBitShift() { return 0; }
+/// \returns Vmcnt bit shift (lower bits).
+unsigned getVmcntBitShiftLo() { return 0; }
 
-/// \returns Vmcnt bit width.
-unsigned getVmcntBitWidth() { return 4; }
+/// \returns Vmcnt bit width (lower bits).
+unsigned getVmcntBitWidthLo() { return 4; }
 
 /// \returns Expcnt bit shift.
 unsigned getExpcntBitShift() { return 4; }
@@ -74,52 +84,224 @@ unsigned getLgkmcntBitShift() { return 8; }
 /// \returns Lgkmcnt bit width.
 unsigned getLgkmcntBitWidth() { return 4; }
 
-} // anonymous namespace
+/// \returns Vmcnt bit shift (higher bits).
+unsigned getVmcntBitShiftHi() { return 14; }
+
+/// \returns Vmcnt bit width (higher bits).
+unsigned getVmcntBitWidthHi() { return 2; }
+
+} // end namespace anonymous
 
 namespace llvm {
 namespace AMDGPU {
 
-IsaVersion getIsaVersion(const FeatureBitset &Features) {
+namespace IsaInfo {
 
+IsaVersion getIsaVersion(const FeatureBitset &Features) {
+  // CI.
   if (Features.test(FeatureISAVersion7_0_0))
     return {7, 0, 0};
-
   if (Features.test(FeatureISAVersion7_0_1))
     return {7, 0, 1};
-
   if (Features.test(FeatureISAVersion7_0_2))
     return {7, 0, 2};
 
+  // VI.
   if (Features.test(FeatureISAVersion8_0_0))
     return {8, 0, 0};
-
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
-
   if (Features.test(FeatureISAVersion8_0_2))
     return {8, 0, 2};
-
   if (Features.test(FeatureISAVersion8_0_3))
     return {8, 0, 3};
-
   if (Features.test(FeatureISAVersion8_0_4))
     return {8, 0, 4};
-
   if (Features.test(FeatureISAVersion8_1_0))
     return {8, 1, 0};
 
-  return {0, 0, 0};
+  // GFX9.
+  if (Features.test(FeatureISAVersion9_0_0))
+    return {9, 0, 0};
+  if (Features.test(FeatureISAVersion9_0_1))
+    return {9, 0, 1};
+
+  if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+    return {0, 0, 0};
+  return {7, 0, 0};
+}
+
+unsigned getWavefrontSize(const FeatureBitset &Features) {
+  if (Features.test(FeatureWavefrontSize16))
+    return 16;
+  if (Features.test(FeatureWavefrontSize32))
+    return 32;
+
+  return 64;
+}
+
+unsigned getLocalMemorySize(const FeatureBitset &Features) {
+  if (Features.test(FeatureLocalMemorySize32768))
+    return 32768;
+  if (Features.test(FeatureLocalMemorySize65536))
+    return 65536;
+
+  return 0;
+}
+
+unsigned getEUsPerCU(const FeatureBitset &Features) {
+  return 4;
+}
+
+unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+                               unsigned FlatWorkGroupSize) {
+  if (!Features.test(FeatureGCN))
+    return 8;
+  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  if (N == 1)
+    return 40;
+  N = 40 / N;
+  return std::min(N, 16u);
+}
+
+unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
+  return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+}
+
+unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize) {
+  return getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+}
+
+unsigned getMinWavesPerEU(const FeatureBitset &Features) {
+  return 1;
+}
+
+unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
+  if (!Features.test(FeatureGCN))
+    return 8;
+  // FIXME: Need to take scratch memory into account.
+  return 10;
+}
+
+unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize) {
+  return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize),
+                 getEUsPerCU(Features)) / getEUsPerCU(Features);
+}
+
+unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) {
+  return 1;
+}
+
+unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) {
+  return 2048;
+}
+
+unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+                              unsigned FlatWorkGroupSize) {
+  return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) /
+                 getWavefrontSize(Features);
+}
+
+unsigned getSGPRAllocGranule(const FeatureBitset &Features) {
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 16;
+  return 8;
+}
+
+unsigned getSGPREncodingGranule(const FeatureBitset &Features) {
+  return 8;
+}
+
+unsigned getTotalNumSGPRs(const FeatureBitset &Features) {
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 800;
+  return 512;
+}
+
+unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
+  if (Features.test(FeatureSGPRInitBug))
+    return FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 102;
+  return 104;
+}
+
+unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  if (WavesPerEU >= getMaxWavesPerEU(Features))
+    return 0;
+  unsigned MinNumSGPRs =
+      alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
+                getSGPRAllocGranule(Features)) + 1;
+  return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
+}
+
+unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+                        bool Addressable) {
+  assert(WavesPerEU != 0);
+
+  IsaVersion Version = getIsaVersion(Features);
+  unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
+                                   getSGPRAllocGranule(Features));
+  unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
+  if (Version.Major >= 8 && !Addressable)
+    AddressableNumSGPRs = 112;
+  return std::min(MaxNumSGPRs, AddressableNumSGPRs);
+}
+
+unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
+  return 4;
+}
+
+unsigned getVGPREncodingGranule(const FeatureBitset &Features) {
+  return getVGPRAllocGranule(Features);
+}
+
+unsigned getTotalNumVGPRs(const FeatureBitset &Features) {
+  return 256;
 }
 
+unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
+  return getTotalNumVGPRs(Features);
+}
+
+unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  if (WavesPerEU >= getMaxWavesPerEU(Features))
+    return 0;
+  unsigned MinNumVGPRs =
+      alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
+                getVGPRAllocGranule(Features)) + 1;
+  return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features));
+}
+
+unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU,
+                                   getVGPRAllocGranule(Features));
+  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features);
+  return std::min(MaxNumVGPRs, AddressableNumVGPRs);
+}
+
+} // end namespace IsaInfo
+
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features) {
-
-  IsaVersion ISA = getIsaVersion(Features);
+  IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features);
 
   memset(&Header, 0, sizeof(Header));
 
   Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 0;
+  Header.amd_kernel_code_version_minor = 1;
   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
   Header.amd_machine_version_major = ISA.Major;
   Header.amd_machine_version_minor = ISA.Minor;
@@ -127,6 +309,11 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.kernel_code_entry_byte_offset = sizeof(Header);
   // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
   Header.wavefront_size = 6;
+
+  // If the code object does not support indirect functions, then the value must
+  // be 0xffffffff.
+  Header.call_convention = -1;
+
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
   Header.kernarg_segment_alignment = 4;
@@ -161,16 +348,16 @@ MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
                            ELF::SHF_AMDGPU_HSA_AGENT);
 }
 
-bool isGroupSegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS;
 }
 
-bool isGlobalSegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS;
 }
 
-bool isReadOnlySegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS;
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -208,7 +395,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
     return Default;
   }
   if (Strs.second.trim().getAsInteger(0, Ints.second)) {
-    if (!OnlyFirstRequired || Strs.second.trim().size()) {
+    if (!OnlyFirstRequired || !Strs.second.trim().empty()) {
       Ctx.emitError("can't parse second integer attribute " + Name);
       return Default;
     }
@@ -217,57 +404,84 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
   return Ints;
 }
 
-unsigned getWaitcntBitMask(IsaVersion Version) {
-  unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
-  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
-  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
-  return Vmcnt | Expcnt | Lgkmcnt;
-}
+unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
+  unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
+  if (Version.Major < 9)
+    return VmcntLo;
 
-unsigned getVmcntBitMask(IsaVersion Version) {
-  return (1 << getVmcntBitWidth()) - 1;
+  unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
+  return VmcntLo | VmcntHi;
 }
 
-unsigned getExpcntBitMask(IsaVersion Version) {
+unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
   return (1 << getExpcntBitWidth()) - 1;
 }
 
-unsigned getLgkmcntBitMask(IsaVersion Version) {
+unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
   return (1 << getLgkmcntBitWidth()) - 1;
 }
 
-unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) {
-  return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
+  unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
+  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+  unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
+  if (Version.Major < 9)
+    return Waitcnt;
+
+  unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  return Waitcnt | VmcntHi;
+}
+
+unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+  unsigned VmcntLo =
+      unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  if (Version.Major < 9)
+    return VmcntLo;
+
+  unsigned VmcntHi =
+      unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  VmcntHi <<= getVmcntBitWidthLo();
+  return VmcntLo | VmcntHi;
 }
 
-unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) {
+unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) {
+unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
   Vmcnt = decodeVmcnt(Version, Waitcnt);
   Expcnt = decodeExpcnt(Version, Waitcnt);
   Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
 }
 
-unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) {
-  return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                     unsigned Vmcnt) {
+  Waitcnt =
+      packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  if (Version.Major < 9)
+    return Waitcnt;
+
+  Vmcnt >>= getVmcntBitWidthLo();
+  return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
 }
 
-unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) {
+unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                      unsigned Expcnt) {
   return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
+unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                       unsigned Lgkmcnt) {
   return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-unsigned encodeWaitcnt(IsaVersion Version,
+unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
   unsigned Waitcnt = getWaitcntBitMask(Version);
   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
@@ -296,6 +510,10 @@ bool isCompute(CallingConv::ID cc) {
   return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
 }
 
+bool isEntryFunctionCC(CallingConv::ID CC) {
+  return true;
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
@@ -327,13 +545,34 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
   return Reg;
 }
 
+unsigned mc2PseudoReg(unsigned Reg) {
+  switch (Reg) {
+  case AMDGPU::FLAT_SCR_ci:
+  case AMDGPU::FLAT_SCR_vi:
+    return FLAT_SCR;
+
+  case AMDGPU::FLAT_SCR_LO_ci:
+  case AMDGPU::FLAT_SCR_LO_vi:
+    return AMDGPU::FLAT_SCR_LO;
+
+  case AMDGPU::FLAT_SCR_HI_ci:
+  case AMDGPU::FLAT_SCR_HI_vi:
+    return AMDGPU::FLAT_SCR_HI;
+
+  default:
+    return Reg;
+  }
+}
+
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
          OpType <= AMDGPU::OPERAND_SRC_LAST;
 }
 
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   switch (OpType) {
   case AMDGPU::OPERAND_REG_IMM_FP32:
@@ -342,6 +581,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return true;
   default:
     return false;
@@ -349,6 +589,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 }
 
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
          OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
@@ -392,6 +633,7 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
 
 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
                            unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
   return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
 }
@@ -440,7 +682,8 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
 }
 
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
-  assert(HasInv2Pi);
+  if (!HasInv2Pi)
+    return false;
 
   if (Literal >= -16 && Literal <= 64)
     return true;
@@ -457,5 +700,92 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
          Val == 0x3118;   // 1/2pi
 }
 
-} // End namespace AMDGPU
-} // End namespace llvm
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
+  assert(HasInv2Pi);
+
+  int16_t Lo16 = static_cast<int16_t>(Literal);
+  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+  return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+}
+
+bool isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
+int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+  if (isSI(ST) || isCI(ST))
+    return ByteOffset >> 2;
+
+  return ByteOffset;
+}
+
+bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+  int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
+  return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) :
+                                isUInt<20>(EncodedOffset);
+}
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+const unsigned AMDGPUAS::MAX_COMMON_ADDRESS;
+const unsigned AMDGPUAS::GLOBAL_ADDRESS;
+const unsigned AMDGPUAS::LOCAL_ADDRESS;
+const unsigned AMDGPUAS::PARAM_D_ADDRESS;
+const unsigned AMDGPUAS::PARAM_I_ADDRESS;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_0;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_1;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_2;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_3;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_4;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_5;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_6;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_7;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_8;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_9;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_10;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_11;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_12;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_13;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_14;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_15;
+const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+namespace llvm {
+namespace AMDGPU {
+
+AMDGPUAS getAMDGPUAS(Triple T) {
+  auto Env = T.getEnvironmentName();
+  AMDGPUAS AS;
+  if (Env == "amdgiz" || Env == "amdgizcl") {
+    AS.FLAT_ADDRESS     = 0;
+    AS.PRIVATE_ADDRESS  = 5;
+    AS.REGION_ADDRESS   = 4;
+  }
+  else {
+    AS.FLAT_ADDRESS     = 4;
+    AS.PRIVATE_ADDRESS  = 0;
+    AS.REGION_ADDRESS   = 5;
+   }
+  return AS;
+}
+
+AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
+  return getAMDGPUAS(M.getTargetTriple());
+}
+
+AMDGPUAS getAMDGPUAS(const Module &M) {
+  return getAMDGPUAS(Triple(M.getTargetTriple()));
+}
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ea5fc366d205..d6c836eb748b 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===//
+//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,39 +10,143 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
+#include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
-#include "llvm/IR/CallingConv.h"
-
 #include "SIDefines.h"
-
-#define GET_INSTRINFO_OPERAND_ENUM
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_OPERAND_ENUM
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
 class FeatureBitset;
 class Function;
 class GlobalValue;
+class MachineMemOperand;
 class MCContext;
-class MCInstrDesc;
 class MCRegisterClass;
 class MCRegisterInfo;
 class MCSection;
 class MCSubtargetInfo;
+class Triple;
 
 namespace AMDGPU {
+namespace IsaInfo {
 
-LLVM_READONLY
-int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+enum {
+  // The closed Vulkan driver sets 96, which limits the wave count to 8 but
+  // doesn't spill SGPRs as much as when 80 is set.
+  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+};
 
+/// \brief Instruction set architecture version.
 struct IsaVersion {
   unsigned Major;
   unsigned Minor;
   unsigned Stepping;
 };
 
+/// \returns Isa version for given subtarget \p Features.
 IsaVersion getIsaVersion(const FeatureBitset &Features);
+
+/// \returns Wavefront size for given subtarget \p Features.
+unsigned getWavefrontSize(const FeatureBitset &Features);
+
+/// \returns Local memory size in bytes for given subtarget \p Features.
+unsigned getLocalMemorySize(const FeatureBitset &Features);
+
+/// \returns Number of execution units per compute unit for given subtarget \p
+/// Features.
+unsigned getEUsPerCU(const FeatureBitset &Features);
+
+/// \returns Maximum number of work groups per compute unit for given subtarget
+/// \p Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+                               unsigned FlatWorkGroupSize);
+
+/// \returns Maximum number of waves per compute unit for given subtarget \p
+/// Features without any kind of limitation.
+unsigned getMaxWavesPerCU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per compute unit for given subtarget \p
+/// Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize);
+
+/// \returns Minimum number of waves per execution unit for given subtarget \p
+/// Features.
+unsigned getMinWavesPerEU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per execution unit for given subtarget \p
+/// Features without any kind of limitation.
+unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per execution unit for given subtarget \p
+/// Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize);
+
+/// \returns Minimum flat work group size for given subtarget \p Features.
+unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features);
+
+/// \returns Maximum flat work group size for given subtarget \p Features.
+unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features);
+
+/// \returns Number of waves per work group for given subtarget \p Features and
+/// limited by given \p FlatWorkGroupSize.
+unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+                              unsigned FlatWorkGroupSize);
+
+/// \returns SGPR allocation granularity for given subtarget \p Features.
+unsigned getSGPRAllocGranule(const FeatureBitset &Features);
+
+/// \returns SGPR encoding granularity for given subtarget \p Features.
+unsigned getSGPREncodingGranule(const FeatureBitset &Features);
+
+/// \returns Total number of SGPRs for given subtarget \p Features.
+unsigned getTotalNumSGPRs(const FeatureBitset &Features);
+
+/// \returns Addressable number of SGPRs for given subtarget \p Features.
+unsigned getAddressableNumSGPRs(const FeatureBitset &Features);
+
+/// \returns Minimum number of SGPRs that meets the given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+/// \returns Maximum number of SGPRs that meets the given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+                        bool Addressable);
+
+/// \returns VGPR allocation granularity for given subtarget \p Features.
+unsigned getVGPRAllocGranule(const FeatureBitset &Features);
+
+/// \returns VGPR encoding granularity for given subtarget \p Features.
+unsigned getVGPREncodingGranule(const FeatureBitset &Features);
+
+/// \returns Total number of VGPRs for given subtarget \p Features.
+unsigned getTotalNumVGPRs(const FeatureBitset &Features);
+
+/// \returns Addressable number of VGPRs for given subtarget \p Features.
+unsigned getAddressableNumVGPRs(const FeatureBitset &Features);
+
+/// \returns Minimum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+/// \returns Maximum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+} // end namespace IsaInfo
+
+LLVM_READONLY
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 MCSection *getHSATextSection(MCContext &Ctx);
@@ -53,9 +157,9 @@ MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
 
 MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
 
-bool isGroupSegment(const GlobalValue *GV);
-bool isGlobalSegment(const GlobalValue *GV);
-bool isReadOnlySegment(const GlobalValue *GV);
+bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS);
 
 /// \returns True if constants should be emitted to .text section for given
 /// target triple \p TT, false otherwise.
@@ -83,64 +187,89 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
                                             std::pair<int, int> Default,
                                             bool OnlyFirstRequired = false);
 
-/// \returns Waitcnt bit mask for given isa \p Version.
-unsigned getWaitcntBitMask(IsaVersion Version);
-
 /// \returns Vmcnt bit mask for given isa \p Version.
-unsigned getVmcntBitMask(IsaVersion Version);
+unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Expcnt bit mask for given isa \p Version.
-unsigned getExpcntBitMask(IsaVersion Version);
+unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Lgkmcnt bit mask for given isa \p Version.
-unsigned getLgkmcntBitMask(IsaVersion Version);
+unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version);
+
+/// \returns Waitcnt bit mask for given isa \p Version.
+unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
 /// \p Lgkmcnt respectively.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-///     \p Vmcnt = \p Waitcnt[3:0]
+///     \p Vmcnt = \p Waitcnt[3:0]                      (pre-gfx9 only)
+///     \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14]  (gfx9+ only)
 ///     \p Expcnt = \p Waitcnt[6:4]
 ///     \p Lgkmcnt = \p Waitcnt[11:8]
-void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
 /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
-unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt);
+unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                     unsigned Vmcnt);
 
 /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
-unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt);
+unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                      unsigned Expcnt);
 
 /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
-unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt);
+unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                       unsigned Lgkmcnt);
 
 /// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
 /// \p Version.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
-///     Waitcnt[3:0]  = \p Vmcnt
-///     Waitcnt[6:4]  = \p Expcnt
-///     Waitcnt[11:8] = \p Lgkmcnt
+///     Waitcnt[3:0]   = \p Vmcnt       (pre-gfx9 only)
+///     Waitcnt[3:0]   = \p Vmcnt[3:0]  (gfx9+ only)
+///     Waitcnt[6:4]   = \p Expcnt
+///     Waitcnt[11:8]  = \p Lgkmcnt
+///     Waitcnt[15:14] = \p Vmcnt[5:4]  (gfx9+ only)
 ///
 /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
 /// isa \p Version.
-unsigned encodeWaitcnt(IsaVersion Version,
+unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
 
 unsigned getInitialPSInputAddr(const Function &F);
 
-bool isShader(CallingConv::ID cc);
-bool isCompute(CallingConv::ID cc);
+LLVM_READNONE
+bool isShader(CallingConv::ID CC);
+
+LLVM_READNONE
+bool isCompute(CallingConv::ID CC);
+
+LLVM_READNONE
+bool isEntryFunctionCC(CallingConv::ID CC);
+
+// FIXME: Remove this when calling conventions cleaned up
+LLVM_READNONE
+inline bool isKernel(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return true;
+  default:
+    return false;
+  }
+}
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
@@ -150,6 +279,10 @@ bool isVI(const MCSubtargetInfo &STI);
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
+/// \brief Convert hardware register \p Reg to a pseudo register
+LLVM_READNONE
+unsigned mc2PseudoReg(unsigned Reg);
+
 /// \brief Can this operand also contain immediate values?
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
@@ -188,6 +321,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return 2;
 
   default:
@@ -210,7 +345,21 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+
+bool isUniformMMO(const MachineMemOperand *MMO);
+
+/// \returns The encoding that will be used for \p ByteOffset in the SMRD
+/// offset field.
+int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+
+/// \returns true if this offset is small enough to fit in the SMRD
+/// offset field.  \p ByteOffset should be the offset in bytes and
+/// not the encoded offset.
+bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index c55eaab077d1..991408c81c92 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -87,7 +87,7 @@ COMPPGM1(enable_ieee_mode,                compute_pgm_rsrc1_ieee_mode,      IEEE
 // TODO: cdbg_user
 COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
 COMPPGM2(user_sgpr_count,                 compute_pgm_rsrc2_user_sgpr,      USER_SGPR),
-// TODO: enable_trap_handler
+COMPPGM2(enable_trap_handler,             compute_pgm_rsrc2_trap_handler,   TRAP_HANDLER),
 COMPPGM2(enable_sgpr_workgroup_id_x,      compute_pgm_rsrc2_tgid_x_en,      TGID_X_EN),
 COMPPGM2(enable_sgpr_workgroup_id_y,      compute_pgm_rsrc2_tgid_y_en,      TGID_Y_EN),
 COMPPGM2(enable_sgpr_workgroup_id_z,      compute_pgm_rsrc2_tgid_z_en,      TGID_Z_EN),
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 8cae83cd9d1a..1febc6bf8ec2 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -23,18 +23,18 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
 
 class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
-  
+
   let Inst{8-0}   = 0xf9; // sdwa
   let Inst{16-9}  = op;
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{31-25} = 0x3f; // encoding
 }
 
-class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
   InstSI <P.Outs32, P.Ins32, "", pattern>,
   VOP <opName>,
-  SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
+  SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
+  MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
 
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -75,6 +75,8 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -83,10 +85,17 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 }
 
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
-  list<dag> ret = !if(P.HasModifiers,
-    [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
-                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-    [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
+  list<dag> ret =
+    !if(P.HasModifiers,
+        [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                              i32:$src0_modifiers,
+                                              i1:$clamp, i32:$omod))))],
+        !if(P.HasOMod,
+            [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
+                                                  i1:$clamp, i32:$omod))))],
+            [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]
+        )
+    );
 }
 
 multiclass VOP1Inst <string opName, VOPProfile P,
@@ -96,6 +105,23 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
 }
 
+// Special profile for instructions which have clamp
+// and output modifiers (but have no input modifiers)
+class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
+  VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+
+  let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
+  let Asm64 = "$vdst, $src0$clamp$omod";
+
+  let HasModifiers = 0;
+  let HasClamp = 1;
+  let HasOMod = 1;
+}
+
+def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
+def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
+def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
+
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -142,24 +168,24 @@ def V_READFIRSTLANE_B32 :
 
 let SchedRW = [WriteQuarterRate32] in {
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
-defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
-defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
-defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
+defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
-defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP1_F32_I32>;
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
 defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
-defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
-defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
-defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
-defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
 defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
-defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
 } // End SchedRW = [WriteQuarterRate32]
 
 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
@@ -237,7 +263,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
                      src0_sel:$src0_sel);
 
   let Asm32 = getAsm32<1, 1>.ret;
-  let Asm64 = getAsm64<1, 1, 0>.ret;
+  let Asm64 = getAsm64<1, 1, 0, 1>.ret;
   let AsmDPP = getAsmDPP<1, 1, 0>.ret;
   let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
 
@@ -258,11 +284,14 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
 defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
+let SchedRW = [WriteQuarterRate32] in {
+defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
+}
+
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
 let SchedRW = [WriteQuarterRate32] in {
-defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
 defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
 defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
 defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
@@ -297,8 +326,8 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
 
 let SubtargetPredicate = isVI in {
 
-defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
-defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
+defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
@@ -326,12 +355,31 @@ def : Pat<
 >;
 
 def : Pat<
-    (i16 (fp_to_f16 f32:$src)),
+    (i16 (AMDGPUfp_to_f16 f32:$src)),
     (V_CVT_F16_F32_e32 $src)
 >;
 
 }
 
+def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+  let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1);
+  let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1);
+  let Outs64 = Outs32;
+  let Asm32 = " $vdst, $src0";
+  let Asm64 = "";
+  let Ins64 = (ins);
+}
+
+let SubtargetPredicate = isGFX9 in {
+  let Constraints = "$vdst = $src1, $vdst1 = $src0",
+      DisableEncoding="$vdst1,$src1",
+      SchedRW = [Write64Bit, Write64Bit] in {
+// Never VOP3. Takes as long as 2 v_mov_b32s
+def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
+}
+
+} // End SubtargetPredicate = isGFX9
+
 //===----------------------------------------------------------------------===//
 // Target
 //===----------------------------------------------------------------------===//
@@ -453,6 +501,14 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
   let Inst{31-25} = 0x3f; //encoding
 }
 
+multiclass VOP1Only_Real_vi <bits<10> op> {
+  let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+    def _vi :
+      VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
+      VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+  }
+}
+
 multiclass VOP1_Real_vi <bits<10> op> {
   let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
     def _e32_vi :
@@ -480,6 +536,7 @@ defm V_CVT_F32_I32       : VOP1_Real_vi <0x5>;
 defm V_CVT_F32_U32       : VOP1_Real_vi <0x6>;
 defm V_CVT_U32_F32       : VOP1_Real_vi <0x7>;
 defm V_CVT_I32_F32       : VOP1_Real_vi <0x8>;
+defm V_MOV_FED_B32       : VOP1_Real_vi <0x9>;
 defm V_CVT_F16_F32       : VOP1_Real_vi <0xa>;
 defm V_CVT_F32_F16       : VOP1_Real_vi <0xb>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_vi <0xc>;
@@ -547,7 +604,7 @@ defm V_RNDNE_F16         : VOP1_Real_vi <0x47>;
 defm V_FRACT_F16         : VOP1_Real_vi <0x48>;
 defm V_SIN_F16           : VOP1_Real_vi <0x49>;
 defm V_COS_F16           : VOP1_Real_vi <0x4a>;
-
+defm V_SWAP_B32          : VOP1Only_Real_vi <0x51>;
 
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 00e5ab3db0b7..2281f338ab45 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -40,7 +40,7 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
 class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
   bits<8> src1;
-  
+
   let Inst{8-0}   = 0xf9; // sdwa
   let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -93,6 +93,8 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -119,8 +121,7 @@ multiclass VOP2Inst <string opName,
   def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
              Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
-              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+  def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
 }
 
 // TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
@@ -134,10 +135,10 @@ multiclass VOP2bInst <string opName,
     let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
-      
-      def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
-              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+
+      def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
     }
+
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
   }
@@ -154,6 +155,7 @@ multiclass VOP2eInst <string opName,
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
     }
+
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
   }
@@ -179,10 +181,12 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
 
+// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
+// and processing time but it makes it easier to convert to mad.
 class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
-                       HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+                       HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
   let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                     VGPR_32:$src2, // stub argument
@@ -194,6 +198,7 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm32 = getAsm32<1, 2, vt>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
   let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
   let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
   let HasSrc2 = 0;
@@ -204,13 +209,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 def VOP_MAC_F16 : VOP_MAC <f16> {
   // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
   // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret;
 }
 
 def VOP_MAC_F32 : VOP_MAC <f32> {
   // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
   // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret;
 }
 
 // Write out to vcc or arbitrary SGPR.
@@ -280,7 +285,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
 def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
   let Outs32 = (outs VGPR_32:$vdst);
   let Outs64 = Outs32;
-  let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1);
+  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
@@ -354,7 +359,7 @@ defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
 defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
 
@@ -494,6 +499,14 @@ def : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
 >;
 
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
+  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+>;
+
 } // End Predicates = [isVI]
 
 //===----------------------------------------------------------------------===//
@@ -566,7 +579,10 @@ defm V_SUBB_U32           : VOP2be_Real_e32e64_si <0x29>;
 defm V_SUBBREV_U32        : VOP2be_Real_e32e64_si <0x2a>;
 
 defm V_READLANE_B32       : VOP2_Real_si <0x01>;
+
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
 defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
+}
 
 defm V_MAC_LEGACY_F32     : VOP2_Real_e32e64_si <0x6>;
 defm V_MIN_LEGACY_F32     : VOP2_Real_e32e64_si <0xd>;
@@ -646,7 +662,7 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
   VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
- 
+
 multiclass VOP2_SDWA_Real <bits<6> op> {
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index c2a4d4ba99b1..217a07488853 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -29,6 +29,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
                   ret1));
 }
 
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+                                    (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+                          (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
 class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
   list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
@@ -86,6 +106,14 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
   let DstRC = RegisterOperand<VReg_64>;
 }
 
+def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VReg_64>;
+
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+}
+
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
@@ -209,10 +237,8 @@ def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I3
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
 
 let isCommutable = 1 in {
-def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
-
-// XXX - Does this set VCC?
-def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
+def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isCIVI
@@ -234,12 +260,14 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 
 }  // End isCommutable = 1
 
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
 } // End SubtargetPredicate = isVI
 
 let Predicates = [isVI] in {
 
-multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
-                            Instruction inst, SDPatternOperator op3> {
+multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+                             Instruction inst, SDPatternOperator op3> {
 def : Pat<
   (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
   (inst i16:$src0, i16:$src1, i16:$src2)
@@ -258,11 +286,26 @@ def : Pat<
 >;
 }
 
-defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
 
 } // End Predicates = [isVI]
 
+let SubtargetPredicate = isGFX9 in {
+def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;
+def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
+def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
+def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
+def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Target
@@ -351,11 +394,19 @@ multiclass VOP3_Real_ci<bits<9> op> {
   }
 }
 
+multiclass VOP3be_Real_ci<bits<9> op> {
+  def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [isCIOnly];
+    let DecoderNamespace = "CI";
+  }
+}
+
 defm V_MQSAD_U16_U8     : VOP3_Real_ci <0x172>;
 defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
-defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x174>;
-defm V_MAD_U64_U32      : VOP3_Real_ci <0x176>;
-defm V_MAD_I64_I32      : VOP3_Real_ci <0x177>;
+defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x175>;
+defm V_MAD_U64_U32      : VOP3be_Real_ci <0x176>;
+defm V_MAD_I64_I32      : VOP3be_Real_ci <0x177>;
 
 //===----------------------------------------------------------------------===//
 // VI
@@ -376,8 +427,8 @@ multiclass VOP3be_Real_vi<bits<10> op> {
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
 
 defm V_MQSAD_U16_U8     : VOP3_Real_vi <0x172>;
-defm V_MAD_U64_U32      : VOP3_Real_vi <0x176>;
-defm V_MAD_I64_I32      : VOP3_Real_vi <0x177>;
+defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
+defm V_MAD_I64_I32      : VOP3be_Real_vi <0x1E9>;
 
 defm V_MAD_LEGACY_F32   : VOP3_Real_vi <0x1c0>;
 defm V_MAD_F32          : VOP3_Real_vi <0x1c1>;
@@ -424,6 +475,8 @@ defm V_MAD_F16          : VOP3_Real_vi <0x1ea>;
 defm V_MAD_U16          : VOP3_Real_vi <0x1eb>;
 defm V_MAD_I16          : VOP3_Real_vi <0x1ec>;
 
+defm V_PERM_B32         : VOP3_Real_vi <0x1ed>;
+
 defm V_FMA_F16          : VOP3_Real_vi <0x1ee>;
 defm V_DIV_FIXUP_F16    : VOP3_Real_vi <0x1ef>;
 
@@ -449,3 +502,16 @@ defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
 defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
 defm V_ASHRREV_I64      : VOP3_Real_vi <0x291>;
 defm V_TRIG_PREOP_F64   : VOP3_Real_vi <0x292>;
+
+defm V_LSHL_ADD_U32 : VOP3_Real_vi <0x1fd>;
+defm V_ADD_LSHL_U32 : VOP3_Real_vi <0x1fe>;
+defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>;
+defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
+defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
+defm V_OR3_B32 : VOP3_Real_vi <0x202>;
+defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
+
+defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
+defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
+defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
+defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
new file mode 100644
index 000000000000..96d343099132
--- /dev/null
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -0,0 +1,82 @@
+//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP3P Classes
+//===----------------------------------------------------------------------===//
+
+class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+  VOP3P_Pseudo<OpName, P,
+    !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+>;
+
+// Non-packed instructions that use the VOP3P encoding. i.e. where
+// omod/abs are used.
+class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+  VOP3P_Pseudo<OpName, P,
+    !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+>;
+
+let isCommutable = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
+def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+
+def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+
+def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+}
+
+def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
+def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
+def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+
+// XXX - Commutable?
+def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+
+
+multiclass VOP3P_Real_vi<bits<10> op> {
+  def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [HasVOP3PInsts];
+    let DecoderNamespace = "VI";
+  }
+}
+
+defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
+defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
+defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
+defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
+defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+
+defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
+defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
+defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
+defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>;
+defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
+defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
+defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+
+defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index 16a456da3c67..a3550a63677b 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -93,6 +93,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -165,13 +167,11 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
-  def _sdwa : VOPC_SDWA_Pseudo <opName, P>,
-              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
+  def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
     let isCompare = 1;
-    let isCommutable = 1;
   }
 }
 
@@ -563,7 +563,7 @@ multiclass VOPC_CLASS_F16 <string opName> :
   VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
 
 multiclass VOPCX_CLASS_F16 <string opName> :
-  VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+  VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>;
 
 multiclass VOPC_CLASS_F32 <string opName> :
   VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 5f72f97d9e28..69906c419db3 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -68,8 +68,9 @@ class VOP3Common <dag outs, dag ins, string asm = "",
   let hasPostISelHook = 1;
 }
 
-class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
-  InstSI <P.Outs64, P.Ins64, "", pattern>,
+class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
+                   bit VOP3Only = 0, bit isVOP3P = 0> :
+  InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>,
   VOP <opName>,
   SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e64", opName> {
@@ -79,7 +80,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On
   let UseNamedOperandTable = 1;
 
   string Mnemonic = opName;
-  string AsmOperands = P.Asm64;
+  string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64);
 
   let Size = 8;
   let mayLoad = 0;
@@ -100,23 +101,34 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On
 
   let VOP3 = 1;
   let VALU = 1;
+  let FPClamp = P.HasFPClamp;
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
   let AsmMatchConverter =
     !if(!eq(VOP3Only,1),
-        "cvtVOP3",
-        !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));
+        !if(!and(P.IsPacked, isVOP3P), "cvtVOP3P", "cvtVOP3"),
+        !if(!eq(P.HasModifiers, 1),
+            "cvtVOP3_2_mod",
+            !if(!eq(P.HasOMod, 1), "cvtVOP3OMod", "")
+        )
+    );
 
   VOPProfile Pfl = P;
 }
 
+class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
+  VOP3_Pseudo<opName, P, pattern, 1, 1> {
+  let VOP3P = 1;
+}
+
 class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let UseNamedOperandTable = 1;
 
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
@@ -128,8 +140,15 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
+// XXX - Is there any reason to distingusih this from regular VOP3
+// here?
+class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+  VOP3_Real<ps, EncodingFamily>;
+
 class VOP3a<VOPProfile P> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
@@ -197,6 +216,42 @@ class VOP3be <VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
+class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
+  bits<8> vdst;
+  // neg, neg_hi, op_sel put in srcN_modifiers
+  bits<4> src0_modifiers;
+  bits<9> src0;
+  bits<4> src1_modifiers;
+  bits<9> src1;
+  bits<4> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+
+  let Inst{7-0} = vdst;
+  let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+  let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+  let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+
+  let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+
+  let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2)
+
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{59}    = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0)
+  let Inst{60}    = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1)
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
 class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
   let Inst{25-17} = op;
 }
@@ -250,7 +305,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOP <opName>,
   SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
   MnemonicAlias <opName#"_sdwa", opName> {
-  
+
   let isPseudo = 1;
   let isCodeGenOnly = 1;
   let UseNamedOperandTable = 1;
@@ -261,14 +316,14 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   let Size = 8;
   let mayLoad = 0;
   let mayStore = 0;
-  let hasSideEffects = 0;  
+  let hasSideEffects = 0;
 
   let VALU = 1;
   let SDWA = 1;
   let Uses = [EXEC];
-  
-  let SubtargetPredicate = isVI;
-  let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+
+  let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst);
+  let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
                                      AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "SDWA";
@@ -337,8 +392,8 @@ class VOP_DPP <string OpName, VOPProfile P> :
   let Size = 8;
 
   let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
-  let SubtargetPredicate = isVI;
-  let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+  let SubtargetPredicate = HasDPP;
+  let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
                                      AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "DPP";
@@ -348,3 +403,4 @@ include "VOPCInstructions.td"
 include "VOP1Instructions.td"
 include "VOP2Instructions.td"
 include "VOP3Instructions.td"
+include "VOP3PInstructions.td"
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 89859ba063d9..8640c873f441 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -427,13 +427,11 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
                                        unsigned Lane, bool QPR) {
   unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
                                                   &ARM::DPRRegClass);
-  AddDefaultPred(BuildMI(MBB,
-                         InsertBefore,
-                         DL,
-                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
-                         Out)
-                   .addReg(Reg)
-                   .addImm(Lane));
+  BuildMI(MBB, InsertBefore, DL,
+          TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out)
+      .addReg(Reg)
+      .addImm(Lane)
+      .add(predOps(ARMCC::AL));
 
   return Out;
 }
@@ -476,13 +474,11 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
                                     const DebugLoc &DL, unsigned Ssub0,
                                     unsigned Ssub1) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
-  AddDefaultPred(BuildMI(MBB,
-                         InsertBefore,
-                         DL,
-                         TII->get(ARM::VEXTd32), Out)
-                   .addReg(Ssub0)
-                   .addReg(Ssub1)
-                   .addImm(1));
+  BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out)
+      .addReg(Ssub0)
+      .addReg(Ssub1)
+      .addImm(1)
+      .add(predOps(ARMCC::AL));
   return Out;
 }
 
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index be3048252bbc..39f7988200ea 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -16,21 +16,21 @@
 #define LLVM_LIB_TARGET_ARM_ARM_H
 
 #include "llvm/Support/CodeGen.h"
-#include "ARMBasicBlockInfo.h"
 #include <functional>
+#include <vector>
 
 namespace llvm {
 
 class ARMAsmPrinter;
 class ARMBaseTargetMachine;
+struct BasicBlockInfo;
 class Function;
 class FunctionPass;
-class ImmutablePass;
+class MachineBasicBlock;
+class MachineFunction;
 class MachineInstr;
 class MCInst;
 class PassRegistry;
-class TargetLowering;
-class TargetMachine;
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -53,7 +53,8 @@ std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
 
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMConstantIslandsPass(PassRegistry &);
 
-} // end namespace llvm;
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARM_H
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 2a090faeee6a..57f9d1c6b610 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -72,8 +72,6 @@ def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
 def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
                                         "HasHardwareDivideInARM", "true",
                                       "Enable divide instructions in ARM mode">;
-def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
-                                 "Enable Thumb2 extract and pack instructions">;
 def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
                                    "Has data barrier (dmb / dsb) instructions">;
 def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
@@ -263,6 +261,12 @@ def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
                                      "Don't use movt/movw pairs for 32-bit "
                                      "imms">;
 
+def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
+                                        "NegativeImmediates", "false",
+                                        "Convert immediates and instructions "
+                                        "to their negated or complemented "
+                                        "equivalent when the immediate does "
+                                        "not fit in the encoding.">;
 
 //===----------------------------------------------------------------------===//
 // ARM ISAa.
@@ -297,8 +301,7 @@ def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                     FeatureV7Clrex]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops, FeatureAcquireRelease,
-                                    FeatureT2XtPk]>;
+                                   [HasV7Ops, FeatureAcquireRelease]>;
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
@@ -342,7 +345,9 @@ def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", []>;
 
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
-                                   "Qualcomm ARM processors", []>;
+                                   "Qualcomm Krait processors", []>;
+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors", []>;
 def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
                                    "Swift ARM processors", []>;
 
@@ -393,8 +398,7 @@ def ARMv5tej  : Architecture<"armv5tej",  "ARMv5tej", [HasV5TEOps]>;
 def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops]>;
 
 def ARMv6t2   : Architecture<"armv6t2",   "ARMv6t2",  [HasV6T2Ops,
-                                                       FeatureDSP,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureDSP]>;
 
 def ARMv6k    : Architecture<"armv6k",    "ARMv6k",   [HasV6KOps]>;
 
@@ -415,15 +419,22 @@ def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
                                                        FeatureNEON,
                                                        FeatureDB,
                                                        FeatureDSP,
-                                                       FeatureAClass,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureAClass]>;
+
+def ARMv7ve   : Architecture<"armv7ve",   "ARMv7ve",  [HasV7Ops,
+                                                       FeatureNEON,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureAClass]>;
 
 def ARMv7r    : Architecture<"armv7-r",   "ARMv7r",   [HasV7Ops,
                                                        FeatureDB,
                                                        FeatureDSP,
                                                        FeatureHWDiv,
-                                                       FeatureRClass,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureRClass]>;
 
 def ARMv7m    : Architecture<"armv7-m",   "ARMv7m",   [HasV7Ops,
                                                        FeatureThumb2,
@@ -438,8 +449,7 @@ def ARMv7em   : Architecture<"armv7e-m",  "ARMv7em",  [HasV7Ops,
                                                        FeatureDB,
                                                        FeatureHWDiv,
                                                        FeatureMClass,
-                                                       FeatureDSP,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureDSP]>;
 
 def ARMv8a    : Architecture<"armv8-a",   "ARMv8a",   [HasV8Ops,
                                                        FeatureAClass,
@@ -481,9 +491,6 @@ def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
-                                                       FeatureHWDivARM,
-                                                       FeatureT2XtPk,
                                                        FeatureDSP,
                                                        FeatureCRC,
                                                        FeatureMP,
@@ -603,8 +610,6 @@ def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
                                                          FeatureVMLxForwarding,
                                                          FeatureMP,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureVirtualization]>;
 
 def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
@@ -636,8 +641,6 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureTrustZone,
                                                          FeatureVMLxForwarding,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization,
                                                          FeatureMP]>;
@@ -651,8 +654,6 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureVFP4,
                                                          FeatureMP,
                                                          FeatureCheckVLDnAlign,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
@@ -663,8 +664,6 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureMP,
                                                          FeatureVMLxForwarding,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
@@ -759,6 +758,15 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
                                                          FeatureD16]>;
 
+def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
+                                                         FeatureNoMovt]>;
+
+def : ProcNoItin<"cortex-m33",                          [ARMv8mMainline,
+                                                         FeatureDSP,
+                                                         FeatureFPARMv8,
+                                                         FeatureD16,
+                                                         FeatureVFPOnlySP]>;
+
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDiv,
                                                          FeatureHWDivARM,
@@ -829,6 +837,12 @@ def : ProcNoItin<"exynos-m3",                           [ARMv8a, ProcExynosM1,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
+def : ProcNoItin<"kryo",                                [ARMv8a, ProcKryo,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
                                                          FeatureFPAO]>;
 
@@ -838,6 +852,8 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
 
 include "ARMRegisterInfo.td"
 
+include "ARMRegisterBanks.td"
+
 include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 95db35ce8ffb..eb0d410b596b 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -844,7 +844,7 @@ void ARMAsmPrinter::emitAttributes() {
                       ARMBuildAttrs::Allowed);
   else
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                      ARMBuildAttrs::AllowIEE754);
+                      ARMBuildAttrs::AllowIEEE754);
 
   if (STI.allowsUnalignedMem())
     ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
@@ -1142,6 +1142,11 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
+  // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
+  // ARM mode tables.
+  EmitAlignment(2);
+
+  // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
   OutStreamer->EmitLabel(JTISymbol);
 
@@ -1255,7 +1260,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 
     switch (Opc) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
     case ARM::tPUSH:
       // Special case here: no src & dst reg, but two extra imp ops.
@@ -1291,7 +1296,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       int64_t Offset = 0;
       switch (Opc) {
       default:
-        MI->dump();
+        MI->print(errs());
         llvm_unreachable("Unsupported opcode for unwinding information");
       case ARM::MOVr:
       case ARM::tMOVr:
@@ -1346,11 +1351,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         }
       }
     } else if (DstReg == ARM::SP) {
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
     }
     else {
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
     }
   }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 70a3246e34f1..4f5711ca9a79 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -11,34 +11,58 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMFeatures.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -168,9 +192,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
                      .addReg(BaseReg)
                      .addImm(Amt)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     } else if (Amt != 0) {
       ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
       unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
@@ -180,17 +203,15 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
                      .addReg(OffReg)
                      .addReg(0)
                      .addImm(SOOpc)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     } else
       UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
                      .addReg(BaseReg)
                      .addReg(OffReg)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     break;
   }
   case ARMII::AddrMode3 : {
@@ -202,17 +223,15 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
                      .addReg(BaseReg)
                      .addImm(Amt)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     else
       UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
                      .addReg(BaseReg)
                      .addReg(OffReg)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     break;
   }
   }
@@ -306,7 +325,6 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   // Walk backwards from the end of the basic block until the branch is
   // analyzed or we give up.
   while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) {
-
     // Flag to be raised on unanalyzeable instructions. This is useful in cases
     // where we want to clean up on the end of the basic block before we bail
     // out.
@@ -381,7 +399,6 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   return false;
 }
 
-
 unsigned ARMBaseInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                         int *BytesRemoved) const {
   assert(!BytesRemoved && "code size not handled");
@@ -433,20 +450,24 @@ unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
   if (!FBB) {
     if (Cond.empty()) { // Unconditional branch?
       if (isThumb)
-        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).add(predOps(ARMCC::AL));
       else
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
     } else
-      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
-        .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+      BuildMI(&MBB, DL, get(BccOpc))
+          .addMBB(TBB)
+          .addImm(Cond[0].getImm())
+          .add(Cond[1]);
     return 1;
   }
 
   // Two-way conditional branch.
-  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
-    .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+  BuildMI(&MBB, DL, get(BccOpc))
+      .addMBB(TBB)
+      .addImm(Cond[0].getImm())
+      .add(Cond[1]);
   if (isThumb)
-    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).add(predOps(ARMCC::AL));
   else
     BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
   return 2;
@@ -576,7 +597,7 @@ static bool isEligibleForITBlock(const MachineInstr *MI) {
 /// isPredicable - Return true if the specified instruction can be predicated.
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
-bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
   if (!MI.isPredicable())
     return false;
 
@@ -586,7 +607,7 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
   if (!isEligibleForITBlock(&MI))
     return false;
 
-  ARMFunctionInfo *AFI =
+  const ARMFunctionInfo *AFI =
       MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
   if (AFI->isThumb2Function()) {
@@ -601,7 +622,8 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
 }
 
 namespace llvm {
-template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
+
+template <> bool IsCPSRDead<MachineInstr>(const MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
@@ -614,7 +636,8 @@ template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
   // all definitions of CPSR are dead
   return true;
 }
-}
+
+} // end namespace llvm
 
 /// GetInstSize - Return the size of the specified MachineInstr.
 ///
@@ -698,9 +721,8 @@ void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
   if (Subtarget.isMClass())
     MIB.addImm(0x800);
 
-  AddDefaultPred(MIB);
-
-  MIB.addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
+  MIB.add(predOps(ARMCC::AL))
+     .addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
 }
 
 void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
@@ -718,11 +740,9 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
   else
     MIB.addImm(8);
 
-  MIB.addReg(SrcReg, getKillRegState(KillSrc));
-
-  AddDefaultPred(MIB);
-
-  MIB.addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
+  MIB.addReg(SrcReg, getKillRegState(KillSrc))
+     .add(predOps(ARMCC::AL))
+     .addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
 }
 
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -733,8 +753,10 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
   if (GPRDest && GPRSrc) {
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
-                                    .addReg(SrcReg, getKillRegState(KillSrc))));
+    BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     return;
   }
 
@@ -758,7 +780,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
     if (Opc == ARM::VORRq)
       MIB.addReg(SrcReg, getKillRegState(KillSrc));
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
     return;
   }
 
@@ -845,10 +867,10 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // VORR takes two source operands.
     if (Opc == ARM::VORRq)
       Mov.addReg(Src);
-    Mov = AddDefaultPred(Mov);
+    Mov = Mov.add(predOps(ARMCC::AL));
     // MOVr can set CC.
     if (Opc == ARM::MOVr)
-      Mov = AddDefaultCC(Mov);
+      Mov = Mov.add(condCodeOp());
   }
   // Add implicit super-register defs and kills to the last instruction.
   Mov->addRegisterDefined(DestReg, TRI);
@@ -886,35 +908,44 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   switch (RC->getSize()) {
     case 4:
       if (ARM::GPRRegClass.hasSubClassEq(RC)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::STRi12))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VSTRS))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else
         llvm_unreachable("Unknown reg class!");
       break;
     case 8:
       if (ARM::DPRRegClass.hasSubClassEq(RC)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VSTRD))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
         if (Subtarget.hasV5TEOps()) {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD));
           AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
           AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
-          MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
-
-          AddDefaultPred(MIB);
+          MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
+             .add(predOps(ARMCC::AL));
         } else {
           // Fallback to STM instruction, which has existed since the dawn of
           // time.
-          MachineInstrBuilder MIB =
-            AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
-                             .addFrameIndex(FI).addMemOperand(MMO));
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STMIA))
+                                        .addFrameIndex(FI)
+                                        .addMemOperand(MMO)
+                                        .add(predOps(ARMCC::AL));
           AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
           AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
         }
@@ -925,15 +956,18 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       if (ARM::DPairRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
-                     .addFrameIndex(FI).addImm(16)
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VST1q64))
+              .addFrameIndex(FI)
+              .addImm(16)
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         } else {
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addFrameIndex(FI)
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addFrameIndex(FI)
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -942,15 +976,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
-                     .addFrameIndex(FI).addImm(16)
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+              .addFrameIndex(FI)
+              .addImm(16)
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         } else {
-          MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                       .addFrameIndex(FI))
-                       .addMemOperand(MMO);
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                                        .addFrameIndex(FI)
+                                        .add(predOps(ARMCC::AL))
+                                        .addMemOperand(MMO);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
           AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -963,15 +999,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
           // FIXME: It's possible to only store part of the QQ register if the
           // spilled def has a sub-register index.
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
-                     .addFrameIndex(FI).addImm(16)
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+              .addFrameIndex(FI)
+              .addImm(16)
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         } else {
-          MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                       .addFrameIndex(FI))
-                       .addMemOperand(MMO);
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                                        .addFrameIndex(FI)
+                                        .add(predOps(ARMCC::AL))
+                                        .addMemOperand(MMO);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -982,10 +1020,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       break;
     case 64:
       if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
-        MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                         .addFrameIndex(FI))
-                         .addMemOperand(MMO);
+        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                                      .addFrameIndex(FI)
+                                      .add(predOps(ARMCC::AL))
+                                      .addMemOperand(MMO);
         MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
         MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
         MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -1068,19 +1106,28 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   switch (RC->getSize()) {
   case 4:
     if (ARM::GPRRegClass.hasSubClassEq(RC)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
 
     } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
     } else
       llvm_unreachable("Unknown reg class!");
     break;
   case 8:
     if (ARM::DPRRegClass.hasSubClassEq(RC)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
     } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
       MachineInstrBuilder MIB;
 
@@ -1088,14 +1135,15 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
         AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
         AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
-        MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
-
-        AddDefaultPred(MIB);
+        MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
+           .add(predOps(ARMCC::AL));
       } else {
         // Fallback to LDM instruction, which has existed since the dawn of
         // time.
-        MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA))
-                                 .addFrameIndex(FI).addMemOperand(MMO));
+        MIB = BuildMI(MBB, I, DL, get(ARM::LDMIA))
+                  .addFrameIndex(FI)
+                  .addMemOperand(MMO)
+                  .add(predOps(ARMCC::AL));
         MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
       }
@@ -1108,13 +1156,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   case 16:
     if (ARM::DPairRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
-                     .addFrameIndex(FI).addImm(16)
-                     .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+            .addFrameIndex(FI)
+            .addImm(16)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
-                       .addFrameIndex(FI)
-                       .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+            .addFrameIndex(FI)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       }
     } else
       llvm_unreachable("Unknown reg class!");
@@ -1122,14 +1173,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   case 24:
     if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
-                     .addFrameIndex(FI).addImm(16)
-                     .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+            .addFrameIndex(FI)
+            .addImm(16)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else {
-        MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                         .addFrameIndex(FI)
-                         .addMemOperand(MMO));
+        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                      .addFrameIndex(FI)
+                                      .addMemOperand(MMO)
+                                      .add(predOps(ARMCC::AL));
         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
@@ -1142,14 +1195,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
    case 32:
     if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
-                     .addFrameIndex(FI).addImm(16)
-                     .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+            .addFrameIndex(FI)
+            .addImm(16)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else {
-        MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                       .addFrameIndex(FI))
-                       .addMemOperand(MMO);
+        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                      .addFrameIndex(FI)
+                                      .add(predOps(ARMCC::AL))
+                                      .addMemOperand(MMO);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
@@ -1162,10 +1217,10 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case 64:
     if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
-      MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                     .addFrameIndex(FI))
-                     .addMemOperand(MMO);
+      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                    .addFrameIndex(FI)
+                                    .add(predOps(ARMCC::AL))
+                                    .addMemOperand(MMO);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
@@ -1248,7 +1303,7 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
     LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
                                                  : isThumb1 ? ARM::tLDMIA_UPD
                                                             : ARM::LDMIA_UPD))
-             .addOperand(MI->getOperand(1));
+              .add(MI->getOperand(1));
   } else {
     LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
   }
@@ -1257,17 +1312,17 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
     STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
                                                  : isThumb1 ? ARM::tSTMIA_UPD
                                                             : ARM::STMIA_UPD))
-             .addOperand(MI->getOperand(0));
+              .add(MI->getOperand(0));
   } else {
     STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
   }
 
-  AddDefaultPred(LDM.addOperand(MI->getOperand(3)));
-  AddDefaultPred(STM.addOperand(MI->getOperand(2)));
+  LDM.add(MI->getOperand(3)).add(predOps(ARMCC::AL));
+  STM.add(MI->getOperand(2)).add(predOps(ARMCC::AL));
 
   // Sort the scratch registers into ascending order.
   const TargetRegisterInfo &TRI = getRegisterInfo();
-  llvm::SmallVector<unsigned, 6> ScratchRegs;
+  SmallVector<unsigned, 6> ScratchRegs;
   for(unsigned I = 5; I < MI->getNumOperands(); ++I)
     ScratchRegs.push_back(MI->getOperand(I).getReg());
   std::sort(ScratchRegs.begin(), ScratchRegs.end(),
@@ -1285,7 +1340,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   BB->erase(MI);
 }
 
-
 bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
     assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
@@ -1346,7 +1400,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MI.setDesc(get(ARM::VMOVD));
   MI.getOperand(0).setReg(DstRegD);
   MI.getOperand(1).setReg(SrcRegD);
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   // We are now reading SrcRegD instead of SrcRegS.  This may upset the
   // register scavenger and machine verifier, so we need to indicate that we
@@ -1735,25 +1789,17 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
       }
     }
   }
-
-  // Attempt to estimate the relative costs of predication versus branching.
-  // Here we scale up each component of UnpredCost to avoid precision issue when
-  // scaling NumCycles by Probability.
-  const unsigned ScalingUpFactor = 1024;
-  unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
-  UnpredCost += ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
-
-  return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
+  return isProfitableToIfCvt(MBB, NumCycles, ExtraPredCycles,
+                             MBB, 0, 0, Probability);
 }
 
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB,
+isProfitableToIfCvt(MachineBasicBlock &,
                     unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &FMBB,
+                    MachineBasicBlock &,
                     unsigned FCycles, unsigned FExtra,
                     BranchProbability Probability) const {
-  if (!TCycles || !FCycles)
+  if (!TCycles)
     return false;
 
   // Attempt to estimate the relative costs of predication versus branching.
@@ -1793,7 +1839,6 @@ ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
   return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
 }
 
-
 unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
   if (Opc == ARM::B)
     return ARM::Bcc;
@@ -1920,25 +1965,25 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
   const MCInstrDesc &DefDesc = DefMI->getDesc();
   for (unsigned i = 1, e = DefDesc.getNumOperands();
        i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
-    NewMI.addOperand(DefMI->getOperand(i));
+    NewMI.add(DefMI->getOperand(i));
 
   unsigned CondCode = MI.getOperand(3).getImm();
   if (Invert)
     NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
   else
     NewMI.addImm(CondCode);
-  NewMI.addOperand(MI.getOperand(4));
+  NewMI.add(MI.getOperand(4));
 
   // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
   if (NewMI->hasOptionalDef())
-    AddDefaultCC(NewMI);
+    NewMI.add(condCodeOp());
 
   // The output register value when the predicate is false is an implicit
   // register operand tied to the first def.
   // The tie makes the register allocator ensure the FalseReg is allocated the
   // same register as operand 0.
   FalseReg.setImplicit();
-  NewMI.addOperand(FalseReg);
+  NewMI.add(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
 
   // Update SeenMIs set: register newly created MI and erase removed DefMI.
@@ -1983,6 +2028,16 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::RSBSrsi, ARM::RSBrsi},
   {ARM::RSBSrsr, ARM::RSBrsr},
 
+  {ARM::tADDSi3, ARM::tADDi3},
+  {ARM::tADDSi8, ARM::tADDi8},
+  {ARM::tADDSrr, ARM::tADDrr},
+  {ARM::tADCS, ARM::tADC},
+
+  {ARM::tSUBSi3, ARM::tSUBi3},
+  {ARM::tSUBSi8, ARM::tSUBi8},
+  {ARM::tSUBSrr, ARM::tSUBrr},
+  {ARM::tSBCS, ARM::tSBC},
+
   {ARM::t2ADDSri, ARM::t2ADDri},
   {ARM::t2ADDSrr, ARM::t2ADDrr},
   {ARM::t2ADDSrs, ARM::t2ADDrs},
@@ -2011,9 +2066,10 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                    unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
-      .addReg(BaseReg, RegState::Kill)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-      .setMIFlags(MIFlags);
+        .addReg(BaseReg, RegState::Kill)
+        .add(predOps(Pred, PredReg))
+        .add(condCodeOp())
+        .setMIFlags(MIFlags);
     return;
   }
 
@@ -2033,9 +2089,11 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
     // Build the new ADD / SUB.
     unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
     BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-      .setMIFlags(MIFlags);
+        .addReg(BaseReg, RegState::Kill)
+        .addImm(ThisVal)
+        .add(predOps(Pred, PredReg))
+        .add(condCodeOp())
+        .setMIFlags(MIFlags);
     BaseReg = DestReg;
   }
 }
@@ -2154,7 +2212,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
   // Add the complete list back in.
   MachineInstrBuilder MIB(MF, &*MI);
   for (int i = RegList.size() - 1; i >= 0; --i)
-    MIB.addOperand(RegList[i]);
+    MIB.add(RegList[i]);
 
   return true;
 }
@@ -2213,33 +2271,30 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     unsigned NumBits = 0;
     unsigned Scale = 1;
     switch (AddrMode) {
-    case ARMII::AddrMode_i12: {
+    case ARMII::AddrMode_i12:
       ImmIdx = FrameRegIdx + 1;
       InstrOffs = MI.getOperand(ImmIdx).getImm();
       NumBits = 12;
       break;
-    }
-    case ARMII::AddrMode2: {
+    case ARMII::AddrMode2:
       ImmIdx = FrameRegIdx+2;
       InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
       if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
         InstrOffs *= -1;
       NumBits = 12;
       break;
-    }
-    case ARMII::AddrMode3: {
+    case ARMII::AddrMode3:
       ImmIdx = FrameRegIdx+2;
       InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
       if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
         InstrOffs *= -1;
       NumBits = 8;
       break;
-    }
     case ARMII::AddrMode4:
     case ARMII::AddrMode6:
       // Can't fold any offset even if it's zero.
       return false;
-    case ARMII::AddrMode5: {
+    case ARMII::AddrMode5:
       ImmIdx = FrameRegIdx+1;
       InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
       if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
@@ -2247,7 +2302,6 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       NumBits = 8;
       Scale = 4;
       break;
-    }
     default:
       llvm_unreachable("Unsupported addressing mode!");
     }
@@ -2401,6 +2455,63 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
   return false;
 }
 
+static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tLSLri:
+  case ARM::tLSRri:
+  case ARM::tLSLrr:
+  case ARM::tLSRrr:
+  case ARM::tSUBrr:
+  case ARM::tADDrr:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tMUL:
+    IsThumb1 = true;
+    LLVM_FALLTHROUGH;
+  case ARM::RSBrr:
+  case ARM::RSBri:
+  case ARM::RSCrr:
+  case ARM::RSCri:
+  case ARM::ADDrr:
+  case ARM::ADDri:
+  case ARM::ADCrr:
+  case ARM::ADCri:
+  case ARM::SUBrr:
+  case ARM::SUBri:
+  case ARM::SBCrr:
+  case ARM::SBCri:
+  case ARM::t2RSBri:
+  case ARM::t2ADDrr:
+  case ARM::t2ADDri:
+  case ARM::t2ADCrr:
+  case ARM::t2ADCri:
+  case ARM::t2SUBrr:
+  case ARM::t2SUBri:
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri:
+  case ARM::t2LSRri:
+  case ARM::t2LSRrr:
+  case ARM::t2LSLri:
+  case ARM::t2LSLrr:
+    return true;
+  }
+}
+
 /// optimizeCompareInstr - Convert the instruction supplying the argument to the
 /// comparison into one that sets the zero bit in the flags register;
 /// Remove a redundant Compare instruction if an earlier instruction can set the
@@ -2462,6 +2573,41 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
       return false;
   }
 
+  bool IsThumb1 = false;
+  if (MI && !isOptimizeCompareCandidate(MI, IsThumb1))
+    return false;
+
+  // We also want to do this peephole for cases like this: if (a*b == 0),
+  // and optimise away the CMP instruction from the generated code sequence:
+  // MULS, MOVS, MOVS, CMP. Here the MOVS instructions load the boolean values
+  // resulting from the select instruction, but these MOVS instructions for
+  // Thumb1 (V6M) are flag setting and are thus preventing this optimisation.
+  // However, if we only have MOVS instructions in between the CMP and the
+  // other instruction (the MULS in this example), then the CPSR is dead so we
+  // can safely reorder the sequence into: MOVS, MOVS, MULS, CMP. We do this
+  // reordering and then continue the analysis hoping we can eliminate the
+  // CMP. This peephole works on the vregs, so is still in SSA form. As a
+  // consequence, the movs won't redefine/kill the MUL operands which would
+  // make this reordering illegal.
+  if (MI && IsThumb1) {
+    --I;
+    bool CanReorder = true;
+    const bool HasStmts = I != E;
+    for (; I != E; --I) {
+      if (I->getOpcode() != ARM::tMOVi8) {
+        CanReorder = false;
+        break;
+      }
+    }
+    if (HasStmts && CanReorder) {
+      MI = MI->removeFromParent();
+      E = CmpInstr;
+      CmpInstr.getParent()->insert(E, MI);
+    }
+    I = CmpInstr;
+    E = MI;
+  }
+
   // Check that CPSR isn't set between the comparison instruction and the one we
   // want to change. At the same time, search for Sub.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -2497,183 +2643,128 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   if (isPredicated(*MI))
     return false;
 
-  bool IsThumb1 = false;
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::tLSLri:
-  case ARM::tLSRri:
-  case ARM::tLSLrr:
-  case ARM::tLSRrr:
-  case ARM::tSUBrr:
-  case ARM::tADDrr:
-  case ARM::tADDi3:
-  case ARM::tADDi8:
-  case ARM::tSUBi3:
-  case ARM::tSUBi8:
-    IsThumb1 = true;
-    LLVM_FALLTHROUGH;
-  case ARM::RSBrr:
-  case ARM::RSBri:
-  case ARM::RSCrr:
-  case ARM::RSCri:
-  case ARM::ADDrr:
-  case ARM::ADDri:
-  case ARM::ADCrr:
-  case ARM::ADCri:
-  case ARM::SUBrr:
-  case ARM::SUBri:
-  case ARM::SBCrr:
-  case ARM::SBCri:
-  case ARM::t2RSBri:
-  case ARM::t2ADDrr:
-  case ARM::t2ADDri:
-  case ARM::t2ADCrr:
-  case ARM::t2ADCri:
-  case ARM::t2SUBrr:
-  case ARM::t2SUBri:
-  case ARM::t2SBCrr:
-  case ARM::t2SBCri:
-  case ARM::ANDrr:
-  case ARM::ANDri:
-  case ARM::t2ANDrr:
-  case ARM::t2ANDri:
-  case ARM::ORRrr:
-  case ARM::ORRri:
-  case ARM::t2ORRrr:
-  case ARM::t2ORRri:
-  case ARM::EORrr:
-  case ARM::EORri:
-  case ARM::t2EORrr:
-  case ARM::t2EORri:
-  case ARM::t2LSRri:
-  case ARM::t2LSRrr:
-  case ARM::t2LSLri:
-  case ARM::t2LSLrr: {
-    // Scan forward for the use of CPSR
-    // When checking against MI: if it's a conditional code that requires
-    // checking of the V bit or C bit, then this is not safe to do.
-    // It is safe to remove CmpInstr if CPSR is redefined or killed.
-    // If we are done with the basic block, we need to check whether CPSR is
-    // live-out.
-    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
-        OperandsToUpdate;
-    bool isSafe = false;
-    I = CmpInstr;
-    E = CmpInstr.getParent()->end();
-    while (!isSafe && ++I != E) {
-      const MachineInstr &Instr = *I;
-      for (unsigned IO = 0, EO = Instr.getNumOperands();
-           !isSafe && IO != EO; ++IO) {
-        const MachineOperand &MO = Instr.getOperand(IO);
-        if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
-          isSafe = true;
-          break;
-        }
-        if (!MO.isReg() || MO.getReg() != ARM::CPSR)
-          continue;
-        if (MO.isDef()) {
-          isSafe = true;
-          break;
-        }
-        // Condition code is after the operand before CPSR except for VSELs.
-        ARMCC::CondCodes CC;
-        bool IsInstrVSel = true;
-        switch (Instr.getOpcode()) {
-        default:
-          IsInstrVSel = false;
-          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
-          break;
-        case ARM::VSELEQD:
-        case ARM::VSELEQS:
-          CC = ARMCC::EQ;
-          break;
-        case ARM::VSELGTD:
-        case ARM::VSELGTS:
-          CC = ARMCC::GT;
-          break;
-        case ARM::VSELGED:
-        case ARM::VSELGES:
-          CC = ARMCC::GE;
-          break;
-        case ARM::VSELVSS:
-        case ARM::VSELVSD:
-          CC = ARMCC::VS;
-          break;
-        }
+  // Scan forward for the use of CPSR
+  // When checking against MI: if it's a conditional code that requires
+  // checking of the V bit or C bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if CPSR is redefined or killed.
+  // If we are done with the basic block, we need to check whether CPSR is
+  // live-out.
+  SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+      OperandsToUpdate;
+  bool isSafe = false;
+  I = CmpInstr;
+  E = CmpInstr.getParent()->end();
+  while (!isSafe && ++I != E) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands();
+         !isSafe && IO != EO; ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
+        isSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+        continue;
+      if (MO.isDef()) {
+        isSafe = true;
+        break;
+      }
+      // Condition code is after the operand before CPSR except for VSELs.
+      ARMCC::CondCodes CC;
+      bool IsInstrVSel = true;
+      switch (Instr.getOpcode()) {
+      default:
+        IsInstrVSel = false;
+        CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+        break;
+      case ARM::VSELEQD:
+      case ARM::VSELEQS:
+        CC = ARMCC::EQ;
+        break;
+      case ARM::VSELGTD:
+      case ARM::VSELGTS:
+        CC = ARMCC::GT;
+        break;
+      case ARM::VSELGED:
+      case ARM::VSELGES:
+        CC = ARMCC::GE;
+        break;
+      case ARM::VSELVSS:
+      case ARM::VSELVSD:
+        CC = ARMCC::VS;
+        break;
+      }
 
-        if (Sub) {
-          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
-          if (NewCC == ARMCC::AL)
-            return false;
-          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
-          // on CMP needs to be updated to be based on SUB.
-          // Push the condition code operands to OperandsToUpdate.
-          // If it is safe to remove CmpInstr, the condition code of these
-          // operands will be modified.
-          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-              Sub->getOperand(2).getReg() == SrcReg) {
-            // VSel doesn't support condition code update.
-            if (IsInstrVSel)
-              return false;
-            OperandsToUpdate.push_back(
-                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
-          }
-        } else {
-          // No Sub, so this is x = <op> y, z; cmp x, 0.
-          switch (CC) {
-          case ARMCC::EQ: // Z
-          case ARMCC::NE: // Z
-          case ARMCC::MI: // N
-          case ARMCC::PL: // N
-          case ARMCC::AL: // none
-            // CPSR can be used multiple times, we should continue.
-            break;
-          case ARMCC::HS: // C
-          case ARMCC::LO: // C
-          case ARMCC::VS: // V
-          case ARMCC::VC: // V
-          case ARMCC::HI: // C Z
-          case ARMCC::LS: // C Z
-          case ARMCC::GE: // N V
-          case ARMCC::LT: // N V
-          case ARMCC::GT: // Z N V
-          case ARMCC::LE: // Z N V
-            // The instruction uses the V bit or C bit which is not safe.
+      if (Sub) {
+        ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+        if (NewCC == ARMCC::AL)
+          return false;
+        // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+        // on CMP needs to be updated to be based on SUB.
+        // Push the condition code operands to OperandsToUpdate.
+        // If it is safe to remove CmpInstr, the condition code of these
+        // operands will be modified.
+        if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+            Sub->getOperand(2).getReg() == SrcReg) {
+          // VSel doesn't support condition code update.
+          if (IsInstrVSel)
             return false;
-          }
+          OperandsToUpdate.push_back(
+              std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
         }
-      }
-    }
-
-    // If CPSR is not killed nor re-defined, we should check whether it is
-    // live-out. If it is live-out, do not optimize.
-    if (!isSafe) {
-      MachineBasicBlock *MBB = CmpInstr.getParent();
-      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-               SE = MBB->succ_end(); SI != SE; ++SI)
-        if ((*SI)->isLiveIn(ARM::CPSR))
+      } else {
+        // No Sub, so this is x = <op> y, z; cmp x, 0.
+        switch (CC) {
+        case ARMCC::EQ: // Z
+        case ARMCC::NE: // Z
+        case ARMCC::MI: // N
+        case ARMCC::PL: // N
+        case ARMCC::AL: // none
+          // CPSR can be used multiple times, we should continue.
+          break;
+        case ARMCC::HS: // C
+        case ARMCC::LO: // C
+        case ARMCC::VS: // V
+        case ARMCC::VC: // V
+        case ARMCC::HI: // C Z
+        case ARMCC::LS: // C Z
+        case ARMCC::GE: // N V
+        case ARMCC::LT: // N V
+        case ARMCC::GT: // Z N V
+        case ARMCC::LE: // Z N V
+          // The instruction uses the V bit or C bit which is not safe.
           return false;
+        }
+      }
     }
+  }
 
-    // Toggle the optional operand to CPSR (if it exists - in Thumb1 we always
-    // set CPSR so this is represented as an explicit output)
-    if (!IsThumb1) {
-      MI->getOperand(5).setReg(ARM::CPSR);
-      MI->getOperand(5).setIsDef(true);
-    }
-    assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
-    CmpInstr.eraseFromParent();
-
-    // Modify the condition code of operands in OperandsToUpdate.
-    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
-    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
-    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
-      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
-    return true;
+  // If CPSR is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!isSafe) {
+    MachineBasicBlock *MBB = CmpInstr.getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+             SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(ARM::CPSR))
+        return false;
   }
+
+  // Toggle the optional operand to CPSR (if it exists - in Thumb1 we always
+  // set CPSR so this is represented as an explicit output)
+  if (!IsThumb1) {
+    MI->getOperand(5).setReg(ARM::CPSR);
+    MI->getOperand(5).setIsDef(true);
   }
-  
-  return false;
+  assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
+  CmpInstr.eraseFromParent();
+
+  // Modify the condition code of operands in OperandsToUpdate.
+  // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+  // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+  for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+    OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+
+  return true;
 }
 
 bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
@@ -2728,7 +2819,7 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     switch (UseOpc) {
     default: break;
     case ARM::ADDrr:
-    case ARM::SUBrr: {
+    case ARM::SUBrr:
       if (UseOpc == ARM::SUBrr && Commute)
         return false;
 
@@ -2744,9 +2835,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
       SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
       break;
-    }
     case ARM::ORRrr:
-    case ARM::EORrr: {
+    case ARM::EORrr:
       if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
         return false;
       SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
@@ -2757,9 +2847,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       case ARM::EORrr: NewUseOpc = ARM::EORri; break;
       }
       break;
-    }
     case ARM::t2ADDrr:
-    case ARM::t2SUBrr: {
+    case ARM::t2SUBrr:
       if (UseOpc == ARM::t2SUBrr && Commute)
         return false;
 
@@ -2775,9 +2864,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
       SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
       break;
-    }
     case ARM::t2ORRrr:
-    case ARM::t2EORrr: {
+    case ARM::t2EORrr:
       if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
         return false;
       SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
@@ -2789,7 +2877,6 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       }
       break;
     }
-    }
   }
   }
 
@@ -2797,11 +2884,12 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
   bool isKill = UseMI.getOperand(OpIdx).isKill();
   unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
-  AddDefaultCC(
-      AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
-                             get(NewUseOpc), NewReg)
-                         .addReg(Reg1, getKillRegState(isKill))
-                         .addImm(SOImmValV1)));
+  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc),
+          NewReg)
+      .addReg(Reg1, getKillRegState(isKill))
+      .addImm(SOImmValV1)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
   UseMI.setDesc(get(NewUseOpc));
   UseMI.getOperand(1).setReg(NewReg);
   UseMI.getOperand(1).setIsKill();
@@ -3413,7 +3501,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   case ARM::t2LDMDB:
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB_UPD:
-    LdmBypass = 1;
+    LdmBypass = true;
     DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
     break;
   }
@@ -3888,12 +3976,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::t2LDRs:
     case ARM::t2LDRBs:
     case ARM::t2LDRHs:
-    case ARM::t2LDRSHs: {
+    case ARM::t2LDRSHs:
       // Thumb2 mode: lsl 0-3 only.
       Latency -= 2;
       break;
     }
-    }
   }
 
   if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
@@ -4180,14 +4267,14 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                  MachineMemOperand::MOInvariant;
     MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
         MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
-    MIB.addMemOperand(MMO);
-    AddDefaultPred(MIB);
+    MIB.addMemOperand(MMO).add(predOps(ARMCC::AL));
   }
 
   MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
-  MIB.addReg(Reg, RegState::Kill).addImm(0);
-  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-  AddDefaultPred(MIB);
+  MIB.addReg(Reg, RegState::Kill)
+     .addImm(0)
+     .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
+     .add(predOps(ARMCC::AL));
 }
 
 bool
@@ -4222,6 +4309,7 @@ enum ARMExeDomain {
   ExeVFP = 1,
   ExeNEON = 2
 };
+
 //
 // Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
 //
@@ -4345,8 +4433,10 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
     // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
     MI.setDesc(get(ARM::VORRd));
-    AddDefaultPred(
-        MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg));
+    MIB.addReg(DstReg, RegState::Define)
+        .addReg(SrcReg)
+        .addReg(SrcReg)
+        .add(predOps(ARMCC::AL));
     break;
   case ARM::VMOVRS:
     if (Domain != ExeNEON)
@@ -4366,9 +4456,10 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     // Note that DSrc has been widened and the other lane may be undef, which
     // contaminates the entire register.
     MI.setDesc(get(ARM::VGETLNi32));
-    AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
-                       .addReg(DReg, RegState::Undef)
-                       .addImm(Lane));
+    MIB.addReg(DstReg, RegState::Define)
+        .addReg(DReg, RegState::Undef)
+        .addImm(Lane)
+        .add(predOps(ARMCC::AL));
 
     // The old source should be an implicit use, otherwise we might think it
     // was dead before here.
@@ -4398,8 +4489,8 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     MIB.addReg(DReg, RegState::Define)
         .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI)))
         .addReg(SrcReg)
-        .addImm(Lane);
-    AddDefaultPred(MIB);
+        .addImm(Lane)
+        .add(predOps(ARMCC::AL));
 
     // The narrower destination must be marked as set to keep previous chains
     // in place.
@@ -4433,8 +4524,8 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
         MI.setDesc(get(ARM::VDUPLN32d));
         MIB.addReg(DDst, RegState::Define)
             .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI)))
-            .addImm(SrcLane);
-        AddDefaultPred(MIB);
+            .addImm(SrcLane)
+            .add(predOps(ARMCC::AL));
 
         // Neither the source or the destination are naturally represented any
         // more, so add them in manually.
@@ -4470,10 +4561,9 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
       CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
       CurUndef = !MI.readsRegister(CurReg, TRI);
-      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
-
-      NewMIB.addImm(1);
-      AddDefaultPred(NewMIB);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef))
+            .addImm(1)
+            .add(predOps(ARMCC::AL));
 
       if (SrcLane == DstLane)
         NewMIB.addReg(SrcReg, RegState::Implicit);
@@ -4489,10 +4579,9 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
       CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
       CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
-      MIB.addReg(CurReg, getUndefRegState(CurUndef));
-
-      MIB.addImm(1);
-      AddDefaultPred(MIB);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef))
+         .addImm(1)
+         .add(predOps(ARMCC::AL));
 
       if (SrcLane != DstLane)
         MIB.addReg(SrcReg, RegState::Implicit);
@@ -4505,7 +4594,6 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
       break;
     }
   }
-
 }
 
 //===----------------------------------------------------------------------===//
@@ -4613,9 +4701,9 @@ void ARMBaseInstrInfo::breakPartialRegDependency(
 
   // Insert the dependency-breaking FCONSTD before MI.
   // 96 is the encoding of 0.5, but the actual value doesn't matter here.
-  AddDefaultPred(
-      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
-          .addImm(96));
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
+      .addImm(96)
+      .add(predOps(ARMCC::AL));
   MI.addRegisterKilled(DReg, TRI, true);
 }
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index b01d5c8ec85f..23777b821f9f 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -17,16 +17,21 @@
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/CodeGen.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <array>
+#include <cstdint>
 
 #define GET_INSTRINFO_HEADER
 #include "ARMGenInstrInfo.inc"
 
 namespace llvm {
-  class ARMSubtarget;
-  class ARMBaseRegisterInfo;
+
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
 
 class ARMBaseInstrInfo : public ARMGenInstrInfo {
   const ARMSubtarget &Subtarget;
@@ -106,7 +111,7 @@ public:
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
-  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
 
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineInstr &MI,
@@ -156,7 +161,7 @@ public:
   bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
@@ -401,25 +406,28 @@ public:
   bool isSwiftFastImmShift(const MachineInstr *MI) const;
 };
 
-static inline
-const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
-  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+/// Get the operands corresponding to the given \p Pred value. By default, the
+/// predicate register is assumed to be 0 (no register), but you can pass in a
+/// \p PredReg if that is not the case.
+static inline std::array<MachineOperand, 2> predOps(ARMCC::CondCodes Pred,
+                                                    unsigned PredReg = 0) {
+  return {{MachineOperand::CreateImm(static_cast<int64_t>(Pred)),
+           MachineOperand::CreateReg(PredReg, false)}};
 }
 
-static inline
-const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
-  return MIB.addReg(0);
+/// Get the operand corresponding to the conditional code result. By default,
+/// this is 0 (no register).
+static inline MachineOperand condCodeOp(unsigned CCReg = 0) {
+  return MachineOperand::CreateReg(CCReg, false);
 }
 
-static inline
-const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
-                                          bool isDead = false) {
-  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
-}
-
-static inline
-const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
-  return MIB.addReg(0);
+/// Get the operand corresponding to the conditional code result for Thumb1.
+/// This operand will always refer to CPSR and it will have the Define flag set.
+/// You can optionally set the Dead flag by means of \p isDead.
+static inline MachineOperand t1CondCodeOp(bool isDead = false) {
+  return MachineOperand::CreateReg(ARM::CPSR,
+                                   /*Define*/ true, /*Implicit*/ false,
+                                   /*Kill*/ false, isDead);
 }
 
 static inline
@@ -517,6 +525,6 @@ bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                          unsigned FrameReg, int &Offset,
                          const ARMBaseInstrInfo &TII);
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index d995c631dd1c..70a44eaaceb8 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -11,32 +11,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMBaseRegisterInfo.h"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <utility>
 
 #define DEBUG_TYPE "arm-register-info"
 
@@ -46,7 +56,7 @@
 using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo()
-    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {}
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {}
 
 static unsigned getFramePointerReg(const ARMSubtarget &STI) {
   return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
@@ -140,7 +150,6 @@ ARMBaseRegisterInfo::getSjLjDispatchPreservedMask(const MachineFunction &MF) con
     return CSR_FPRegs_RegMask;
 }
 
-
 const uint32_t *
 ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
                                                 CallingConv::ID CC) const {
@@ -425,10 +434,11 @@ void ARMBaseRegisterInfo::emitLoadConstPool(
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
-    .addReg(DestReg, getDefRegState(true), SubIdx)
-    .addConstantPoolIndex(Idx)
-    .addImm(0).addImm(Pred).addReg(PredReg)
-    .setMIFlags(MIFlags);
+      .addReg(DestReg, getDefRegState(true), SubIdx)
+      .addConstantPoolIndex(Idx)
+      .addImm(0)
+      .add(predOps(Pred, PredReg))
+      .setMIFlags(MIFlags);
 }
 
 bool ARMBaseRegisterInfo::
@@ -474,26 +484,23 @@ getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
     Scale = 4;
     break;
   }
-  case ARMII::AddrMode2: {
+  case ARMII::AddrMode2:
     ImmIdx = Idx+2;
     InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
     if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
       InstrOffs = -InstrOffs;
     break;
-  }
-  case ARMII::AddrMode3: {
+  case ARMII::AddrMode3:
     ImmIdx = Idx+2;
     InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
     if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
       InstrOffs = -InstrOffs;
     break;
-  }
-  case ARMII::AddrModeT1_s: {
+  case ARMII::AddrModeT1_s:
     ImmIdx = Idx+1;
     InstrOffs = MI->getOperand(ImmIdx).getImm();
     Scale = 4;
     break;
-  }
   default:
     llvm_unreachable("Unsupported addressing mode!");
   }
@@ -609,7 +616,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx).addImm(Offset);
 
   if (!AFI->isThumb1OnlyFunction())
-    AddDefaultCC(AddDefaultPred(MIB));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
 }
 
 void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -636,7 +643,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
     assert(AFI->isThumb2Function());
     Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
   }
-  assert (Done && "Unable to resolve frame index!");
+  assert(Done && "Unable to resolve frame index!");
   (void)Done;
 }
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 330e1535e863..2e91d9d4be24 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -15,24 +15,33 @@
 #define LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cstdint>
 
 #define GET_REGINFO_HEADER
 #include "ARMGenRegisterInfo.inc"
 
 namespace llvm {
+
 /// Register allocation hints.
 namespace ARMRI {
+
   enum {
     RegPairOdd  = 1,
     RegPairEven = 2
   };
-}
+
+} // end namespace ARMRI
 
 /// isARMArea1Register - Returns true if the register is a low register (r0-r7)
 /// or a stack/pc register that we should push/pop.
 static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
+
   switch (Reg) {
     case R0:  case R1:  case R2:  case R3:
     case R4:  case R5:  case R6:  case R7:
@@ -48,6 +57,7 @@ static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
 
 static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
+
   switch (Reg) {
     case R8: case R9: case R10: case R11: case R12:
       // iOS has this second area.
@@ -59,6 +69,7 @@ static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
 
 static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
+
   switch (Reg) {
     case D15: case D14: case D13: case D12:
     case D11: case D10: case D9:  case D8:
@@ -87,7 +98,7 @@ protected:
   /// BasePtr - ARM physical register used as a base ptr in complex stack
   /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
   /// variable size stack objects.
-  unsigned BasePtr;
+  unsigned BasePtr = ARM::R6;
 
   // Can be only subclassed.
   explicit ARMBaseRegisterInfo();
@@ -198,4 +209,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h
index 780544f865df..e0cb0aa676a6 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
 
-#include "ARM.h"
-#include "ARMMachineFunctionInfo.h"
-using namespace llvm;
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cstdint>
 
 namespace llvm {
 
@@ -44,31 +44,30 @@ struct BasicBlockInfo {
   ///
   /// Because worst case padding is used, the computed offset of an aligned
   /// block may not actually be aligned.
-  unsigned Offset;
+  unsigned Offset = 0;
 
   /// Size - Size of the basic block in bytes.  If the block contains
   /// inline assembly, this is a worst case estimate.
   ///
   /// The size does not include any alignment padding whether from the
   /// beginning of the block, or from an aligned jump table at the end.
-  unsigned Size;
+  unsigned Size = 0;
 
   /// KnownBits - The number of low bits in Offset that are known to be
   /// exact.  The remaining bits of Offset are an upper bound.
-  uint8_t KnownBits;
+  uint8_t KnownBits = 0;
 
   /// Unalign - When non-zero, the block contains instructions (inline asm)
   /// of unknown size.  The real size may be smaller than Size bytes by a
   /// multiple of 1 << Unalign.
-  uint8_t Unalign;
+  uint8_t Unalign = 0;
 
   /// PostAlign - When non-zero, the block terminator contains a .align
   /// directive, so the end of the block is aligned to 1 << PostAlign
   /// bytes.
-  uint8_t PostAlign;
+  uint8_t PostAlign = 0;
 
-  BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
-    PostAlign(0) {}
+  BasicBlockInfo() = default;
 
   /// Compute the number of known offset bits internally to this block.
   /// This number should be used to predict worst case padding when
@@ -107,4 +106,4 @@ struct BasicBlockInfo {
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 52c95b6244ac..94b317a8f986 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -17,7 +17,9 @@
 
 #include "ARMBaseInstrInfo.h"
 #include "ARMISelLowering.h"
+#include "ARMSubtarget.h"
 
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -30,25 +32,47 @@ using namespace llvm;
 ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
-static bool isSupportedType(const DataLayout DL, const ARMTargetLowering &TLI,
+static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
                             Type *T) {
-  EVT VT = TLI.getValueType(DL, T);
-  if (!VT.isSimple() || !VT.isInteger() || VT.isVector())
+  EVT VT = TLI.getValueType(DL, T, true);
+  if (!VT.isSimple() || VT.isVector())
     return false;
 
   unsigned VTSize = VT.getSimpleVT().getSizeInBits();
-  return VTSize == 8 || VTSize == 16 || VTSize == 32;
+
+  if (VTSize == 64)
+    // FIXME: Support i64 too
+    return VT.isFloatingPoint();
+
+  return VTSize == 1 || VTSize == 8 || VTSize == 16 || VTSize == 32;
 }
 
 namespace {
-struct FuncReturnHandler : public CallLowering::ValueHandler {
-  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                    MachineInstrBuilder &MIB)
-      : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+/// Helper class for values going out through an ABI boundary (used for handling
+/// function return values and call parameters).
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), StackSize(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
-    llvm_unreachable("Don't know how to get a stack address yet");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    LLT p0 = LLT::pointer(0, 32);
+    LLT s32 = LLT::scalar(32);
+    unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildCopy(SPReg, ARM::SP);
+
+    unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+    MIRBuilder.buildConstant(OffsetReg, Offset);
+
+    unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    return AddrReg;
   }
 
   void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
@@ -56,26 +80,92 @@ struct FuncReturnHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
-    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
-
-    assert(VA.getLocInfo() != CCValAssign::SExt &&
-           VA.getLocInfo() != CCValAssign::ZExt &&
-           "ABI extensions not supported yet");
+    assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
 
-    MIRBuilder.buildCopy(PhysReg, ValVReg);
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
     MIB.addUse(PhysReg, RegState::Implicit);
   }
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    llvm_unreachable("Don't know how to assign a value to an address yet");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
+        /* Alignment */ 0);
+    MIRBuilder.buildStore(ExtReg, Addr, *MMO);
+  }
+
+  unsigned assignCustomValue(const CallLowering::ArgInfo &Arg,
+                             ArrayRef<CCValAssign> VAs) override {
+    CCValAssign VA = VAs[0];
+    assert(VA.needsCustom() && "Value doesn't need custom handling");
+    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    CCValAssign NextVA = VAs[1];
+    assert(NextVA.needsCustom() && "Value doesn't need custom handling");
+    assert(NextVA.getValVT() == MVT::f64 && "Unsupported type");
+
+    assert(VA.getValNo() == NextVA.getValNo() &&
+           "Values belong to different arguments");
+
+    assert(VA.isRegLoc() && "Value should be in reg");
+    assert(NextVA.isRegLoc() && "Value should be in reg");
+
+    unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+                          MRI.createGenericVirtualRegister(LLT::scalar(32))};
+    MIRBuilder.buildExtract(NewRegs[0], Arg.Reg, 0);
+    MIRBuilder.buildExtract(NewRegs[1], Arg.Reg, 32);
+
+    bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
+    if (!IsLittle)
+      std::swap(NewRegs[0], NewRegs[1]);
+
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+    assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+
+    return 1;
+  }
+
+  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                 CCValAssign::LocInfo LocInfo,
+                 const CallLowering::ArgInfo &Info, CCState &State) override {
+    if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State))
+      return true;
+
+    StackSize =
+        std::max(StackSize, static_cast<uint64_t>(State.getNextStackOffset()));
+    return false;
   }
 
   MachineInstrBuilder &MIB;
+  uint64_t StackSize;
 };
 } // End anonymous namespace.
 
+void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                        SmallVectorImpl<ArgInfo> &SplitArgs,
+                                        const DataLayout &DL,
+                                        MachineRegisterInfo &MRI) const {
+  const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+  assert(SplitVTs.size() == 1 && "Unsupported type");
+
+  // Even if there is no splitting to do, we still want to replace the original
+  // type (e.g. pointer type -> integer).
+  SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+                         OrigArg.Flags, OrigArg.IsFixed);
+}
+
 /// Lower the return value for the already existing \p Ret. This assumes that
 /// \p MIRBuilder's insertion point is correct.
 bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
@@ -93,21 +183,23 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
   if (!isSupportedType(DL, TLI, Val->getType()))
     return false;
 
+  SmallVector<ArgInfo, 4> SplitVTs;
+  ArgInfo RetInfo(VReg, Val->getType());
+  setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+  splitToValueTypes(RetInfo, SplitVTs, DL, MF.getRegInfo());
+
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
 
-  ArgInfo RetInfo(VReg, Val->getType());
-  setArgFlags(RetInfo, AttributeSet::ReturnIndex, DL, F);
-
-  FuncReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
-  return handleAssignments(MIRBuilder, AssignFn, RetInfo, RetHandler);
+  OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
+  return handleAssignments(MIRBuilder, SplitVTs, RetHandler);
 }
 
 bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                   const Value *Val, unsigned VReg) const {
   assert(!Val == !VReg && "Return value without a vreg");
 
-  auto Ret = AddDefaultPred(MIRBuilder.buildInstrNoInsert(ARM::BX_RET));
+  auto Ret = MIRBuilder.buildInstrNoInsert(ARM::BX_RET).add(predOps(ARMCC::AL));
 
   if (!lowerReturnVal(MIRBuilder, Val, VReg, Ret))
     return false;
@@ -117,13 +209,17 @@ bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 }
 
 namespace {
-struct FormalArgHandler : public CallLowering::ValueHandler {
-  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
-      : ValueHandler(MIRBuilder, MRI) {}
+/// Helper class for values coming in through an ABI boundary (used for handling
+/// formal arguments and call return values).
+struct IncomingValueHandler : public CallLowering::ValueHandler {
+  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       CCAssignFn AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
-    assert(Size == 4 && "Unsupported size");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
 
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
 
@@ -139,7 +235,17 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    assert(Size == 4 && "Unsupported size");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    if (VA.getLocInfo() == CCValAssign::SExt ||
+        VA.getLocInfo() == CCValAssign::ZExt) {
+      // If the value is zero- or sign-extended, its size becomes 4 bytes, so
+      // that's what we should load.
+      Size = 4;
+      assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
+      MRI.setType(ValVReg, LLT::scalar(32));
+    }
 
     auto MMO = MIRBuilder.getMF().getMachineMemOperand(
         MPO, MachineMemOperand::MOLoad, Size, /* Alignment */ 0);
@@ -151,12 +257,60 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
-    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+    assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
 
-    MIRBuilder.getMBB().addLiveIn(PhysReg);
+    // The necesary extensions are handled on the other side of the ABI
+    // boundary.
+    markPhysRegUsed(PhysReg);
     MIRBuilder.buildCopy(ValVReg, PhysReg);
   }
+
+  unsigned assignCustomValue(const ARMCallLowering::ArgInfo &Arg,
+                             ArrayRef<CCValAssign> VAs) override {
+    CCValAssign VA = VAs[0];
+    assert(VA.needsCustom() && "Value doesn't need custom handling");
+    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    CCValAssign NextVA = VAs[1];
+    assert(NextVA.needsCustom() && "Value doesn't need custom handling");
+    assert(NextVA.getValVT() == MVT::f64 && "Unsupported type");
+
+    assert(VA.getValNo() == NextVA.getValNo() &&
+           "Values belong to different arguments");
+
+    assert(VA.isRegLoc() && "Value should be in reg");
+    assert(NextVA.isRegLoc() && "Value should be in reg");
+
+    unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+                          MRI.createGenericVirtualRegister(LLT::scalar(32))};
+
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+    assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+
+    bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
+    if (!IsLittle)
+      std::swap(NewRegs[0], NewRegs[1]);
+
+    MIRBuilder.buildSequence(Arg.Reg, NewRegs, {0, 32});
+
+    return 1;
+  }
+
+  /// Marking a physical register as used is different between formal
+  /// parameters, where it's a basic block live-in, and call returns, where it's
+  /// an implicit-def of the call instruction.
+  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+};
+
+struct FormalArgHandler : public IncomingValueHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+  }
 };
 } // End anonymous namespace
 
@@ -170,34 +324,111 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (F.isVarArg())
     return false;
 
-  auto DL = MIRBuilder.getMF().getDataLayout();
+  auto &MF = MIRBuilder.getMF();
+  auto DL = MF.getDataLayout();
   auto &TLI = *getTLI<ARMTargetLowering>();
 
-  auto &Args = F.getArgumentList();
-  unsigned ArgIdx = 0;
-  for (auto &Arg : Args) {
-    ArgIdx++;
-    if (!isSupportedType(DL, TLI, Arg.getType()))
-      return false;
+  auto Subtarget = TLI.getSubtarget();
 
-    // FIXME: This check as well as ArgIdx are going away as soon as we support
-    // loading values < 32 bits.
-    if (ArgIdx > 4 && Arg.getType()->getIntegerBitWidth() != 32)
+  if (Subtarget->isThumb())
+    return false;
+
+  for (auto &Arg : F.args())
+    if (!isSupportedType(DL, TLI, Arg.getType()))
       return false;
-  }
 
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
 
   SmallVector<ArgInfo, 8> ArgInfos;
   unsigned Idx = 0;
-  for (auto &Arg : Args) {
+  for (auto &Arg : F.args()) {
     ArgInfo AInfo(VRegs[Idx], Arg.getType());
     setArgFlags(AInfo, Idx + 1, DL, F);
-    ArgInfos.push_back(AInfo);
+    splitToValueTypes(AInfo, ArgInfos, DL, MF.getRegInfo());
     Idx++;
   }
 
-  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo());
-  return handleAssignments(MIRBuilder, AssignFn, ArgInfos, ArgHandler);
+  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
+                              AssignFn);
+  return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
+}
+
+namespace {
+struct CallReturnHandler : public IncomingValueHandler {
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIB.addDef(PhysReg, RegState::Implicit);
+  }
+
+  MachineInstrBuilder MIB;
+};
+} // End anonymous namespace.
+
+bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                CallingConv::ID CallConv,
+                                const MachineOperand &Callee,
+                                const ArgInfo &OrigRet,
+                                ArrayRef<ArgInfo> OrigArgs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto &TLI = *getTLI<ARMTargetLowering>();
+  const auto &DL = MF.getDataLayout();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  if (MF.getSubtarget<ARMSubtarget>().genLongCalls())
+    return false;
+
+  auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN);
+
+  // Create the call instruction so we can add the implicit uses of arg
+  // registers, but don't insert it yet.
+  auto MIB = MIRBuilder.buildInstrNoInsert(ARM::BLX).add(Callee).addRegMask(
+      TRI->getCallPreservedMask(MF, CallConv));
+
+  SmallVector<ArgInfo, 8> ArgInfos;
+  for (auto Arg : OrigArgs) {
+    if (!isSupportedType(DL, TLI, Arg.Ty))
+      return false;
+
+    if (!Arg.IsFixed)
+      return false;
+
+    splitToValueTypes(Arg, ArgInfos, DL, MRI);
+  }
+
+  auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+  OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
+  if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+    return false;
+
+  // Now we can add the actual call instruction to the correct basic block.
+  MIRBuilder.insertInstr(MIB);
+
+  if (!OrigRet.Ty->isVoidTy()) {
+    if (!isSupportedType(DL, TLI, OrigRet.Ty))
+      return false;
+
+    ArgInfos.clear();
+    splitToValueTypes(OrigRet, ArgInfos, DL, MRI);
+
+    auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false);
+    CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
+    if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
+      return false;
+  }
+
+  // We now know the size of the stack - update the ADJCALLSTACKDOWN
+  // accordingly.
+  CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL));
+
+  MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
+      .addImm(ArgHandler.StackSize)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
+
+  return true;
 }
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 6a1b886b501f..6404c7a2689e 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -34,9 +34,19 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
 
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
+                 ArrayRef<ArgInfo> OrigArgs) const override;
+
 private:
   bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
                       unsigned VReg, MachineInstrBuilder &Ret) const;
+
+  /// Split an argument into one or more arguments that the CC lowering can cope
+  /// with (e.g. replace pointers with integers).
+  void splitToValueTypes(const ArgInfo &OrigArg,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI) const;
 };
 } // End of namespace llvm
 #endif
diff --git a/lib/Target/ARM/ARMComputeBlockSize.cpp b/lib/Target/ARM/ARMComputeBlockSize.cpp
index 64f187d17e64..e145d0a49ae6 100644
--- a/lib/Target/ARM/ARMComputeBlockSize.cpp
+++ b/lib/Target/ARM/ARMComputeBlockSize.cpp
@@ -8,7 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBasicBlockInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <vector>
+
 using namespace llvm;
 
 namespace llvm {
@@ -69,4 +77,4 @@ std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF) {
   return BBInfo;
 }
 
-} // end namespace
+} // end namespace llvm
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index be1a37e3e362..23722f1b7f3f 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -14,30 +14,50 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBasicBlockInfo.h"
 #include "ARMMachineFunctionInfo.h"
-#include "MCTargetDesc/ARMAddressingModes.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-cp-islands"
 
+#define ARM_CP_ISLANDS_OPT_NAME \
+  "ARM constant island placement and branch shortening pass"
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -49,7 +69,6 @@ STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
 STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
 STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
 
-
 static cl::opt<bool>
 AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
           cl::desc("Adjust basic block layout to better use TB[BH]"));
@@ -64,6 +83,7 @@ static cl::opt<bool> SynthesizeThumb1TBB(
              "equivalent to the TBB/TBH instructions"));
 
 namespace {
+
   /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
   /// requires constant pool entries to be scattered among the instructions
   /// inside a function.  To do this, it completely ignores the normal LLVM
@@ -76,7 +96,6 @@ namespace {
   ///   CPE     - A constant pool entry that has been placed somewhere, which
   ///             tracks a list of users.
   class ARMConstantIslands : public MachineFunctionPass {
-
     std::vector<BasicBlockInfo> BBInfo;
 
     /// WaterList - A sorted list of basic blocks where islands could be placed
@@ -110,12 +129,14 @@ namespace {
       bool NegOk;
       bool IsSoImm;
       bool KnownAlignment;
+
       CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
              bool neg, bool soimm)
         : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm),
           KnownAlignment(false) {
         HighWaterMark = CPEMI->getParent();
       }
+
       /// getMaxDisp - Returns the maximum displacement supported by MI.
       /// Correct for unknown alignment.
       /// Conservatively subtract 2 bytes to handle weird alignment effects.
@@ -135,6 +156,7 @@ namespace {
       MachineInstr *CPEMI;
       unsigned CPI;
       unsigned RefCount;
+
       CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
         : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
     };
@@ -148,7 +170,7 @@ namespace {
     /// The first half of CPEntries contains generic constants, the second half
     /// contains jump tables. Use getCombinedIndex on a generic CPEMI to look up
     /// which vector it will be in here.
-    std::vector<std::vector<CPEntry> > CPEntries;
+    std::vector<std::vector<CPEntry>> CPEntries;
 
     /// Maps a JT index to the offset in CPEntries containing copies of that
     /// table. The equivalent map for a CONSTPOOL_ENTRY is the identity.
@@ -167,6 +189,7 @@ namespace {
       unsigned MaxDisp : 31;
       bool isCond : 1;
       unsigned UncondBr;
+
       ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, unsigned ubr)
         : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
     };
@@ -195,8 +218,10 @@ namespace {
     bool isThumb1;
     bool isThumb2;
     bool isPositionIndependentOrROPI;
+
   public:
     static char ID;
+
     ARMConstantIslands() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
@@ -207,7 +232,7 @@ namespace {
     }
 
     StringRef getPassName() const override {
-      return "ARM constant island placement and branch shortening pass";
+      return ARM_CP_ISLANDS_OPT_NAME;
     }
 
   private:
@@ -264,8 +289,10 @@ namespace {
                              U.getMaxDisp(), U.NegOk, U.IsSoImm);
     }
   };
+
   char ARMConstantIslands::ID = 0;
-}
+
+} // end anonymous namespace
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
 void ARMConstantIslands::verify() {
@@ -295,8 +322,9 @@ void ARMConstantIslands::verify() {
 #endif
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
-void ARMConstantIslands::dumpBBs() {
+LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
   DEBUG({
     for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
       const BasicBlockInfo &BBI = BBInfo[J];
@@ -308,12 +336,7 @@ void ARMConstantIslands::dumpBBs() {
     }
   });
 }
-
-/// createARMConstantIslandPass - returns an instance of the constpool
-/// island pass.
-FunctionPass *llvm::createARMConstantIslandPass() {
-  return new ARMConstantIslands();
-}
+#endif
 
 bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
@@ -782,6 +805,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           case ARM::LDRcp:
           case ARM::t2LDRpci:
           case ARM::t2LDRHpci:
+          case ARM::t2LDRBpci:
             Bits = 12;  // +-offset_12
             NegOk = true;
             break;
@@ -873,7 +897,6 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
   WaterList.insert(IP, NewBB);
 }
 
-
 /// Split the basic block containing MI into two blocks, which are joined by
 /// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
@@ -897,8 +920,9 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
   if (!isThumb)
     BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
   else
-    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
-            .addImm(ARMCC::AL).addReg(0);
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc))
+        .addMBB(NewBB)
+        .add(predOps(ARMCC::AL));
   ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
@@ -1296,8 +1320,9 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
       if (!isThumb)
         BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
       else
-        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
-          .addImm(ARMCC::AL).addReg(0);
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr))
+            .addMBB(NewMBB)
+            .add(predOps(ARMCC::AL));
       unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
       ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                                       MaxDisp, false, UncondBr));
@@ -1477,7 +1502,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
   // add it to the island.
   U.HighWaterMark = NewIsland;
   U.CPEMI = BuildMI(NewIsland, DebugLoc(), CPEMI->getDesc())
-                .addImm(ID).addOperand(CPEMI->getOperand(1)).addImm(Size);
+                .addImm(ID)
+                .add(CPEMI->getOperand(1))
+                .addImm(Size);
   CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
   ++NumCPEs;
 
@@ -1681,8 +1708,9 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   Br.MI = &MBB->back();
   BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
   if (isThumb)
-    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
-            .addImm(ARMCC::AL).addReg(0);
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr))
+        .addMBB(DestBB)
+        .add(predOps(ARMCC::AL));
   else
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
   BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
@@ -1709,8 +1737,15 @@ bool ARMConstantIslands::undoLRSpillRestore() {
         MI->getNumExplicitOperands() == 3) {
       // Create the new insn and copy the predicate from the old.
       BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
-        .addOperand(MI->getOperand(0))
-        .addOperand(MI->getOperand(1));
+          .add(MI->getOperand(0))
+          .add(MI->getOperand(1));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+    if (MI->getOpcode() == ARM::tPUSH &&
+        MI->getOperand(2).getReg() == ARM::LR &&
+        MI->getNumExplicitOperands() == 3) {
+      // Just remove the push.
       MI->eraseFromParent();
       MadeChange = true;
     }
@@ -1792,13 +1827,12 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
       Bits = 11;
       Scale = 2;
       break;
-    case ARM::t2Bcc: {
+    case ARM::t2Bcc:
       NewOpc = ARM::tBcc;
       Bits = 8;
       Scale = 2;
       break;
     }
-    }
     if (NewOpc) {
       unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
       MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
@@ -1983,6 +2017,54 @@ static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
          &*MBB->begin() == CPEMI;
 }
 
+static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
+                                         MachineInstr *JumpMI,
+                                         unsigned &DeadSize) {
+  // Remove a dead add between the LEA and JT, which used to compute EntryReg,
+  // but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg
+  // and is not clobbered / used.
+  MachineInstr *RemovableAdd = nullptr;
+  unsigned EntryReg = JumpMI->getOperand(0).getReg();
+
+  // Find the last ADD to set EntryReg
+  MachineBasicBlock::iterator I(LEAMI);
+  for (++I; &*I != JumpMI; ++I) {
+    if (I->getOpcode() == ARM::t2ADDrs && I->getOperand(0).getReg() == EntryReg)
+      RemovableAdd = &*I;
+  }
+
+  if (!RemovableAdd)
+    return;
+
+  // Ensure EntryReg is not clobbered or used.
+  MachineBasicBlock::iterator J(RemovableAdd);
+  for (++J; &*J != JumpMI; ++J) {
+    for (unsigned K = 0, E = J->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = J->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == EntryReg)
+        return;
+      if (MO.isUse() && MO.getReg() == EntryReg)
+        return;
+    }
+  }
+
+  DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
+  RemovableAdd->eraseFromParent();
+  DeadSize += 4;
+}
+
+static bool registerDefinedBetween(unsigned Reg,
+                                   MachineBasicBlock::iterator From,
+                                   MachineBasicBlock::iterator To,
+                                   const TargetRegisterInfo *TRI) {
+  for (auto I = From; I != To; ++I)
+    if (I->modifiesRegister(Reg, TRI))
+      return true;
+  return false;
+}
+
 /// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
 /// jumptables when it's possible.
 bool ARMConstantIslands::optimizeThumb2JumpTables() {
@@ -2060,6 +2142,12 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       IdxReg = Shift->getOperand(2).getReg();
       unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
 
+      // It's important that IdxReg is live until the actual TBB/TBH. Most of
+      // the range is checked later, but the LEA might still clobber it and not
+      // actually get removed.
+      if (BaseReg == IdxReg && !jumpTableFollowsTB(MI, User.CPEMI))
+        continue;
+
       MachineInstr *Load = User.MI->getNextNode();
       if (Load->getOpcode() != ARM::tLDRr)
         continue;
@@ -2069,6 +2157,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
         continue;
 
       // If we're in PIC mode, there should be another ADD following.
+      auto *TRI = STI->getRegisterInfo();
       if (isPositionIndependentOrROPI) {
         MachineInstr *Add = Load->getNextNode();
         if (Add->getOpcode() != ARM::tADDrr ||
@@ -2078,22 +2167,26 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
           continue;
         if (Add->getOperand(0).getReg() != MI->getOperand(0).getReg())
           continue;
-
+        if (registerDefinedBetween(IdxReg, Add->getNextNode(), MI, TRI))
+          // IdxReg gets redefined in the middle of the sequence.
+          continue;
         Add->eraseFromParent();
         DeadSize += 2;
       } else {
         if (Load->getOperand(0).getReg() != MI->getOperand(0).getReg())
           continue;
+        if (registerDefinedBetween(IdxReg, Load->getNextNode(), MI, TRI))
+          // IdxReg gets redefined in the middle of the sequence.
+          continue;
       }
-      
-      
+
       // Now safe to delete the load and lsl. The LEA will be removed later.
       CanDeleteLEA = true;
       Shift->eraseFromParent();
       Load->eraseFromParent();
       DeadSize += 4;
     }
-    
+
     DEBUG(dbgs() << "Shrink JT: " << *MI);
     MachineInstr *CPEMI = User.CPEMI;
     unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
@@ -2117,7 +2210,10 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       NewJTMI->getOperand(0).setReg(ARM::PC);
       NewJTMI->getOperand(0).setIsKill(false);
 
-      if (CanDeleteLEA)  {
+      if (CanDeleteLEA) {
+        if (isThumb2)
+          RemoveDeadAddBetweenLEAAndJT(User.MI, MI, DeadSize);
+
         User.MI->eraseFromParent();
         DeadSize += isThumb2 ? 4 : 2;
 
@@ -2238,13 +2334,11 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   if (isThumb2)
     BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B))
         .addMBB(BB)
-        .addImm(ARMCC::AL)
-        .addReg(0);
+        .add(predOps(ARMCC::AL));
   else
     BuildMI(NewBB, DebugLoc(), TII->get(ARM::tB))
         .addMBB(BB)
-        .addImm(ARMCC::AL)
-        .addReg(0);
+        .add(predOps(ARMCC::AL));
 
   // Update internal data structures to account for the newly inserted MBB.
   MF->RenumberBlocks(NewBB);
@@ -2256,3 +2350,12 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   ++NumJTInserted;
   return NewBB;
 }
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+INITIALIZE_PASS(ARMConstantIslands, "arm-cp-islands", ARM_CP_ISLANDS_OPT_NAME,
+                false, false)
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 2d1602873ce0..9705c8b718b7 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -13,13 +13,17 @@
 
 #include "ARMConstantPoolValue.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cstdlib>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -44,7 +48,7 @@ ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
     LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
     AddCurrentAddress(addCurrentAddress) {}
 
-ARMConstantPoolValue::~ARMConstantPoolValue() {}
+ARMConstantPoolValue::~ARMConstantPoolValue() = default;
 
 StringRef ARMConstantPoolValue::getModifierText() const {
   switch (Modifier) {
@@ -94,9 +98,11 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   return false;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const {
   errs() << "  " << *this;
 }
+#endif
 
 void ARMConstantPoolValue::print(raw_ostream &O) const {
   if (Modifier) O << "(" << getModifierText() << ")";
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 5f61832aa740..61c521581f79 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -14,10 +14,11 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
 #define LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cstddef>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
@@ -29,6 +30,7 @@ class LLVMContext;
 class MachineBasicBlock;
 
 namespace ARMCP {
+
   enum ARMCPKind {
     CPValue,
     CPExtSymbol,
@@ -47,7 +49,8 @@ namespace ARMCP {
     SECREL,      /// Section Relative (Windows TLS)
     SBREL,       /// Static Base Relative (RWPI)
   };
-}
+
+} // end namespace ARMCP
 
 /// ARMConstantPoolValue - ARM specific constantpool value. This is used to
 /// represent PC-relative displacement between the address of the load
@@ -169,9 +172,11 @@ public:
 
   const GlobalValue *getGV() const;
   const BlockAddress *getBlockAddress() const;
+
   const GlobalVariable *getPromotedGlobal() const {
     return dyn_cast_or_null<GlobalVariable>(GVar);
   }
+
   const Constant *getPromotedGlobalInit() const {
     return CVal;
   }
@@ -186,6 +191,7 @@ public:
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
   void print(raw_ostream &O) const override;
+
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA() ||
            APV->isPromotedGlobal();
@@ -267,6 +273,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index baa4e0330cf4..e0aecff2633b 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -19,6 +19,7 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -30,6 +31,7 @@
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-pseudo"
@@ -97,9 +99,9 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
     const MachineOperand &MO = OldMI.getOperand(i);
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
-      UseMI.addOperand(MO);
+      UseMI.add(MO);
     else
-      DefMI.addOperand(MO);
+      DefMI.add(MO);
   }
 }
 
@@ -415,14 +417,14 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
     MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
 
   if (TableEntry->isUpdating)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the addrmode6 operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   // Copy the am6offset operand.
   if (TableEntry->hasWritebackOperand)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // For an instruction writing double-spaced subregs, the pseudo instruction
   // has an extra operand that is a use of the super-register.  Record the
@@ -432,15 +434,15 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
     SrcOpIdx = OpIdx++;
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the super-register source operand used for double-spaced subregs over
   // to the new instruction as an implicit operand.
   if (SrcOpIdx != 0) {
     MachineOperand MO = MI.getOperand(SrcOpIdx);
     MO.setImplicit(true);
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
   // Add an implicit def for the super-register.
   MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
@@ -467,14 +469,14 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
                                     TII->get(TableEntry->RealOpc));
   unsigned OpIdx = 0;
   if (TableEntry->isUpdating)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the addrmode6 operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   // Copy the am6offset operand.
   if (TableEntry->hasWritebackOperand)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
@@ -490,8 +492,8 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
     MIB.addReg(D3, getUndefRegState(SrcIsUndef));
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
     MIB->addRegisterKilled(SrcReg, TRI, true);
@@ -549,14 +551,14 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
   }
 
   if (TableEntry->isUpdating)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the addrmode6 operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   // Copy the am6offset operand.
   if (TableEntry->hasWritebackOperand)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Grab the super-register source.
   MachineOperand MO = MI.getOperand(OpIdx++);
@@ -579,12 +581,12 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
   OpIdx += 1;
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the super-register source to be an implicit source.
   MO.setImplicit(true);
-  MIB.addOperand(MO);
+  MIB.add(MO);
   if (TableEntry->IsLoad)
     // Add an implicit def for the super-register.
     MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
@@ -605,9 +607,9 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   unsigned OpIdx = 0;
 
   // Transfer the destination register operand.
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   if (IsExt)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
@@ -616,11 +618,11 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MIB.addReg(D0);
 
   // Copy the other source register operand.
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Add an implicit kill and use for the super-reg.
   MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
@@ -659,6 +661,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
     return false;
   case MachineOperand::MO_IntrinsicID:
   case MachineOperand::MO_Predicate:
+  case MachineOperand::MO_Placeholder:
     llvm_unreachable("should not exist post-isel");
   }
   llvm_unreachable("unhandled machine operand type");
@@ -696,8 +699,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     HI16 = HI16.addImm(SOImmValV2);
     LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-    LO16.addImm(Pred).addReg(PredReg).addReg(0);
-    HI16.addImm(Pred).addReg(PredReg).addReg(0);
+    LO16.addImm(Pred).addReg(PredReg).add(condCodeOp());
+    HI16.addImm(Pred).addReg(PredReg).add(condCodeOp());
     TransferImpOps(MI, LO16, HI16);
     MI.eraseFromParent();
     return;
@@ -797,7 +800,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
             .addReg(Desired.getReg(), RegState::Kill);
     if (!IsThumb)
       MIB.addImm(0);
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
   }
 
   // .Lloadcmp:
@@ -814,12 +817,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   MIB.addReg(Addr.getReg());
   if (LdrexOp == ARM::t2LDREX)
     MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
-  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
-                     .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
-                     .addOperand(Desired));
+  BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+      .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+      .add(Desired)
+      .add(predOps(ARMCC::AL));
   unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
   BuildMI(LoadCmpBB, DL, TII->get(Bcc))
       .addMBB(DoneBB)
@@ -838,16 +842,17 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
 
 
   MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
-  MIB.addOperand(New);
-  MIB.addOperand(Addr);
+  MIB.add(New);
+  MIB.add(Addr);
   if (StrexOp == ARM::t2STREX)
     MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
-  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
-                     .addReg(StatusReg, RegState::Kill)
-                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(CMPri))
+      .addReg(StatusReg, RegState::Kill)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
       .addMBB(LoadCmpBB)
       .addImm(ARMCC::NE)
@@ -927,13 +932,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   MachineInstrBuilder MIB;
   MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
   addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
-  MIB.addReg(Addr.getReg());
-  AddDefaultPred(MIB);
+  MIB.addReg(Addr.getReg()).add(predOps(ARMCC::AL));
 
   unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
-  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
-                     .addReg(DestLo, getKillRegState(Dest.isDead()))
-                     .addReg(DesiredLo, getKillRegState(Desired.isDead())));
+  BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+      .addReg(DestLo, getKillRegState(Dest.isDead()))
+      .addReg(DesiredLo, getKillRegState(Desired.isDead()))
+      .add(predOps(ARMCC::AL));
 
   BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
       .addReg(DestHi, getKillRegState(Dest.isDead()))
@@ -959,13 +964,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
   MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
   addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
-  MIB.addOperand(Addr);
-  AddDefaultPred(MIB);
+  MIB.add(Addr).add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
-  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
-                     .addReg(StatusReg, RegState::Kill)
-                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(CMPri))
+      .addReg(StatusReg, RegState::Kill)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
       .addMBB(LoadCmpBB)
       .addImm(ARMCC::NE)
@@ -1026,7 +1031,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
         // Add the default predicate in Thumb mode.
         if (STI->isThumb())
-          MIB.addImm(ARMCC::AL).addReg(0);
+          MIB.add(predOps(ARMCC::AL));
       } else if (RetOpcode == ARM::TCRETURNri) {
         BuildMI(MBB, MBBI, dl,
                 TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
@@ -1047,9 +1052,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
               MI.getOperand(1).getReg())
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4));
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4));
 
       MI.eraseFromParent();
       return true;
@@ -1059,10 +1064,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1070,11 +1075,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsi: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
               (MI.getOperand(1).getReg()))
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm())
-        .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addOperand(MI.getOperand(5))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm())
+          .addImm(MI.getOperand(4).getImm()) // 'pred'
+          .add(MI.getOperand(5))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1082,12 +1087,12 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsr: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
-        .addOperand(MI.getOperand(2))
-        .addOperand(MI.getOperand(3))
-        .addImm(MI.getOperand(4).getImm())
-        .addImm(MI.getOperand(5).getImm()) // 'pred'
-        .addOperand(MI.getOperand(6))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3))
+          .addImm(MI.getOperand(4).getImm())
+          .addImm(MI.getOperand(5).getImm()) // 'pred'
+          .add(MI.getOperand(6))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1097,9 +1102,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4));
+          .addImm(MI.getOperand(2).getImm())
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4));
       MI.eraseFromParent();
       return true;
     }
@@ -1108,10 +1113,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4))
-        .addReg(0); // 's' bit
+          .addImm(MI.getOperand(2).getImm())
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1121,10 +1126,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4))
-        .addReg(0); // 's' bit
+          .addImm(MI.getOperand(2).getImm())
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1143,11 +1148,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       }
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm())
-        .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addOperand(MI.getOperand(5))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm())
+          .addImm(MI.getOperand(4).getImm()) // 'pred'
+          .add(MI.getOperand(5))
+          .add(condCodeOp()); // 's' bit
       MI.eraseFromParent();
       return true;
     }
@@ -1187,10 +1192,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                                     "bits set.");
           unsigned bicOpc = AFI->isThumbFunction() ?
             ARM::t2BICri : ARM::BICri;
-          AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                                              TII->get(bicOpc), ARM::R6)
-                                      .addReg(ARM::R6, RegState::Kill)
-                                      .addImm(MaxAlign-1)));
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6)
+              .addReg(ARM::R6, RegState::Kill)
+              .addImm(MaxAlign - 1)
+              .add(predOps(ARMCC::AL))
+              .add(condCodeOp());
         }
 
       }
@@ -1201,24 +1207,25 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
       // These are just fancy MOVs instructions.
-      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
-                             MI.getOperand(0).getReg())
-                     .addOperand(MI.getOperand(1))
-                     .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
-                                                  ARM_AM::lsr : ARM_AM::asr),
-                                                 1)))
-        .addReg(ARM::CPSR, RegState::Define);
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              MI.getOperand(0).getReg())
+          .add(MI.getOperand(1))
+          .addImm(ARM_AM::getSORegOpc(
+              (Opcode == ARM::MOVsrl_flag ? ARM_AM::lsr : ARM_AM::asr), 1))
+          .add(predOps(ARMCC::AL))
+          .addReg(ARM::CPSR, RegState::Define);
       MI.eraseFromParent();
       return true;
     }
     case ARM::RRX: {
       // This encodes as "MOVs Rd, Rm, rrx
       MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
-                               MI.getOperand(0).getReg())
-                       .addOperand(MI.getOperand(1))
-                       .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
-        .addReg(0);
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+                  MI.getOperand(0).getReg())
+              .add(MI.getOperand(1))
+              .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0))
+              .add(predOps(ARMCC::AL))
+              .add(condCodeOp());
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
       return true;
@@ -1241,18 +1248,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                   .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
         if (!Thumb)
           MIB.addImm(0);
-        MIB.addImm(static_cast<unsigned>(ARMCC::AL)).addReg(0);
+        MIB.add(predOps(ARMCC::AL));
 
         MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                       TII->get(Thumb ? ARM::tBLXr : ARM::BLX));
         if (Thumb)
-          MIB.addImm(static_cast<unsigned>(ARMCC::AL)).addReg(0);
+          MIB.add(predOps(ARMCC::AL));
         MIB.addReg(Reg, RegState::Kill);
       } else {
         MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                       TII->get(Thumb ? ARM::tBL : ARM::BL));
         if (Thumb)
-          MIB.addImm(static_cast<unsigned>(ARMCC::AL)).addReg(0);
+          MIB.add(predOps(ARMCC::AL));
         MIB.addExternalSymbol("__aeabi_read_tp", 0);
       }
 
@@ -1268,15 +1275,15 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       MachineInstrBuilder MIB1 =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                               TII->get(NewLdOpc), DstReg)
-                       .addOperand(MI.getOperand(1)));
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
+              .add(MI.getOperand(1))
+              .add(predOps(ARMCC::AL));
       MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                                         TII->get(ARM::tPICADD))
-        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-        .addReg(DstReg)
-        .addOperand(MI.getOperand(2));
+      MachineInstrBuilder MIB2 =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD))
+              .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+              .addReg(DstReg)
+              .add(MI.getOperand(2));
       TransferImpOps(MI, MIB1, MIB2);
       MI.eraseFromParent();
       return true;
@@ -1319,7 +1326,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
       if (IsARM)
         MIB.addImm(0);
-      AddDefaultPred(MIB);
+      MIB.add(predOps(ARMCC::AL));
 
       if (IsPIC) {
         MachineInstrBuilder MIB =
@@ -1329,7 +1336,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addImm(ARMPCLabelIndex);
 
         if (IsARM)
-          AddDefaultPred(MIB);
+          MIB.add(predOps(ARMCC::AL));
       }
 
       MI.eraseFromParent();
@@ -1368,7 +1375,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
         .addReg(DstReg).addImm(LabelId);
       if (isARM) {
-        AddDefaultPred(MIB3);
+        MIB3.add(predOps(ARMCC::AL));
         if (Opcode == ARM::MOV_ga_pcrel_ldr)
           MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
@@ -1388,9 +1395,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB =
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
               .addReg(ARM::LR)
-              .addOperand(MI.getOperand(0))
-              .addOperand(MI.getOperand(1))
-              .addOperand(MI.getOperand(2))
+              .add(MI.getOperand(0))
+              .add(MI.getOperand(1))
+              .add(MI.getOperand(2))
               .addReg(ARM::CPSR, RegState::Undef);
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
@@ -1407,11 +1414,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned DstReg = MI.getOperand(OpIdx++).getReg();
 
       // Copy the source register.
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Copy the predicate operands.
-      MIB.addOperand(MI.getOperand(OpIdx++));
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Add the destination operands (D subregs).
       unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
@@ -1438,11 +1445,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
 
       // Copy the destination register.
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Copy the predicate operands.
-      MIB.addOperand(MI.getOperand(OpIdx++));
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Add the source operands (D subregs).
       unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index df4dcb375750..01e062bd185c 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
@@ -21,30 +22,61 @@
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -54,24 +86,22 @@ namespace {
     enum {
       RegBase,
       FrameIndexBase
-    } BaseType;
+    } BaseType = RegBase;
 
     union {
       unsigned Reg;
       int FI;
     } Base;
 
-    int Offset;
+    int Offset = 0;
 
     // Innocuous defaults for our address.
-    Address()
-     : BaseType(RegBase), Offset(0) {
-       Base.Reg = 0;
-     }
+    Address() {
+      Base.Reg = 0;
+    }
   } Address;
 
 class ARMFastISel final : public FastISel {
-
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
   const ARMSubtarget *Subtarget;
@@ -99,8 +129,9 @@ class ARMFastISel final : public FastISel {
       Context = &funcInfo.Fn->getContext();
     }
 
-    // Code from FastISel.cpp.
   private:
+    // Code from FastISel.cpp.
+
     unsigned fastEmitInst_r(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             unsigned Op0, bool Op0IsKill);
@@ -117,18 +148,18 @@ class ARMFastISel final : public FastISel {
                             uint64_t Imm);
 
     // Backend specific FastISel code.
-  private:
+
     bool fastSelectInstruction(const Instruction *I) override;
     unsigned fastMaterializeConstant(const Constant *C) override;
     unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
     bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                              const LoadInst *LI) override;
     bool fastLowerArguments() override;
-  private:
+
   #include "ARMGenFastISel.inc"
 
     // Instruction selection routines.
-  private:
+
     bool SelectLoad(const Instruction *I);
     bool SelectStore(const Instruction *I);
     bool SelectBranch(const Instruction *I);
@@ -151,12 +182,12 @@ class ARMFastISel final : public FastISel {
     bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
 
     // Utility routines.
-  private:
+
     bool isPositionIndependent() const;
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                    bool isZExt);
+                    bool isZExt, bool isEquality);
     bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
@@ -179,7 +210,7 @@ class ARMFastISel final : public FastISel {
     const TargetLowering *getTargetLowering() { return &TLI; }
 
     // Call handling routines.
-  private:
+
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
                                   bool Return,
                                   bool isVarArg);
@@ -198,7 +229,7 @@ class ARMFastISel final : public FastISel {
     bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
 
     // OptionalDef handling routines.
-  private:
+
     bool isARMNEONPred(const MachineInstr *MI);
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
@@ -256,17 +287,13 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // Are we NEON in ARM mode and have a predicate operand? If so, I know
   // we're not predicable but add it anyways.
   if (isARMNEONPred(MI))
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
   // defines CPSR. All other OptionalDefines in ARM are the CCR register.
   bool CPSR = false;
-  if (DefinesOptionalPredicate(MI, &CPSR)) {
-    if (CPSR)
-      AddDefaultT1CC(MIB);
-    else
-      AddDefaultCC(MIB);
-  }
+  if (DefinesOptionalPredicate(MI, &CPSR))
+    MIB.add(CPSR ? t1CondCodeOp() : condCodeOp());
   return MIB;
 }
 
@@ -434,7 +461,6 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
 }
 
 unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
-
   if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
     return 0;
 
@@ -739,7 +765,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
           TmpOffset += SL->getElementOffset(Idx);
         } else {
           uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-          for (;;) {
+          while (true) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
               TmpOffset += CI->getSExtValue() * S;
@@ -971,7 +997,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // Create the base instruction, then add the operands.
   if (allocReg)
     ResultReg = createResultReg(RC);
-  assert (ResultReg > 255 && "Expected an allocated virtual register.");
+  assert(ResultReg > 255 && "Expected an allocated virtual register.");
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
   AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
@@ -1216,7 +1242,6 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // behavior.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-
       // Get the compare predicate.
       // Try to take advantage of fallthrough opportunities.
       CmpInst::Predicate Predicate = CI->getPredicate();
@@ -1231,7 +1256,8 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       if (ARMPred == ARMCC::AL) return false;
 
       // Emit the compare.
-      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                      CI->isEquality()))
         return false;
 
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
@@ -1318,14 +1344,16 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 }
 
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                             bool isZExt) {
+                             bool isZExt, bool isEquality) {
   Type *Ty = Src1Value->getType();
   EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple()) return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
-  if (isFloat && !Subtarget->hasVFP2())
+  if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+    return false;
+
+  if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
     return false;
 
   // Check to see if the 2nd operand is a constant that we can encode directly
@@ -1364,10 +1392,18 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     // TODO: Verify compares.
     case MVT::f32:
       isICmp = false;
-      CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      // Equality comparisons shouldn't raise Invalid on uordered inputs.
+      if (isEquality)
+        CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
+      else
+        CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
       break;
     case MVT::f64:
       isICmp = false;
+      // Equality comparisons shouldn't raise Invalid on uordered inputs.
+      if (isEquality)
+        CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
+      else
       CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
       break;
     case MVT::i1:
@@ -1444,7 +1480,8 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   if (ARMPred == ARMCC::AL) return false;
 
   // Emit the compare.
-  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                  CI->isEquality()))
     return false;
 
   // Now set a register based on the comparison. Explicitly set the predicates
@@ -1466,7 +1503,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
 
 bool ARMFastISel::SelectFPExt(const Instruction *I) {
   // Make sure we have VFP and that we're extending float to double.
-  if (!Subtarget->hasVFP2()) return false;
+  if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
 
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() ||
@@ -1485,7 +1522,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
 
 bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   // Make sure we have VFP and that we're truncating double to float.
-  if (!Subtarget->hasVFP2()) return false;
+  if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
 
   Value *V = I->getOperand(0);
   if (!(I->getType()->isFloatTy() &&
@@ -1536,7 +1573,8 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
 
   unsigned Opc;
   if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
-  else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+  else if (Ty->isDoubleTy() && !Subtarget->isFPOnlySP())
+    Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
   else return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
@@ -1561,7 +1599,8 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
   unsigned Opc;
   Type *OpTy = I->getOperand(0)->getType();
   if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
-  else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+  else if (OpTy->isDoubleTy() && !Subtarget->isFPOnlySP())
+    Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
   else return false;
 
   // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
@@ -1596,7 +1635,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   bool UseImm = false;
   bool isNegativeImm = false;
   if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) {
-    assert (VT == MVT::i32 && "Expecting an i32.");
+    assert(VT == MVT::i32 && "Expecting an i32.");
     Imm = (int)ConstInt->getValue().getZExtValue();
     if (Imm < 0) {
       isNegativeImm = true;
@@ -1765,8 +1804,9 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
   // if we have them.
   // FIXME: It'd be nice to use NEON instructions.
   Type *Ty = I->getType();
-  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
-  if (isFloat && !Subtarget->hasVFP2())
+  if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+    return false;
+  if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
     return false;
 
   unsigned Opc;
@@ -1926,7 +1966,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       case CCValAssign::SExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false);
-        assert (Arg != 0 && "Failed to emit a sext");
+        assert(Arg != 0 && "Failed to emit a sext");
         ArgVT = DestVT;
         break;
       }
@@ -1935,7 +1975,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
-        assert (Arg != 0 && "Failed to emit a zext");
+        assert(Arg != 0 && "Failed to emit a zext");
         ArgVT = DestVT;
         break;
       }
@@ -2230,7 +2270,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
                                     DbgLoc, TII.get(CallOpc));
   // BL / BLX don't take a predicate, but tBL / tBLX do.
   if (isThumb2)
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
   if (Subtarget->genLongCalls())
     MIB.addReg(CalleeReg);
   else
@@ -2311,19 +2351,19 @@ bool ARMFastISel::SelectCall(const Instruction *I,
       break;
 
     ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+    unsigned ArgIdx = i - CS.arg_begin();
+    if (CS.paramHasAttr(ArgIdx, Attribute::SExt))
       Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+    if (CS.paramHasAttr(ArgIdx, Attribute::ZExt))
       Flags.setZExt();
 
     // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) ||
-        CS.paramHasAttr(AttrInd, Attribute::SwiftError) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+    if (CS.paramHasAttr(ArgIdx, Attribute::InReg) ||
+        CS.paramHasAttr(ArgIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(ArgIdx, Attribute::SwiftSelf) ||
+        CS.paramHasAttr(ArgIdx, Attribute::SwiftError) ||
+        CS.paramHasAttr(ArgIdx, Attribute::Nest) ||
+        CS.paramHasAttr(ArgIdx, Attribute::ByVal))
       return false;
 
     Type *ArgTy = (*i)->getType();
@@ -2373,7 +2413,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
@@ -2418,7 +2458,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
       else if (Len >= 2)
         VT = MVT::i16;
       else {
-        assert (Len == 1 && "Expected a length of 1!");
+        assert(Len == 1 && "Expected a length of 1!");
         VT = MVT::i8;
       }
     } else {
@@ -2433,9 +2473,9 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
     bool RV;
     unsigned ResultReg;
     RV = ARMEmitLoad(VT, ResultReg, Src);
-    assert (RV == true && "Should be able to handle this load.");
+    assert(RV && "Should be able to handle this load.");
     RV = ARMEmitStore(VT, ResultReg, Dest);
-    assert (RV == true && "Should be able to handle this store.");
+    assert(RV && "Should be able to handle this store.");
     (void)RV;
 
     unsigned Size = VT.getSizeInBits()/8;
@@ -2687,9 +2727,11 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     if (setsCPSR)
       MIB.addReg(ARM::CPSR, RegState::Define);
     SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
-    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
+    MIB.addReg(SrcReg, isKill * RegState::Kill)
+        .addImm(ImmEnc)
+        .add(predOps(ARMCC::AL));
     if (hasS)
-      AddDefaultCC(MIB);
+      MIB.add(condCodeOp());
     // Second instruction consumes the first's result.
     SrcReg = ResultReg;
   }
@@ -2779,7 +2821,6 @@ bool ARMFastISel::SelectShift(const Instruction *I,
 
 // TODO: SoftFP support.
 bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
-
   switch (I->getOpcode()) {
     case Instruction::Load:
       return SelectLoad(I);
@@ -2849,6 +2890,7 @@ bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
 }
 
 namespace {
+
 // This table describes sign- and zero-extend instructions which can be
 // folded into a preceding load. All of these extends have an immediate
 // (sometimes a mask and sometimes a shift) that's applied after
@@ -2865,7 +2907,8 @@ const struct FoldableLoadExtendsStruct {
   { { ARM::SXTB,  ARM::t2SXTB  },   0, 0, MVT::i8  },
   { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
 };
-}
+
+} // end anonymous namespace
 
 /// \brief The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
@@ -2933,7 +2976,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
           .addConstantPoolIndex(Idx);
   if (Opc == ARM::LDRcp)
     MIB.addImm(0);
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   // Fix the address by adding pc.
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
@@ -2944,7 +2987,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
             .addReg(TempReg)
             .addImm(ARMPCLabelIndex);
   if (!Subtarget->isThumb())
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 
   if (UseGOT_PREL && Subtarget->isThumb()) {
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
@@ -3010,7 +3053,6 @@ bool ARMFastISel::fastLowerArguments() {
     }
   }
 
-
   static const MCPhysReg GPRArgRegs[] = {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
@@ -3035,6 +3077,7 @@ bool ARMFastISel::fastLowerArguments() {
 }
 
 namespace llvm {
+
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
     if (funcInfo.MF->getSubtarget<ARMSubtarget>().useFastISel())
@@ -3042,4 +3085,5 @@ namespace llvm {
 
     return nullptr;
   }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index 0c910ab6130f..8c0df4c2cbf9 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -19,10 +19,10 @@
 namespace llvm {
 
 template<typename InstrType> // could be MachineInstr or MCInst
-bool IsCPSRDead(InstrType *Instr);
+bool IsCPSRDead(const InstrType *Instr);
 
 template<typename InstrType> // could be MachineInstr or MCInst
-inline bool isV8EligibleForIT(InstrType *Instr) {
+inline bool isV8EligibleForIT(const InstrType *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c72db8aca108..37be22bed540 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -16,19 +16,49 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "arm-frame-lowering"
 
@@ -180,6 +210,7 @@ static bool WindowsRequiresStackProbe(const MachineFunction &MF,
 }
 
 namespace {
+
 struct StackAdjustingInsts {
   struct InstInfo {
     MachineBasicBlock::iterator I;
@@ -196,7 +227,8 @@ struct StackAdjustingInsts {
   }
 
   void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) {
-    auto Info = find_if(Insts, [&](InstInfo &Info) { return Info.I == I; });
+    auto Info =
+        llvm::find_if(Insts, [&](InstInfo &Info) { return Info.I == I; });
     assert(Info != Insts.end() && "invalid sp adjusting instruction");
     Info->SPAdjust += ExtraBytes;
   }
@@ -219,7 +251,8 @@ struct StackAdjustingInsts {
     }
   }
 };
-}
+
+} // end anonymous namespace
 
 /// Emit an instruction sequence that will align the address in
 /// register Reg by zero-ing out the lower bits.  For versions of the
@@ -252,35 +285,40 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
     //   lsr Reg, Reg, log2(Alignment)
     //   lsl Reg, Reg, log2(Alignment)
     if (CanUseBFC) {
-      AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
-                         .addReg(Reg, RegState::Kill)
-                         .addImm(~AlignMask));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(~AlignMask)
+          .add(predOps(ARMCC::AL));
     } else if (AlignMask <= 255) {
-      AddDefaultCC(
-          AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
-                             .addReg(Reg, RegState::Kill)
-                             .addImm(AlignMask)));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(AlignMask)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
     } else {
       assert(!MustBeSingleInstruction &&
              "Shouldn't call emitAligningInstructions demanding a single "
              "instruction to be emitted for large stack alignment for a target "
              "without BFC.");
-      AddDefaultCC(AddDefaultPred(
-          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
-              .addReg(Reg, RegState::Kill)
-              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
-      AddDefaultCC(AddDefaultPred(
-          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
-              .addReg(Reg, RegState::Kill)
-              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
     }
   } else {
     // Since this is only reached for Thumb-2 targets, the BFC instruction
     // should always be available.
     assert(CanUseBFC);
-    AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
-                       .addReg(Reg, RegState::Kill)
-                       .addImm(~AlignMask));
+    BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addImm(~AlignMask)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -448,9 +486,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     uint32_t NumWords = NumBytes >> 2;
 
     if (NumWords < 65536)
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
-                     .addImm(NumWords)
-                     .setMIFlags(MachineInstr::FrameSetup));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+          .addImm(NumWords)
+          .setMIFlags(MachineInstr::FrameSetup)
+          .add(predOps(ARMCC::AL));
     else
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
         .addImm(NumWords)
@@ -462,10 +501,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     case CodeModel::Default:
     case CodeModel::Kernel:
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
-        .addImm((unsigned)ARMCC::AL).addReg(0)
-        .addExternalSymbol("__chkstk")
-        .addReg(ARM::R4, RegState::Implicit)
-        .setMIFlags(MachineInstr::FrameSetup);
+          .add(predOps(ARMCC::AL))
+          .addExternalSymbol("__chkstk")
+          .addReg(ARM::R4, RegState::Implicit)
+          .setMIFlags(MachineInstr::FrameSetup);
       break;
     case CodeModel::Large:
     case CodeModel::JITDefault:
@@ -474,18 +513,19 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlags(MachineInstr::FrameSetup);
 
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
-        .addImm((unsigned)ARMCC::AL).addReg(0)
-        .addReg(ARM::R12, RegState::Kill)
-        .addReg(ARM::R4, RegState::Implicit)
-        .setMIFlags(MachineInstr::FrameSetup);
+          .add(predOps(ARMCC::AL))
+          .addReg(ARM::R12, RegState::Kill)
+          .addReg(ARM::R4, RegState::Implicit)
+          .setMIFlags(MachineInstr::FrameSetup);
       break;
     }
 
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
-                                        ARM::SP)
-                                .addReg(ARM::SP, RegState::Kill)
-                                .addReg(ARM::R4, RegState::Kill)
-                                .setMIFlags(MachineInstr::FrameSetup)));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), ARM::SP)
+        .addReg(ARM::SP, RegState::Kill)
+        .addReg(ARM::R4, RegState::Kill)
+        .setMIFlags(MachineInstr::FrameSetup)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     NumBytes = 0;
   }
 
@@ -657,12 +697,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       // -- out lower bits in r4
       // mov sp, r4
       // FIXME: It will be better just to find spare register here.
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
-                         .addReg(ARM::SP, RegState::Kill));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+          .addReg(ARM::SP, RegState::Kill)
+          .add(predOps(ARMCC::AL));
       emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
                                false);
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-                         .addReg(ARM::R4, RegState::Kill));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+          .addReg(ARM::R4, RegState::Kill)
+          .add(predOps(ARMCC::AL));
     }
 
     AFI->setShouldRestoreSPFromFP(true);
@@ -675,14 +717,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // FIXME: Clarify FrameSetup flags here.
   if (RegInfo->hasBasePointer(MF)) {
     if (isARM)
-      BuildMI(MBB, MBBI, dl,
-              TII.get(ARM::MOVr), RegInfo->getBaseRegister())
-        .addReg(ARM::SP)
-        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
     else
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                             RegInfo->getBaseRegister())
-        .addReg(ARM::SP));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), RegInfo->getBaseRegister())
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL));
   }
 
   // If the frame has variable sized objects then the epilogue must restore
@@ -757,19 +799,21 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                  "No scratch register to restore SP from FP!");
           emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                  ARMCC::AL, 0, TII);
-          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                                 ARM::SP)
-            .addReg(ARM::R4));
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+              .addReg(ARM::R4)
+              .add(predOps(ARMCC::AL));
         }
       } else {
         // Thumb2 or ARM.
         if (isARM)
           BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
-            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+              .addReg(FramePtr)
+              .add(predOps(ARMCC::AL))
+              .add(condCodeOp());
         else
-          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                                 ARM::SP)
-            .addReg(FramePtr));
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+              .addReg(FramePtr)
+              .add(predOps(ARMCC::AL));
       }
     } else if (NumBytes &&
                !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
@@ -829,7 +873,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
   // When dynamically realigning the stack, use the frame pointer for
   // parameters, and the stack/base pointer for locals.
   if (RegInfo->needsStackRealignment(MF)) {
-    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    assert(hasFP(MF) && "dynamic stack realignment without a FP!");
     if (isFixed) {
       FrameReg = RegInfo->getFrameRegister(MF);
       Offset = FPOffset;
@@ -936,18 +980,19 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     });
 
     if (Regs.size() > 1 || StrOpc== 0) {
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
-                       .addReg(ARM::SP).setMIFlags(MIFlags));
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+                                    .addReg(ARM::SP)
+                                    .setMIFlags(MIFlags)
+                                    .add(predOps(ARMCC::AL));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
     } else if (Regs.size() == 1) {
-      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
-                                        ARM::SP)
-        .addReg(Regs[0].first, getKillRegState(Regs[0].second))
-        .addReg(ARM::SP).setMIFlags(MIFlags)
-        .addImm(-4);
-      AddDefaultPred(MIB);
+      BuildMI(MBB, MI, DL, TII.get(StrOpc), ARM::SP)
+          .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+          .addReg(ARM::SP)
+          .setMIFlags(MIFlags)
+          .addImm(-4)
+          .add(predOps(ARMCC::AL));
     }
     Regs.clear();
 
@@ -1027,9 +1072,9 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     });
 
     if (Regs.size() > 1 || LdrOpc == 0) {
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
-                       .addReg(ARM::SP));
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+                                    .addReg(ARM::SP)
+                                    .add(predOps(ARMCC::AL));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
       if (DeleteRet && MI != MBB.end()) {
@@ -1053,7 +1098,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
         MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
       } else
         MIB.addImm(4);
-      AddDefaultPred(MIB);
+      MIB.add(predOps(ARMCC::AL));
     }
     Regs.clear();
 
@@ -1114,9 +1159,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // sub r4, sp, #numregs * 8
   // The immediate is <= 64, so it doesn't need any special encoding.
   unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
-  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
-                                  .addReg(ARM::SP)
-                                  .addImm(8 * NumAlignedDPRCS2Regs)));
+  BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+      .addReg(ARM::SP)
+      .addImm(8 * NumAlignedDPRCS2Regs)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
 
   unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment();
   // We must set parameter MustBeSingleInstruction to true, since
@@ -1132,10 +1179,10 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // Leave r4 live, it is used below.
   Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
   MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
-                            .addReg(ARM::R4);
-  MIB = AddDefaultPred(MIB);
+                                .addReg(ARM::R4)
+                                .add(predOps(ARMCC::AL));
   if (!isThumb)
-    AddDefaultCC(MIB);
+    MIB.add(condCodeOp());
 
   // Now spill NumAlignedDPRCS2Regs registers starting from d8.
   // r4 holds the stack slot address.
@@ -1147,11 +1194,12 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
-                           ARM::R4)
-                   .addReg(ARM::R4, RegState::Kill).addImm(16)
-                   .addReg(NextReg)
-                   .addReg(SupReg, RegState::ImplicitKill));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ARM::R4)
+        .addReg(ARM::R4, RegState::Kill)
+        .addImm(16)
+        .addReg(NextReg)
+        .addReg(SupReg, RegState::ImplicitKill)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1165,9 +1213,12 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
-                   .addReg(ARM::R4).addImm(16).addReg(NextReg)
-                   .addReg(SupReg, RegState::ImplicitKill));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
+        .addReg(ARM::R4)
+        .addImm(16)
+        .addReg(NextReg)
+        .addReg(SupReg, RegState::ImplicitKill)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1177,8 +1228,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QPRRegClass);
     MBB.addLiveIn(SupReg);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
-                   .addReg(ARM::R4).addImm(16).addReg(SupReg));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
+        .addReg(ARM::R4)
+        .addImm(16)
+        .addReg(SupReg)
+        .add(predOps(ARMCC::AL));
     NextReg += 2;
     NumAlignedDPRCS2Regs -= 2;
   }
@@ -1187,9 +1241,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs) {
     MBB.addLiveIn(NextReg);
     // vstr.64 uses addrmode5 which has an offset scale of 4.
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
-                   .addReg(NextReg)
-                   .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
+        .addReg(NextReg)
+        .addReg(ARM::R4)
+        .addImm((NextReg - R4BaseReg) * 2)
+        .add(predOps(ARMCC::AL));
   }
 
   // The last spill instruction inserted should kill the scratch register r4.
@@ -1254,8 +1310,11 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
 
   unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
-  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
-                              .addFrameIndex(D8SpillFI).addImm(0)));
+  BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+      .addFrameIndex(D8SpillFI)
+      .addImm(0)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
 
   // Now restore NumAlignedDPRCS2Regs registers starting from d8.
   unsigned NextReg = ARM::D8;
@@ -1264,10 +1323,12 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs >= 6) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
-                   .addReg(ARM::R4, RegState::Define)
-                   .addReg(ARM::R4, RegState::Kill).addImm(16)
-                   .addReg(SupReg, RegState::ImplicitDefine));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
+        .addReg(ARM::R4, RegState::Define)
+        .addReg(ARM::R4, RegState::Kill)
+        .addImm(16)
+        .addReg(SupReg, RegState::ImplicitDefine)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1280,9 +1341,11 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs >= 4) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
-                   .addReg(ARM::R4).addImm(16)
-                   .addReg(SupReg, RegState::ImplicitDefine));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
+        .addReg(ARM::R4)
+        .addImm(16)
+        .addReg(SupReg, RegState::ImplicitDefine)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1291,16 +1354,20 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs >= 2) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QPRRegClass);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
-                   .addReg(ARM::R4).addImm(16));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
+        .addReg(ARM::R4)
+        .addImm(16)
+        .add(predOps(ARMCC::AL));
     NextReg += 2;
     NumAlignedDPRCS2Regs -= 2;
   }
 
   // Finally, use a vanilla vldr.64 for the remaining odd register.
   if (NumAlignedDPRCS2Regs)
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
-                   .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
+        .addReg(ARM::R4)
+        .addImm(2 * (NextReg - R4BaseReg))
+        .add(predOps(ARMCC::AL));
 
   // Last store kills r4.
   std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
@@ -1633,13 +1700,14 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   //        worth the effort and added fragility?
   unsigned EstimatedStackSize =
       MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
-  if (hasFP(MF)) {
+  bool HasFP = hasFP(MF);
+  if (HasFP) {
     if (AFI->hasStackFrame())
       EstimatedStackSize += 4;
   } else {
     // If FP is not used, SP will be used to access arguments, so count the
     // size of arguments into the estimation.
-    EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+    EstimatedStackSize += AFI->getArgumentStackSize();
   }
   EstimatedStackSize += 16; // For possible paddings.
 
@@ -1650,7 +1718,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
 
-    if (hasFP(MF)) {
+    if (HasFP) {
       SavedRegs.set(FramePtr);
       // If the frame pointer is required by the ABI, also spill LR so that we
       // emit a complete frame record.
@@ -1658,11 +1726,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         SavedRegs.set(ARM::LR);
         LRSpilled = true;
         NumGPRSpills++;
-        auto LRPos = find(UnspilledCS1GPRs, ARM::LR);
+        auto LRPos = llvm::find(UnspilledCS1GPRs, ARM::LR);
         if (LRPos != UnspilledCS1GPRs.end())
           UnspilledCS1GPRs.erase(LRPos);
       }
-      auto FPPos = find(UnspilledCS1GPRs, FramePtr);
+      auto FPPos = llvm::find(UnspilledCS1GPRs, FramePtr);
       if (FPPos != UnspilledCS1GPRs.end())
         UnspilledCS1GPRs.erase(FPPos);
       NumGPRSpills++;
@@ -1721,7 +1789,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       }
 
       // r7 can be used if it is not being used as the frame pointer.
-      if (!hasFP(MF)) {
+      if (!HasFP) {
         if (SavedRegs.test(ARM::R7)) {
           --RegDeficit;
           DEBUG(dbgs() << "%R7 is saved low register, RegDeficit = "
@@ -1773,7 +1841,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         NumGPRSpills++;
         CS1Spilled = true;
         ExtraCSSpill = true;
-        UnspilledCS1GPRs.erase(find(UnspilledCS1GPRs, Reg));
+        UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg));
         if (Reg == ARM::LR)
           LRSpilled = true;
       }
@@ -1786,7 +1854,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(ARM::LR);
       NumGPRSpills++;
       SmallVectorImpl<unsigned>::iterator LRPos;
-      LRPos = find(UnspilledCS1GPRs, (unsigned)ARM::LR);
+      LRPos = llvm::find(UnspilledCS1GPRs, (unsigned)ARM::LR);
       if (LRPos != UnspilledCS1GPRs.end())
         UnspilledCS1GPRs.erase(LRPos);
 
@@ -2081,12 +2149,17 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // SR1: Scratch Register #1
   // push {SR0, SR1}
   if (Thumb) {
-    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)))
-        .addReg(ScratchReg0).addReg(ScratchReg1);
+    BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH))
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   } else {
-    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
-                   .addReg(ARM::SP, RegState::Define).addReg(ARM::SP))
-        .addReg(ScratchReg0).addReg(ScratchReg1);
+    BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   }
 
   // Emit the relevant DWARF information about the change in stack pointer as
@@ -2106,21 +2179,29 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // mov SR1, sp
   if (Thumb) {
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
-                      .addReg(ARM::SP));
+    BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL));
   } else if (CompareStackPointer) {
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
-                      .addReg(ARM::SP)).addReg(0);
+    BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
 
   // sub SR1, sp, #StackSize
   if (!CompareStackPointer && Thumb) {
-    AddDefaultPred(
-        AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1))
-            .addReg(ScratchReg1).addImm(AlignedStackSize));
+    BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)
+        .add(condCodeOp())
+        .addReg(ScratchReg1)
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL));
   } else if (!CompareStackPointer) {
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
-                      .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0);
+    BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+        .addReg(ARM::SP)
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
 
   if (Thumb && ST->isThumb1Only()) {
@@ -2131,21 +2212,25 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
 
     // ldr SR0, [pc, offset(STACK_LIMIT)]
-    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
-                      .addConstantPoolIndex(CPI));
+    BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+        .addConstantPoolIndex(CPI)
+        .add(predOps(ARMCC::AL));
 
     // ldr SR0, [SR0]
-    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
-                      .addReg(ScratchReg0).addImm(0));
+    BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
+        .addReg(ScratchReg0)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
   } else {
     // Get TLS base address from the coprocessor
     // mrc p15, #0, SR0, c13, c0, #3
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
-                     .addImm(15)
-                     .addImm(0)
-                     .addImm(13)
-                     .addImm(0)
-                     .addImm(3));
+    BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+        .addImm(15)
+        .addImm(0)
+        .addImm(13)
+        .addImm(0)
+        .addImm(3)
+        .add(predOps(ARMCC::AL));
 
     // Use the last tls slot on android and a private field of the TCP on linux.
     assert(ST->isTargetAndroid() || ST->isTargetLinux());
@@ -2153,16 +2238,19 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
     // Get the stack limit from the right offset
     // ldr SR0, [sr0, #4 * TlsOffset]
-    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
-                      .addReg(ScratchReg0).addImm(4 * TlsOffset));
+    BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+        .addReg(ScratchReg0)
+        .addImm(4 * TlsOffset)
+        .add(predOps(ARMCC::AL));
   }
 
   // Compare stack limit with stack size requested.
   // cmp SR0, SR1
   Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
-  AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode))
-                    .addReg(ScratchReg0)
-                    .addReg(ScratchReg1));
+  BuildMI(GetMBB, DL, TII.get(Opcode))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1)
+      .add(predOps(ARMCC::AL));
 
   // This jump is taken if StackLimit < SP - stack required.
   Opcode = Thumb ? ARM::tBcc : ARM::Bcc;
@@ -2178,32 +2266,40 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // Pass first argument for the __morestack by Scratch Register #0.
   //   The amount size of stack required
   if (Thumb) {
-    AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8),
-                                        ScratchReg0)).addImm(AlignedStackSize));
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0)
+        .add(condCodeOp())
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL));
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
-                      .addImm(AlignedStackSize)).addReg(0);
+    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
   // Pass second argument for the __morestack by Scratch Register #1.
   //   The amount size of stack consumed to save function arguments.
   if (Thumb) {
-    AddDefaultPred(
-        AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1))
-            .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())));
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)
+        .add(condCodeOp())
+        .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
+        .add(predOps(ARMCC::AL));
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
-                   .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())))
-                   .addReg(0);
+    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+        .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
 
   // push {lr} - Save return address of this function.
   if (Thumb) {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)))
+    BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH))
+        .add(predOps(ARMCC::AL))
         .addReg(ARM::LR);
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
+    BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
         .addReg(ARM::LR);
   }
 
@@ -2220,7 +2316,8 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Call __morestack().
   if (Thumb) {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL)))
+    BuildMI(AllocMBB, DL, TII.get(ARM::tBL))
+        .add(predOps(ARMCC::AL))
         .addExternalSymbol("__morestack");
   } else {
     BuildMI(AllocMBB, DL, TII.get(ARM::BL))
@@ -2230,22 +2327,26 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // pop {lr} - Restore return address of this original function.
   if (Thumb) {
     if (ST->isThumb1Only()) {
-      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
-                     .addReg(ScratchReg0);
-      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
-                     .addReg(ScratchReg0));
+      BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))
+          .add(predOps(ARMCC::AL))
+          .addReg(ScratchReg0);
+      BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
+          .addReg(ScratchReg0)
+          .add(predOps(ARMCC::AL));
     } else {
-      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
-                     .addReg(ARM::LR, RegState::Define)
-                     .addReg(ARM::SP, RegState::Define)
-                     .addReg(ARM::SP)
-                     .addImm(4));
+      BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
+          .addReg(ARM::LR, RegState::Define)
+          .addReg(ARM::SP, RegState::Define)
+          .addReg(ARM::SP)
+          .addImm(4)
+          .add(predOps(ARMCC::AL));
     }
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
-      .addReg(ARM::LR);
+    BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ARM::LR);
   }
 
   // Restore SR0 and SR1 in case of __morestack() was called.
@@ -2253,15 +2354,17 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // scratch registers from here.
   // pop {SR0, SR1}
   if (Thumb) {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   }
 
   // Update the CFA offset now that we've popped
@@ -2271,20 +2374,22 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // bx lr - Return from this function.
   Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET;
-  AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode)));
+  BuildMI(AllocMBB, DL, TII.get(Opcode)).add(predOps(ARMCC::AL));
 
   // Restore SR0 and SR1 in case of __morestack() was not called.
   // pop {SR0, SR1}
   if (Thumb) {
-    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP))
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   } else {
-    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   }
 
   // Update the CFA offset now that we've popped
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c3e9591d5c70..b07b4e1f5cfb 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -244,11 +244,8 @@ private:
 
   bool tryInlineAsm(SDNode *N);
 
-  void SelectConcatVector(SDNode *N);
   void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
 
-  bool trySMLAWSMULW(SDNode *N);
-
   void SelectCMP_SWAP(SDNode *N);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
@@ -2559,141 +2556,6 @@ bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
   return false;
 }
 
-static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
-                                 bool Accumulate) {
-  // For SM*WB, we need to some form of sext.
-  // For SM*WT, we need to search for (sra X, 16)
-  // Src1 then gets set to X.
-  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
-       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
-       SignExt.getOpcode() == ISD::AssertSext) &&
-       SignExt.getValueType() == MVT::i32) {
-
-    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
-    Src1 = SignExt.getOperand(0);
-    return true;
-  }
-
-  if (SignExt.getOpcode() != ISD::SRA)
-    return false;
-
-  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
-  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
-    return false;
-
-  SDValue Op0 = SignExt.getOperand(0);
-
-  // The sign extend operand for SM*WB could be generated by a shl and ashr.
-  if (Op0.getOpcode() == ISD::SHL) {
-    SDValue SHL = Op0;
-    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
-    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
-      return false;
-
-    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
-    Src1 = Op0.getOperand(0);
-    return true;
-  }
-  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
-  Src1 = SignExt.getOperand(0);
-  return true;
-}
-
-static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
-                                SDValue &Src1, bool Accumulate) {
-  // First we look for:
-  // (add (or (srl ?, 16), (shl ?, 16)))
-  if (OR.getOpcode() != ISD::OR)
-    return false;
-
-  SDValue SRL = OR.getOperand(0);
-  SDValue SHL = OR.getOperand(1);
-
-  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
-    SRL = OR.getOperand(1);
-    SHL = OR.getOperand(0);
-    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
-      return false;
-  }
-
-  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
-  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
-  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
-      SHLSrc1->getZExtValue() != 16)
-    return false;
-
-  // The first operands to the shifts need to be the two results from the
-  // same smul_lohi node.
-  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
-       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
-    return false;
-
-  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
-  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
-      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
-    return false;
-
-  // Now we have:
-  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
-  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
-  // For SMLAWB the 16-bit value will signed extended somehow.
-  // For SMLAWT only the SRA is required.
-
-  // Check both sides of SMUL_LOHI
-  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
-    Src0 = SMULLOHI->getOperand(1);
-  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
-                                  Accumulate)) {
-    Src0 = SMULLOHI->getOperand(0);
-  } else {
-    return false;
-  }
-  return true;
-}
-
-bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
-  if (!Subtarget->hasV6Ops() ||
-      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
-    return false;
-
-  SDLoc dl(N);
-  SDValue Src0 = N->getOperand(0);
-  SDValue Src1 = N->getOperand(1);
-  SDValue A, B;
-  unsigned Opc = 0;
-
-  if (N->getOpcode() == ISD::ADD) {
-    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
-      return false;
-
-    SDValue Acc;
-    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
-      Acc = Src1;
-    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
-      Acc = Src0;
-    } else {
-      return false;
-    }
-    if (Opc == 0)
-      return false;
-
-    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
-                      CurDAG->getRegister(0, MVT::i32) };
-    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
-    return true;
-  } else if (N->getOpcode() == ISD::OR &&
-             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
-    if (Opc == 0)
-      return false;
-
-    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
-                      CurDAG->getRegister(0, MVT::i32)};
-    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
-    return true;
-  }
-  return false;
-}
-
 /// We've got special pseudo-instructions for these
 void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
@@ -2722,15 +2584,6 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
-void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
-  // The only time a CONCAT_VECTORS operation can have legal types is when
-  // two 64-bit vectors are concatenated to a 128-bit vector.
-  EVT VT = N->getValueType(0);
-  if (!VT.is128BitVector() || N->getNumOperands() != 2)
-    llvm_unreachable("unexpected CONCAT_VECTORS");
-  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
-}
-
 static Optional<std::pair<unsigned, unsigned>>
 getContiguousRangeOfSetBits(const APInt &A) {
   unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
@@ -2822,11 +2675,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADD:
-  case ISD::OR:
-    if (trySMLAWSMULW(N))
-      return;
-    break;
   case ISD::WRITE_REGISTER:
     if (tryWriteRegister(N))
       return;
@@ -3042,49 +2890,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
     break;
   }
-  case ARMISD::VMOVRRD:
-    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
-                                          N->getOperand(0), getAL(CurDAG, dl),
-                                          CurDAG->getRegister(0, MVT::i32)));
-    return;
-  case ISD::UMUL_LOHI: {
-    if (Subtarget->isThumb1Only())
-      break;
-    if (Subtarget->isThumb()) {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(
-          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
-      return;
-    } else {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                        CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(N, CurDAG->getMachineNode(
-                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
-                         MVT::i32, MVT::i32, Ops));
-      return;
-    }
-  }
-  case ISD::SMUL_LOHI: {
-    if (Subtarget->isThumb1Only())
-      break;
-    if (Subtarget->isThumb()) {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(
-          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
-      return;
-    } else {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                        CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(N, CurDAG->getMachineNode(
-                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
-                         MVT::i32, MVT::i32, Ops));
-      return;
-    }
-  }
   case ARMISD::UMAAL: {
     unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
@@ -3095,38 +2900,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ARMISD::UMLAL:{
-    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
-    // 64-bit multiplication result.
-    if (Subtarget->hasV6Ops() && Subtarget->hasDSP() &&
-        N->getOperand(2).getOpcode() == ARMISD::ADDC &&
-        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
-
-      SDValue Addc = N->getOperand(2);
-      SDValue Adde = N->getOperand(3);
-
-      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
-
-        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
-        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
-
-        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
-        {
-          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
-          // RdLo = one operand to be added, lower 32-bits of res
-          // RdHi = other operand to be added, upper 32-bits of res
-          // Rn = first multiply operand
-          // Rm = second multiply operand
-          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                            Addc.getOperand(0), Addc.getOperand(1),
-                            getAL(CurDAG, dl),
-                            CurDAG->getRegister(0, MVT::i32) };
-          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
-          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
-          return;
-        }
-      }
-    }
-
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
@@ -3277,26 +3050,23 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       int64_t Addend = -C->getSExtValue();
 
       SDNode *Add = nullptr;
-      // In T2 mode, ADDS can be better than CMN if the immediate fits in a
+      // ADDS can be better than CMN if the immediate fits in a
       // 16-bit ADDS, which means either [0,256) for tADDi8 or [0,8) for tADDi3.
       // Outside that range we can just use a CMN which is 32-bit but has a
       // 12-bit immediate range.
-      if (Subtarget->isThumb2() && Addend < 1<<8) {
-        SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                          getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                          CurDAG->getRegister(0, MVT::i32) };
-        Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
-      } else if (!Subtarget->isThumb2() && Addend < 1<<8) {
-        // FIXME: Add T1 tADDi8 code.
-        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
-                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
-        Add = CurDAG->getMachineNode(ARM::tADDi8, dl, MVT::i32, Ops);
-      } else if (!Subtarget->isThumb2() && Addend < 1<<3) {
-        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
-                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
-        Add = CurDAG->getMachineNode(ARM::tADDi3, dl, MVT::i32, Ops);
+      if (Addend < 1<<8) {
+        if (Subtarget->isThumb2()) {
+          SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                            getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+          Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
+        } else {
+          unsigned Opc = (Addend < 1<<3) ? ARM::tADDi3 : ARM::tADDi8;
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+                           CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                           getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+          Add = CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+        }
       }
       if (Add) {
         SDValue Ops2[] = {SDValue(Add, 0), CurDAG->getConstant(0, dl, MVT::i32)};
@@ -4013,10 +3783,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     return;
   }
 
-  case ISD::CONCAT_VECTORS:
-    SelectConcatVector(N);
-    return;
-
   case ISD::ATOMIC_CMP_SWAP:
     SelectCMP_SWAP(N);
     return;
@@ -4123,11 +3889,10 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
 // The flags here are common to those allowed for apsr in the A class cores and
 // those allowed for the special registers in the M class cores. Returns a
 // value representing which flags were present, -1 if invalid.
-static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
-  if (Flags.empty())
-    return 0x2 | (int)hasDSP;
-
+static inline int getMClassFlagsMask(StringRef Flags) {
   return StringSwitch<int>(Flags)
+          .Case("", 0x2) // no flags means nzcvq for psr registers, and 0x2 is
+                         // correct when flags are not permitted
           .Case("g", 0x1)
           .Case("nzcvq", 0x2)
           .Case("nzcvqg", 0x3)
@@ -4170,7 +3935,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
-  int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
+  int Mask = getMClassFlagsMask(Flags);
 
   // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
   // shouldn't have flags present.
@@ -4185,10 +3950,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   // The register was valid so need to put the mask in the correct place
   // (the flags need to be in bits 11-10) and combine with the SYSmvalue to
   // construct the operand for the instruction node.
-  if (SYSmvalue < 0x4)
-    return SYSmvalue | Mask << 10;
-
-  return SYSmvalue;
+  return SYSmvalue | Mask << 10;
 }
 
 static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
@@ -4201,7 +3963,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
     // The flags permitted for apsr are the same flags that are allowed in
     // M class registers. We get the flag value and then shift the flags into
     // the correct place to combine with the mask.
-    Mask = getMClassFlagsMask(Flags, true);
+    Mask = getMClassFlagsMask(Flags);
     if (Mask == -1)
       return -1;
     return Mask << 2;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0f84a2359160..e697c8ca5339 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12,47 +12,101 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMISelLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMPerfectShuffle.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
-#include "ARMTargetMachine.h"
-#include "ARMTargetObjectFile.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <string>
 #include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-isel"
@@ -82,21 +136,6 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
     cl::init(128));
 
-namespace {
-  class ARMCCState : public CCState {
-  public:
-    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
-               ParmContext PC)
-        : CCState(CC, isVarArg, MF, locs, C) {
-      assert(((PC == Call) || (PC == Prologue)) &&
-             "ARMCCState users must specify whether their context is call"
-             "or prologue generation.");
-      CallOrPrologue = PC;
-    }
-  };
-}
-
 // The APCS parameter registers.
 static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -685,10 +724,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  // ARM and Thumb2 support UMLAL/SMLAL.
-  if (!Subtarget->isThumb1Only())
-    setTargetDAGCombine(ISD::ADDC);
-
   if (Subtarget->isFPOnlySP()) {
     // When targeting a floating-point unit with only single-precision
     // operations, f64 is legal for the few double-precision instructions which
@@ -787,13 +822,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 
-  if (!Subtarget->isThumb1Only()) {
-    // FIXME: We should do this for Thumb1 as well.
-    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
-    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
-    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
-    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
-  }
+  setOperationAction(ISD::ADDC,      MVT::i32, Custom);
+  setOperationAction(ISD::ADDE,      MVT::i32, Custom);
+  setOperationAction(ISD::SUBC,      MVT::i32, Custom);
+  setOperationAction(ISD::SUBE,      MVT::i32, Custom);
 
   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -1305,6 +1337,12 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
+  case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
+  case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
+  case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
+  case ARMISD::SMULWB:        return "ARMISD::SMULWB";
+  case ARMISD::SMULWT:        return "ARMISD::SMULWT";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -1414,6 +1452,40 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
+static bool isSRL16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SRL)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+static bool isSRA16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SRA)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+static bool isSHL16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SHL)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+// Check for a signed 16-bit value. We special case SRA because it makes it
+// more simple when also looking for SRAs that aren't sign extending a
+// smaller value. Without the check, we'd need to take extra care with
+// checking order for some operations.
+static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
+  if (isSRA16(Op))
+    return isSHL16(Op.getOperand(0));
+  return DAG.ComputeNumSignBits(Op) == 17;
+}
+
 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
   switch (CC) {
@@ -1433,22 +1505,34 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
 
 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
-                        ARMCC::CondCodes &CondCode2) {
+                        ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
   CondCode2 = ARMCC::AL;
+  InvalidOnQNaN = true;
   switch (CC) {
   default: llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
-  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETOEQ:
+    CondCode = ARMCC::EQ;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETGT:
   case ISD::SETOGT: CondCode = ARMCC::GT; break;
   case ISD::SETGE:
   case ISD::SETOGE: CondCode = ARMCC::GE; break;
   case ISD::SETOLT: CondCode = ARMCC::MI; break;
   case ISD::SETOLE: CondCode = ARMCC::LS; break;
-  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETONE:
+    CondCode = ARMCC::MI;
+    CondCode2 = ARMCC::GT;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETO:   CondCode = ARMCC::VC; break;
   case ISD::SETUO:  CondCode = ARMCC::VS; break;
-  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUEQ:
+    CondCode = ARMCC::EQ;
+    CondCode2 = ARMCC::VS;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETUGT: CondCode = ARMCC::HI; break;
   case ISD::SETUGE: CondCode = ARMCC::PL; break;
   case ISD::SETLT:
@@ -1456,7 +1540,10 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   case ISD::SETLE:
   case ISD::SETULE: CondCode = ARMCC::LE; break;
   case ISD::SETNE:
-  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  case ISD::SETUNE:
+    CondCode = ARMCC::NE;
+    InvalidOnQNaN = false;
+    break;
   }
 }
 
@@ -1549,8 +1636,8 @@ SDValue ARMTargetLowering::LowerCallResult(
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
 
   // Copy all of the result registers out of their specified physreg.
@@ -1710,8 +1797,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2088,10 +2175,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// this.
 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
                                     unsigned Align) const {
-  assert((State->getCallOrPrologue() == Prologue ||
-          State->getCallOrPrologue() == Call) &&
-         "unhandled ParmContext");
-
   // Byval (as with any stack) slots are always at least 4 byte aligned.
   Align = std::max(Align, 4U);
 
@@ -2148,7 +2231,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
                          const TargetInstrInfo *TII) {
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
-  int FI = INT_MAX;
+  int FI = std::numeric_limits<int>::max();
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!TargetRegisterInfo::isVirtualRegister(VR))
@@ -2178,7 +2261,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   } else
     return false;
 
-  assert(FI != INT_MAX);
+  assert(FI != std::numeric_limits<int>::max());
   if (!MFI.isFixedObjectIndex(FI))
     return false;
   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
@@ -2260,7 +2343,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
     if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
@@ -2362,8 +2445,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
@@ -2790,9 +2873,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
 
   // FIXME: is there useful debug info available here?
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
-               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+      DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2935,7 +3018,7 @@ static bool isSimpleType(Type *T) {
 }
 
 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
-                                     EVT PtrVT, SDLoc dl) {
+                                     EVT PtrVT, const SDLoc &dl) {
   // If we're creating a pool entry for a constant global with unnamed address,
   // and the global is small enough, we can emit it inline into the constant pool
   // to save ourselves an indirection.
@@ -2980,7 +3063,8 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
   unsigned RequiredPadding = 4 - (Size % 4);
   bool PaddingPossible =
     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
-  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
+  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
+      Size == 0)
     return SDValue();
 
   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
@@ -3080,15 +3164,22 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     return Result;
   } else if (Subtarget->isRWPI() && !IsRO) {
     // SB-relative.
-    ARMConstantPoolValue *CPV =
-      ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
-    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    SDValue G = DAG.getLoad(
-        PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    SDValue RelAddr;
+    if (Subtarget->useMovt(DAG.getMachineFunction())) {
+      ++NumMovwMovt;
+      SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
+      RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
+    } else { // use literal pool for address constant
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      RelAddr = DAG.getLoad(
+          PtrVT, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
-    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
+    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
     return Result;
   }
 
@@ -3462,8 +3553,8 @@ SDValue ARMTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                    *DAG.getContext(), Prologue);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
 
   SmallVector<SDValue, 16> ArgValues;
@@ -3595,7 +3686,6 @@ SDValue ARMTargetLowering::LowerFormalArguments(
       InVals.push_back(ArgValue);
 
     } else { // VA.isRegLoc()
-
       // sanity check
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
@@ -3734,13 +3824,15 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
-                                     SelectionDAG &DAG, const SDLoc &dl) const {
+                                     SelectionDAG &DAG, const SDLoc &dl,
+                                     bool InvalidOnQNaN) const {
   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
+  SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
   else
-    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
@@ -3757,10 +3849,12 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   Cmp = Cmp.getOperand(0);
   Opc = Cmp.getOpcode();
   if (Opc == ARMISD::CMPFP)
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
+                      Cmp.getOperand(1), Cmp.getOperand(2));
   else {
     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
+                      Cmp.getOperand(1));
   }
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
@@ -3808,7 +3902,6 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
   return std::make_pair(Value, OverflowCmp);
 }
 
-
 SDValue
 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
@@ -3832,7 +3925,6 @@ ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
-
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
@@ -4025,7 +4117,6 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
 // Additionally, the variable is returned in parameter V and the constant in K.
 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
                                     uint64_t &K) {
-
   SDValue LHS1 = Op.getOperand(0);
   SDValue RHS1 = Op.getOperand(1);
   SDValue TrueVal1 = Op.getOperand(2);
@@ -4046,10 +4137,10 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
   // in each conditional
   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
                                                         ? &RHS1
-                                                        : NULL;
+                                                        : nullptr;
   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
                                                         ? &RHS2
-                                                        : NULL;
+                                                        : nullptr;
   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
@@ -4073,13 +4164,15 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
   const SDValue *LowerCheckOp =
       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
           ? &Op
-          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
-                                                                       : NULL;
+          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
+                ? &Op2
+                : nullptr;
   const SDValue *UpperCheckOp =
       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
           ? &Op
-          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
-                                                                       : NULL;
+          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
+                ? &Op2
+                : nullptr;
 
   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
     return false;
@@ -4104,7 +4197,6 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
 }
 
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
 
@@ -4162,7 +4254,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  FPCCToARMCC(CC, CondCode, CondCode2);
+  bool InvalidOnQNaN;
+  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   // Try to generate VMAXNM/VMINNM on ARMv8.
   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
@@ -4181,13 +4274,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
-    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
   }
   return Result;
@@ -4348,10 +4441,11 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  FPCCToARMCC(CC, CondCode, CondCode2);
+  bool InvalidOnQNaN;
+  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
@@ -4853,9 +4947,10 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
-  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                              DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
-                                              MVT::i32));
+  SDValue Ops[] = { DAG.getEntryNode(),
+                    DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
+
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
@@ -5584,7 +5679,6 @@ static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
-
 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                        bool &ReverseVEXT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
@@ -6027,10 +6121,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   }
   if (ValueCounts.size() != 1)
     usesOnlyOneValue = false;
-  if (!Value.getNode() && ValueCounts.size() > 0)
+  if (!Value.getNode() && !ValueCounts.empty())
     Value = ValueCounts.begin()->first;
 
-  if (ValueCounts.size() == 0)
+  if (ValueCounts.empty())
     return DAG.getUNDEF(VT);
 
   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
@@ -6182,8 +6276,8 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
   struct ShuffleSourceInfo {
     SDValue Vec;
-    unsigned MinElt;
-    unsigned MaxElt;
+    unsigned MinElt = std::numeric_limits<unsigned>::max();
+    unsigned MaxElt = 0;
 
     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
     // be compatible with the shuffle we intend to construct. As a result
@@ -6192,13 +6286,12 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
     // Code should guarantee that element i in Vec starts at element "WindowBase
     // + i * WindowScale in ShuffleVec".
-    int WindowBase;
-    int WindowScale;
+    int WindowBase = 0;
+    int WindowScale = 1;
+
+    ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
 
     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
-    ShuffleSourceInfo(SDValue Vec)
-        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
-          WindowScale(1) {}
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -6220,7 +6313,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
     // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
-    auto Source = find(Sources, SourceVec);
+    auto Source = llvm::find(Sources, SourceVec);
     if (Source == Sources.end())
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
@@ -6336,7 +6429,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     if (Entry.isUndef())
       continue;
 
-    auto Src = find(Sources, Entry.getOperand(0));
+    auto Src = llvm::find(Sources, Entry.getOperand(0));
     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
 
     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
@@ -6633,7 +6726,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
       EVT SubVT = SubV1.getValueType();
 
       // We expect these to have been canonicalized to -1.
-      assert(all_of(ShuffleMask, [&](int i) {
+      assert(llvm::all_of(ShuffleMask, [&](int i) {
         return i < (int)VT.getVectorNumElements();
       }) && "Unexpected shuffle index into UNDEF operand!");
 
@@ -6896,8 +6989,19 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
                                         N->getValueType(0),
                                         N->getOpcode());
 
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
-    return SkipLoadExtensionForVMULL(LD, DAG);
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
+           "Expected extending load");
+
+    SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
+    unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    SDValue extLoad =
+        DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
+
+    return newLoad;
+  }
 
   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
   // have been legalized as a BITCAST from v4i32.
@@ -7258,9 +7362,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     ArgListEntry Entry;
     Entry.Node = SRet;
     Entry.Ty = RetTy->getPointerTo();
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isSRet = true;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsSRet = true;
     Args.push_back(Entry);
     RetTy = Type::getVoidTy(*DAG.getContext());
   }
@@ -7268,8 +7372,8 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   ArgListEntry Entry;
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
@@ -7480,12 +7584,12 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
 
   Entry.Node = Val;
   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
-  Entry.isZExt = true;
+  Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Entry.Node = Exponent;
   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
-  Entry.isZExt = true;
+  Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
@@ -7702,24 +7806,27 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   add    r5, pc
     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
-                   .addConstantPoolIndex(CPI)
-                   .addMemOperand(CPMMO));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+        .addConstantPoolIndex(CPI)
+        .addMemOperand(CPMMO)
+        .add(predOps(ARMCC::AL));
     // Set the low bit because of thumb mode.
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
-    AddDefaultCC(
-      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
-                     .addReg(NewVReg1, RegState::Kill)
-                     .addImm(0x01)));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+        .addReg(NewVReg1, RegState::Kill)
+        .addImm(0x01)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
       .addReg(NewVReg2, RegState::Kill)
       .addImm(PCLabelId);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
-                   .addReg(NewVReg3, RegState::Kill)
-                   .addFrameIndex(FI)
-                   .addImm(36)  // &jbuf[1] :: pc
-                   .addMemOperand(FIMMOSt));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+        .addReg(NewVReg3, RegState::Kill)
+        .addFrameIndex(FI)
+        .addImm(36) // &jbuf[1] :: pc
+        .addMemOperand(FIMMOSt)
+        .add(predOps(ARMCC::AL));
   } else if (isThumb) {
     // Incoming value: jbuf
     //   ldr.n  r1, LCPI1_4
@@ -7729,51 +7836,58 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   add    r2, $jbuf, #+4 ; &jbuf[1]
     //   str    r1, [r2]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
-                   .addConstantPoolIndex(CPI)
-                   .addMemOperand(CPMMO));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+        .addConstantPoolIndex(CPI)
+        .addMemOperand(CPMMO)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
       .addReg(NewVReg1, RegState::Kill)
       .addImm(PCLabelId);
     // Set the low bit because of thumb mode.
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addImm(1));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addImm(1)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg2, RegState::Kill)
-                   .addReg(NewVReg3, RegState::Kill));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addReg(NewVReg2, RegState::Kill)
+        .addReg(NewVReg3, RegState::Kill)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
             .addFrameIndex(FI)
             .addImm(36); // &jbuf[1] :: pc
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
-                   .addReg(NewVReg4, RegState::Kill)
-                   .addReg(NewVReg5, RegState::Kill)
-                   .addImm(0)
-                   .addMemOperand(FIMMOSt));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+        .addReg(NewVReg4, RegState::Kill)
+        .addReg(NewVReg5, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(FIMMOSt)
+        .add(predOps(ARMCC::AL));
   } else {
     // Incoming value: jbuf
     //   ldr  r1, LCPI1_1
     //   add  r1, pc, r1
     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
-                   .addConstantPoolIndex(CPI)
-                   .addImm(0)
-                   .addMemOperand(CPMMO));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
+        .addConstantPoolIndex(CPI)
+        .addImm(0)
+        .addMemOperand(CPMMO)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
-                   .addReg(NewVReg1, RegState::Kill)
-                   .addImm(PCLabelId));
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
-                   .addReg(NewVReg2, RegState::Kill)
-                   .addFrameIndex(FI)
-                   .addImm(36)  // &jbuf[1] :: pc
-                   .addMemOperand(FIMMOSt));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+        .addReg(NewVReg1, RegState::Kill)
+        .addImm(PCLabelId)
+        .add(predOps(ARMCC::AL));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+        .addReg(NewVReg2, RegState::Kill)
+        .addFrameIndex(FI)
+        .addImm(36) // &jbuf[1] :: pc
+        .addMemOperand(FIMMOSt)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -7791,7 +7905,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
-  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
        ++BB) {
@@ -7886,31 +8000,36 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
-                   .addFrameIndex(FI)
-                   .addImm(4)
-                   .addMemOperand(FIMMOLd));
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+        .addFrameIndex(FI)
+        .addImm(4)
+        .addMemOperand(FIMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
-                     .addReg(NewVReg1)
-                     .addImm(LPadList.size()));
+      BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+          .addReg(NewVReg1)
+          .addImm(LPadList.size())
+          .add(predOps(ARMCC::AL));
     } else {
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                     .addImm(NumLPads & 0xFFFF));
+      BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
+          .addImm(NumLPads & 0xFFFF)
+          .add(predOps(ARMCC::AL));
 
       unsigned VReg2 = VReg1;
       if ((NumLPads & 0xFFFF0000) != 0) {
         VReg2 = MRI->createVirtualRegister(TRC);
-        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
-                       .addReg(VReg1)
-                       .addImm(NumLPads >> 16));
+        BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
+            .addReg(VReg1)
+            .addImm(NumLPads >> 16)
+            .add(predOps(ARMCC::AL));
       }
 
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg2));
+      BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
+          .addReg(NewVReg1)
+          .addReg(VReg2)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
@@ -7919,16 +8038,17 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(ARM::CPSR);
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
-                   .addJumpTableIndex(MJTI));
+    BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
+        .addJumpTableIndex(MJTI)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultCC(
-      AddDefaultPred(
-        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
+    BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
         .addReg(NewVReg3, RegState::Kill)
         .addReg(NewVReg1)
-        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
 
     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
       .addReg(NewVReg4, RegState::Kill)
@@ -7936,15 +8056,17 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addJumpTableIndex(MJTI);
   } else if (Subtarget->isThumb()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
-                   .addFrameIndex(FI)
-                   .addImm(1)
-                   .addMemOperand(FIMMOLd));
+    BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+        .addFrameIndex(FI)
+        .addImm(1)
+        .addMemOperand(FIMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
-                     .addReg(NewVReg1)
-                     .addImm(NumLPads));
+      BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+          .addReg(NewVReg1)
+          .addImm(NumLPads)
+          .add(predOps(ARMCC::AL));
     } else {
       MachineConstantPool *ConstantPool = MF->getConstantPool();
       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -7957,12 +8079,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
-                     .addReg(VReg1, RegState::Define)
-                     .addConstantPoolIndex(Idx));
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg1));
+      BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
+          .addReg(VReg1, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .add(predOps(ARMCC::AL));
+      BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
+          .addReg(NewVReg1)
+          .addReg(VReg1)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
@@ -7971,37 +8095,42 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(ARM::CPSR);
 
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg1)
-                   .addImm(2));
+    BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addReg(NewVReg1)
+        .addImm(2)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
-                   .addJumpTableIndex(MJTI));
+    BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+        .addJumpTableIndex(MJTI)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg2, RegState::Kill)
-                   .addReg(NewVReg3));
+    BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addReg(NewVReg2, RegState::Kill)
+        .addReg(NewVReg3)
+        .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
 
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
-                   .addReg(NewVReg4, RegState::Kill)
-                   .addImm(0)
-                   .addMemOperand(JTMMOLd));
+    BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+        .addReg(NewVReg4, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(JTMMOLd)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg6 = NewVReg5;
     if (IsPositionIndependent) {
       NewVReg6 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
-                     .addReg(ARM::CPSR, RegState::Define)
-                     .addReg(NewVReg5, RegState::Kill)
-                     .addReg(NewVReg3));
+      BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+          .addReg(ARM::CPSR, RegState::Define)
+          .addReg(NewVReg5, RegState::Kill)
+          .addReg(NewVReg3)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
@@ -8009,31 +8138,36 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addJumpTableIndex(MJTI);
   } else {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
-                   .addFrameIndex(FI)
-                   .addImm(4)
-                   .addMemOperand(FIMMOLd));
+    BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+        .addFrameIndex(FI)
+        .addImm(4)
+        .addMemOperand(FIMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
-                     .addReg(NewVReg1)
-                     .addImm(NumLPads));
+      BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+          .addReg(NewVReg1)
+          .addImm(NumLPads)
+          .add(predOps(ARMCC::AL));
     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
-                     .addImm(NumLPads & 0xFFFF));
+      BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
+          .addImm(NumLPads & 0xFFFF)
+          .add(predOps(ARMCC::AL));
 
       unsigned VReg2 = VReg1;
       if ((NumLPads & 0xFFFF0000) != 0) {
         VReg2 = MRI->createVirtualRegister(TRC);
-        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
-                       .addReg(VReg1)
-                       .addImm(NumLPads >> 16));
+        BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
+            .addReg(VReg1)
+            .addImm(NumLPads >> 16)
+            .add(predOps(ARMCC::AL));
       }
 
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg2));
+      BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+          .addReg(NewVReg1)
+          .addReg(VReg2)
+          .add(predOps(ARMCC::AL));
     } else {
       MachineConstantPool *ConstantPool = MF->getConstantPool();
       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -8046,13 +8180,15 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
-                     .addReg(VReg1, RegState::Define)
-                     .addConstantPoolIndex(Idx)
-                     .addImm(0));
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg1, RegState::Kill));
+      BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
+          .addReg(VReg1, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .addImm(0)
+          .add(predOps(ARMCC::AL));
+      BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+          .addReg(NewVReg1)
+          .addReg(VReg1, RegState::Kill)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
@@ -8061,23 +8197,25 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(ARM::CPSR);
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultCC(
-      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
-                     .addReg(NewVReg1)
-                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
-                   .addJumpTableIndex(MJTI));
+    BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(
-      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
-      .addReg(NewVReg3, RegState::Kill)
-      .addReg(NewVReg4)
-      .addImm(0)
-      .addMemOperand(JTMMOLd));
+    BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
+        .addReg(NewVReg3, RegState::Kill)
+        .addReg(NewVReg4)
+        .addImm(0)
+        .addMemOperand(JTMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (IsPositionIndependent) {
       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
@@ -8222,26 +8360,35 @@ static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
   assert(LdOpc != 0 && "Should have a load opcode");
   if (LdSize >= 8) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
-                       .addImm(0));
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrOut, RegState::Define)
+        .addReg(AddrIn)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb1) {
     // load + update AddrIn
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrIn).addImm(0));
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
-    MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(AddrIn).addImm(LdSize);
-    AddDefaultPred(MIB);
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrIn)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
+    BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
+        .add(t1CondCodeOp())
+        .addReg(AddrIn)
+        .addImm(LdSize)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb2) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
-                       .addImm(LdSize));
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrOut, RegState::Define)
+        .addReg(AddrIn)
+        .addImm(LdSize)
+        .add(predOps(ARMCC::AL));
   } else { // arm
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
-                       .addReg(0).addImm(LdSize));
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrOut, RegState::Define)
+        .addReg(AddrIn)
+        .addReg(0)
+        .addImm(LdSize)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -8254,24 +8401,36 @@ static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
   assert(StOpc != 0 && "Should have a store opcode");
   if (StSize >= 8) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
-                       .addReg(AddrIn).addImm(0).addReg(Data));
+    BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+        .addReg(AddrIn)
+        .addImm(0)
+        .addReg(Data)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb1) {
     // store + update AddrIn
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
-                       .addReg(AddrIn).addImm(0));
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
-    MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(AddrIn).addImm(StSize);
-    AddDefaultPred(MIB);
+    BuildMI(*BB, Pos, dl, TII->get(StOpc))
+        .addReg(Data)
+        .addReg(AddrIn)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
+    BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
+        .add(t1CondCodeOp())
+        .addReg(AddrIn)
+        .addImm(StSize)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb2) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
-                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+    BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+        .addReg(Data)
+        .addReg(AddrIn)
+        .addImm(StSize)
+        .add(predOps(ARMCC::AL));
   } else { // arm
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
-                       .addReg(Data).addReg(AddrIn).addReg(0)
-                       .addImm(StSize));
+    BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+        .addReg(Data)
+        .addReg(AddrIn)
+        .addReg(0)
+        .addImm(StSize)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -8402,16 +8561,15 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
       Vtmp = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl,
-                           TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16),
-                           Vtmp).addImm(LoopSize & 0xFFFF));
+    BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
+        .addImm(LoopSize & 0xFFFF)
+        .add(predOps(ARMCC::AL));
 
     if ((LoopSize & 0xFFFF0000) != 0)
-      AddDefaultPred(BuildMI(BB, dl,
-                             TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16),
-                             varEnd)
-                         .addReg(Vtmp)
-                         .addImm(LoopSize >> 16));
+      BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
+          .addReg(Vtmp)
+          .addImm(LoopSize >> 16)
+          .add(predOps(ARMCC::AL));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -8424,11 +8582,16 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
     if (IsThumb)
-      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
-          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+      BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
+          .addReg(varEnd, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .add(predOps(ARMCC::AL));
     else
-      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
-          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+      BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
+          .addReg(varEnd, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .addImm(0)
+          .add(predOps(ARMCC::AL));
   }
   BB->addSuccessor(loopMBB);
 
@@ -8465,16 +8628,19 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
 
   // Decrement loop variable by UnitSize.
   if (IsThumb1) {
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
-    MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(varPhi).addImm(UnitSize);
-    AddDefaultPred(MIB);
+    BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
+        .add(t1CondCodeOp())
+        .addReg(varPhi)
+        .addImm(UnitSize)
+        .add(predOps(ARMCC::AL));
   } else {
     MachineInstrBuilder MIB =
         BuildMI(*BB, BB->end(), dl,
                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB.addReg(varPhi)
+        .addImm(UnitSize)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     MIB->getOperand(5).setReg(ARM::CPSR);
     MIB->getOperand(5).setIsDef(true);
   }
@@ -8545,11 +8711,12 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
   case CodeModel::Default:
   case CodeModel::Kernel:
     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
-      .addImm((unsigned)ARMCC::AL).addReg(0)
-      .addExternalSymbol("__chkstk")
-      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
-      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
-      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+        .add(predOps(ARMCC::AL))
+        .addExternalSymbol("__chkstk")
+        .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+        .addReg(ARM::R12,
+                RegState::Implicit | RegState::Define | RegState::Dead);
     break;
   case CodeModel::Large:
   case CodeModel::JITDefault: {
@@ -8559,20 +8726,22 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
       .addExternalSymbol("__chkstk");
     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
-      .addImm((unsigned)ARMCC::AL).addReg(0)
-      .addReg(Reg, RegState::Kill)
-      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
-      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
-      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+        .add(predOps(ARMCC::AL))
+        .addReg(Reg, RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+        .addReg(ARM::R12,
+                RegState::Implicit | RegState::Define | RegState::Dead);
     break;
   }
   }
 
-  AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
-                                      ARM::SP)
-                         .addReg(ARM::SP, RegState::Kill)
-                         .addReg(ARM::R4, RegState::Kill)
-                         .setMIFlags(MachineInstr::FrameSetup)));
+  BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
+      .addReg(ARM::SP, RegState::Kill)
+      .addReg(ARM::R4, RegState::Kill)
+      .setMIFlags(MachineInstr::FrameSetup)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
 
   MI.eraseFromParent();
   return MBB;
@@ -8597,9 +8766,10 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
   MF->push_back(TrapBB);
   MBB->addSuccessor(TrapBB);
 
-  AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
-                     .addReg(MI.getOperand(0).getReg())
-                     .addImm(0));
+  BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
+      .addReg(MI.getOperand(0).getReg())
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
       .addMBB(TrapBB)
       .addImm(ARMCC::EQ)
@@ -8617,18 +8787,18 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI.getOpcode()) {
   default: {
-    MI.dump();
+    MI.print(errs());
     llvm_unreachable("Unexpected instr type to insert");
   }
 
   // Thumb1 post-indexed loads are really just single-register LDMs.
   case ARM::tLDR_postidx: {
     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
-      .addOperand(MI.getOperand(1)) // Rn_wb
-      .addOperand(MI.getOperand(2)) // Rn
-      .addOperand(MI.getOperand(3)) // PredImm
-      .addOperand(MI.getOperand(4)) // PredReg
-      .addOperand(MI.getOperand(0)); // Rt
+        .add(MI.getOperand(1))  // Rn_wb
+        .add(MI.getOperand(2))  // Rn
+        .add(MI.getOperand(3))  // PredImm
+        .add(MI.getOperand(4))  // PredReg
+        .add(MI.getOperand(0)); // Rt
     MI.eraseFromParent();
     return BB;
   }
@@ -8659,12 +8829,12 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
     MachineMemOperand *MMO = *MI.memoperands_begin();
     BuildMI(*BB, MI, dl, TII->get(NewOpc))
-        .addOperand(MI.getOperand(0)) // Rn_wb
-        .addOperand(MI.getOperand(1)) // Rt
-        .addOperand(MI.getOperand(2)) // Rn
-        .addImm(Offset)               // offset (skip GPR==zero_reg)
-        .addOperand(MI.getOperand(5)) // pred
-        .addOperand(MI.getOperand(6))
+        .add(MI.getOperand(0)) // Rn_wb
+        .add(MI.getOperand(1)) // Rt
+        .add(MI.getOperand(2)) // Rn
+        .addImm(Offset)        // offset (skip GPR==zero_reg)
+        .add(MI.getOperand(5)) // pred
+        .add(MI.getOperand(6))
         .addMemOperand(MMO);
     MI.eraseFromParent();
     return BB;
@@ -8681,7 +8851,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
-      MIB.addOperand(MI.getOperand(i));
+      MIB.add(MI.getOperand(i));
     MI.eraseFromParent();
     return BB;
   }
@@ -8754,18 +8924,20 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     unsigned LHS1 = MI.getOperand(1).getReg();
     unsigned LHS2 = MI.getOperand(2).getReg();
     if (RHSisZero) {
-      AddDefaultPred(BuildMI(BB, dl,
-                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                     .addReg(LHS1).addImm(0));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+          .addReg(LHS1)
+          .addImm(0)
+          .add(predOps(ARMCC::AL));
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
         .addReg(LHS2).addImm(0)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     } else {
       unsigned RHS1 = MI.getOperand(3).getReg();
       unsigned RHS2 = MI.getOperand(4).getReg();
-      AddDefaultPred(BuildMI(BB, dl,
-                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
-                     .addReg(LHS1).addReg(RHS1));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+          .addReg(LHS1)
+          .addReg(RHS1)
+          .add(predOps(ARMCC::AL));
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
         .addReg(LHS2).addReg(RHS2)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
@@ -8779,7 +8951,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
     if (isThumb2)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+      BuildMI(BB, dl, TII->get(ARM::t2B))
+          .addMBB(exitMBB)
+          .add(predOps(ARMCC::AL));
     else
       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
@@ -8842,9 +9016,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     RSBBB->addSuccessor(SinkBB);
 
     // insert a cmp at the end of BB
-    AddDefaultPred(BuildMI(BB, dl,
-                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                   .addReg(ABSSrcReg).addImm(0));
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+        .addReg(ABSSrcReg)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
 
     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
     BuildMI(BB, dl,
@@ -8855,9 +9030,11 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // Note: BCC and rsbri will be converted into predicated rsbmi
     // by if-conversion pass
     BuildMI(*RSBBB, RSBBB->begin(), dl,
-      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
-      .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
-      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+            TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+        .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
+        .addImm(0)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
 
     // insert PHI in SinkBB,
     // reuse ABSDstReg to not change uses of ABS instruction
@@ -8927,19 +9104,45 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 
   // Rename pseudo opcodes.
   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
+  unsigned ccOutIdx;
   if (NewOpc) {
     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
-    assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
-           "converted opcode should be the same except for cc_out");
+    assert(MCID->getNumOperands() ==
+           MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
+        && "converted opcode should be the same except for cc_out"
+           " (and, on Thumb1, pred)");
 
     MI.setDesc(*MCID);
 
     // Add the optional cc_out operand
     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
-  }
-  unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+    // On Thumb1, move all input operands to the end, then add the predicate
+    if (Subtarget->isThumb1Only()) {
+      for (unsigned c = MCID->getNumOperands() - 4; c--;) {
+        MI.addOperand(MI.getOperand(1));
+        MI.RemoveOperand(1);
+      }
+
+      // Restore the ties
+      for (unsigned i = MI.getNumOperands(); i--;) {
+        const MachineOperand& op = MI.getOperand(i);
+        if (op.isReg() && op.isUse()) {
+          int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
+          if (DefIdx != -1)
+            MI.tieOperands(DefIdx, i);
+        }
+      }
+
+      MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
+      MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
+      ccOutIdx = 1;
+    } else
+      ccOutIdx = MCID->getNumOperands() - 1;
+  } else
+    ccOutIdx = MCID->getNumOperands() - 1;
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
@@ -8970,7 +9173,9 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   if (deadCPSR) {
     assert(!MI.getOperand(ccOutIdx).getReg() &&
            "expect uninitialized optional cc_out operand");
-    return;
+    // Thumb1 instructions must have the S bit even if the CPSR is dead.
+    if (!Subtarget->isThumb1Only())
+      return;
   }
 
   // If this instruction was defined with an optional CPSR def and its dag node
@@ -9032,7 +9237,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
     CC = N->getOperand(0);
-    if (CC.getValueType() != MVT::i1)
+    if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
       return false;
     Invert = !AllOnes;
     if (AllOnes)
@@ -9308,10 +9513,90 @@ static SDValue findMUL_LOHI(SDValue V) {
   return SDValue();
 }
 
-static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb()) {
+    if (!Subtarget->hasDSP())
+      return SDValue();
+  } else if (!Subtarget->hasV5TEOps())
+    return SDValue();
+
+  // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
+  // accumulates the product into a 64-bit value. The 16-bit values will
+  // be sign extended somehow or SRA'd into 32-bit values
+  // (addc (adde (mul 16bit, 16bit), lo), hi)
+  SDValue Mul = AddcNode->getOperand(0);
+  SDValue Lo = AddcNode->getOperand(1);
+  if (Mul.getOpcode() != ISD::MUL) {
+    Lo = AddcNode->getOperand(0);
+    Mul = AddcNode->getOperand(1);
+    if (Mul.getOpcode() != ISD::MUL)
+      return SDValue();
+  }
+
+  SDValue SRA = AddeNode->getOperand(0);
+  SDValue Hi = AddeNode->getOperand(1);
+  if (SRA.getOpcode() != ISD::SRA) {
+    SRA = AddeNode->getOperand(1);
+    Hi = AddeNode->getOperand(0);
+    if (SRA.getOpcode() != ISD::SRA)
+      return SDValue();
+  }
+  if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
+    if (Const->getZExtValue() != 31)
+      return SDValue();
+  } else
+    return SDValue();
+
+  if (SRA.getOperand(0) != Mul)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(AddcNode);
+  unsigned Opcode = 0;
+  SDValue Op0;
+  SDValue Op1;
+
+  if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
+    Opcode = ARMISD::SMLALBB;
+    Op0 = Mul.getOperand(0);
+    Op1 = Mul.getOperand(1);
+  } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
+    Opcode = ARMISD::SMLALBT;
+    Op0 = Mul.getOperand(0);
+    Op1 = Mul.getOperand(1).getOperand(0);
+  } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
+    Opcode = ARMISD::SMLALTB;
+    Op0 = Mul.getOperand(0).getOperand(0);
+    Op1 = Mul.getOperand(1);
+  } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
+    Opcode = ARMISD::SMLALTT;
+    Op0 = Mul->getOperand(0).getOperand(0);
+    Op1 = Mul->getOperand(1).getOperand(0);
+  }
+
+  if (!Op0 || !Op1)
+    return SDValue();
+
+  SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                              Op0, Op1, Lo, Hi);
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(SMLAL.getNode(), 1);
+  SDValue LoMLALResult(SMLAL.getNode(), 0);
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
-
   // Look for multiply add opportunities.
   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
@@ -9326,7 +9611,17 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   //                  \      /
   //                    ADDC   <- hiAdd
   //
-  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+         "ADDE node has the wrong inputs");
+
+  // Check that we have a glued ADDC node.
+  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
+  if (AddcNode->getOpcode() != ARMISD::ADDC)
+    return SDValue();
+
   SDValue AddcOp0 = AddcNode->getOperand(0);
   SDValue AddcOp1 = AddcNode->getOperand(1);
 
@@ -9338,29 +9633,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
          AddcNode->getValueType(0) == MVT::i32 &&
          "Expect ADDC with two result values. First: i32");
 
-  // Check that we have a glued ADDC node.
-  if (AddcNode->getValueType(1) != MVT::Glue)
-    return SDValue();
-
-  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
+  // maybe a SMLAL which multiplies two 16-bit values.
   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
-    return SDValue();
-
-  // Look for the glued ADDE.
-  SDNode* AddeNode = AddcNode->getGluedUser();
-  if (!AddeNode)
-    return SDValue();
-
-  // Make sure it is really an ADDE.
-  if (AddeNode->getOpcode() != ISD::ADDE)
-    return SDValue();
-
-  assert(AddeNode->getNumOperands() == 3 &&
-         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
-         "ADDE node has the wrong inputs");
+    return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
 
   // Check for the triangle shape.
   SDValue AddeOp0 = AddeNode->getOperand(0);
@@ -9435,38 +9714,25 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
 
   // Return original node to notify the driver to stop replacing.
-  SDValue resNode(AddcNode, 0);
-  return resNode;
+  return SDValue(AddeNode, 0);
 }
 
-static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   // UMAAL is similar to UMLAL except that it adds two unsigned values.
   // While trying to combine for the other MLAL nodes, first search for the
-  // chance to use UMAAL. Check if Addc uses another addc node which can first
-  // be combined into a UMLAL. The other pattern is AddcNode being combined
-  // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
-
-  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
-      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
-
-  SDNode *PrevAddc = nullptr;
-  if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
-    PrevAddc = AddcNode->getOperand(0).getNode();
-  else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
-    PrevAddc = AddcNode->getOperand(1).getNode();
-
-  // If there's no addc chains, just return a search for any MLAL.
-  if (PrevAddc == nullptr)
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
-
-  // Try to convert the addc operand to an MLAL and if that fails try to
-  // combine AddcNode.
-  SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
-  if (MLAL != SDValue(PrevAddc, 0))
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+  // chance to use UMAAL. Check if Addc uses a node which has already
+  // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
+  // as the addend, and it's handled in PerformUMLALCombine.
+
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+    return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
+
+  // Check that we have a glued ADDC node.
+  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
+  if (AddcNode->getOpcode() != ARMISD::ADDC)
+    return SDValue();
 
   // Find the converted UMAAL or quit if it doesn't exist.
   SDNode *UmlalNode = nullptr;
@@ -9478,29 +9744,18 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
     UmlalNode = AddcNode->getOperand(1).getNode();
     AddHi = AddcNode->getOperand(0);
   } else {
-    return SDValue();
+    return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
   }
 
   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
   // the ADDC as well as Zero.
-  auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
-
-  if (!Zero || Zero->getZExtValue() != 0)
+  if (!isNullConstant(UmlalNode->getOperand(3)))
     return SDValue();
 
-  // Check that we have a glued ADDC node.
-  if (AddcNode->getValueType(1) != MVT::Glue)
-    return SDValue();
-
-  // Look for the glued ADDE.
-  SDNode* AddeNode = AddcNode->getGluedUser();
-  if (!AddeNode)
-    return SDValue();
-
-  if ((AddeNode->getOperand(0).getNode() == Zero &&
+  if ((isNullConstant(AddeNode->getOperand(0)) &&
        AddeNode->getOperand(1).getNode() == UmlalNode) ||
       (AddeNode->getOperand(0).getNode() == UmlalNode &&
-       AddeNode->getOperand(1).getNode() == Zero)) {
+       isNullConstant(AddeNode->getOperand(1)))) {
 
     SelectionDAG &DAG = DCI.DAG;
     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
@@ -9513,19 +9768,84 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
 
     // Return original node to notify the driver to stop replacing.
-    return SDValue(AddcNode, 0);
+    return SDValue(AddeNode, 0);
   }
   return SDValue();
 }
 
-/// PerformADDCCombine - Target-specific dag combine transform from
-/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
-/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
-static SDValue PerformADDCCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARMSubtarget *Subtarget) {
+static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+    return SDValue();
+
+  // Check that we have a pair of ADDC and ADDE as operands.
+  // Both addends of the ADDE must be zero.
+  SDNode* AddcNode = N->getOperand(2).getNode();
+  SDNode* AddeNode = N->getOperand(3).getNode();
+  if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
+      (AddeNode->getOpcode() == ARMISD::ADDE) &&
+      isNullConstant(AddeNode->getOperand(0)) &&
+      isNullConstant(AddeNode->getOperand(1)) &&
+      (AddeNode->getOperand(2).getNode() == AddcNode))
+    return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
+                       DAG.getVTList(MVT::i32, MVT::i32),
+                       {N->getOperand(0), N->getOperand(1),
+                        AddcNode->getOperand(0), AddcNode->getOperand(1)});
+  else
+    return SDValue();
+}
+
+static SDValue PerformAddcSubcCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  if (Subtarget->isThumb1Only()) {
+    SDValue RHS = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      int32_t imm = C->getSExtValue();
+      if (imm < 0 && imm > INT_MIN) {
+        SDLoc DL(N);
+        RHS = DAG.getConstant(-imm, DL, MVT::i32);
+        unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
+                                                           : ARMISD::ADDC;
+        return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
+      }
+    }
+  }
+  return SDValue();
+}
 
-  if (Subtarget->isThumb1Only()) return SDValue();
+static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  if (Subtarget->isThumb1Only()) {
+    SDValue RHS = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      int64_t imm = C->getSExtValue();
+      if (imm < 0) {
+        SDLoc DL(N);
+
+        // The with-carry-in form matches bitwise not instead of the negation.
+        // Effectively, the inverse interpretation of the carry flag already
+        // accounts for part of the negation.
+        RHS = DAG.getConstant(~imm, DL, MVT::i32);
+
+        unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
+                                                           : ARMISD::ADDE;
+        return DAG.getNode(Opcode, DL, N->getVTList(),
+                           N->getOperand(0), RHS, N->getOperand(2));
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// PerformADDECombine - Target-specific dag combine transform from
+/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
+static SDValue PerformADDECombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  // Only ARM and Thumb2 support UMLAL/SMLAL.
+  if (Subtarget->isThumb1Only())
+    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
 
   // Only perform the checks after legalize when the pattern is available.
   if (DCI.isBeforeLegalize()) return SDValue();
@@ -9722,7 +10042,6 @@ static SDValue PerformMULCombine(SDNode *N,
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
-
   // Attempt to use immediate-form VBIC
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   SDLoc dl(N);
@@ -9761,6 +10080,67 @@ static SDValue PerformANDCombine(SDNode *N,
   return SDValue();
 }
 
+// Try combining OR nodes to SMULWB, SMULWT.
+static SDValue PerformORCombineToSMULWBT(SDNode *OR,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasV6Ops() ||
+      (Subtarget->isThumb() &&
+       (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
+    return SDValue();
+
+  SDValue SRL = OR->getOperand(0);
+  SDValue SHL = OR->getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR->getOperand(1);
+    SHL = OR->getOperand(0);
+  }
+  if (!isSRL16(SRL) || !isSHL16(SHL))
+    return SDValue();
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return SDValue();
+
+  // Now we have:
+  // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMUWB the 16-bit value will signed extended somehow.
+  // For SMULWT only the SRA is required.
+  // Check both sides of SMUL_LOHI
+  SDValue OpS16 = SMULLOHI->getOperand(0);
+  SDValue OpS32 = SMULLOHI->getOperand(1);
+
+  SelectionDAG &DAG = DCI.DAG;
+  if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
+    OpS16 = OpS32;
+    OpS32 = SMULLOHI->getOperand(0);
+  }
+
+  SDLoc dl(OR);
+  unsigned Opcode = 0;
+  if (isS16(OpS16, DAG))
+    Opcode = ARMISD::SMULWB;
+  else if (isSRA16(OpS16)) {
+    Opcode = ARMISD::SMULWT;
+    OpS16 = OpS16->getOperand(0);
+  }
+  else
+    return SDValue();
+
+  SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
+  return SDValue(OR, 0);
+}
+
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
@@ -9798,6 +10178,8 @@ static SDValue PerformORCombine(SDNode *N,
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
+    if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
+      return Result;
   }
 
   // The code below optimizes (or (and X, Y), Z).
@@ -9906,7 +10288,7 @@ static SDValue PerformORCombine(SDNode *N,
         (Mask == ~Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
-      if (Subtarget->hasT2ExtractPack() &&
+      if (Subtarget->hasDSP() &&
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
@@ -9922,7 +10304,7 @@ static SDValue PerformORCombine(SDNode *N,
                (~Mask == Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
-      if (Subtarget->hasT2ExtractPack() &&
+      if (Subtarget->hasDSP() &&
           (Mask2 == 0xffff || Mask2 == 0xffff0000))
         return SDValue();
       // 2b
@@ -11324,8 +11706,8 @@ static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
   if (Op.getOpcode() == ARMISD::CMOV) {
     APInt KZ2(KnownZero.getBitWidth(), 0);
     APInt KO2(KnownOne.getBitWidth(), 0);
-    computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
-    computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
+    computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
+    computeKnownBits(DAG, Op.getOperand(1), KZ2, KO2);
 
     KnownZero &= KZ2;
     KnownOne &= KO2;
@@ -11555,13 +11937,17 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
+  case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
+  case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ARMISD::ADDC:
+  case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
@@ -11593,6 +11979,56 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ARMISD::SMULWB: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMULWT: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALBB: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALBT: {
+    unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
+    APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
+    unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
+    APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALTB: {
+    unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
+    APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
+    unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
+    APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALTT: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -12180,6 +12616,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = KnownOne.getBitWidth();
@@ -12588,8 +13025,8 @@ static TargetLowering::ArgListTy getDivRemArgList(
     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
     Entry.Node = N->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   if (Subtarget->isTargetWindows() && Args.size() >= 2)
@@ -12861,7 +13298,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return true;
   }
   case Intrinsic::arm_stlexd:
-  case Intrinsic::arm_strexd: {
+  case Intrinsic::arm_strexd:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(2);
@@ -12871,9 +13308,9 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = false;
     Info.writeMem = true;
     return true;
-  }
+
   case Intrinsic::arm_ldaexd:
-  case Intrinsic::arm_ldrexd: {
+  case Intrinsic::arm_ldrexd:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(0);
@@ -12883,7 +13320,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = true;
     Info.writeMem = false;
     return true;
-  }
+
   default:
     break;
   }
@@ -12921,7 +13358,7 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
-      Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
+      Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
                         Builder.getInt32(0), Builder.getInt32(7),
                         Builder.getInt32(10), Builder.getInt32(5)};
@@ -12932,7 +13369,7 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
       llvm_unreachable("makeDMB on a target so old that it has no barriers");
     }
   } else {
-    Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+    Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
     // Only a full system barrier exists in the M-class architectures.
     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
     Constant *CDomain = Builder.getInt32(Domain);
@@ -13089,7 +13526,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   if (ValTy->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
-    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+    Function *Ldrex = Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
@@ -13106,7 +13543,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
-  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldrex, Addr),
@@ -13118,7 +13555,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
 }
 
 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
@@ -13154,6 +13591,39 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
               Addr});
 }
 
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+                                             const DataLayout &DL) const {
+  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+bool ARMTargetLowering::isLegalInterleavedAccessType(
+    VectorType *VecTy, const DataLayout &DL) const {
+
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+  // Ensure the vector doesn't have f16 elements. Even though we could do an
+  // i16 vldN, we can't hold the f16 vectors and will end up converting via
+  // f32.
+  if (VecTy->getElementType()->isHalfTy())
+    return false;
+
+  // Ensure the number of vector elements is greater than 1.
+  if (VecTy->getNumElements() < 2)
+    return false;
+
+  // Ensure the element type is legal.
+  if (ElSize != 8 && ElSize != 16 && ElSize != 32)
+    return false;
+
+  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+  // 128 will be split into multiple interleaved accesses.
+  return VecSize == 64 || VecSize % 128 == 0;
+}
+
 /// \brief Lower an interleaved load into a vldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -13178,64 +13648,97 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip if we do not have NEON and skip illegal vector types and vector types
-  // with i64/f64 elements (vldN doesn't support i64/f64 elements).
-  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
+  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
+  IRBuilder<> Builder(LI);
+
+  // The base address of the load.
+  Value *BaseAddr = LI->getPointerOperand();
+
+  if (NumLoads > 1) {
+    // If we're going to generate more than one load, reset the sub-vector type
+    // to something legal.
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
+
+    // We will compute the pointer operand of each load from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
+  }
+
+  assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+  Type *Tys[] = {VecTy, Int8Ptr};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                             Intrinsic::arm_neon_vld3,
                                             Intrinsic::arm_neon_vld4};
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
-  IRBuilder<> Builder(LI);
-  SmallVector<Value *, 2> Ops;
+  // Holds sub-vectors extracted from the load intrinsic return values. The
+  // sub-vectors are associated with the shufflevector instructions they will
+  // replace.
+  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
-  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
-  Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
-  Ops.push_back(Builder.getInt32(LI->getAlignment()));
+  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
-  Type *Tys[] = { VecTy, Int8Ptr };
-  Function *VldnFunc =
-      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
-  CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+    // If we're generating more than one load, compute the base address of
+    // subsequent loads as an offset from the previous.
+    if (LoadCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(
+          BaseAddr, VecTy->getVectorNumElements() * Factor);
 
-  // Replace uses of each shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SV = Shuffles[i];
-    unsigned Index = Indices[i];
+    SmallVector<Value *, 2> Ops;
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+    Ops.push_back(Builder.getInt32(LI->getAlignment()));
 
-    Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+    CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
 
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+    // Replace uses of each shufflevector with the corresponding vector loaded
+    // by ldN.
+    for (unsigned i = 0; i < Shuffles.size(); i++) {
+      ShuffleVectorInst *SV = Shuffles[i];
+      unsigned Index = Indices[i];
 
-    SV->replaceAllUsesWith(SubVec);
-  }
+      Value *SubVec = Builder.CreateExtractValue(VldN, Index);
 
-  return true;
-}
+      // Convert the integer vector to pointer vector if the element is pointer.
+      if (EltTy->isPointerTy())
+        SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
 
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+      SubVecs[SV].push_back(SubVec);
+    }
+  }
+
+  // Replace uses of the shufflevector instructions with the sub-vectors
+  // returned by the load intrinsic. If a shufflevector instruction is
+  // associated with more than one sub-vector, those sub-vectors will be
+  // concatenated into a single wide vector.
+  for (ShuffleVectorInst *SVI : Shuffles) {
+    auto &SubVec = SubVecs[SVI];
+    auto *WideVec =
+        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+    SVI->replaceAllUsesWith(WideVec);
+  }
 
-  return ConstantVector::get(Mask);
+  return true;
 }
 
 /// \brief Lower an interleaved store into a vstN intrinsic.
@@ -13279,15 +13782,15 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip if we do not have NEON and skip illegal vector types and vector types
-  // with i64/f64 elements (vstN doesn't support i64/f64 elements).
-  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
-      EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
@@ -13306,44 +13809,75 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
+  // The base address of the store.
+  Value *BaseAddr = SI->getPointerOperand();
+
+  if (NumStores > 1) {
+    // If we're going to generate more than one store, reset the lane length
+    // and sub-vector type to something legal.
+    LaneLen /= NumStores;
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+    // We will compute the pointer operand of each store from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
+  }
+
+  assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
+
+  auto Mask = SVI->getShuffleMask();
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+  Type *Tys[] = {Int8Ptr, SubVecTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
                                              Intrinsic::arm_neon_vst3,
                                              Intrinsic::arm_neon_vst4};
-  SmallVector<Value *, 6> Ops;
 
-  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
-  Type *Tys[] = { Int8Ptr, SubVecTy };
-  Function *VstNFunc = Intrinsic::getDeclaration(
-      SI->getModule(), StoreInts[Factor - 2], Tys);
+    // If we generating more than one store, we compute the base address of
+    // subsequent stores as an offset from the previous.
+    if (StoreCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
 
-  // Split the shufflevector operands into sub vectors for the new vstN call.
-  auto Mask = SVI->getShuffleMask();
-  for (unsigned i = 0; i < Factor; i++) {
-    if (Mask[i] >= 0) {
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
-    } else {
-      unsigned StartMask = 0;
-      for (unsigned j = 1; j < LaneLen; j++) {
-        if (Mask[j*Factor + i] >= 0) {
-          StartMask = Mask[j*Factor + i] - j;
-          break;
+    SmallVector<Value *, 6> Ops;
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+
+    Function *VstNFunc =
+        Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+    // Split the shufflevector operands into sub vectors for the new vstN call.
+    for (unsigned i = 0; i < Factor; i++) {
+      unsigned IdxI = StoreCount * LaneLen * Factor + i;
+      if (Mask[IdxI] >= 0) {
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+      } else {
+        unsigned StartMask = 0;
+        for (unsigned j = 1; j < LaneLen; j++) {
+          unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+          if (Mask[IdxJ * Factor + IdxI] >= 0) {
+            StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+            break;
+          }
         }
+        // Note: If all elements in a chunk are undefs, StartMask=0!
+        // Note: Filling undef gaps with random elements is ok, since
+        // those elements were being written anyway (with undefs).
+        // In the case of all undefs we're defaulting to using elems from 0
+        // Note: StartMask cannot be negative, it's checked in
+        // isReInterleaveMask
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
-      // Note: If all elements in a chunk are undefs, StartMask=0!
-      // Note: Filling undef gaps with random elements is ok, since
-      // those elements were being written anyway (with undefs).
-      // In the case of all undefs we're defaulting to using elems from 0
-      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
     }
-  }
 
-  Ops.push_back(Builder.getInt32(SI->getAlignment()));
-  Builder.CreateCall(VstNFunc, Ops);
+    Ops.push_back(Builder.getInt32(SI->getAlignment()));
+    Builder.CreateCall(VstNFunc, Ops);
+  }
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 84c6eb845bb8..70a0b1380ec9 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -175,9 +175,15 @@ class InstrItineraryData;
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      SMULWB,       // Signed multiply word by half word, bottom
+      SMULWT,       // Signed multiply word by half word, top
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
       UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
+      SMLALBB,      // 64-bit signed accumulate multiply bottom, bottom 16
+      SMLALBT,      // 64-bit signed accumulate multiply bottom, top 16
+      SMLALTB,      // 64-bit signed accumulate multiply top, bottom 16
+      SMLALTT,      // 64-bit signed accumulate multiply top, top 16
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -346,6 +352,7 @@ class InstrItineraryData;
 
     void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth) const override;
 
@@ -500,9 +507,18 @@ class InstrItineraryData;
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
+    bool canMergeStoresTo(EVT MemVT) const override {
+      // Do not merge to larger than i32.
+      return (MemVT.getSizeInBits() <= 32);
+    }
+
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
     bool supportSwiftError() const override {
       return true;
     }
@@ -514,6 +530,17 @@ class InstrItineraryData;
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
     CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
 
+    /// Returns true if \p VecTy is a legal interleaved access type. This
+    /// function checks the vector element type and the overall width of the
+    /// vector.
+    bool isLegalInterleavedAccessType(VectorType *VecTy,
+                                      const DataLayout &DL) const;
+
+    /// Returns the number of interleaved accesses that will be generated when
+    /// lowering accesses of the given type.
+    unsigned getNumInterleavedAccesses(VectorType *VecTy,
+                                       const DataLayout &DL) const;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -698,7 +725,7 @@ class InstrItineraryData;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                      const SDLoc &dl) const;
+                      const SDLoc &dl, bool InvalidOnQNaN) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 488439fc24e0..1bbe7f0d275e 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -184,7 +184,7 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
 
 // ARM special operands for disassembly only.
 //
-def SetEndAsmOperand : ImmAsmOperand {
+def SetEndAsmOperand : ImmAsmOperand<0,1> {
   let Name = "SetEndImm";
   let ParserMethod = "parseSetEndImm";
 }
@@ -221,25 +221,25 @@ def banked_reg : Operand<i32> {
 //     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
 //     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
 //     64       64 - <imm> is encoded in imm6<5:0>
-def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8_asm_operand : ImmAsmOperand<1,8> { let Name = "ShrImm8"; }
 def shr_imm8  : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
   let EncoderMethod = "getShiftRight8Imm";
   let DecoderMethod = "DecodeShiftRight8Imm";
   let ParserMatchClass = shr_imm8_asm_operand;
 }
-def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16_asm_operand : ImmAsmOperand<1,16> { let Name = "ShrImm16"; }
 def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
   let EncoderMethod = "getShiftRight16Imm";
   let DecoderMethod = "DecodeShiftRight16Imm";
   let ParserMatchClass = shr_imm16_asm_operand;
 }
-def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32_asm_operand : ImmAsmOperand<1,32> { let Name = "ShrImm32"; }
 def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
   let EncoderMethod = "getShiftRight32Imm";
   let DecoderMethod = "DecodeShiftRight32Imm";
   let ParserMatchClass = shr_imm32_asm_operand;
 }
-def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64_asm_operand : ImmAsmOperand<1,64> { let Name = "ShrImm64"; }
 def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
   let EncoderMethod = "getShiftRight64Imm";
   let DecoderMethod = "DecodeShiftRight64Imm";
@@ -261,10 +261,19 @@ def const_pool_asm_imm : Operand<i32> {
 // Note: When EmitPriority == 1, the alias will be used for printing
 class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class ARMInstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsARM,UseNegativeImmediates]>;
 class  tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class  tInstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsThumb,UseNegativeImmediates]>;
 class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class t2InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsThumb2,UseNegativeImmediates]>;
 class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
 class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
@@ -948,7 +957,7 @@ class ADivA1I<bits<3> opcod, dag oops, dag iops,
 }
 
 // PKH instructions
-def PKHLSLAsmOperand : ImmAsmOperand {
+def PKHLSLAsmOperand : ImmAsmOperand<0,31> {
   let Name = "PKHLSLImm";
   let ParserMethod = "parsePKHLSLImm";
 }
@@ -1013,9 +1022,6 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
 class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
 }
-class Thumb2ExtractPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb2, HasT2ExtractPack];
-}
 //===----------------------------------------------------------------------===//
 // Thumb Instruction Format Definitions.
 //
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 27b64322dfa9..3b3606ef462a 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -129,8 +129,9 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
       MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
   MIB.addMemOperand(MMO);
-  MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
-  MIB.addReg(Reg, RegState::Kill).addImm(0);
-  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-  AddDefaultPred(MIB);
+  BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addImm(0)
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
+      .add(predOps(ARMCC::AL));
 }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c47393990e97..cc0e7d4d9c35 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -51,6 +51,8 @@ def SDT_ARMAnd     : SDTypeProfile<1, 2,
                                     SDTCisVT<2, i32>]>;
 
 def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ARMFCmp    : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
+                                          SDTCisVT<2, i32>]>;
 
 def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
@@ -90,6 +92,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
 
+def SDT_LongMac  : SDTypeProfile<2, 4, [SDTCisVT<0, i32>,
+                                        SDTCisSameAs<0, 1>,
+                                        SDTCisSameAs<0, 2>,
+                                        SDTCisSameAs<0, 3>,
+                                        SDTCisSameAs<0, 4>,
+                                        SDTCisSameAs<0, 5>]>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -181,6 +190,13 @@ def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                          SDNPMayStore, SDNPMayLoad]>;
 
+def ARMsmulwb       : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>;
+def ARMsmulwt       : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>;
+def ARMsmlalbb      : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>;
+def ARMsmlalbt      : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
+def ARMsmlaltb      : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
+def ARMsmlaltt      : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -247,9 +263,6 @@ def HasDivide        : Predicate<"Subtarget->hasDivide()">,
                                  AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
                                  AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
-def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
-                                 AssemblerPredicate<"FeatureT2XtPk",
-                                                     "pack/extract">;
 def HasDSP           : Predicate<"Subtarget->hasDSP()">,
                                  AssemblerPredicate<"FeatureDSP", "dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
@@ -298,6 +311,11 @@ def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
                                  AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
+def UseNegativeImmediates :
+  Predicate<"false">,
+            AssemblerPredicate<"!FeatureNoNegativeImmediates",
+                               "NegativeImmediates">;
+
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
 def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
@@ -423,7 +441,16 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
 //
 
 // Immediate operands with a shared generic asm render method.
-class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+class ImmAsmOperand<int Low, int High> : AsmOperandClass {
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
+  let DiagnosticType = "ImmRange" # Low # "_" # High;
+}
+
+class ImmAsmOperandMinusOne<int Low, int High> : AsmOperandClass {
+  let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
+  let DiagnosticType = "ImmRange" # Low # "_" # High;
+}
 
 // Operands that are part of a memory addressing mode.
 class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; }
@@ -645,35 +672,45 @@ def arm_i32imm : PatLeaf<(imm), [{
 }]>;
 
 /// imm0_1 predicate - Immediate in the range [0,1].
-def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; }
 def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
 
 /// imm0_3 predicate - Immediate in the range [0,3].
-def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def Imm0_3AsmOperand: ImmAsmOperand<0,3> { let Name = "Imm0_3"; }
 def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
 
 /// imm0_7 predicate - Immediate in the range [0,7].
-def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def Imm0_7AsmOperand: ImmAsmOperand<0,7> {
+  let Name = "Imm0_7";
+}
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 8;
 }]> {
   let ParserMatchClass = Imm0_7AsmOperand;
 }
 
+/// imm8_255 predicate - Immediate in the range [8,255].
+def Imm8_255AsmOperand: ImmAsmOperand<8,255> { let Name = "Imm8_255"; }
+def imm8_255 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
+}]> {
+  let ParserMatchClass = Imm8_255AsmOperand;
+}
+
 /// imm8 predicate - Immediate is exactly 8.
-def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def Imm8AsmOperand: ImmAsmOperand<8,8> { let Name = "Imm8"; }
 def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
   let ParserMatchClass = Imm8AsmOperand;
 }
 
 /// imm16 predicate - Immediate is exactly 16.
-def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def Imm16AsmOperand: ImmAsmOperand<16,16> { let Name = "Imm16"; }
 def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
   let ParserMatchClass = Imm16AsmOperand;
 }
 
 /// imm32 predicate - Immediate is exactly 32.
-def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def Imm32AsmOperand: ImmAsmOperand<32,32> { let Name = "Imm32"; }
 def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
   let ParserMatchClass = Imm32AsmOperand;
 }
@@ -681,25 +718,25 @@ def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
 def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>;
 
 /// imm1_7 predicate - Immediate in the range [1,7].
-def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def Imm1_7AsmOperand: ImmAsmOperand<1,7> { let Name = "Imm1_7"; }
 def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
   let ParserMatchClass = Imm1_7AsmOperand;
 }
 
 /// imm1_15 predicate - Immediate in the range [1,15].
-def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def Imm1_15AsmOperand: ImmAsmOperand<1,15> { let Name = "Imm1_15"; }
 def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
   let ParserMatchClass = Imm1_15AsmOperand;
 }
 
 /// imm1_31 predicate - Immediate in the range [1,31].
-def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def Imm1_31AsmOperand: ImmAsmOperand<1,31> { let Name = "Imm1_31"; }
 def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
   let ParserMatchClass = Imm1_31AsmOperand;
 }
 
 /// imm0_15 predicate - Immediate in the range [0,15].
-def Imm0_15AsmOperand: ImmAsmOperand {
+def Imm0_15AsmOperand: ImmAsmOperand<0,15> {
   let Name = "Imm0_15";
   let DiagnosticType = "ImmRange0_15";
 }
@@ -710,7 +747,7 @@ def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
-def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def Imm0_31AsmOperand: ImmAsmOperand<0,31> { let Name = "Imm0_31"; }
 def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 32;
 }]> {
@@ -718,15 +755,15 @@ def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
-def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def Imm0_32AsmOperand: ImmAsmOperand<0,32> { let Name = "Imm0_32"; }
 def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm < 32;
+  return Imm >= 0 && Imm < 33;
 }]> {
   let ParserMatchClass = Imm0_32AsmOperand;
 }
 
 /// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
-def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def Imm0_63AsmOperand: ImmAsmOperand<0,63> { let Name = "Imm0_63"; }
 def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 64;
 }]> {
@@ -734,7 +771,7 @@ def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_239 predicate - Immediate in the range [0,239].
-def Imm0_239AsmOperand : ImmAsmOperand {
+def Imm0_239AsmOperand : ImmAsmOperand<0,239> {
   let Name = "Imm0_239";
   let DiagnosticType = "ImmRange0_239";
 }
@@ -743,13 +780,13 @@ def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
 }
 
 /// imm0_255 predicate - Immediate in the range [0,255].
-def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def Imm0_255AsmOperand : ImmAsmOperand<0,255> { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
   let ParserMatchClass = Imm0_255AsmOperand;
 }
 
-/// imm0_65535 - An immediate is in the range [0.65535].
-def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+/// imm0_65535 - An immediate is in the range [0,65535].
+def Imm0_65535AsmOperand: ImmAsmOperand<0,65535> { let Name = "Imm0_65535"; }
 def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 65536;
 }]> {
@@ -767,19 +804,23 @@ def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
 // FIXME: This really needs a Thumb version separate from the ARM version.
 // While the range is the same, and can thus use the same match class,
 // the encoding is different so it should have a different encoder method.
-def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def Imm0_65535ExprAsmOperand: AsmOperandClass {
+  let Name = "Imm0_65535Expr";
+  let RenderMethod = "addImmOperands";
+}
+
 def imm0_65535_expr : Operand<i32> {
   let EncoderMethod = "getHiLo16ImmOpValue";
   let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
-def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def Imm256_65535ExprAsmOperand: ImmAsmOperand<256,65535> { let Name = "Imm256_65535Expr"; }
 def imm256_65535_expr : Operand<i32> {
   let ParserMatchClass = Imm256_65535ExprAsmOperand;
 }
 
 /// imm24b - True if the 32-bit immediate is encodable in 24 bits.
-def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def Imm24bitAsmOperand: ImmAsmOperand<0,0xffffff> { let Name = "Imm24bit"; }
 def imm24b : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm <= 0xffffff;
 }]> {
@@ -808,7 +849,9 @@ def imm1_32_XFORM: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
                                    MVT::i32);
 }]>;
-def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def Imm1_32AsmOperand: ImmAsmOperandMinusOne<1,32> {
+  let Name = "Imm1_32";
+}
 def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
    uint64_t Imm = N->getZExtValue();
    return Imm > 0 && Imm <= 32;
@@ -822,7 +865,7 @@ def imm1_16_XFORM: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
                                    MVT::i32);
 }]>;
-def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def Imm1_16AsmOperand: ImmAsmOperandMinusOne<1,16> { let Name = "Imm1_16"; }
 def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
     imm1_16_XFORM> {
   let PrintMethod = "printImmPlusOneOperand";
@@ -3850,6 +3893,7 @@ def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
   let Inst{11-0} = imm;
 }
 
+let AddedComplexity = 1 in
 def : ARMPat<(and   GPR:$src, mod_imm_not:$imm),
              (BICri GPR:$src, mod_imm_not:$imm)>;
 
@@ -3899,7 +3943,8 @@ def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
                     (ins GPRnopc:$Rn, GPRnopc:$Rm),
                     IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
                   [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
-                  Requires<[IsARM, HasV6]> {
+                  Requires<[IsARM, HasV6]>,
+         Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{15-12} = 0b0000;
   let Unpredictable{15-12} = 0b1111;
 }
@@ -3910,14 +3955,16 @@ def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
                            4, IIC_iMUL32,
                [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
                (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
-               Requires<[IsARM, NoV6, UseMulOps]>;
+               Requires<[IsARM, NoV6, UseMulOps]>,
+           Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 }
 
 def MLA  : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
                      (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
                      IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
         [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
-                     Requires<[IsARM, HasV6, UseMulOps]> {
+                     Requires<[IsARM, HasV6, UseMulOps]>,
+        Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
 }
@@ -3928,12 +3975,14 @@ def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
                             pred:$p, cc_out:$s), 4, IIC_iMAC32,
          [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
   (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+           Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                    IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
-                   Requires<[IsARM, HasV6T2, UseMulOps]> {
+                   Requires<[IsARM, HasV6T2, UseMulOps]>,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<4> Rn;
@@ -3949,26 +3998,38 @@ let hasSideEffects = 0 in {
 let isCommutable = 1 in {
 def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
-                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm",
+                    [(set GPR:$RdLo, GPR:$RdHi,
+                          (smullohi GPR:$Rn, GPR:$Rm))]>,
+                    Requires<[IsARM, HasV6]>,
+           Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
-                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm",
+                    [(set GPR:$RdLo, GPR:$RdHi,
+                          (umullohi GPR:$Rn, GPR:$Rm))]>,
+                    Requires<[IsARM, HasV6]>,
+           Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL]>;
 
 let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                             (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
-                            4, IIC_iMUL64, [],
+                            4, IIC_iMUL64,
+                            [(set GPR:$RdLo, GPR:$RdHi,
+                                  (smullohi GPR:$Rn, GPR:$Rm))],
           (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                             (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
-                            4, IIC_iMUL64, [],
+                            4, IIC_iMUL64,
+                            [(set GPR:$RdLo, GPR:$RdHi,
+                                  (umullohi GPR:$Rn, GPR:$Rm))],
           (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+             Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 }
 }
 
@@ -3976,17 +4037,20 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
                         (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+           Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
                         (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+            Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
                                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
                                IIC_iMAC64,
                     "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+            Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rm;
@@ -4004,13 +4068,15 @@ def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                               4, IIC_iMAC64, [],
              (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
                            pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                 (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
              (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
                            pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 }
 
 } // hasSideEffects
@@ -4019,13 +4085,15 @@ def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
                [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
-            Requires<[IsARM, HasV6]> {
+            Requires<[IsARM, HasV6]>,
+            Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{15-12} = 0b1111;
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
-            Requires<[IsARM, HasV6]> {
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMUL32, ReadMUL, ReadMUL]>  {
   let Inst{15-12} = 0b1111;
 }
 
@@ -4033,57 +4101,67 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
                [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
-            Requires<[IsARM, HasV6, UseMulOps]>;
+            Requires<[IsARM, HasV6, UseMulOps]>,
+            Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6, UseMulOps]>;
+            Requires<[IsARM, HasV6, UseMulOps]>,
+            Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 multiclass AI_smul<string opc> {
   def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sext_inreg GPR:$Rm, i16)))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sra GPR:$Rm, (i32 16))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sext_inreg GPR:$Rm, i16)))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sra GPR:$Rm, (i32 16))))]>,
-            Requires<[IsARM, HasV5TE]>;
+            Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
-              []>,
-           Requires<[IsARM, HasV5TE]>;
+              [(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
-              []>,
-            Requires<[IsARM, HasV5TE]>;
+              [(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>,
+            Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 }
 
 
@@ -4095,7 +4173,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd, (add GPR:$Ra,
                                (mul (sext_inreg GPRnopc:$Rn, i16),
                                        (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4103,7 +4182,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
                                           (sra GPRnopc:$Rm, (i32 16)))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4111,7 +4191,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                           (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4119,19 +4200,24 @@ multiclass AI_smla<string opc> {
              [(set GPRnopc:$Rd,
                    (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
-            Requires<[IsARM, HasV5TE, UseMulOps]>;
+            Requires<[IsARM, HasV5TE, UseMulOps]>,
+            Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              []>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              []>,
-            Requires<[IsARM, HasV5TE, UseMulOps]>;
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
   }
 }
 
@@ -4139,25 +4225,28 @@ defm SMUL : AI_smul<"smul">;
 defm SMLA : AI_smla<"smla">;
 
 // Halfword multiply accumulate long: SMLAL<x><y>.
-def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
+class SMLAL<bits<2> opc1, string asm>
+ : AMulxyI64<0b0001010, opc1,
+        (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+        (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+        IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+        Requires<[IsARM, HasV5TE]>,
+        Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
+
+def SMLALBB : SMLAL<0b00, "smlalbb">;
+def SMLALBT : SMLAL<0b10, "smlalbt">;
+def SMLALTB : SMLAL<0b01, "smlaltb">;
+def SMLALTT : SMLAL<0b11, "smlaltt">;
+
+def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALBB $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALBT $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALTB $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALTT $Rn, $Rm, $RLo, $RHi)>;
 
 // Helper class for AI_smld.
 class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
@@ -4203,19 +4292,23 @@ multiclass AI_smld<bit sub, string opc> {
 
   def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
-                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
   def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
   def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
-                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">,
+          Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
   def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
-                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">,
+             Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 }
 
@@ -4225,9 +4318,11 @@ defm SMLS : AI_smld<1, "smls">;
 multiclass AI_sdml<bit sub, string opc> {
 
   def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">,
+        Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
   def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">,
+         Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 }
 
 defm SMUA : AI_sdml<0, "smua">;
@@ -4239,12 +4334,14 @@ defm SMUS : AI_sdml<1, "smus">;
 def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
                    "sdiv", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
-           Requires<[IsARM, HasDivideInARM]>;
+           Requires<[IsARM, HasDivideInARM]>,
+           Sched<[WriteDIV]>;
 
 def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
                    "udiv", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
-           Requires<[IsARM, HasDivideInARM]>;
+           Requires<[IsARM, HasDivideInARM]>,
+           Sched<[WriteDIV]>;
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
@@ -4831,14 +4928,15 @@ let AddedComplexity = 8 in {
   def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
 }
 
-// SWP/SWPB are deprecated in V6/V7.
+// SWP/SWPB are deprecated in V6/V7 and optional in v7VE.
+// FIXME Use InstAlias to generate LDREX/STREX pairs instead.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
                 (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
-                Requires<[PreV8]>;
+                Requires<[IsARM,PreV8]>;
 def SWPB: AIswp<1, (outs GPRnopc:$Rt),
                 (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
-                Requires<[PreV8]>;
+                Requires<[IsARM,PreV8]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4850,7 +4948,7 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
             [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                           imm:$CRm, imm:$opc2)]>,
-            Requires<[PreV8]> {
+            Requires<[IsARM,PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -4872,7 +4970,7 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                               imm:$CRm, imm:$opc2)]>,
-               Requires<[PreV8]> {
+               Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -5048,13 +5146,13 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
 
 defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
 defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
 defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -5132,7 +5230,7 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                      imm:$CRm, imm:$opc2)]>,
-                      Requires<[PreV8]>;
+                      Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -5140,7 +5238,7 @@ def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>,
-                      Requires<[PreV8]>;
+                      Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -5183,7 +5281,7 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
                   list<dag> pattern = []>
   : ABXI<0b1100, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
-    Requires<[PreV8]> {
+    Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -5525,20 +5623,26 @@ def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
 
 // smul* and smla*
 def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
-                 (SMULBB GPR:$a, GPR:$b)>;
+                 (SMULBB GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
-                 (SMULBT GPR:$a, GPR:$b)>;
+                 (SMULBT GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
-                (SMULTB GPR:$a, GPR:$b)>;
+                (SMULTB GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, sext_16_node:$b)),
-                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
-                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
-                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 
 // Pre-v7 uses MCR for synchronization barriers.
 def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
@@ -5717,33 +5821,49 @@ def : MnemonicAlias<"usubaddx", "usax">;
 
 // "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
-def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+def : ARMInstSubst<"mov${s}${p} $Rd, $imm",
                    (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+def : ARMInstSubst<"mvn${s}${p} $Rd, $imm",
                    (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
-def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"bic${s}${p} $Rd, $Rn, $imm",
                    (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+def : ARMInstSubst<"bic${s}${p} $Rdn, $imm",
                    (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"and${s}${p} $Rd, $Rn, $imm",
                    (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+def : ARMInstSubst<"and${s}${p} $Rdn, $imm",
                    (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
 // Likewise, "add Rd, mod_imm_neg" -> sub
-def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"add${s}${p} $Rd, $Rn, $imm",
                  (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+def : ARMInstSubst<"add${s}${p} $Rd, $imm",
                  (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Likewise, "sub Rd, mod_imm_neg" -> add
+def : ARMInstSubst<"sub${s}${p} $Rd, $Rn, $imm",
+                 (ADDri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sub${s}${p} $Rd, $imm",
+                 (ADDri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+
+
+def : ARMInstSubst<"adc${s}${p} $Rd, $Rn, $imm",
+                 (SBCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"adc${s}${p} $Rdn, $imm",
+                 (SBCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sbc${s}${p} $Rd, $Rn, $imm",
+                 (ADCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sbc${s}${p} $Rdn, $imm",
+                 (ADCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+
 // Same for CMP <--> CMN via mod_imm_neg
-def : ARMInstAlias<"cmp${p} $Rd, $imm",
+def : ARMInstSubst<"cmp${p} $Rd, $imm",
                    (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
-def : ARMInstAlias<"cmn${p} $Rd, $imm",
+def : ARMInstSubst<"cmn${p} $Rd, $imm",
                    (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
 
 // The shifter forms of the MOV instruction are aliased to the ASR, LSL,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index b5fa8e999e2a..681e235d78f0 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -7139,6 +7139,17 @@ let Predicates = [IsBE] in {
                         (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
 }
 
+def : Pat<(v2i64 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v4i32 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v8i16 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v16i8 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+
 //===----------------------------------------------------------------------===//
 // Assembler aliases
 //
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index a681f64b05e6..f2f426e86701 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -19,7 +19,7 @@ def imm_sr_XFORM: SDNodeXForm<imm, [{
   unsigned Imm = N->getZExtValue();
   return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
 }]>;
-def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def ThumbSRImmAsmOperand: ImmAsmOperand<1,32> { let Name = "ImmThumbSR"; }
 def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   uint64_t Imm = N->getZExtValue();
   return Imm > 0 && Imm <= 32;
@@ -28,22 +28,31 @@ def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = ThumbSRImmAsmOperand;
 }
 
-def imm_comp_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
 def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
+def ThumbModImmNeg1_7AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg1_7"; }
+def mod_imm1_7_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return 0 < Value && Value < 8;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ThumbModImmNeg1_7AsmOperand;
+}
+
+def ThumbModImmNeg8_255AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg8_255"; }
+def mod_imm8_255_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return 7 < Value && Value < 256;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ThumbModImmNeg8_255AsmOperand;
+}
+
+
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
 
-def imm8_255 : ImmLeaf<i32, [{
-  return Imm >= 8 && Imm < 256;
-}]>;
 def imm8_255_neg : PatLeaf<(i32 imm), [{
   unsigned Val = -N->getZExtValue();
   return Val >= 8 && Val < 256;
@@ -407,9 +416,9 @@ def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
   let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
-def : tInstAlias<"add${p} sp, $imm",
+def : tInstSubst<"add${p} sp, $imm",
                  (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
-def : tInstAlias<"add${p} sp, sp, $imm",
+def : tInstSubst<"add${p} sp, sp, $imm",
                  (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
 
 // Can optionally specify SP as a three operand instruction.
@@ -910,7 +919,7 @@ let isAdd = 1 in {
   def tADC :                      // A8.6.2
     T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
                   "adc", "\t$Rdn, $Rm",
-                  [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+                  []>, Sched<[WriteALU]>;
 
   // Add immediate
   def tADDi3 :                    // A8.6.4 T1
@@ -938,6 +947,43 @@ let isAdd = 1 in {
                   "add", "\t$Rd, $Rn, $Rm",
                   [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
+  /// Similar to the above except these set the 's' bit so the
+  /// instruction modifies the CPSR register.
+  ///
+  /// These opcodes will be converted to the real non-S opcodes by
+  /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+  let hasPostISelHook = 1, Defs = [CPSR] in {
+    let isCommutable = 1 in
+    def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm,
+                                                            CPSR))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+    def tADDSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                              2, IIC_iALUi,
+                              [(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rm,
+                                                             imm0_7:$imm3))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+
+    def tADDSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
+                              2, IIC_iALUi,
+                              [(set tGPR:$Rdn, CPSR, (ARMaddc tGPR:$Rn,
+                                                      imm8_255:$imm8))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+
+    let isCommutable = 1 in
+    def tADDSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                              2, IIC_iALUr,
+                              [(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rn,
+                                                             tGPR:$Rm))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+  }
+
   let hasSideEffects = 0 in
   def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
                        "add", "\t$Rdn, $Rm", []>,
@@ -951,6 +997,12 @@ let isAdd = 1 in {
   }
 }
 
+def : tInstSubst<"sub${s}${p} $rd, $rn, $imm",
+                 (tADDi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
+def : tInstSubst<"sub${s}${p} $rdn, $imm",
+                 (tADDi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
+
+
 // AND register
 let isCommutable = 1 in
 def tAND :                      // A8.6.12
@@ -1197,7 +1249,7 @@ def tSBC :                      // A8.6.151
   T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sbc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                []>,
                 Sched<[WriteALU]>;
 
 // Subtract immediate
@@ -1218,6 +1270,14 @@ def tSUBi8 :                    // A8.6.210 T2
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
                     Sched<[WriteALU]>;
 
+def : tInstSubst<"add${s}${p} $rd, $rn, $imm",
+                 (tSUBi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
+
+
+def : tInstSubst<"add${s}${p} $rdn, $imm",
+                 (tSUBi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
+
+
 // Subtract register
 def tSUBrr :                    // A8.6.212
   T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
@@ -1226,6 +1286,41 @@ def tSUBrr :                    // A8.6.212
                 [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
                 Sched<[WriteALU]>;
 
+/// Similar to the above except these set the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+  def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                          2, IIC_iALUr,
+                          [(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm,
+                                                          CPSR))]>,
+              Requires<[IsThumb1Only]>,
+              Sched<[WriteALU]>;
+
+  def tSUBSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                            2, IIC_iALUi,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rm,
+                                                           imm0_7:$imm3))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+  def tSUBSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
+                            2, IIC_iALUi,
+                            [(set tGPR:$Rdn, CPSR, (ARMsubc tGPR:$Rn,
+                                                            imm8_255:$imm8))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+  def tSUBSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rn,
+                                                           tGPR:$Rm))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+}
+
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
   T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1386,22 +1481,6 @@ def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
 def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
             (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
 
-// Add with carry
-def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
-            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
-            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
-            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
-
-// Subtract with carry
-def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
-            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
-            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
-def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
-            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
-
 // Bswap 16 with load/store
 def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
             (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
@@ -1477,7 +1556,7 @@ def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
 // post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
 // different to how ISel expects them for a post-inc load, so use a pseudo
 // and expand it just after ISel.
-let usesCustomInserter = 1,
+let usesCustomInserter = 1, mayLoad =1,
     Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
  def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
                                (ins rGPR:$Rn, pred:$p),
@@ -1547,7 +1626,7 @@ def : T1Pat<(i32 thumb_immshifted:$src),
                     (thumb_immshifted_shamt imm:$src))>;
 
 def : T1Pat<(i32 imm0_255_comp:$src),
-            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+            (tMVN (tMOVi8 (imm_not_XFORM imm:$src)))>;
 
 def : T1Pat<(i32 imm256_510:$src),
             (tADDi8 (tMOVi8 255),
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 603d66403e65..f5b673b78ad7 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -76,7 +76,11 @@ def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
-def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm_asmoperand : AsmOperandClass {
+  let Name = "T2SOImm";
+  let RenderMethod = "addImmOperands";
+
+}
 def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
     return ARM_AM::getT2SOImmVal(Imm) != -1;
   }]> {
@@ -110,15 +114,14 @@ def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
 
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
-def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-  int64_t Value = -(int)N->getZExtValue();
-  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+def t2_so_imm_neg : Operand<i32>, ImmLeaf<i32, [{
+  return Imm && ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
 }], t2_so_imm_neg_XFORM> {
   let ParserMatchClass = t2_so_imm_neg_asmoperand;
 }
 
-/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0,4095].
+def imm0_4095_asmoperand: ImmAsmOperand<0,4095> { let Name = "Imm0_4095"; }
 def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 4096;
 }]> {
@@ -139,7 +142,7 @@ def imm1_255_neg : PatLeaf<(i32 imm), [{
 
 def imm0_255_not : PatLeaf<(i32 imm), [{
   return (uint32_t)(~N->getZExtValue()) < 255;
-}], imm_comp_XFORM>;
+}], imm_not_XFORM>;
 
 def lo5AllOne : PatLeaf<(i32 imm), [{
   // Returns true if all low 5-bits are 1.
@@ -538,7 +541,8 @@ class T2FourReg<dag oops, dag iops, InstrItinClass itin,
 class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
                 string opc, list<dag> pattern>
   : T2I<(outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
-         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern> {
+         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern>,
+    Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rn;
@@ -556,7 +560,8 @@ class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, string opc>
   : T2I<(outs rGPR:$RdLo, rGPR:$RdHi),
         (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
         opc, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi"> {
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+    Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rn;
@@ -977,7 +982,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
                   PatFrag opnode> {
   def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]>,
+            Sched<[WriteLd]> {
     bits<4> Rt;
     bits<17> addr;
     let Inst{31-25} = 0b1111100;
@@ -993,7 +999,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
   def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>,
+            Sched<[WriteLd]> {
     bits<4> Rt;
     bits<13> addr;
     let Inst{31-27} = 0b11111;
@@ -1015,7 +1022,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
   def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]>,
+            Sched<[WriteLd]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -1039,7 +1047,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   // from the PC.
   def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]>,
+            Sched<[WriteLd]> {
     let isReMaterializable = 1;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
@@ -1065,7 +1074,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
                   PatFrag opnode> {
   def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0001;
     let Inst{22-21} = opcod;
@@ -1082,7 +1092,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
   }
   def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -1102,7 +1113,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
   }
   def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -1121,28 +1133,10 @@ multiclass T2I_st<bits<2> opcod, string opc,
 
 /// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-             opc, ".w\t$Rd, $Rm$rot",
-             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-             Requires<[IsThumb2]> {
-   let Inst{31-27} = 0b11111;
-   let Inst{26-23} = 0b0100;
-   let Inst{22-20} = opcod;
-   let Inst{19-16} = 0b1111; // Rn
-   let Inst{15-12} = 0b1111;
-   let Inst{7} = 1;
-
-   bits<2> rot;
-   let Inst{5-4} = rot{1-0}; // rotate
-}
-
-// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
-class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
-             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
-            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-          Requires<[HasT2ExtractPack, IsThumb2]> {
+class T2I_ext_rrot_base<bits<3> opcod, dag iops, dag oops,
+                        string opc, string oprs,
+                        list<dag> pattern>
+  : T2TwoReg<iops, oops, IIC_iEXTr, opc, oprs, pattern> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1150,46 +1144,34 @@ class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15-12} = 0b1111;
   let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
-
-// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
-// supported yet.
-class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-             opc, "\t$Rd, $Rm$rot", []>,
-          Requires<[IsThumb2, HasT2ExtractPack]> {
-  bits<2> rot;
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0100;
-  let Inst{22-20} = opcod;
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15-12} = 0b1111;
-  let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
+  let Inst{5-4} = rot; // rotate
+}
+
+class T2I_ext_rrot<bits<3> opcod, string opc>
+  : T2I_ext_rrot_base<opcod,
+                      (outs rGPR:$Rd),
+                      (ins rGPR:$Rm, rot_imm:$rot),
+                      opc, ".w\t$Rd, $Rm$rot", []>,
+                      Requires<[IsThumb2]>,
+                      Sched<[WriteALU, ReadALU]>;
+
+// UXTB16, SXTB16 - Requires HasDSP, does not need the .w qualifier.
+class T2I_ext_rrot_xtb16<bits<3> opcod, string opc>
+  : T2I_ext_rrot_base<opcod,
+                      (outs rGPR:$Rd),
+                      (ins rGPR:$Rm, rot_imm:$rot),
+                      opc, "\t$Rd, $Rm$rot", []>,
+                      Requires<[HasDSP, IsThumb2]>,
+                      Sched<[WriteALU, ReadALU]>;
 
 /// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+class T2I_exta_rrot<bits<3> opcod, string opc>
   : T2ThreeReg<(outs rGPR:$Rd),
                (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
-               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
-             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
-           Requires<[HasT2ExtractPack, IsThumb2]> {
-  bits<2> rot;
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0100;
-  let Inst{22-20} = opcod;
-  let Inst{15-12} = 0b1111;
-  let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
-
-class T2I_exta_rrot_np<bits<3> opcod, string opc>
-  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
                IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
-               Requires<[HasT2ExtractPack, IsThumb2]> {
+               Requires<[HasDSP, IsThumb2]>,
+               Sched<[WriteALU, ReadALU]> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1279,7 +1261,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>,
+                 Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
@@ -1333,17 +1316,20 @@ let mayLoad = 1, hasSideEffects = 0 in {
 def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                 Sched<[WriteLd]>;
 
 def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
-                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 
 def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                 Sched<[WriteLd]>;
 
 def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
@@ -1353,41 +1339,45 @@ def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
 def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                Sched<[WriteLd]>;
 
 def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 
 def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []>;
+                            []>, Sched<[WriteLd]>;
 
 def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                   Sched<[WriteLd]>;
 
 def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []>;
+                            []>, Sched<[WriteLd]>;
 
 def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0
 
 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
   : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
-          "\t$Rt, $addr", []> {
+          "\t$Rt, $addr", []>, Sched<[WriteLd]> {
   bits<4> Rt;
   bits<13> addr;
   let Inst{31-27} = 0b11111;
@@ -1431,11 +1421,14 @@ class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
 }
 
 def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
-                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
-                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
-                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 
 // Store
 defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
@@ -1448,7 +1441,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
-               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>,
+               Sched<[WriteST]>;
 
 // Indexed stores
 
@@ -1457,19 +1451,22 @@ def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                             "str", "\t$Rt, $addr!",
-                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+                 Sched<[WriteST]>;
 
 def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                         "strh", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+                  Sched<[WriteST]>;
 
 def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
                         "strb", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+            Sched<[WriteST]>;
 } // mayStore = 1, hasSideEffects = 0
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
@@ -1480,7 +1477,8 @@ def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
                           "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
              [(set GPRnopc:$Rn_wb,
                   (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, addr_offset_none:$Rn,
@@ -1490,7 +1488,8 @@ def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
        [(set GPRnopc:$Rn_wb,
              (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, addr_offset_none:$Rn,
@@ -1500,7 +1499,8 @@ def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
         [(set GPRnopc:$Rn_wb,
               (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 // Pseudo-instructions for pattern matching the pre-indexed stores. We can't
 // put the patterns on the instruction definitions directly as ISel wants
@@ -1513,17 +1513,20 @@ def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 }
 
 // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
@@ -1531,7 +1534,7 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
   : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
-          "\t$Rt, $addr", []> {
+          "\t$Rt, $addr", []>, Sched<[WriteST]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = 0; // not signed
@@ -1557,7 +1560,8 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 let mayLoad = 1 in
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []>,
+                 Sched<[WriteLd]> {
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
@@ -1565,13 +1569,13 @@ let mayLoad = 1 in
 def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
                  IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
-                 "$addr.base = $wb", []>;
+                 "$addr.base = $wb", []>, Sched<[WriteLd]>;
 
 let mayStore = 1 in
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
-                 "$addr.base = $wb", []> {
+                 "$addr.base = $wb", []>, Sched<[WriteST]> {
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
@@ -1580,12 +1584,13 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
                       t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
-                 "$addr.base = $wb", []>;
+                 "$addr.base = $wb", []>, Sched<[WriteST]>;
 
 class T2Istrrel<bits<2> bit54, dag oops, dag iops,
                 string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
-            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]>,
+    Sched<[WriteST]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1861,7 +1866,7 @@ defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 //
 
 let hasSideEffects = 0 in
-def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr,
                    "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -1870,11 +1875,11 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0000;
 }
-def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                 pred:$p, zero_reg)>;
-def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                  pred:$p, CPSR)>;
-def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                pred:$p, CPSR)>;
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
@@ -1926,10 +1931,11 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
 
 def : InstAlias<"mov${p} $Rd, $imm",
                 (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
-                Requires<[IsThumb, HasV8MBaseline]>;
+                Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteALU]>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
-                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                        Sched<[WriteALU]>;
 
 let Constraints = "$src = $Rd" in {
 def t2MOVTi16 : T2I<(outs rGPR:$Rd),
@@ -1969,31 +1975,39 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 // Sign extenders
 
-def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
-                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
-def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
-                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
-def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb">;
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth">;
+def t2SXTB16 : T2I_ext_rrot_xtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab">;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah">;
+def t2SXTAB16 : T2I_exta_rrot<0b010, "sxtab16">;
+
+def : T2Pat<(sext_inreg (rotr rGPR:$Rn, rot_imm:$rot), i8),
+            (t2SXTB rGPR:$Rn, rot_imm:$rot)>;
+def : T2Pat<(sext_inreg (rotr rGPR:$Rn, rot_imm:$rot), i16),
+            (t2SXTH rGPR:$Rn, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn,
+                            (sext_inreg (rotr rGPR:$Rm, rot_imm:$rot), i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn,
+                            (sext_inreg (rotr rGPR:$Rm, rot_imm:$rot), i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
 
-def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
-                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
-                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
-def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
 
 // A simple right-shift can also be used in most cases (the exception is the
 // SXTH operations with a rotate of 24: there the non-contiguous bits are
 // relevant).
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (srl rGPR:$Rm, rot_imm:$rot), i8)),
                        (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (srl rGPR:$Rm, imm8_or_16:$rot), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (rotr rGPR:$Rm, (i32 24)), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (or (srl rGPR:$Rm, (i32 24)),
                                               (shl rGPR:$Rm, (i32 8))), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
@@ -2001,12 +2015,16 @@ def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
 // Zero extenders
 
 let AddedComplexity = 16 in {
-def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
-                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
-def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
-                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
-                                   UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb">;
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth">;
+def t2UXTB16 : T2I_ext_rrot_xtb16<0b011, "uxtb16">;
+
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x000000FF),
+                       (t2UXTB rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x0000FFFF),
+                       (t2UXTH rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x00FF00FF),
+                       (t2UXTB16 rGPR:$Rm, rot_imm:$rot)>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
 //        The transformation should probably be done as a combiner action
@@ -2014,21 +2032,25 @@ def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
 //        eight bits of the source into the lower eight bits of the result.
 //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
 //            (t2UXTB16 rGPR:$Src, 3)>,
-//          Requires<[HasT2ExtractPack, IsThumb2]>;
+//          Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
             (t2UXTB16 rGPR:$Src, 1)>,
-        Requires<[HasT2ExtractPack, IsThumb2]>;
+        Requires<[HasDSP, IsThumb2]>;
 
-def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
-                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
-                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
-def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab">;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah">;
+def t2UXTAB16 : T2I_exta_rrot<0b011, "uxtab16">;
 
-def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (rotr rGPR:$Rm, rot_imm:$rot),
+                                            0x00FF)),
+                       (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (rotr rGPR:$Rm, rot_imm:$rot),
+                                            0xFFFF)),
+                       (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
                                            0xFF)),
                        (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
                                             0xFFFF)),
                        (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
 }
@@ -2060,6 +2082,19 @@ defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
 }
 
+def : t2InstSubst<"adc${s}${p} $rd, $rn, $imm",
+                 (t2SBCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sbc${s}${p} $rd, $rn, $imm",
+                 (t2ADCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
+
+def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm",
+                 (t2SUBri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"addw${p} $rd, $rn, $imm",
+                 (t2SUBri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
+                 (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"subw${p} $rd, $rn, $imm",
+                 (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
 
@@ -2230,70 +2265,52 @@ def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
           Requires<[IsThumb2, HasDSP]>;
 
 // Signed/Unsigned saturate.
-class T2SatI<dag oops, dag iops, InstrItinClass itin,
-           string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+class T2SatI<dag iops, string opc, string asm>
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, []> {
   bits<4> Rd;
   bits<4> Rn;
   bits<5> sat_imm;
-  bits<7> sh;
+  bits<6> sh;
 
-  let Inst{11-8}  = Rd;
+  let Inst{31-24} = 0b11110011;
+  let Inst{21} = sh{5};
+  let Inst{20} = 0;
   let Inst{19-16} = Rn;
-  let Inst{4-0}   = sat_imm;
-  let Inst{21}    = sh{5};
+  let Inst{15} = 0;
   let Inst{14-12} = sh{4-2};
-  let Inst{7-6}   = sh{1-0};
+  let Inst{11-8}  = Rd;
+  let Inst{7-6} = sh{1-0};
+  let Inst{5} = 0;
+  let Inst{4-0}   = sat_imm;
 }
 
-def t2SSAT: T2SatI<
-              (outs rGPR:$Rd),
-              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
-              Requires<[IsThumb2]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
+def t2SSAT: T2SatI<(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                   "ssat", "\t$Rd, $sat_imm, $Rn$sh">,
+                   Requires<[IsThumb2]> {
+  let Inst{23-22} = 0b00;
   let Inst{5}  = 0;
 }
 
-def t2SSAT16: T2SatI<
-                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
-                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-                Requires<[IsThumb2, HasDSP]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
-  let Inst{14-12} = 0b000; // imm3 = '000'
-  let Inst{7-6} = 0b00;    // imm2 = '00'
-  let Inst{5-4} = 0b00;
+def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn),
+                     "ssat16", "\t$Rd, $sat_imm, $Rn">,
+                     Requires<[IsThumb2, HasDSP]> {
+  let Inst{23-22} = 0b00;
+  let sh = 0b100000;
+  let Inst{4} = 0;
 }
 
-def t2USAT: T2SatI<
-               (outs rGPR:$Rd),
-               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
-                Requires<[IsThumb2]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
+def t2USAT: T2SatI<(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                    "usat", "\t$Rd, $sat_imm, $Rn$sh">,
+                    Requires<[IsThumb2]> {
+  let Inst{23-22} = 0b10;
 }
 
-def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
-                     NoItinerary,
-                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
+                     "usat16", "\t$Rd, $sat_imm, $Rn">,
                      Requires<[IsThumb2, HasDSP]> {
-  let Inst{31-22} = 0b1111001110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
-  let Inst{14-12} = 0b000; // imm3 = '000'
-  let Inst{7-6} = 0b00;    // imm2 = '00'
-  let Inst{5-4} = 0b00;
+  let Inst{23-22} = 0b10;
+  let sh = 0b100000;
+  let Inst{4} = 0;
 }
 
 def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
@@ -2305,11 +2322,18 @@ def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm1_31, shl>;
 defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,  srl>;
 defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,  sra>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
 
+// LSL #0 is actually MOV, and has slightly different permitted registers to
+// LSL with non-zero shift
+def : t2InstAlias<"lsl${s}${p} $Rd, $Rm, #0",
+                  (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"lsl${s}${p}.w $Rd, $Rm, #0",
+                  (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
             (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
@@ -2547,7 +2571,8 @@ def : T2Pat<(t2_so_imm_not:$src),
 let isCommutable = 1 in
 def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                 "mul", "\t$Rd, $Rn, $Rm",
-                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]>,
+           Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2558,7 +2583,8 @@ def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
 class T2FourRegMLA<bits<4> op7_4, string opc, list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-               Requires<[IsThumb2, UseMulOps]> {
+               Requires<[IsThumb2, UseMulOps]>,
+    Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>  {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2575,8 +2601,12 @@ def t2MLS: T2FourRegMLA<0b0001, "mls",
 // Extra precision multiplies with low / high results
 let hasSideEffects = 0 in {
 let isCommutable = 1 in {
-def t2SMULL : T2MulLong<0b000, 0b0000, "smull", []>;
-def t2UMULL : T2MulLong<0b010, 0b0000, "umull", []>;
+def t2SMULL : T2MulLong<0b000, 0b0000, "smull",
+                        [(set rGPR:$RdLo, rGPR:$RdHi,
+                              (smullohi rGPR:$Rn, rGPR:$Rm))]>;
+def t2UMULL : T2MulLong<0b010, 0b0000, "umull",
+                        [(set rGPR:$RdLo, rGPR:$RdHi,
+                              (umullohi rGPR:$Rn, rGPR:$Rm))]>;
 } // isCommutable
 
 // Multiply + accumulate
@@ -2592,7 +2622,8 @@ class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
   : T2ThreeReg<(outs rGPR:$Rd),
                (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                opc, "\t$Rd, $Rn, $Rm", pattern>,
-               Requires<[IsThumb2, HasDSP]> {
+               Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2607,7 +2638,8 @@ class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
                      list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
               opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-              Requires<[IsThumb2, HasDSP, UseMulOps]> {
+              Requires<[IsThumb2, HasDSP, UseMulOps]>,
+    Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = op22_20;
@@ -2624,7 +2656,8 @@ class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
                      list<dag> pattern>
   : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, opc,
                "\t$Rd, $Rn, $Rm", pattern>,
-    Requires<[IsThumb2, HasDSP]> {
+    Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMUL16, ReadMUL, ReadMUL]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = op22_20;
@@ -2645,8 +2678,10 @@ def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb",
 def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
              [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
                                    (sra rGPR:$Rm, (i32 16))))]>;
-def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>;
-def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>;
+def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb",
+             [(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>;
+def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt",
+             [(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>;
 
 def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
                    (t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
@@ -2659,7 +2694,8 @@ class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc,
                     list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMUL16,
                opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-    Requires<[IsThumb2, HasDSP, UseMulOps]> {
+    Requires<[IsThumb2, HasDSP, UseMulOps]>,
+    Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>  {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = op22_20;
@@ -2680,8 +2716,10 @@ def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb",
 def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
              [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
                                                  (sra rGPR:$Rm, (i32 16)))))]>;
-def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>;
-def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>;
+def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>;
+def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwt rGPR:$Rn, rGPR:$Rm)))]>;
 
 def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
                       (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
@@ -2692,25 +2730,32 @@ def : Thumb2DSPMulPat<(add rGPR:$Ra,
                         (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)),
                       (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
 
-class T2SMLAL<bits<3> op22_20, bits<4> op7_4, string opc, list<dag> pattern>
-  : T2FourReg_mac<1, op22_20, op7_4,
-                  (outs rGPR:$Ra, rGPR:$Rd),
-                  (ins rGPR:$Rn, rGPR:$Rm),
-                  IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
-                  Requires<[IsThumb2, HasDSP]>;
-
 // Halfword multiple accumulate long: SMLAL<x><y>
-def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>;
-def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>;
-def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>;
-def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>;
+def t2SMLALBB : T2MlaLong<0b100, 0b1000, "smlalbb">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALBT : T2MlaLong<0b100, 0b1001, "smlalbt">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALTB : T2MlaLong<0b100, 0b1010, "smlaltb">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALTT : T2MlaLong<0b100, 0b1011, "smlaltt">,
+                          Requires<[IsThumb2, HasDSP]>;
+
+def : Thumb2DSPPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALBB $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALBT $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALTB $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALTT $Rn, $Rm, $RLo, $RHi)>;
 
 class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc>
   : T2ThreeReg_mac<0, op22_20, op7_4,
                    (outs rGPR:$Rd),
                    (ins rGPR:$Rn, rGPR:$Rm),
                    IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm", []>,
-                   Requires<[IsThumb2, HasDSP]> {
+                   Requires<[IsThumb2, HasDSP]>,
+   Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   let Inst{15-12} = 0b1111;
 }
 
@@ -2737,7 +2782,8 @@ class T2DualHalfMulAddLong<bits<3> op22_20, bits<4> op7_4, string opc>
                   (outs rGPR:$Ra, rGPR:$Rd),
                   (ins rGPR:$Rn, rGPR:$Rm),
                   IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
-                  Requires<[IsThumb2, HasDSP]>;
+                  Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
 def t2SMLALD  : T2DualHalfMulAddLong<0b100, 0b1100, "smlald">;
 def t2SMLALDX : T2DualHalfMulAddLong<0b100, 0b1101, "smlaldx">;
@@ -2751,7 +2797,8 @@ def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">;
 def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "sdiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]>,
+             Sched<[WriteDIV]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
   let Inst{20} = 0b1;
@@ -2762,7 +2809,8 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
 def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "udiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]>,
+             Sched<[WriteDIV]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
   let Inst{20} = 0b1;
@@ -2819,7 +2867,7 @@ def t2PKHBT : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
                                       (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Requires<[HasDSP, IsThumb2]>,
                   Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2835,10 +2883,10 @@ def t2PKHBT : T2ThreeReg<
 // Alternate cases for PKHBT where identities eliminate some nodes.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
@@ -2848,7 +2896,7 @@ def t2PKHTB : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
                                        (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Requires<[HasDSP, IsThumb2]>,
                   Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2867,14 +2915,14 @@ def t2PKHTB : T2ThreeReg<
 // pkhtb src1, src2, asr (17..31).
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
                 (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
 // CRC32 Instructions
@@ -4216,13 +4264,13 @@ def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
 def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
            Requires<[IsThumb2]>;
 def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
             (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
             (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 }
 
 def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
@@ -4231,10 +4279,10 @@ def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
            Requires<[IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
             (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
             (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 
 // Atomic load/store patterns
 def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
@@ -4325,26 +4373,26 @@ def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
                            pred:$p, cc_out:$s)>;
 
 // add w/ negative immediates is just a sub.
-def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
                  cc_out:$s)>;
-def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${p} $Rd, $Rn, $imm",
            (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
-def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+def : t2InstSubst<"add${s}${p} $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
                cc_out:$s)>;
-def : t2InstAlias<"add${p} $Rdn, $imm",
+def : t2InstSubst<"add${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
-def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
                  cc_out:$s)>;
-def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"addw${p} $Rd, $Rn, $imm",
            (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
-def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+def : t2InstSubst<"add${s}${p}.w $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
                cc_out:$s)>;
-def : t2InstAlias<"addw${p} $Rdn, $imm",
+def : t2InstSubst<"addw${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
 
@@ -4431,10 +4479,10 @@ def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
 // input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
                 (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
                 (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 // PUSH/POP aliases for STM/LDM
 def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
@@ -4513,16 +4561,16 @@ def : t2InstAlias<"strh${p} $Rt, $addr",
 // Extend instruction optional rotate operand.
 def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
               (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
               (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
               (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm",
               (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm",
                 (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
@@ -4535,16 +4583,16 @@ def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
 
 def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
               (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
               (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
               (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm",
               (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 
 def : t2InstAlias<"uxtb${p} $Rd, $Rm",
                 (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
@@ -4560,7 +4608,7 @@ def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
                   (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
                 (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
-                Requires<[HasT2ExtractPack, IsThumb2]>;
+                Requires<[HasDSP, IsThumb2]>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
                   (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
@@ -4568,41 +4616,41 @@ def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
                   (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
                 (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
-                Requires<[HasT2ExtractPack, IsThumb2]>;
+                Requires<[HasDSP, IsThumb2]>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
                   (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
 
 // "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
-def : t2InstAlias<"mov${p} $Rd, $imm",
+def : t2InstSubst<"mov${p} $Rd, $imm",
                   (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
-def : t2InstAlias<"mvn${p} $Rd, $imm",
-                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstSubst<"mvn${s}${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
 // Same for AND <--> BIC
-def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"bic${s}${p} $Rd, $Rn, $imm",
                   (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+def : t2InstSubst<"bic${s}${p} $Rdn, $imm",
                   (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"and${s}${p} $Rd, $Rn, $imm",
                   (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+def : t2InstSubst<"and${s}${p} $Rdn, $imm",
                   (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 // Likewise, "add Rd, t2_so_imm_neg" -> sub
-def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
                   (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"add${s}${p} $Rd, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $imm",
                   (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
                            pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via t2_so_imm_neg
-def : t2InstAlias<"cmp${p} $Rd, $imm",
+def : t2InstSubst<"cmp${p} $Rd, $imm",
                   (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
-def : t2InstAlias<"cmn${p} $Rd, $imm",
+def : t2InstSubst<"cmn${p} $Rd, $imm",
                   (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 
 
@@ -4616,6 +4664,8 @@ def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
 
 // MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
 // these, unfortunately.
+// FIXME: LSL #0 in the shift should allow SP to be used as either the
+// source or destination (but not both).
 def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
 def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index e99048645685..0f225156d4ca 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -11,14 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>;
 def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
                                        SDTCisSameAs<1, 2>]>;
+def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+                                       SDTCisVT<2, f64>]>;
 
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMFCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
@@ -336,13 +339,15 @@ let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VADDD  : ADbI<0b11100, 0b11, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPALU64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>,
+             Sched<[WriteFPALU32]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -352,19 +357,22 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPALU64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>,
+             Sched<[WriteFPALU32]>{
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -374,37 +382,43 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+            Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPDIV64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
-                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>,
+             Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>,
+            Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -414,17 +428,20 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULH  : AHbI<0b11100, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
+             Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
 
 def VNMULS : ASbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
-                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>,
+            Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -433,7 +450,8 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
 def VNMULH : AHbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
@@ -501,12 +519,12 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
-                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
-                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -517,17 +535,15 @@ def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
                   []>;
 
-
-// FIXME: Verify encoding after integrated assembler is working.
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
-                  [/* For disassembly only; pattern left blank */]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>;
 
 def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
-                  [/* For disassembly only; pattern left blank */]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -566,7 +582,7 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
-                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -574,7 +590,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
-                   [(arm_cmpfp0 SPR:$Sd)]> {
+                   [(arm_cmpfp0 SPR:$Sd, (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -591,11 +607,10 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let Inst{5}   = 0;
 }
 
-// FIXME: Verify encoding after integrated assembler is working.
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -603,7 +618,7 @@ def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfp0 SPR:$Sd, (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -624,7 +639,8 @@ def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
-                   [(set DPR:$Dd, (fpextend SPR:$Sm))]> {
+                   [(set DPR:$Dd, (fpextend SPR:$Sm))]>,
+             Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Dd;
   bits<5> Sm;
@@ -641,7 +657,8 @@ def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
 // Special case encoding: bits 11-8 is 0b1011.
 def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
                     IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
-                    [(set SPR:$Sd, (fpround DPR:$Dm))]> {
+                    [(set SPR:$Sd, (fpround DPR:$Dm))]>,
+              Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -663,31 +680,35 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 
 // Between half, single and double-precision.  For disassembly only.
 
-// FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+            Sched<[WriteFPCVT]>;
 
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]>,
+              Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Sm;
 
@@ -946,12 +967,14 @@ defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
-                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPSQRT64]>;
 
 def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
-                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
+                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>,
+             Sched<[WriteFPSQRT32]>;
 
 def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
@@ -987,7 +1010,8 @@ def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
 def VMOVRS : AVConv2I<0b11100001, 0b1010,
                       (outs GPR:$Rt), (ins SPR:$Sn),
                       IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
-                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<4> Rt;
   bits<5> Sn;
@@ -1010,7 +1034,8 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
                       (outs SPR:$Sn), (ins GPR:$Rt),
                       IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
                       [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
-             Requires<[HasVFP2, UseVMOVSR]> {
+             Requires<[HasVFP2, UseVMOVSR]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Sn;
   bits<4> Rt;
@@ -1032,7 +1057,8 @@ let hasSideEffects = 0 in {
 def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
                         (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
                         IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
-                 [/* FIXME: Can't write pattern for multiple result instr*/]> {
+                 [(set GPR:$Rt, GPR:$Rt2, (arm_fmrrd DPR:$Dm))]>,
+               Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Dm;
   bits<4> Rt;
@@ -1059,7 +1085,8 @@ def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
                       (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
                  IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [/* For disassembly only; pattern left blank */]>,
+               Sched<[WriteFPMOV]> {
   bits<5> src1;
   bits<4> Rt;
   bits<4> Rt2;
@@ -1085,7 +1112,8 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
 def VMOVDRR : AVConv5I<0b11000100, 0b1011,
                       (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
                       IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
-                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]>,
+              Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Dm;
   bits<4> Rt;
@@ -1128,7 +1156,8 @@ let hasSideEffects = 0 in
 def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                      (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
-                [/* For disassembly only; pattern left blank */]> {
+                [/* For disassembly only; pattern left blank */]>,
+              Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> dst1;
   bits<4> src1;
@@ -1154,7 +1183,8 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
                       (outs GPR:$Rt), (ins SPR:$Sn),
                       IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
                       []>,
-             Requires<[HasFullFP16]> {
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<4> Rt;
   bits<5> Sn;
@@ -1173,7 +1203,8 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001,
                       (outs SPR:$Sn), (ins GPR:$Rt),
                       IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
                       []>,
-             Requires<[HasFullFP16]> {
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Sn;
   bits<4> Rt;
@@ -1254,7 +1285,8 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
 def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
-                               []> {
+                               []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 }
 
@@ -1269,7 +1301,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 (outs SPR:$Sd),(ins SPR:$Sm),
                                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
-                                []> {
+                                []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1286,14 +1319,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
 def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
                                (outs SPR:$Sd), (ins SPR:$Sm),
                                IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
-                               []> {
+                               []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 }
 
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
-                               []> {
+                               []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 }
 
@@ -1308,7 +1343,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 (outs SPR:$Sd), (ins SPR:$Sm),
                                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
-                                []> {
+                                []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1325,7 +1361,8 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
 def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
                                 (outs SPR:$Sd), (ins SPR:$Sm),
                                 IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
-                                []> {
+                                []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 }
 
@@ -1390,7 +1427,8 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
 def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
-                                []> {
+                                []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
@@ -1405,7 +1443,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1423,14 +1462,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
 def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
                                IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
-                               []> {
+                               []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
@@ -1445,7 +1486,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1463,52 +1505,58 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
 def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
-// FIXME: Verify encoding after integrated assembler is working.
 def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
-                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
+                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
-                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
+                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvtr", ".s32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
-                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
+                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
-                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
+                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvtr", ".u32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 }
@@ -1528,8 +1576,7 @@ let Constraints = "$a = $dst" in {
 class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
-  Sched<[WriteCvtFP]> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{0};
@@ -1540,8 +1587,7 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
 class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
-    Sched<[WriteCvtFP]> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{4};
@@ -1553,26 +1599,31 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
 def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOUHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTHI, "vcvt", ".u16.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOSLH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTHI, "vcvt", ".s32.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTHI, "vcvt", ".u32.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1604,45 +1655,54 @@ def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
 
 def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 // Fixed-Point to FP:
 
 def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VUHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.u16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VSLTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.s32\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.u32\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1650,7 +1710,8 @@ def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
 
 def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1658,7 +1719,8 @@ def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
 
 def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1666,7 +1728,8 @@ def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
 
 def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1674,19 +1737,23 @@ def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
 
 def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 } // End of 'let Constraints = "$a = $dst" in'
 
@@ -1700,7 +1767,8 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1708,7 +1776,8 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1734,7 +1803,8 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1742,7 +1812,8 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1768,7 +1839,8 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+                Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1776,7 +1848,8 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+                Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1802,14 +1875,16 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
                   IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+             Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1838,7 +1913,8 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+            Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1846,7 +1922,8 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+            Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
@@ -1856,7 +1933,8 @@ def VFMAH : AHbI<0b11101, 0b10, 0, 0,
                   IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
                   []>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFusedMAC]>;
+              Requires<[HasFullFP16,UseFusedMAC]>,
+            Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2bdbe4fca3de..8d224d6a70fa 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -54,9 +54,21 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
            DstSize <= SrcSize)) &&
          "Copy with different width?!");
 
-  assert(RegBank->getID() == ARM::GPRRegBankID && "Unsupported reg bank");
+  assert((RegBank->getID() == ARM::GPRRegBankID ||
+          RegBank->getID() == ARM::FPRRegBankID) &&
+         "Unsupported reg bank");
+
   const TargetRegisterClass *RC = &ARM::GPRRegClass;
 
+  if (RegBank->getID() == ARM::FPRRegBankID) {
+    if (DstSize == 32)
+      RC = &ARM::SPRRegClass;
+    else if (DstSize == 64)
+      RC = &ARM::DPRRegClass;
+    else
+      llvm_unreachable("Unsupported destination size");
+  }
+
   // No need to constrain SrcReg. It will get constrained when
   // we hit another of its uses or its defs.
   // Copies do not have constraints.
@@ -68,6 +80,146 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   return true;
 }
 
+static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                       MachineRegisterInfo &MRI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp");
+
+  LLT Ty = MRI.getType(MIB->getOperand(0).getReg());
+  unsigned ValSize = Ty.getSizeInBits();
+
+  if (ValSize == 32) {
+    if (TII.getSubtarget().useNEONForSinglePrecisionFP())
+      return false;
+    MIB->setDesc(TII.get(ARM::VADDS));
+  } else {
+    assert(ValSize == 64 && "Unsupported size for floating point value");
+    if (TII.getSubtarget().isFPOnlySP())
+      return false;
+    MIB->setDesc(TII.get(ARM::VADDD));
+  }
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+static bool selectSequence(MachineInstrBuilder &MIB,
+                           const ARMBaseInstrInfo &TII,
+                           MachineRegisterInfo &MRI,
+                           const TargetRegisterInfo &TRI,
+                           const RegisterBankInfo &RBI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select sequence without VFP");
+
+  // We only support G_SEQUENCE as a way to stick together two scalar GPRs
+  // into one DPR.
+  unsigned VReg0 = MIB->getOperand(0).getReg();
+  (void)VReg0;
+  assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
+         RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
+         "Unsupported operand for G_SEQUENCE");
+  unsigned VReg1 = MIB->getOperand(1).getReg();
+  (void)VReg1;
+  assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_SEQUENCE");
+  unsigned VReg2 = MIB->getOperand(3).getReg();
+  (void)VReg2;
+  assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_SEQUENCE");
+
+  // Remove the operands corresponding to the offsets.
+  MIB->RemoveOperand(4);
+  MIB->RemoveOperand(2);
+
+  MIB->setDesc(TII.get(ARM::VMOVDRR));
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+static bool selectExtract(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                          MachineRegisterInfo &MRI,
+                          const TargetRegisterInfo &TRI,
+                          const RegisterBankInfo &RBI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select extract without VFP");
+
+  // We only support G_EXTRACT as a way to break up one DPR into two GPRs.
+  unsigned VReg0 = MIB->getOperand(0).getReg();
+  (void)VReg0;
+  assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_EXTRACT");
+  unsigned VReg1 = MIB->getOperand(1).getReg();
+  (void)VReg1;
+  assert(MRI.getType(VReg1).getSizeInBits() == 64 &&
+         RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::FPRRegBankID &&
+         "Unsupported operand for G_EXTRACT");
+  assert(MIB->getOperand(2).getImm() % 32 == 0 &&
+         "Unsupported operand for G_EXTRACT");
+
+  // Remove the operands corresponding to the offsets.
+  MIB->getOperand(2).setImm(MIB->getOperand(2).getImm() / 32);
+
+  MIB->setDesc(TII.get(ARM::VGETLNi32));
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+/// Select the opcode for simple extensions (that translate to a single SXT/UXT
+/// instruction). Extension operations more complicated than that should not
+/// invoke this. Returns the original opcode if it doesn't know how to select a
+/// better one.
+static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) {
+  using namespace TargetOpcode;
+
+  if (Size != 8 && Size != 16)
+    return Opc;
+
+  if (Opc == G_SEXT)
+    return Size == 8 ? ARM::SXTB : ARM::SXTH;
+
+  if (Opc == G_ZEXT)
+    return Size == 8 ? ARM::UXTB : ARM::UXTH;
+
+  return Opc;
+}
+
+/// Select the opcode for simple loads and stores. For types smaller than 32
+/// bits, the value will be zero extended. Returns the original opcode if it
+/// doesn't know how to select a better one.
+static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
+                                      unsigned Size) {
+  bool isStore = Opc == TargetOpcode::G_STORE;
+
+  if (RegBank == ARM::GPRRegBankID) {
+    switch (Size) {
+    case 1:
+    case 8:
+      return isStore ? ARM::STRBi12 : ARM::LDRBi12;
+    case 16:
+      return isStore ? ARM::STRH : ARM::LDRH;
+    case 32:
+      return isStore ? ARM::STRi12 : ARM::LDRi12;
+    default:
+      return Opc;
+    }
+  }
+
+  if (RegBank == ARM::FPRRegBankID) {
+    switch (Size) {
+    case 32:
+      return isStore ? ARM::VSTRS : ARM::VLDRS;
+    case 64:
+      return isStore ? ARM::VSTRD : ARM::VLDRD;
+    default:
+      return Opc;
+    }
+  }
+
+  return Opc;
+}
+
 bool ARMInstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -84,23 +236,129 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
   }
 
   MachineInstrBuilder MIB{MF, I};
+  bool isSExt = false;
 
   using namespace TargetOpcode;
   switch (I.getOpcode()) {
+  case G_SEXT:
+    isSExt = true;
+    LLVM_FALLTHROUGH;
+  case G_ZEXT: {
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    // FIXME: Smaller destination sizes coming soon!
+    if (DstTy.getSizeInBits() != 32) {
+      DEBUG(dbgs() << "Unsupported destination size for extension");
+      return false;
+    }
+
+    LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
+    unsigned SrcSize = SrcTy.getSizeInBits();
+    switch (SrcSize) {
+    case 1: {
+      // ZExt boils down to & 0x1; for SExt we also subtract that from 0
+      I.setDesc(TII.get(ARM::ANDri));
+      MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
+
+      if (isSExt) {
+        unsigned SExtResult = I.getOperand(0).getReg();
+
+        // Use a new virtual register for the result of the AND
+        unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
+        I.getOperand(0).setReg(AndResult);
+
+        auto InsertBefore = std::next(I.getIterator());
+        auto SubI =
+            BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::RSBri))
+                .addDef(SExtResult)
+                .addUse(AndResult)
+                .addImm(0)
+                .add(predOps(ARMCC::AL))
+                .add(condCodeOp());
+        if (!constrainSelectedInstRegOperands(*SubI, TII, TRI, RBI))
+          return false;
+      }
+      break;
+    }
+    case 8:
+    case 16: {
+      unsigned NewOpc = selectSimpleExtOpc(I.getOpcode(), SrcSize);
+      if (NewOpc == I.getOpcode())
+        return false;
+      I.setDesc(TII.get(NewOpc));
+      MIB.addImm(0).add(predOps(ARMCC::AL));
+      break;
+    }
+    default:
+      DEBUG(dbgs() << "Unsupported source size for extension");
+      return false;
+    }
+    break;
+  }
   case G_ADD:
+  case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
-    AddDefaultCC(AddDefaultPred(MIB));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+    break;
+  case G_FADD:
+    if (!selectFAdd(MIB, TII, MRI))
+      return false;
     break;
   case G_FRAME_INDEX:
     // Add 0 to the given frame index and hope it will eventually be folded into
     // the user(s).
     I.setDesc(TII.get(ARM::ADDri));
-    AddDefaultCC(AddDefaultPred(MIB.addImm(0)));
+    MIB.addImm(0).add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
-  case G_LOAD:
-    I.setDesc(TII.get(ARM::LDRi12));
-    AddDefaultPred(MIB.addImm(0));
+  case G_CONSTANT: {
+    unsigned Reg = I.getOperand(0).getReg();
+    if (MRI.getType(Reg).getSizeInBits() != 32)
+      return false;
+
+    assert(RBI.getRegBank(Reg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+           "Expected constant to live in a GPR");
+    I.setDesc(TII.get(ARM::MOVi));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+    break;
+  }
+  case G_STORE:
+  case G_LOAD: {
+    const auto &MemOp = **I.memoperands_begin();
+    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      return false;
+    }
+
+    unsigned Reg = I.getOperand(0).getReg();
+    unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID();
+
+    LLT ValTy = MRI.getType(Reg);
+    const auto ValSize = ValTy.getSizeInBits();
+
+    assert((ValSize != 64 || TII.getSubtarget().hasVFP2()) &&
+           "Don't know how to load/store 64-bit value without VFP");
+
+    const auto NewOpc = selectLoadStoreOpCode(I.getOpcode(), RegBank, ValSize);
+    if (NewOpc == G_LOAD || NewOpc == G_STORE)
+      return false;
+
+    I.setDesc(TII.get(NewOpc));
+
+    if (NewOpc == ARM::LDRH || NewOpc == ARM::STRH)
+      // LDRH has a funny addressing mode (there's already a FIXME for it).
+      MIB.addReg(0);
+    MIB.addImm(0).add(predOps(ARMCC::AL));
+    break;
+  }
+  case G_SEQUENCE: {
+    if (!selectSequence(MIB, TII, MRI, TRI, RBI))
+      return false;
     break;
+  }
+  case G_EXTRACT: {
+    if (!selectExtract(MIB, TII, MRI, TRI, RBI))
+      return false;
+    break;
+  }
   default:
     return false;
   }
diff --git a/lib/Target/ARM/ARMInstructionSelector.h b/lib/Target/ARM/ARMInstructionSelector.h
index 5072cdd60ce4..530141d92c2c 100644
--- a/lib/Target/ARM/ARMInstructionSelector.h
+++ b/lib/Target/ARM/ARMInstructionSelector.h
@@ -1,4 +1,4 @@
-//===- ARMInstructionSelector ------------------------------------*- C++ -*-==//
+//===- ARMInstructionSelector -----------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,8 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
 /// \file
 /// This file declares the targeting of the InstructionSelector class for ARM.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
@@ -16,9 +18,9 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 
 namespace llvm {
+
 class ARMBaseInstrInfo;
 class ARMBaseRegisterInfo;
-class ARMBaseTargetMachine;
 class ARMRegisterBankInfo;
 class ARMSubtarget;
 
@@ -27,7 +29,7 @@ public:
   ARMInstructionSelector(const ARMSubtarget &STI,
                          const ARMRegisterBankInfo &RBI);
 
-  virtual bool select(MachineInstr &I) const override;
+  bool select(MachineInstr &I) const override;
 
 private:
   const ARMBaseInstrInfo &TII;
@@ -35,5 +37,6 @@ private:
   const ARMRegisterBankInfo &RBI;
 };
 
-} // End llvm namespace.
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 255ea4bc7198..994bbd673dd8 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMLegalizerInfo.h"
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
@@ -23,22 +24,53 @@ using namespace llvm;
 #error "You shouldn't build this"
 #endif
 
-ARMLegalizerInfo::ARMLegalizerInfo() {
+ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   using namespace TargetOpcode;
 
   const LLT p0 = LLT::pointer(0, 32);
 
+  const LLT s1 = LLT::scalar(1);
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
 
   setAction({G_FRAME_INDEX, p0}, Legal);
 
-  setAction({G_LOAD, s32}, Legal);
-  setAction({G_LOAD, 1, p0}, Legal);
+  for (unsigned Op : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s1, s8, s16, s32, p0})
+      setAction({Op, Ty}, Legal);
+    setAction({Op, 1, p0}, Legal);
+  }
 
-  for (auto Ty : {s8, s16, s32})
+  for (auto Ty : {s1, s8, s16, s32})
     setAction({G_ADD, Ty}, Legal);
 
+  for (unsigned Op : {G_SEXT, G_ZEXT}) {
+    setAction({Op, s32}, Legal);
+    for (auto Ty : {s1, s8, s16})
+      setAction({Op, 1, Ty}, Legal);
+  }
+
+  setAction({G_GEP, p0}, Legal);
+  setAction({G_GEP, 1, s32}, Legal);
+
+  setAction({G_CONSTANT, s32}, Legal);
+
+  if (!ST.useSoftFloat() && ST.hasVFP2()) {
+    setAction({G_FADD, s32}, Legal);
+    setAction({G_FADD, s64}, Legal);
+
+    setAction({G_LOAD, s64}, Legal);
+    setAction({G_STORE, s64}, Legal);
+  } else {
+    for (auto Ty : {s32, s64})
+      setAction({G_FADD, Ty}, Libcall);
+  }
+
+  for (unsigned Op : {G_FREM, G_FPOW})
+    for (auto Ty : {s32, s64})
+      setAction({Op, Ty}, Libcall);
+
   computeTables();
 }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h
index ca3eea81271b..0b8a608a6bde 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/lib/Target/ARM/ARMLegalizerInfo.h
@@ -18,12 +18,12 @@
 
 namespace llvm {
 
-class LLVMContext;
+class ARMSubtarget;
 
 /// This class provides the information for the target register banks.
 class ARMLegalizerInfo : public LegalizerInfo {
 public:
-  ARMLegalizerInfo();
+  ARMLegalizerInfo(const ARMSubtarget &ST);
 };
 } // End llvm namespace.
 #endif
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 48ab491b5be9..72fcf7cd6a4f 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -517,8 +517,12 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
 
     if (InsertSub) {
       // An instruction above couldn't be updated, so insert a sub.
-      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
-        .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base)
+          .add(t1CondCodeOp(true))
+          .addReg(Base)
+          .addImm(WordOffset * 4)
+          .addImm(Pred)
+          .addReg(PredReg);
       return;
     }
 
@@ -534,9 +538,12 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
     // information and *always* have to reset at the end of a block.
     // See PR21029.
     if (MBBI != MBB.end()) --MBBI;
-    AddDefaultT1CC(
-      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
-      .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base)
+        .add(t1CondCodeOp(true))
+        .addReg(Base)
+        .addImm(WordOffset * 4)
+        .addImm(Pred)
+        .addReg(PredReg);
   }
 }
 
@@ -602,13 +609,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback.
   // It's also not possible to merge an STR of the base register in Thumb1.
-  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+  if (isThumb1 && ContainsReg(Regs, Base)) {
     assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
-    if (Opcode == ARM::tLDRi) {
+    if (Opcode == ARM::tLDRi)
       Writeback = false;
-    } else if (Opcode == ARM::tSTRi) {
+    else if (Opcode == ARM::tSTRi)
       return nullptr;
-    }
   }
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
@@ -700,8 +706,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
             .addReg(Base, getKillRegState(KillOldBase));
         } else
           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
-            .addReg(Base, getKillRegState(KillOldBase))
-            .addImm(Pred).addReg(PredReg);
+              .addReg(Base, getKillRegState(KillOldBase))
+              .add(predOps(Pred, PredReg));
 
         // The following ADDS/SUBS becomes an update.
         Base = NewBase;
@@ -710,17 +716,21 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
       if (BaseOpc == ARM::tADDrSPi) {
         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
         BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
-          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
-          .addImm(Pred).addReg(PredReg);
+            .addReg(Base, getKillRegState(KillOldBase))
+            .addImm(Offset / 4)
+            .add(predOps(Pred, PredReg));
       } else
-        AddDefaultT1CC(
-          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
-          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
-          .addImm(Pred).addReg(PredReg);
+        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+            .add(t1CondCodeOp(true))
+            .addReg(Base, getKillRegState(KillOldBase))
+            .addImm(Offset)
+            .add(predOps(Pred, PredReg));
     } else {
       BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
-        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
-        .addImm(Pred).addReg(PredReg).addReg(0);
+          .addReg(Base, getKillRegState(KillOldBase))
+          .addImm(Offset)
+          .add(predOps(Pred, PredReg))
+          .add(condCodeOp());
     }
     Base = NewBase;
     BaseKill = true; // New base is always killed straight away.
@@ -1259,7 +1269,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
 
   // Transfer the rest of operands.
   for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
-    MIB.addOperand(MI->getOperand(OpNum));
+    MIB.add(MI->getOperand(OpNum));
 
   // Transfer memoperands.
   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
@@ -1392,14 +1402,19 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
       } else {
         int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-          .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+            .addReg(Base, RegState::Define)
+            .addReg(Base)
+            .addReg(0)
+            .addImm(Imm)
+            .add(predOps(Pred, PredReg));
       }
     } else {
       // t2LDR_PRE, t2LDR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-        .addReg(Base, RegState::Define)
-        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(Base, RegState::Define)
+          .addReg(Base)
+          .addImm(Offset)
+          .add(predOps(Pred, PredReg));
     }
   } else {
     MachineOperand &MO = MI->getOperand(0);
@@ -1410,13 +1425,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
       int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
-        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+          .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+          .addReg(Base)
+          .addReg(0)
+          .addImm(Imm)
+          .add(predOps(Pred, PredReg));
     } else {
       // t2STR_PRE, t2STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
-        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+          .addReg(Base)
+          .addImm(Offset)
+          .add(predOps(Pred, PredReg));
     }
   }
   MBB.erase(MBBI);
@@ -1462,12 +1482,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
   if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
-    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
-       .addReg(BaseOp.getReg(), RegState::Define);
+    MIB.add(Reg0Op).add(Reg1Op).addReg(BaseOp.getReg(), RegState::Define);
   } else {
     assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
-    MIB.addReg(BaseOp.getReg(), RegState::Define)
-       .addOperand(Reg0Op).addOperand(Reg1Op);
+    MIB.addReg(BaseOp.getReg(), RegState::Define).add(Reg0Op).add(Reg1Op);
   }
   MIB.addReg(BaseOp.getReg(), RegState::Kill)
      .addImm(Offset).addImm(Pred).addReg(PredReg);
@@ -1477,7 +1495,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
 
   // Transfer implicit operands.
   for (const MachineOperand &MO : MI.implicit_operands())
-    MIB.addOperand(MO);
+    MIB.add(MO);
   MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
   MBB.erase(MBBI);
@@ -1891,8 +1909,9 @@ bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
 
   for (auto Use : Prev->uses())
     if (Use.isKill()) {
-      AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
-                         .addReg(Use.getReg(), RegState::Kill))
+      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
+          .addReg(Use.getReg(), RegState::Kill)
+          .add(predOps(ARMCC::AL))
           .copyImplicitOps(*MBBI);
       MBB.erase(MBBI);
       MBB.erase(Prev);
@@ -1942,6 +1961,7 @@ namespace {
     static char ID;
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
+    AliasAnalysis *AA;
     const DataLayout *TD;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -1955,6 +1975,11 @@ namespace {
       return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
     }
 
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
   private:
     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
                           unsigned &NewOpc, unsigned &EvenReg,
@@ -1984,6 +2009,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
   MF  = &Fn;
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   bool Modified = false;
   for (MachineBasicBlock &MFI : Fn)
@@ -1997,28 +2023,19 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
                                       MachineBasicBlock::iterator E,
                                       SmallPtrSetImpl<MachineInstr*> &MemOps,
                                       SmallSet<unsigned, 4> &MemRegs,
-                                      const TargetRegisterInfo *TRI) {
+                                      const TargetRegisterInfo *TRI,
+                                      AliasAnalysis *AA) {
   // Are there stores / loads / calls between them?
-  // FIXME: This is overly conservative. We should make use of alias information
-  // some day.
   SmallSet<unsigned, 4> AddedRegPressure;
   while (++I != E) {
     if (I->isDebugValue() || MemOps.count(&*I))
       continue;
     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
       return false;
-    if (isLd && I->mayStore())
-      return false;
-    if (!isLd) {
-      if (I->mayLoad())
-        return false;
-      // It's not safe to move the first 'str' down.
-      // str r1, [r0]
-      // strh r5, [r0]
-      // str r4, [r0, #+4]
-      if (I->mayStore())
-        return false;
-    }
+    if (I->mayStore() || (!isLd && I->mayLoad()))
+      for (MachineInstr *MemOp : MemOps)
+        if (I->mayAlias(AA, *MemOp, /*UseTBAA*/ false))
+          return false;
     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
       MachineOperand &MO = I->getOperand(j);
       if (!MO.isReg())
@@ -2142,33 +2159,40 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     unsigned LastBytes = 0;
     unsigned NumMove = 0;
     for (int i = Ops.size() - 1; i >= 0; --i) {
+      // Make sure each operation has the same kind.
       MachineInstr *Op = Ops[i];
-      unsigned Loc = MI2LocMap[Op];
-      if (Loc <= FirstLoc) {
-        FirstLoc = Loc;
-        FirstOp = Op;
-      }
-      if (Loc >= LastLoc) {
-        LastLoc = Loc;
-        LastOp = Op;
-      }
-
       unsigned LSMOpcode
         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
       if (LastOpcode && LSMOpcode != LastOpcode)
         break;
 
+      // Check that we have a continuous set of offsets.
       int Offset = getMemoryOpOffset(*Op);
       unsigned Bytes = getLSMultipleTransferSize(Op);
       if (LastBytes) {
         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
           break;
       }
+
+      // Don't try to reschedule too many instructions.
+      if (NumMove == 8) // FIXME: Tune this limit.
+        break;
+
+      // Found a mergable instruction; save information about it.
+      ++NumMove;
       LastOffset = Offset;
       LastBytes = Bytes;
       LastOpcode = LSMOpcode;
-      if (++NumMove == 8) // FIXME: Tune this limit.
-        break;
+
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
     }
 
     if (NumMove <= 1)
@@ -2176,7 +2200,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     else {
       SmallPtrSet<MachineInstr*, 4> MemOps;
       SmallSet<unsigned, 4> MemRegs;
-      for (int i = NumMove-1; i >= 0; --i) {
+      for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) {
         MemOps.insert(Ops[i]);
         MemRegs.insert(Ops[i]->getOperand(0).getReg());
       }
@@ -2186,7 +2210,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
       if (DoMove)
         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
-                                           MemOps, MemRegs, TRI);
+                                           MemOps, MemRegs, TRI, AA);
       if (!DoMove) {
         for (unsigned i = 0; i != NumMove; ++i)
           Ops.pop_back();
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 07044b9697b6..0fd98268723a 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -14,23 +14,36 @@
 
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
-using namespace llvm;
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
+using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
+  MCSymbolRefExpr::VariantKind SymbolVariant = MCSymbolRefExpr::VK_None;
+  if (MO.getTargetFlags() & ARMII::MO_SBREL)
+    SymbolVariant = MCSymbolRefExpr::VK_ARM_SBREL;
+
   const MCExpr *Expr =
-      MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+      MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
   switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
   default:
     llvm_unreachable("Unknown target flag on symbol operand");
@@ -38,12 +51,12 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
     break;
   case ARMII::MO_LO16:
     Expr =
-        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+        MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
     Expr = ARMMCExpr::createLower16(Expr, OutContext);
     break;
   case ARMII::MO_HI16:
     Expr =
-        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+        MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
     Expr = ARMMCExpr::createUpper16(Expr, OutContext);
     break;
   }
@@ -75,11 +88,10 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
         MO.getMBB()->getSymbol(), OutContext));
     break;
-  case MachineOperand::MO_GlobalAddress: {
+  case MachineOperand::MO_GlobalAddress:
     MCOp = GetSymbolRef(MO,
                         GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags()));
     break;
-  }
   case MachineOperand::MO_ExternalSymbol:
     MCOp = GetSymbolRef(MO,
                         GetExternalSymbolSymbol(MO.getSymbolName()));
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 50d8f0941460..e25d36b57616 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 
 using namespace llvm;
 
@@ -15,10 +16,4 @@ void ARMFunctionInfo::anchor() {}
 
 ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
     : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
-      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
-      StByValParamsPadding(0), ArgRegsSaveSize(0), ReturnRegsCount(0),
-      HasStackFrame(false), RestoreSPFromFP(false), LRSpilledForFarJump(false),
-      FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
-      GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), PICLabelUId(0),
-      VarArgsFrameIndex(0), HasITBlocks(false), ArgumentStackSize(0),
-      IsSplitCSR(false), PromotedGlobalsIncrease(0) {}
+      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()) {}
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 8c485e89bf54..816116772995 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -14,11 +14,11 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 
-#include "ARMSubtarget.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <utility>
 
 namespace llvm {
 
@@ -29,42 +29,42 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// isThumb - True if this function is compiled under Thumb mode.
   /// Used to initialized Align, so must precede it.
-  bool isThumb;
+  bool isThumb = false;
 
   /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
   /// to determine if function is compiled under Thumb mode, for that use
   /// 'isThumb'.
-  bool hasThumb2;
+  bool hasThumb2 = false;
 
   /// StByValParamsPadding - For parameter that is split between
   /// GPRs and memory; while recovering GPRs part, when
   /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0,
   /// we need to insert gap before parameter start address. It allows to
   /// "attach" GPR-part to the part that was passed via stack.
-  unsigned StByValParamsPadding;
+  unsigned StByValParamsPadding = 0;
 
   /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
   ///
-  unsigned ArgRegsSaveSize;
+  unsigned ArgRegsSaveSize = 0;
 
   /// ReturnRegsCount - Number of registers used up in the return.
-  unsigned ReturnRegsCount;
+  unsigned ReturnRegsCount = 0;
 
   /// HasStackFrame - True if this function has a stack frame. Set by
   /// determineCalleeSaves().
-  bool HasStackFrame;
+  bool HasStackFrame = false;
 
   /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
   /// emitPrologue.
-  bool RestoreSPFromFP;
+  bool RestoreSPFromFP = false;
 
   /// LRSpilledForFarJump - True if the LR register has been for spilled to
   /// enable far jump.
-  bool LRSpilledForFarJump;
+  bool LRSpilledForFarJump = false;
 
   /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
   /// spill stack offset.
-  unsigned FramePtrSpillOffset;
+  unsigned FramePtrSpillOffset = 0;
 
   /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
   /// register spills areas. For Mac OS X:
@@ -77,16 +77,16 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   ///
   /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3.
   /// Some may be spilled after the stack has been realigned.
-  unsigned GPRCS1Offset;
-  unsigned GPRCS2Offset;
-  unsigned DPRCSOffset;
+  unsigned GPRCS1Offset = 0;
+  unsigned GPRCS2Offset = 0;
+  unsigned DPRCSOffset = 0;
 
   /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
   /// areas.
-  unsigned GPRCS1Size;
-  unsigned GPRCS2Size;
-  unsigned DPRCSAlignGapSize;
-  unsigned DPRCSSize;
+  unsigned GPRCS1Size = 0;
+  unsigned GPRCS2Size = 0;
+  unsigned DPRCSAlignGapSize = 0;
+  unsigned DPRCSSize = 0;
 
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
   /// the aligned portion of the stack frame.  This is always a contiguous
@@ -95,15 +95,15 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// We do not keep track of the frame indices used for these registers - they
   /// behave like any other frame index in the aligned stack frame.  These
   /// registers also aren't included in DPRCSSize above.
-  unsigned NumAlignedDPRCS2Regs;
+  unsigned NumAlignedDPRCS2Regs = 0;
 
-  unsigned PICLabelUId;
+  unsigned PICLabelUId = 0;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
 
   /// HasITBlocks - True if IT blocks have been inserted.
-  bool HasITBlocks;
+  bool HasITBlocks = false;
 
   /// CPEClones - Track constant pool entries clones created by Constant Island
   /// pass.
@@ -111,7 +111,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
   /// being passed on the stack
-  unsigned ArgumentStackSize;
+  unsigned ArgumentStackSize = 0;
 
   /// CoalescedWeights - mapping of basic blocks to the rolling counter of
   /// coalesced weights.
@@ -119,26 +119,16 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies.
-  bool IsSplitCSR;
+  bool IsSplitCSR = false;
 
   /// Globals that have had their storage promoted into the constant pool.
   SmallPtrSet<const GlobalVariable*,2> PromotedGlobals;
 
   /// The amount the literal pool has been increasedby due to promoted globals.
-  int PromotedGlobalsIncrease;
+  int PromotedGlobalsIncrease = 0;
   
 public:
-  ARMFunctionInfo() :
-    isThumb(false),
-    hasThumb2(false),
-    ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false),
-    RestoreSPFromFP(false),
-    LRSpilledForFarJump(false),
-    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
-    GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
-    NumAlignedDPRCS2Regs(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false),
-    PromotedGlobalsIncrease(0) {}
+  ARMFunctionInfo() = default;
 
   explicit ARMFunctionInfo(MachineFunction &MF);
 
@@ -250,6 +240,7 @@ public:
     PromotedGlobalsIncrease = Sz;
   }
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 324087d670b5..08f3da738868 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -13,11 +13,15 @@
 
 #include "ARMRegisterBankInfo.h"
 #include "ARMInstrInfo.h" // For the register classes
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
+#define GET_TARGET_REGBANK_IMPL
+#include "ARMGenRegisterBank.inc"
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -29,44 +33,109 @@ using namespace llvm;
 // into an ARMGenRegisterBankInfo.def (similar to AArch64).
 namespace llvm {
 namespace ARM {
-const uint32_t GPRCoverageData[] = {
-    // Classes 0-31
-    (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) |
-        (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) |
-        (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) |
-        (1u << ARM::GPRnopc_and_hGPRRegClassID) |
-        (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) |
-        (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) |
-        (1u << ARM::hGPR_and_tcGPRRegClassID),
-    // Classes 32-63
-    0,
-    // Classes 64-96
-    0,
-    // FIXME: Some of the entries below this point can be safely removed once
-    // this is tablegenerated. It's only needed because of the hardcoded
-    // register class limit.
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
+enum PartialMappingIdx {
+  PMI_GPR,
+  PMI_SPR,
+  PMI_DPR,
+  PMI_Min = PMI_GPR,
+};
+
+RegisterBankInfo::PartialMapping PartMappings[]{
+    // GPR Partial Mapping
+    {0, 32, GPRRegBank},
+    // SPR Partial Mapping
+    {0, 32, FPRRegBank},
+    // DPR Partial Mapping
+    {0, 64, FPRRegBank},
 };
 
-RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData);
-RegisterBank *RegBanks[] = {&GPRRegBank};
+#ifndef NDEBUG
+static bool checkPartMapping(const RegisterBankInfo::PartialMapping &PM,
+                             unsigned Start, unsigned Length,
+                             unsigned RegBankID) {
+  return PM.StartIdx == Start && PM.Length == Length &&
+         PM.RegBank->getID() == RegBankID;
+}
+
+static void checkPartialMappings() {
+  assert(
+      checkPartMapping(PartMappings[PMI_GPR - PMI_Min], 0, 32, GPRRegBankID) &&
+      "Wrong mapping for GPR");
+  assert(
+      checkPartMapping(PartMappings[PMI_SPR - PMI_Min], 0, 32, FPRRegBankID) &&
+      "Wrong mapping for SPR");
+  assert(
+      checkPartMapping(PartMappings[PMI_DPR - PMI_Min], 0, 64, FPRRegBankID) &&
+      "Wrong mapping for DPR");
+}
+#endif
 
-RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
+enum ValueMappingIdx {
+  InvalidIdx = 0,
+  GPR3OpsIdx = 1,
+  SPR3OpsIdx = 4,
+  DPR3OpsIdx = 7,
+};
 
 RegisterBankInfo::ValueMapping ValueMappings[] = {
-    {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}};
+    // invalid
+    {nullptr, 0},
+    // 3 ops in GPRs
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    // 3 ops in SPRs
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    // 3 ops in DPRs
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    {&PartMappings[PMI_DPR - PMI_Min], 1}};
+
+#ifndef NDEBUG
+static bool checkValueMapping(const RegisterBankInfo::ValueMapping &VM,
+                              RegisterBankInfo::PartialMapping *BreakDown) {
+  return VM.NumBreakDowns == 1 && VM.BreakDown == BreakDown;
+}
+
+static void checkValueMappings() {
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx + 1],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx + 2],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx + 1],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx + 2],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx + 1],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx + 2],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+}
+#endif
 } // end namespace arm
 } // end namespace llvm
 
 ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : RegisterBankInfo(ARM::RegBanks, ARM::NumRegisterBanks) {
+    : ARMGenRegisterBankInfo() {
   static bool AlreadyInit = false;
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
@@ -97,6 +166,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
   assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+
+#ifndef NDEBUG
+  ARM::checkPartialMappings();
+  ARM::checkValueMappings();
+#endif
 }
 
 const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
@@ -105,8 +179,16 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
 
   switch (RC.getID()) {
   case GPRRegClassID:
+  case GPRnopcRegClassID:
+  case GPRspRegClassID:
   case tGPR_and_tcGPRRegClassID:
+  case tGPRRegClassID:
     return getRegBank(ARM::GPRRegBankID);
+  case SPR_8RegClassID:
+  case SPRRegClassID:
+  case DPR_8RegClassID:
+  case DPRRegClassID:
+    return getRegBank(ARM::FPRRegBankID);
   default:
     llvm_unreachable("Unsupported register kind");
   }
@@ -128,23 +210,83 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
   using namespace TargetOpcode;
 
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
   unsigned NumOperands = MI.getNumOperands();
-  const ValueMapping *OperandsMapping = &ARM::ValueMappings[0];
+  const ValueMapping *OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
 
   switch (Opc) {
   case G_ADD:
-  case G_LOAD:
+  case G_SEXT:
+  case G_ZEXT:
+  case G_GEP:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
     // the real world we would use different mappings.
-    OperandsMapping = &ARM::ValueMappings[0];
+    OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
+    break;
+  case G_LOAD:
+  case G_STORE:
+    OperandsMapping =
+        Ty.getSizeInBits() == 64
+            ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::GPR3OpsIdx]})
+            : &ARM::ValueMappings[ARM::GPR3OpsIdx];
+    break;
+  case G_FADD:
+    assert((Ty.getSizeInBits() == 32 || Ty.getSizeInBits() == 64) &&
+           "Unsupported size for G_FADD");
+    OperandsMapping = Ty.getSizeInBits() == 64
+                          ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
+                          : &ARM::ValueMappings[ARM::SPR3OpsIdx];
     break;
+  case G_CONSTANT:
   case G_FRAME_INDEX:
-    OperandsMapping = getOperandsMapping({&ARM::ValueMappings[0], nullptr});
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
+  case G_SEQUENCE: {
+    // We only support G_SEQUENCE for creating a double precision floating point
+    // value out of two GPRs.
+    LLT Ty1 = MRI.getType(MI.getOperand(1).getReg());
+    LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
+    if (Ty.getSizeInBits() != 64 || Ty1.getSizeInBits() != 32 ||
+        Ty2.getSizeInBits() != 32)
+      return InstructionMapping{};
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
+    break;
+  }
+  case G_EXTRACT: {
+    // We only support G_EXTRACT for splitting a double precision floating point
+    // value into two GPRs.
+    LLT Ty1 = MRI.getType(MI.getOperand(1).getReg());
+    if (Ty.getSizeInBits() != 32 || Ty1.getSizeInBits() != 64 ||
+        MI.getOperand(2).getImm() % 32 != 0)
+      return InstructionMapping{};
+    OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                                          &ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                          nullptr, nullptr});
+    break;
+  }
   default:
     return InstructionMapping{};
   }
 
+#ifndef NDEBUG
+  for (unsigned i = 0; i < NumOperands; i++) {
+    for (const auto &Mapping : OperandsMapping[i]) {
+      assert(
+          (Mapping.RegBank->getID() != ARM::FPRRegBankID ||
+           MF.getSubtarget<ARMSubtarget>().hasVFP2()) &&
+          "Trying to use floating point register bank on target without vfp");
+    }
+  }
+#endif
+
   return InstructionMapping{DefaultMappingID, /*Cost=*/1, OperandsMapping,
                             NumOperands};
 }
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.h b/lib/Target/ARM/ARMRegisterBankInfo.h
index 773920ee57a7..5222c1e6389f 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -16,19 +16,20 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "ARMGenRegisterBank.inc"
+
 namespace llvm {
 
 class TargetRegisterInfo;
 
-namespace ARM {
-enum {
-  GPRRegBankID = 0, // General purpose registers
-  NumRegisterBanks,
+class ARMGenRegisterBankInfo : public RegisterBankInfo {
+#define GET_TARGET_REGBANK_CLASS
+#include "ARMGenRegisterBank.inc"
 };
-} // end namespace ARM
 
 /// This class provides the information for the target register banks.
-class ARMRegisterBankInfo final : public RegisterBankInfo {
+class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo {
 public:
   ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/lib/Target/ARM/ARMRegisterBanks.td b/lib/Target/ARM/ARMRegisterBanks.td
new file mode 100644
index 000000000000..7cd2d60d36a4
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterBanks.td
@@ -0,0 +1,14 @@
+//=- ARMRegisterBank.td - Describe the AArch64 Banks ---------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+def GPRRegBank : RegisterBank<"GPRB", [GPR, GPRwithAPSR]>;
+def FPRRegBank : RegisterBank<"FPRB", [SPR, DPR]>;
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index b7d2d34614df..87eb4c2b9074 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //===----------------------------------------------------------------------===//
-// Instruction scheduling annotations for out-of-order CPUs.
+// Instruction scheduling annotations for in-order and out-of-order CPUs.
 // These annotations are independent of the itinerary class defined below.
 // Here we define the subtarget independent read/write per-operand resources.
 // The subtarget schedule definitions will then map these to the subtarget's
@@ -54,6 +54,9 @@
 //  }
 //  def : ReadAdvance<ReadAdvanceALUsr, 3>;
 
+//===----------------------------------------------------------------------===//
+// Sched definitions for integer pipeline instructions
+//
 // Basic ALU operation.
 def WriteALU : SchedWrite;
 def ReadALU : SchedRead;
@@ -69,24 +72,65 @@ def WriteCMP : SchedWrite;
 def WriteCMPsi : SchedWrite;
 def WriteCMPsr : SchedWrite;
 
-// Division.
-def WriteDiv : SchedWrite;
+// Multiplys.
+def WriteMUL16   : SchedWrite; // 16-bit multiply.
+def WriteMUL32   : SchedWrite; // 32-bit multiply.
+def WriteMUL64Lo : SchedWrite; // 64-bit result. Low reg.
+def WriteMUL64Hi : SchedWrite; // 64-bit result. High reg.
+def ReadMUL  : SchedRead;
+
+// Multiply-accumulates.
+def WriteMAC16   : SchedWrite; // 16-bit mac.
+def WriteMAC32   : SchedWrite; // 32-bit mac.
+def WriteMAC64Lo : SchedWrite; // 64-bit mac. Low reg.
+def WriteMAC64Hi : SchedWrite; // 64-bit mac. High reg.
+def ReadMAC : SchedRead;
+
+// Divisions.
+def WriteDIV : SchedWrite;
 
-// Loads.
+// Loads/Stores.
 def WriteLd : SchedWrite;
 def WritePreLd : SchedWrite;
+def WriteST : SchedWrite;
 
 // Branches.
 def WriteBr : SchedWrite;
 def WriteBrL : SchedWrite;
 def WriteBrTbl : SchedWrite;
 
-// Fixpoint conversions.
-def WriteCvtFP : SchedWrite;
-
 // Noop.
 def WriteNoop : SchedWrite;
 
+//===----------------------------------------------------------------------===//
+// Sched definitions for floating-point and neon instructions
+//
+// Floating point conversions
+def WriteFPCVT : SchedWrite;
+def WriteFPMOV : SchedWrite; // FP -> GPR and vice-versa
+
+// ALU operations (32/64-bit)
+def WriteFPALU32 : SchedWrite;
+def WriteFPALU64 : SchedWrite;
+
+// Multiplication
+def WriteFPMUL32 : SchedWrite;
+def WriteFPMUL64 : SchedWrite;
+def ReadFPMUL    : SchedRead; // multiplier read
+def ReadFPMAC    : SchedRead; // accumulator read
+
+// Multiply-accumulate
+def WriteFPMAC32 : SchedWrite;
+def WriteFPMAC64 : SchedWrite;
+
+// Division
+def WriteFPDIV32 : SchedWrite;
+def WriteFPDIV64 : SchedWrite;
+
+// Square-root
+def WriteFPSQRT32 : SchedWrite;
+def WriteFPSQRT64 : SchedWrite;
+
 // Define TII for use in SchedVariant Predicates.
 def : PredicateProlog<[{
   const ARMBaseInstrInfo *TII =
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 519e595bd184..8fb8a2a3b6d2 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1944,6 +1944,16 @@ def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
 def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
 def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
                                                 let NumMicroOps = 0; }
+def : SchedAlias<WriteMUL16, A9WriteM16>;
+def : SchedAlias<WriteMUL32, A9WriteM>;
+def : SchedAlias<WriteMUL64Lo, A9WriteM>;
+def : SchedAlias<WriteMUL64Hi, A9WriteMHi>;
+def : SchedAlias<WriteMAC16, A9WriteM16>;
+def : SchedAlias<WriteMAC32, A9WriteM>;
+def : SchedAlias<WriteMAC64Lo, A9WriteM>;
+def : SchedAlias<WriteMAC64Hi, A9WriteMHi>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
 
 // Floating-point
 // Only one FP or AGU instruction may issue per cycle. We model this
@@ -1953,6 +1963,7 @@ def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
 def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
 def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
 def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+
 def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
 def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
 def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
@@ -1992,6 +2003,7 @@ def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
 
 // Load Integer.
 def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+def : SchedAlias<WriteLd, A9WriteL>;
 // Load the upper 32-bits using the same micro-op.
 def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
                                      let NumMicroOps = 0; }
@@ -2471,6 +2483,34 @@ def : SchedAlias<WriteALUsr, A9WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
 def : SchedAlias<ReadALU, A9ReadALU>;
 def : SchedAlias<ReadALUsr, A9ReadALU>;
+def : SchedAlias<WriteST, A9WriteS>;
+
+// ===---------------------------------------------------------------------===//
+// Floating-point. Map target defined SchedReadWrite to processor specific ones
+//
+def : WriteRes<WriteFPCVT, [A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def : SchedAlias<WriteFPMOV, A9WriteFMov>;
+
+def : SchedAlias<WriteFPALU32, A9WriteF>;
+def : SchedAlias<WriteFPALU64, A9WriteF>;
+
+def : SchedAlias<WriteFPMUL32, A9WriteFMulS>;
+def : SchedAlias<WriteFPMUL64, A9WriteFMulD>;
+
+def : SchedAlias<WriteFPMAC32, A9WriteFMAS>;
+def : SchedAlias<WriteFPMAC64, A9WriteFMAD>;
+
+def : SchedAlias<WriteFPDIV32, A9WriteFDivS>;
+def : SchedAlias<WriteFPDIV64, A9WriteFDivD>;
+def : SchedAlias<WriteFPSQRT32, A9WriteFSqrtS>;
+def : SchedAlias<WriteFPSQRT64, A9WriteFSqrtD>;
+
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 0>;
+
+// ===---------------------------------------------------------------------===//
+// Subtarget-specific overrides. Map opcodes to list of SchedReadWrite types.
+//
 def : InstRW< [WriteALU],
       (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
                  "BICrr")>;
@@ -2518,12 +2558,11 @@ def : InstRW<[A9WriteLb],
       "LDRH", "LDRSH", "LDRSB")>;
 def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
 
-def : WriteRes<WriteDiv, []> { let Latency = 0; }
+def : WriteRes<WriteDIV, []> { let Latency = 0; }
 
 def : WriteRes<WriteBr, [A9UnitB]>;
 def : WriteRes<WriteBrL, [A9UnitB]>;
 def : WriteRes<WriteBrTbl, [A9UnitB]>;
 def : WriteRes<WritePreLd, []>;
-def : SchedAlias<WriteCvtFP, A9WriteF>;
 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
 } // SchedModel = CortexA9Model
diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td
index 1b40742a093b..537e5da9669f 100644
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@@ -70,15 +70,13 @@ def : WriteRes<WriteCMP, [R52UnitALU]> { let Latency = 0; }
 def : WriteRes<WriteCMPsi, [R52UnitALU]> { let Latency = 0; }
 def : WriteRes<WriteCMPsr, [R52UnitALU]> { let Latency = 0; }
 
+// Multiply - aliased to sub-target specific later
+
 // Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2)
-def : WriteRes<WriteDiv, [R52UnitDiv]> {
-  let Latency = 8; let ResourceCycles = [8]; // not pipelined
+def : WriteRes<WriteDIV, [R52UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8]; // non-pipelined
 }
 
-// Loads
-def : WriteRes<WriteLd, [R52UnitLd]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [R52UnitLd]> { let Latency = 4; }
-
 // Branches  - LR written in Late EX2
 def : WriteRes<WriteBr, [R52UnitB]> { let Latency = 0; }
 def : WriteRes<WriteBrL, [R52UnitB]> { let Latency = 0; }
@@ -86,11 +84,44 @@ def : WriteRes<WriteBrTbl, [R52UnitALU]> { let Latency = 0; }
 
 // Misc
 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
-def : WriteRes<WriteCvtFP, [R52UnitALU]> { let Latency = 3; }
 
+// Integer pipeline by-passes
 def : ReadAdvance<ReadALU, 1>;   // Operand needed in EX1 stage
 def : ReadAdvance<ReadALUsr, 0>; // Shift operands needed in ISS
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
+
+// Floating-point. Map target-defined SchedReadWrites to subtarget
+def : WriteRes<WriteFPMUL32, [R52UnitFPMUL]> { let Latency = 6; }
+
+def : WriteRes<WriteFPMUL64, [R52UnitFPMUL, R52UnitFPMUL]> {
+  let Latency = 6;
+}
+
+def : WriteRes<WriteFPMAC32, [R52UnitFPMUL, R52UnitFPALU]> {
+  let Latency = 11;     // as it is internally two insns (MUL then ADD)
+}
 
+def : WriteRes<WriteFPMAC64, [R52UnitFPMUL, R52UnitFPMUL,
+                              R52UnitFPALU, R52UnitFPALU]> {
+  let Latency = 11;
+}
+
+def : WriteRes<WriteFPDIV32, [R52UnitDiv]> {
+  let Latency = 7;          // FP div takes fixed #cycles
+  let ResourceCycles = [7]; // is not pipelined
+}
+
+def : WriteRes<WriteFPDIV64, [R52UnitDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+
+def : WriteRes<WriteFPSQRT32, [R52UnitDiv]> { let Latency = 7; }
+def : WriteRes<WriteFPSQRT64, [R52UnitDiv]> { let Latency = 17; }
+
+def : ReadAdvance<ReadFPMUL, 1>; // mul operand read in F1
+def : ReadAdvance<ReadFPMAC, 1>; // fp-mac operand read in F1
 
 //===----------------------------------------------------------------------===//
 // Subtarget-specific SchedReadWrites.
@@ -106,6 +137,9 @@ def : ReadAdvance<R52Read_F2, 2>;
 
 // Cortex-R52 specific SchedWrites for use with InstRW
 def R52WriteMAC        : SchedWriteRes<[R52UnitMAC]> { let Latency = 4; }
+def R52WriteMACHi      : SchedWriteRes<[R52UnitMAC]> {
+  let Latency = 4; let NumMicroOps = 0;
+}
 def R52WriteDIV        : SchedWriteRes<[R52UnitDiv]> {
   let Latency = 8; let ResourceCycles = [8]; // not pipelined
 }
@@ -120,6 +154,19 @@ def R52WriteALU_WRI    : SchedWriteRes<[R52UnitALU]> { let Latency = 4; }
 def R52WriteNoRSRC_EX2 : SchedWriteRes<[]> { let Latency = 3; }
 def R52WriteNoRSRC_WRI : SchedWriteRes<[]> { let Latency = 4; }
 
+// Alias generics to sub-target specific
+def : SchedAlias<WriteMUL16, R52WriteMAC>;
+def : SchedAlias<WriteMUL32, R52WriteMAC>;
+def : SchedAlias<WriteMUL64Lo, R52WriteMAC>;
+def : SchedAlias<WriteMUL64Hi, R52WriteMACHi>;
+def : SchedAlias<WriteMAC16, R52WriteMAC>;
+def : SchedAlias<WriteMAC32, R52WriteMAC>;
+def : SchedAlias<WriteMAC64Lo, R52WriteMAC>;
+def : SchedAlias<WriteMAC64Hi, R52WriteMACHi>;
+def : SchedAlias<WritePreLd, R52WriteLd>;
+def : SchedAlias<WriteLd, R52WriteLd>;
+def : SchedAlias<WriteST, R52WriteST>;
+
 def R52WriteFPALU_F3   : SchedWriteRes<[R52UnitFPALU]> { let Latency = 4; }
 def R52Write2FPALU_F3  : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
   let Latency = 4;
@@ -147,19 +194,17 @@ def R52Write2FPMAC_F5  : SchedWriteRes<[R52UnitFPMUL, R52UnitFPMUL,
 def R52WriteFPLd_F4    : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
 def R52WriteFPST_F4    : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
 
-def R52WriteFPDIV_SP   : SchedWriteRes<[R52UnitFPDIV]> {
-  let Latency = 7;          // FP div takes fixed #cycles
-  let ResourceCycles = [7]; // is not pipelined
- }
-def R52WriteFPDIV_DP   : SchedWriteRes<[R52UnitFPDIV]> {
-  let Latency = 17;
-  let ResourceCycles = [17];
-}
-
-
 //===----------------------------------------------------------------------===//
-// Subtarget-specific - map operands to SchedReadWrites
+// Floating-point. Map target defined SchedReadWrites to processor specific ones
+//
+def : SchedAlias<WriteFPCVT,   R52WriteFPALU_F5>;
+def : SchedAlias<WriteFPMOV, R52WriteFPALU_F3>;
+def : SchedAlias<WriteFPALU32, R52WriteFPALU_F5>;
+def : SchedAlias<WriteFPALU64, R52WriteFPALU_F5>;
 
+//===----------------------------------------------------------------------===//
+// Subtarget-specific overrides. Map opcodes to list of SchedReadWrites types.
+//
 def : InstRW<[WriteALU], (instrs COPY)>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS],
@@ -235,7 +280,7 @@ def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
       "t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
 
 def : InstRW <[R52WriteDIV, R52Read_ISS, R52Read_ISS],
-      (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+      (instregex "t2SDIV", "t2UDIV")>;
 
 // Loads (except POST) with SHL > 2, or ror, require 2 extra cycles.
 // However, that's non-trivial to specify, so we keep it uniform
@@ -294,15 +339,6 @@ def : InstRW<[R52WriteCC, R52Read_ISS], (instregex "TST")>;
 def : InstRW<[R52WriteLd], (instregex "MRS", "MRSbanked")>;
 def : InstRW<[R52WriteLd, R52Read_EX1], (instregex "MSR", "MSRbanked")>;
 
-//def : InstRW<[R52WriteLd, R52Read_ISS], (instregex "^LDRB?(_PRE_IMM|_POST_IMM)", "LDRrs")>;
-//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_PRE_REG", "LDRB?rr")>;
-//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_POST_REG")>;
-
-//def : InstRW<[R52WriteST, R52Read_ISS], (instregex "STRi12", "PICSTR")>;
-//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_PRE_REG", "STRB?_PRE_REG")>;
-//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_POST_REG", "STRB?_POST_REG")>;
-
-
 // Integer Load, Multiple.
 foreach Lat = 3-25 in {
   def R52WriteILDM#Lat#Cy : SchedWriteRes<[R52UnitLd]> {
@@ -492,12 +528,6 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VAC
 def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)")>;
 def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(fq|hq)")>;
 
-def : InstRW<[R52WriteFPDIV_SP, R52Read_F0, R52Read_F0], (instregex "VDIV(S|H)")>;
-def : InstRW<[R52WriteFPDIV_DP, R52Read_F0, R52Read_F0], (instregex "VDIVD")>;
-
-def : InstRW<[R52WriteFPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1],
-                                          (instregex "(VFMA|VFMS|VFNMA|VFNMS)(D|H|S)")>;
-
 def : InstRW<[R52WriteFPLd_F4, R52Read_ISS, R52Read_F1], (instregex "VLDR")>;
 def : InstRW<[R52WriteFPST_F4, R52Read_ISS, R52Read_F1], (instregex "VSTR")>;
 
@@ -687,16 +717,19 @@ def R52WriteVLD2Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [2];
+  let SingleIssue = 1;
 }
 def R52WriteVLD3Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [3];
+  let SingleIssue = 1;
 }
 def R52WriteVLD4Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 8;
   let NumMicroOps = 7;
   let ResourceCycles = [4];
+  let SingleIssue = 1;
 }
 def R52WriteVST1Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 5;
@@ -777,9 +810,8 @@ def : InstRW<[R52Write2FPALU_F4, R52Read_F2, R52Read_F2], (instregex "(VHADD|VHS
 
 def : InstRW<[R52WriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
 def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VMAX", "VMIN", "VPMAX", "VPMIN")>;
-def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VMOV", "VORR", "VORN", "VREV")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VORR", "VORN", "VREV")>;
 def : InstRW<[R52WriteNoRSRC_WRI], (instregex "VMRS")>;
-def : InstRW<[R52WriteFPMUL_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VMUL", "VNMUL", "VMLA")>;
 def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VNEG")>;
 def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADDi")>;
 def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADAL", "VPADDL")>;
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index ea2bf4b578f0..dc041c6c6006 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -133,6 +133,8 @@ let SchedModel = SwiftModel in {
   def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
   def : ReadAdvance<ReadALU, 0>;
   def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+  def : SchedAlias<WriteLd, SwiftWriteP2ThreeCycle>;
+  def : SchedAlias<WriteST, SwiftWriteP2>;
 
 
   def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
@@ -166,10 +168,10 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftWriteP01OneCycle2x_load],
         (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
 
-  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+  def SwiftWriteP0TwoCycleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
 
   def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
-    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCycleTwoUops ]>,
     SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
   ]>;
 
@@ -282,6 +284,18 @@ let SchedModel = SwiftModel in {
     let ResourceCycles = [2, 3];
   }
 
+  // Aliasing sub-target specific WriteRes to generic ones
+  def : SchedAlias<WriteMUL16, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteMUL32, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteMUL64Lo, SwiftP0P0P01FiveCycle>;
+  def : SchedAlias<WriteMUL64Hi, SwiftWrite5Cycle>;
+  def : SchedAlias<WriteMAC16, SwiftPredP0P01FourFiveCycle>;
+  def : SchedAlias<WriteMAC32, SwiftPredP0P01FourFiveCycle>;
+  def : SchedAlias<WriteMAC64Lo, SwiftWrite5Cycle>;
+  def : SchedAlias<WriteMAC64Hi, Swift2P03P01FiveCycle>;
+  def : ReadAdvance<ReadMUL, 0>;
+  def : SchedAlias<ReadMAC, SwiftReadAdvanceFourCyclesPred>;
+
   // 4.2.15 Integer Multiply Accumulate, Long
   // 4.2.16 Integer Multiply Accumulate, Dual
   // 4.2.17 Integer Multiply Accumulate Accumulate, Long
@@ -300,7 +314,7 @@ let SchedModel = SwiftModel in {
     let ResourceCycles = [1, 14];
   }
   // 4.2.18 Integer Divide
-  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : WriteRes<WriteDIV, [SwiftUnitDiv]>; // Workaround.
   def : InstRW <[SwiftDiv],
         (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
 
@@ -310,7 +324,7 @@ let SchedModel = SwiftModel in {
     let Latency = 3;
     let NumMicroOps = 2;
   }
-  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+  def SwiftWriteP2P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 4;
     let NumMicroOps = 2;
   }
@@ -343,7 +357,7 @@ let SchedModel = SwiftModel in {
         "tLDR(r|i|spi|pci|pciASM)")>;
   def : InstRW<[SwiftWriteP2ThreeCycle],
         (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
-  def : InstRW<[SwiftWriteP2P01FourCyle],
+  def : InstRW<[SwiftWriteP2P01FourCycle],
         (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
         "t2LDRpci_pic", "tLDRS(B|H)")>;
   def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
@@ -597,8 +611,6 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftWriteP1FourCycle],
         (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
                    "VMULL", "VQDMULL")>;
-  def : InstRW<[SwiftWriteP1SixCycle],
-        (instregex "VMULD", "VNMULD")>;
   def : InstRW<[SwiftWriteP1FourCycle],
         (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
         "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
@@ -607,8 +619,6 @@ let SchedModel = SwiftModel in {
 
   // 4.2.36 Advanced SIMD and VFP, Convert
   def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
-  // Fixpoint conversions.
-  def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
 
   // 4.2.37 Advanced SIMD and VFP, Move
   def : InstRW<[SwiftWriteP0TwoCycle],
@@ -1036,6 +1046,30 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
   def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
 
+  // ===---------------------------------------------------------------------===//
+  // Floating-point. Map target defined SchedReadWrite to processor specific ones
+  //
+  def : SchedAlias<WriteFPCVT, SwiftWriteP1FourCycle>;
+  def : SchedAlias<WriteFPMOV, SwiftWriteP2ThreeCycle>;
+
+  def : SchedAlias<WriteFPALU32, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteFPALU64, SwiftWriteP0SixCycle>;
+
+  def : SchedAlias<WriteFPMUL32, SwiftWriteP1FourCycle>;
+  def : SchedAlias<WriteFPMUL64, SwiftWriteP1SixCycle>;
+
+  def : SchedAlias<WriteFPMAC32, SwiftWriteP1FourCycle>;
+  def : SchedAlias<WriteFPMAC64, SwiftWriteP1FourCycle>;
+
+  def : SchedAlias<WriteFPDIV32, SwiftDiv17>;
+  def : SchedAlias<WriteFPSQRT32, SwiftDiv17>;
+
+  def : SchedAlias<WriteFPDIV64, SwiftDiv32>;
+  def : SchedAlias<WriteFPSQRT64, SwiftDiv32>;
+
+  def : ReadAdvance<ReadFPMUL, 0>;
+  def : ReadAdvance<ReadFPMAC, 0>;
+
   // Not specified.
   def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
   // Preload.
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 3b99762f7157..33dcf9b8fef0 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -95,7 +95,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
 
     Entry.Node = Src; 
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Entry.isSExt = false;
+    Entry.IsSExt = false;
     Args.push_back(Entry);
   } else {
     Entry.Node = Src;
@@ -114,11 +114,11 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(
-           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
-           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
-                                 TLI->getPointerTy(DAG.getDataLayout())),
-           std::move(Args))
+      .setLibCallee(
+          TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+          DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+                                TLI->getPointerTy(DAG.getDataLayout())),
+          std::move(Args))
       .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
@@ -198,17 +198,18 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
     return Chain;
 
   // Issue loads / stores for the trailing (1 - 3) bytes.
+  auto getRemainingValueType = [](unsigned BytesLeft) {
+    return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
+  };
+  auto getRemainingSize = [](unsigned BytesLeft) {
+    return (BytesLeft >= 2) ? 2 : 1;
+  };
+
   unsigned BytesLeftSave = BytesLeft;
   i = 0;
   while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
+    VT = getRemainingValueType(BytesLeft);
+    VTSize = getRemainingSize(BytesLeft);
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
@@ -224,14 +225,8 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
   i = 0;
   BytesLeft = BytesLeftSave;
   while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
+    VT = getRemainingValueType(BytesLeft);
+    VTSize = getRemainingSize(BytesLeft);
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, dl, MVT::i32)),
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index e2df0bddd0d1..b8a708a20a95 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -13,25 +13,27 @@
 
 #include "ARMSubtarget.h"
 #include "ARMFrameLowering.h"
-#include "ARMISelLowering.h"
 #include "ARMInstrInfo.h"
-#include "ARMMachineFunctionInfo.h"
-#include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetParser.h"
+#include <cassert>
+#include <string>
 
 using namespace llvm;
 
@@ -104,7 +106,7 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                     : !isThumb()
                           ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
                           : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
-      TLInfo(TM, *this), GISel() {}
+      TLInfo(TM, *this) {}
 
 const CallLowering *ARMSubtarget::getCallLowering() const {
   assert(GISel && "Access to GlobalISel APIs not set");
@@ -148,11 +150,11 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
     if (isTargetDarwin()) {
       StringRef ArchName = TargetTriple.getArchName();
-      unsigned ArchKind = llvm::ARM::parseArch(ArchName);
-      if (ArchKind == llvm::ARM::AK_ARMV7S)
+      unsigned ArchKind = ARM::parseArch(ArchName);
+      if (ArchKind == ARM::AK_ARMV7S)
         // Default to the Swift CPU when targeting armv7s/thumbv7s.
         CPUString = "swift";
-      else if (ArchKind == llvm::ARM::AK_ARMV7K)
+      else if (ArchKind == ARM::AK_ARMV7K)
         // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
         // ARMv7k does not use SjLj exception handling.
         CPUString = "cortex-a7";
@@ -200,12 +202,12 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // support in the assembler and linker to be used. This would need to be
   // fixed to fully support tail calls in Thumb1.
   //
-  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
-  // LR.  This means if we need to reload LR, it takes an extra instructions,
-  // which outweighs the value of the tail call; but here we don't know yet
-  // whether LR is going to be used.  Probably the right approach is to
-  // generate the tail call here and turn it back into CALL/RET in
-  // emitEpilogue if LR is used.
+  // For ARMv8-M, we /do/ implement tail calls.  Doing this is tricky for v8-M
+  // baseline, since the LDM/POP instruction on Thumb doesn't take LR.  This
+  // means if we need to reload LR, it takes extra instructions, which outweighs
+  // the value of the tail call; but here we don't know yet whether LR is going
+  // to be used. We generate the tail call here and turn it back into CALL/RET
+  // in emitEpilogue if LR is used.
 
   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
   // but we need to make sure there are enough registers; the only valid
@@ -274,6 +276,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   case CortexM3:
   case ExynosM1:
   case CortexR52:
+  case Kryo:
     break;
   case Krait:
     PreISelOperandLatencyAdjustment = 1;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 8c8218d0f432..40993fc0aa8a 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -14,47 +14,93 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
 #define LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
 
-
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMISelLowering.h"
-#include "ARMInstrInfo.h"
 #include "ARMSelectionDAGInfo.h"
-#include "ARMSubtarget.h"
-#include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "Thumb1FrameLowering.h"
-#include "Thumb1InstrInfo.h"
-#include "Thumb2InstrInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "ARMGenSubtargetInfo.inc"
 
 namespace llvm {
+
+class ARMBaseTargetMachine;
 class GlobalValue;
 class StringRef;
-class TargetOptions;
-class ARMBaseTargetMachine;
 
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexR52, CortexM3,
-    CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
-    Krait, Swift, ExynosM1
+    Others,
+
+    CortexA12,
+    CortexA15,
+    CortexA17,
+    CortexA32,
+    CortexA35,
+    CortexA5,
+    CortexA53,
+    CortexA57,
+    CortexA7,
+    CortexA72,
+    CortexA73,
+    CortexA8,
+    CortexA9,
+    CortexM3,
+    CortexR4,
+    CortexR4F,
+    CortexR5,
+    CortexR52,
+    CortexR7,
+    ExynosM1,
+    Krait,
+    Kryo,
+    Swift
   };
   enum ARMProcClassEnum {
-    None, AClass, RClass, MClass
+    None,
+
+    AClass,
+    MClass,
+    RClass
   };
   enum ARMArchEnum {
-    ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
-    ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline,
+    ARMv2,
+    ARMv2a,
+    ARMv3,
+    ARMv3m,
+    ARMv4,
+    ARMv4t,
+    ARMv5,
+    ARMv5t,
+    ARMv5te,
+    ARMv5tej,
+    ARMv6,
+    ARMv6k,
+    ARMv6kz,
+    ARMv6m,
+    ARMv6sm,
+    ARMv6t2,
+    ARMv7a,
+    ARMv7em,
+    ARMv7m,
+    ARMv7r,
+    ARMv7ve,
+    ARMv81a,
+    ARMv82a,
+    ARMv8a,
+    ARMv8mBaseline,
+    ARMv8mMainline,
     ARMv8r
   };
 
@@ -168,10 +214,6 @@ protected:
   /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
   bool HasHardwareDivideInARM = false;
 
-  /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
-  /// instructions.
-  bool HasT2ExtractPack = false;
-
   /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
   /// instructions.
   bool HasDataBarrier = false;
@@ -310,6 +352,10 @@ protected:
   /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
   bool UseSjLjEH = false;
 
+  /// Implicitly convert an instruction to a different one if its immediates
+  /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
+  bool NegativeImmediates = true;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment = 4;
@@ -362,6 +408,7 @@ public:
   unsigned getMaxInlineSizeThreshold() const {
     return 64;
   }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -373,15 +420,19 @@ public:
   const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
   const ARMBaseInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
+
   const ARMTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
+
   const ARMFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
   }
+
   const ARMBaseRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
@@ -451,19 +502,21 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasRAS() const { return HasRAS; }
   bool hasVirtualization() const { return HasVirtualization; }
+
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP;
   }
 
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
-  bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
   bool hasV7Clrex() const { return HasV7Clrex; }
   bool hasAcquireRelease() const { return HasAcquireRelease; }
+
   bool hasAnyDataBarrier() const {
     return HasDataBarrier || (hasV6Ops() && !isThumb());
   }
+
   bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
@@ -561,9 +614,10 @@ public:
            TargetTriple.getEnvironment() == Triple::EABIHF ||
            isTargetWindows() || isAAPCS16_ABI();
   }
+
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
 
-  virtual bool isXRaySupported() const override;
+  bool isXRaySupported() const override;
 
   bool isAPCS_ABI() const;
   bool isAAPCS_ABI() const;
@@ -588,6 +642,7 @@ public:
   bool useR7AsFramePointer() const {
     return isTargetDarwin() || (!isTargetWindows() && isThumb());
   }
+
   /// Returns true if the frame setup is split into two separate pushes (first
   /// r0-r7,lr then r8-r11), principally so that the frame pointer is adjacent
   /// to lr. This is always required on Thumb1-only targets, as the push and
@@ -656,6 +711,7 @@ public:
   /// True if fast-isel is used.
   bool useFastISel() const;
 };
-} // End llvm namespace
 
-#endif  // ARMSUBTARGET_H
+} // end namespace llvm
+
+#endif  // LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 70c9567d99f8..b8dadb331ecf 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -10,30 +10,50 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMTargetMachine.h"
 #include "ARM.h"
 #include "ARMCallLowering.h"
-#include "ARMFrameLowering.h"
 #include "ARMInstructionSelector.h"
 #include "ARMLegalizerInfo.h"
 #include "ARMRegisterBankInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "ARMTargetTransformInfo.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool>
@@ -57,6 +77,10 @@ static cl::opt<cl::boolOrDefault>
 EnableGlobalMerge("arm-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"));
 
+namespace llvm {
+  void initializeARMExecutionDepsFixPass(PassRegistry&);
+}
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
@@ -68,14 +92,16 @@ extern "C" void LLVMInitializeARMTarget() {
   initializeGlobalISel(Registry);
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
+  initializeARMConstantIslandsPass(Registry);
+  initializeARMExecutionDepsFixPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
-    return make_unique<TargetLoweringObjectFileMachO>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
   if (TT.isOSWindows())
-    return make_unique<TargetLoweringObjectFileCOFF>();
-  return make_unique<ARMElfTargetObjectFile>();
+    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
+  return llvm::make_unique<ARMElfTargetObjectFile>();
 }
 
 static ARMBaseTargetMachine::ARMABI
@@ -94,13 +120,13 @@ computeTargetABI(const Triple &TT, StringRef CPU,
   ARMBaseTargetMachine::ARMABI TargetABI =
       ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
 
-  unsigned ArchKind = llvm::ARM::parseCPUArch(CPU);
-  StringRef ArchName = llvm::ARM::getArchName(ArchKind);
+  unsigned ArchKind = ARM::parseCPUArch(CPU);
+  StringRef ArchName = ARM::getArchName(ArchKind);
   // FIXME: This is duplicated code from the front end and should be unified.
   if (TT.isOSBinFormatMachO()) {
-    if (TT.getEnvironment() == llvm::Triple::EABI ||
-        (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
-        llvm::ARM::parseArchProfile(ArchName) == llvm::ARM::PK_M) {
+    if (TT.getEnvironment() == Triple::EABI ||
+        (TT.getOS() == Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
+        ARM::parseArchProfile(ArchName) == ARM::PK_M) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
     } else if (TT.isWatchABI()) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
@@ -113,16 +139,16 @@ computeTargetABI(const Triple &TT, StringRef CPU,
   } else {
     // Select the default based on the platform.
     switch (TT.getEnvironment()) {
-    case llvm::Triple::Android:
-    case llvm::Triple::GNUEABI:
-    case llvm::Triple::GNUEABIHF:
-    case llvm::Triple::MuslEABI:
-    case llvm::Triple::MuslEABIHF:
-    case llvm::Triple::EABIHF:
-    case llvm::Triple::EABI:
+    case Triple::Android:
+    case Triple::GNUEABI:
+    case Triple::GNUEABIHF:
+    case Triple::MuslEABI:
+    case Triple::MuslEABIHF:
+    case Triple::EABIHF:
+    case Triple::EABI:
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
       break;
-    case llvm::Triple::GNU:
+    case Triple::GNU:
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
       break;
     default:
@@ -141,7 +167,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options,
                                      bool isLittle) {
   auto ABI = computeTargetABI(TT, CPU, Options);
-  std::string Ret = "";
+  std::string Ret;
 
   if (isLittle)
     // Little endian.
@@ -238,29 +264,35 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
   }
 }
 
-ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
+ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
 
 #ifdef LLVM_BUILD_GLOBAL_ISEL
 namespace {
+
 struct ARMGISelActualAccessor : public GISelAccessor {
   std::unique_ptr<CallLowering> CallLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
   const CallLowering *getCallLowering() const override {
     return CallLoweringInfo.get();
   }
+
   const InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
+
   const LegalizerInfo *getLegalizerInfo() const override {
     return Legalizer.get();
   }
+
   const RegisterBankInfo *getRegBankInfo() const override {
     return RegBankInfo.get();
   }
 };
-} // End anonymous namespace.
+
+} // end anonymous namespace
 #endif
 
 const ARMSubtarget *
@@ -300,7 +332,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
 #else
     ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
     GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new ARMLegalizerInfo());
+    GISel->Legalizer.reset(new ARMLegalizerInfo(*I));
 
     auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
 
@@ -390,6 +422,7 @@ ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
     : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
+
 /// ARM Code Generator Pass Configuration Options.
 class ARMPassConfig : public TargetPassConfig {
 public:
@@ -413,7 +446,21 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+class ARMExecutionDepsFix : public ExecutionDepsFix {
+public:
+  static char ID;
+  ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {}
+  StringRef getPassName() const override {
+    return "ARM Execution Dependency Fix";
+  }
+};
+char ARMExecutionDepsFix::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix",
+                "ARM Execution Dependency Fix", false, false)
 
 TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new ARMPassConfig(this, PM);
@@ -508,7 +555,7 @@ void ARMPassConfig::addPreSched2() {
     if (EnableARMLoadStoreOpt)
       addPass(createARMLoadStoreOptimizationPass());
 
-    addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+    addPass(new ARMExecutionDepsFix());
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index c6b70b953162..f0ca9427d9fb 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -14,10 +14,14 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
 #define LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
 
-#include "ARMInstrInfo.h"
 #include "ARMSubtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
@@ -32,7 +36,7 @@ public:
 
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  ARMSubtarget        Subtarget;
+  ARMSubtarget Subtarget;
   bool isLittle;
   mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
 
@@ -62,7 +66,8 @@ public:
 ///
 class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
- public:
+
+public:
    ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                     StringRef FS, const TargetOptions &Options,
                     Optional<Reloc::Model> RM, CodeModel::Model CM,
@@ -73,6 +78,7 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
 ///
 class ARMLETargetMachine : public ARMTargetMachine {
   void anchor() override;
+
 public:
   ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -84,6 +90,7 @@ public:
 ///
 class ARMBETargetMachine : public ARMTargetMachine {
   void anchor() override;
+
 public:
   ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -97,6 +104,7 @@ public:
 ///
 class ThumbTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
+
 public:
   ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -108,6 +116,7 @@ public:
 ///
 class ThumbLETargetMachine : public ThumbTargetMachine {
   void anchor() override;
+
 public:
   ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
@@ -119,6 +128,7 @@ public:
 ///
 class ThumbBETargetMachine : public ThumbTargetMachine {
   void anchor() override;
+
 public:
   ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
@@ -128,4 +138,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 625c4280e1a6..94f9e8dfebbf 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -7,17 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMTargetObjectFile.h"
+#include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Mangler.h"
+#include "ARMTargetObjectFile.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
 using namespace llvm;
 using namespace dwarf;
 
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 24e755ddac27..dbb8128269dc 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -11,19 +11,19 @@
 #define LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 
-class MCContext;
-class TargetMachine;
-
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
   mutable bool genExecuteOnly = false;
+
 protected:
-  const MCSection *AttributesSection;
+  const MCSection *AttributesSection = nullptr;
+
 public:
   ARMElfTargetObjectFile()
-      : TargetLoweringObjectFileELF(), AttributesSection(nullptr) {
+      : TargetLoweringObjectFileELF() {
     PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
   }
 
@@ -47,4 +47,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2b6b36bc3e68..8eb9dbf5f9de 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -92,7 +92,8 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 }
 
 
-int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -310,7 +311,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
@@ -335,7 +337,7 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     return LT.first;
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -504,7 +506,7 @@ int ARMTTIImpl::getArithmeticInstrCost(
 }
 
 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
@@ -529,12 +531,14 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
-    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one vldN/vstN instruction.
+    if (NumElts % Factor == 0 &&
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 3c83cd92a61a..7de0543dfa5e 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -33,10 +33,6 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const ARMSubtarget *getST() const { return ST; }
   const ARMTargetLowering *getTLI() const { return TLI; }
 
@@ -98,9 +94,11 @@ public:
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
 
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
@@ -118,7 +116,7 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c243a2d35979..f421d3ac1693 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -915,40 +915,37 @@ public:
     int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
     return Val != -1;
   }
-  bool isFBits16() const {
+
+  template<int64_t N, int64_t M>
+  bool isImmediate() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return Value >= 0 && Value <= 16;
+    return Value >= N && Value <= M;
   }
-  bool isFBits32() const {
+  template<int64_t N, int64_t M>
+  bool isImmediateS4() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return Value >= 1 && Value <= 32;
+    return ((Value & 3) == 0) && Value >= N && Value <= M;
+  }
+  bool isFBits16() const {
+    return isImmediate<0, 17>();
+  }
+  bool isFBits32() const {
+    return isImmediate<1, 33>();
   }
   bool isImm8s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+    return isImmediateS4<-1020, 1020>();
   }
   bool isImm0_1020s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+    return isImmediateS4<0, 1020>();
   }
   bool isImm0_508s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+    return isImmediateS4<0, 508>();
   }
   bool isImm0_508s4Neg() const {
     if (!isImm()) return false;
@@ -958,27 +955,6 @@ public:
     // explicitly exclude zero. we want that to use the normal 0_508 version.
     return ((Value & 3) == 0) && Value > 0 && Value <= 508;
   }
-  bool isImm0_239() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 240;
-  }
-  bool isImm0_255() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 256;
-  }
-  bool isImm0_4095() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 4096;
-  }
   bool isImm0_4095Neg() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -986,145 +962,17 @@ public:
     int64_t Value = -CE->getValue();
     return Value > 0 && Value < 4096;
   }
-  bool isImm0_1() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 2;
-  }
-  bool isImm0_3() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 4;
-  }
   bool isImm0_7() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 8;
-  }
-  bool isImm0_15() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 16;
-  }
-  bool isImm0_31() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 32;
-  }
-  bool isImm0_63() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 64;
-  }
-  bool isImm8() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 8;
-  }
-  bool isImm16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 16;
-  }
-  bool isImm32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 32;
-  }
-  bool isShrImm8() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 8;
-  }
-  bool isShrImm16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 16;
-  }
-  bool isShrImm32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 32;
-  }
-  bool isShrImm64() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 64;
-  }
-  bool isImm1_7() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 8;
-  }
-  bool isImm1_15() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 16;
-  }
-  bool isImm1_31() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 32;
+    return isImmediate<0, 7>();
   }
   bool isImm1_16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 17;
+    return isImmediate<1, 16>();
   }
   bool isImm1_32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 33;
-  }
-  bool isImm0_32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 33;
+    return isImmediate<1, 32>();
   }
-  bool isImm0_65535() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 65536;
+  bool isImm8_255() const {
+    return isImmediate<8, 255>();
   }
   bool isImm256_65535Expr() const {
     if (!isImm()) return false;
@@ -1145,32 +993,16 @@ public:
     return Value >= 0 && Value < 65536;
   }
   bool isImm24bit() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value <= 0xffffff;
+    return isImmediate<0, 0xffffff + 1>();
   }
   bool isImmThumbSR() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 33;
+    return isImmediate<1, 33>();
   }
   bool isPKHLSLImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 32;
+    return isImmediate<0, 32>();
   }
   bool isPKHASRImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 32;
+    return isImmediate<0, 33>();
   }
   bool isAdrLabel() const {
     // If we have an immediate that's not a constant, treat it as a label
@@ -1245,6 +1077,20 @@ public:
     return ARM_AM::getSOImmVal(Value) == -1 &&
       ARM_AM::getSOImmVal(-Value) != -1;
   }
+  bool isThumbModImmNeg1_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int32_t Value = -(int32_t)CE->getValue();
+    return 0 < Value && Value < 8;
+  }
+  bool isThumbModImmNeg8_255() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int32_t Value = -(int32_t)CE->getValue();
+    return 7 < Value && Value < 256;
+  }
   bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
   bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
@@ -2035,6 +1881,20 @@ public:
     Inst.addOperand(MCOperand::createImm(Enc));
   }
 
+  void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Val = -CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Val = -CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
   void addBitfieldOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // Munge the lsb/width into a bitfield mask.
@@ -2141,7 +2001,7 @@ public:
     // The operand is actually a t2_so_imm, but we have its bitwise
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(~CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue()));
   }
 
   void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
@@ -2149,7 +2009,7 @@ public:
     // The operand is actually a t2_so_imm, but we have its
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
   }
 
   void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
@@ -4330,7 +4190,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
 
       // If some specific flag is already set, it means that some letter is
       // present more than once, this is not acceptable.
-      if (FlagsVal == ~0U || (FlagsVal & Flag))
+      if (Flag == ~0U || (FlagsVal & Flag))
         return MatchOperand_NoMatch;
       FlagsVal |= Flag;
     }
@@ -5484,7 +5344,8 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   enum {
     COFF = (1 << MCObjectFileInfo::IsCOFF),
     ELF = (1 << MCObjectFileInfo::IsELF),
-    MACHO = (1 << MCObjectFileInfo::IsMachO)
+    MACHO = (1 << MCObjectFileInfo::IsMachO),
+    WASM = (1 << MCObjectFileInfo::IsWasm),
   };
   static const struct PrefixEntry {
     const char *Spelling;
@@ -5518,6 +5379,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   case MCObjectFileInfo::IsCOFF:
     CurrentFormat = COFF;
     break;
+  case MCObjectFileInfo::IsWasm:
+    CurrentFormat = WASM;
+    break;
   }
 
   if (~Prefix->SupportedFormats & CurrentFormat) {
@@ -6301,10 +6165,6 @@ bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
   else if (ListContainsPC && ListContainsLR)
     return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
                  "PC and LR may not be in the register list simultaneously");
-  else if (inITBlock() && !lastInITBlock() && ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "instruction must be outside of IT block or the last "
-                 "instruction in an IT block");
   return false;
 }
 
@@ -6366,6 +6226,12 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     return Warning(Loc, "predicated instructions should be in IT block");
   }
 
+  // PC-setting instructions in an IT block, but not the last instruction of
+  // the block, are UNPREDICTABLE.
+  if (inExplicitITBlock() && !lastInITBlock() && isITBlockTerminator(Inst)) {
+    return Error(Loc, "instruction must be outside of IT block or the last instruction in an IT block");
+  }
+
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
   case ARM::LDRD:
@@ -6676,6 +6542,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::MOVi16:
+  case ARM::MOVTi16:
   case ARM::t2MOVi16:
   case ARM::t2MOVTi16:
     {
@@ -8232,7 +8099,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2LSRri:
   case ARM::t2ASRri: {
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
-        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
         !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
           static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
@@ -8307,23 +8174,38 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
-    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
-    default: llvm_unreachable("unexpected opcode!");
-    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
-    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
-    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
-    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
-    case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
-    }
+    unsigned Shift = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
     unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    bool isMov = false;
+    // MOV rd, rm, LSL #0 is actually a MOV instruction
+    if (Shift == ARM_AM::lsl && Amount == 0) {
+      isMov = true;
+      // The 16-bit encoding of MOV rd, rm, LSL #N is explicitly encoding T2 of
+      // MOV (register) in the ARMv8-A and ARMv8-M manuals, and immediate 0 is
+      // unpredictable in an IT block so the 32-bit encoding T3 has to be used
+      // instead.
+      if (inITBlock()) {
+        isNarrow = false;
+      }
+      newOpc = isNarrow ? ARM::tMOVSr : ARM::t2MOVr;
+    } else {
+      switch(Shift) {
+      default: llvm_unreachable("unexpected opcode!");
+      case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+      case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+      case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+      case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+      case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+      }
+    }
     if (Amount == 32) Amount = 0;
     TmpInst.setOpcode(newOpc);
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
-    if (isNarrow)
+    if (isNarrow && !isMov)
       TmpInst.addOperand(MCOperand::createReg(
           Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
-    if (newOpc != ARM::t2RRX)
+    if (newOpc != ARM::t2RRX && !isMov)
       TmpInst.addOperand(MCOperand::createImm(Amount));
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
     TmpInst.addOperand(Inst.getOperand(4));
@@ -8918,6 +8800,9 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
         inITBlock())
       return Match_RequiresNotITBlock;
+    // LSL with zero immediate is not allowed in an IT block
+    if (Opc == ARM::tLSLri && Inst.getOperand(3).getImm() == 0 && inITBlock())
+      return Match_RequiresNotITBlock;
   } else if (isThumbOne()) {
     // Some high-register supporting Thumb1 encodings only allow both registers
     // to be from r0-r7 when in Thumb2.
@@ -8932,6 +8817,22 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_RequiresV6;
   }
 
+  // Before ARMv8 the rules for when SP is allowed in t2MOVr are more complex
+  // than the loop below can handle, so it uses the GPRnopc register class and
+  // we do SP handling here.
+  if (Opc == ARM::t2MOVr && !hasV8Ops())
+  {
+    // SP as both source and destination is not allowed
+    if (Inst.getOperand(0).getReg() == ARM::SP &&
+        Inst.getOperand(1).getReg() == ARM::SP)
+      return Match_RequiresV8;
+    // When flags-setting SP as either source or destination is not allowed
+    if (Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (Inst.getOperand(0).getReg() == ARM::SP ||
+         Inst.getOperand(1).getReg() == ARM::SP))
+      return Match_RequiresV8;
+  }
+
   for (unsigned I = 0; I < MCID.NumOperands; ++I)
     if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
       // rGPRRegClass excludes PC, and also excluded SP before ARMv8
@@ -8945,7 +8846,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 }
 
 namespace llvm {
-template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
+template <> inline bool IsCPSRDead<MCInst>(const MCInst *Instr) {
   return true; // In an assembly source, no need to second-guess
 }
 }
@@ -8975,6 +8876,7 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
   // operands. We only care about Thumb instructions here, as ARM instructions
   // obviously can't be in an IT block.
   switch (Inst.getOpcode()) {
+  case ARM::tLDMIA:
   case ARM::t2LDMIA:
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB:
@@ -9088,6 +8990,13 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
                                  PendConditionalInstruction, Out);
 
+  SMLoc ErrorLoc;
+  if (ErrorInfo < Operands.size()) {
+    ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+  }
+
   switch (MatchResult) {
   case Match_Success:
     // Context sensitive operand constraints aren't handled by the matcher,
@@ -9177,16 +9086,52 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv8 or later");
   case Match_RequiresFlagSetting:
     return Error(IDLoc, "no flag-preserving variant of this instruction available");
-  case Match_ImmRange0_15: {
-    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
-    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+  case Match_ImmRange0_1:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,1]");
+  case Match_ImmRange0_3:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,3]");
+  case Match_ImmRange0_7:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,7]");
+  case Match_ImmRange0_15:
     return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
-  }
-  case Match_ImmRange0_239: {
-    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
-    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+  case Match_ImmRange0_31:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,31]");
+  case Match_ImmRange0_32:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,32]");
+  case Match_ImmRange0_63:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,63]");
+  case Match_ImmRange0_239:
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
-  }
+  case Match_ImmRange0_255:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,255]");
+  case Match_ImmRange0_4095:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,4095]");
+  case Match_ImmRange0_65535:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,65535]");
+  case Match_ImmRange1_7:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,7]");
+  case Match_ImmRange1_8:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,8]");
+  case Match_ImmRange1_15:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,15]");
+  case Match_ImmRange1_16:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,16]");
+  case Match_ImmRange1_31:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,31]");
+  case Match_ImmRange1_32:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,32]");
+  case Match_ImmRange1_64:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,64]");
+  case Match_ImmRange8_8:
+    return Error(ErrorLoc, "immediate operand must be 8.");
+  case Match_ImmRange16_16:
+    return Error(ErrorLoc, "immediate operand must be 16.");
+  case Match_ImmRange32_32:
+    return Error(ErrorLoc, "immediate operand must be 32.");
+  case Match_ImmRange256_65535:
+    return Error(ErrorLoc, "immediate operand must be in the range [255,65535]");
+  case Match_ImmRange0_16777215:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,0xffffff]");
   case Match_AlignedMemoryRequiresNone:
   case Match_DupAlignedMemoryRequiresNone:
   case Match_AlignedMemoryRequires16:
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 0c57a3e3166b..1062c7943201 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_TARGET_DEFINITIONS ARM.td)
 
+tablegen(LLVM ARMGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM ARMGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM ARMGenMCCodeEmitter.inc -gen-emitter)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ac3d8c780af2..e812d32cc76f 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -7,21 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <vector>
 
 using namespace llvm;
@@ -31,6 +34,7 @@ using namespace llvm;
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+
   // Handles the condition code status of instructions in IT blocks
   class ITStatus
   {
@@ -81,9 +85,7 @@ namespace {
     private:
       std::vector<unsigned char> ITStates;
   };
-}
 
-namespace {
 /// ARM disassembler for all ARM platforms.
 class ARMDisassembler : public MCDisassembler {
 public:
@@ -91,7 +93,7 @@ public:
     MCDisassembler(STI, Ctx) {
   }
 
-  ~ARMDisassembler() override {}
+  ~ARMDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -106,7 +108,7 @@ public:
     MCDisassembler(STI, Ctx) {
   }
 
-  ~ThumbDisassembler() override {}
+  ~ThumbDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -118,7 +120,8 @@ private:
   DecodeStatus AddThumbPredicate(MCInst&) const;
   void UpdateThumbVFPPredicate(MCInst&) const;
 };
-}
+
+} // end anonymous namespace
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -135,7 +138,6 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
   llvm_unreachable("Invalid DecodeStatus!");
 }
 
-
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -319,7 +321,6 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 
-
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
@@ -395,8 +396,9 @@ static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
                                             uint64_t Address, const void *Decoder);
+
 #include "ARMGenDisassemblerTables.inc"
 
 static MCDisassembler *createARMDisassembler(const Target &T,
@@ -416,8 +418,7 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
                                             uint64_t Address, raw_ostream &OS,
                                             raw_ostream &CS,
                                             uint32_t Insn,
-                                            DecodeStatus Result)
-{
+                                            DecodeStatus Result) {
   switch (MI.getOpcode()) {
     case ARM::HVC: {
       // HVC is undefined if condition = 0xf otherwise upredictable
@@ -461,65 +462,28 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
   }
 
-  // VFP and NEON instructions, similarly, are shared between ARM
-  // and Thumb modes.
-  Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
-                             this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
+  struct DecodeTable {
+    const uint8_t *P;
+    bool DecodePred;
+  };
 
-  Result =
-      decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
+  const DecodeTable Tables[] = {
+      {DecoderTableVFP32, false},      {DecoderTableVFPV832, false},
+      {DecoderTableNEONData32, true},  {DecoderTableNEONLoadStore32, true},
+      {DecoderTableNEONDup32, true},   {DecoderTablev8NEON32, false},
+      {DecoderTablev8Crypto32, false},
+  };
 
-  Result =
-      decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
+  for (auto Table : Tables) {
+    Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      // Add a fake predicate operand, because we share these instruction
+      // definitions with Thumb2 where these instructions are predicable.
+      if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
+        return MCDisassembler::Fail;
+      return Result;
+    }
   }
 
   Size = 0;
@@ -527,8 +491,10 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 }
 
 namespace llvm {
+
 extern const MCInstrDesc ARMInsts[];
-}
+
+} // end namespace llvm
 
 /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
 /// immediate Value in the MCInst.  The immediate Value has had any PC
@@ -859,7 +825,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-
 extern "C" void LLVMInitializeARMDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
                                          createARMDisassembler);
@@ -1056,7 +1021,6 @@ static const uint16_t QPRDecoderTable[] = {
     ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
 };
 
-
 static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 31 || (RegNo & 1) != 0)
@@ -1676,7 +1640,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::LDRD:
     case ARM::LDRD_PRE:
     case ARM::LDRD_POST:
-      if (type && Rn == 15){
+      if (type && Rn == 15) {
         if (Rt2 == 15)
           S = MCDisassembler::SoftFail;
         break;
@@ -1693,7 +1657,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::LDRH:
     case ARM::LDRH_PRE:
     case ARM::LDRH_POST:
-      if (type && Rn == 15){
+      if (type && Rn == 15) {
         if (Rt == 15)
           S = MCDisassembler::SoftFail;
         break;
@@ -1711,7 +1675,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::LDRSB:
     case ARM::LDRSB_PRE:
     case ARM::LDRSB_POST:
-      if (type && Rn == 15){
+      if (type && Rn == 15) {
         if (Rt == 15)
           S = MCDisassembler::SoftFail;
         break;
@@ -2309,7 +2273,6 @@ DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -3748,7 +3711,6 @@ static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4073,7 +4035,7 @@ static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
 
 static DecodeStatus
 DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
-                            uint64_t Address, const void *Decoder){
+                            uint64_t Address, const void *Decoder) {
   if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
                                 true, 2, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(SignExtend32<9>(Val << 1)));
@@ -4081,7 +4043,8 @@ DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
-                                       uint64_t Address, const void *Decoder){
+                                               uint64_t Address,
+                                               const void *Decoder) {
   // Val is passed in as S:J1:J2:imm10:imm11
   // Note no trailing zero after imm11.  Also the J1 and J2 values are from
   // the encoded instruction.  So here change to I1 and I2 values via:
@@ -4247,7 +4210,8 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address, const void *Decoder){
+                                         uint64_t Address,
+                                         const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -4323,7 +4287,6 @@ static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4506,7 +4469,6 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4637,7 +4599,6 @@ static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4771,7 +4732,6 @@ static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -5266,9 +5226,8 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
                                             uint64_t Address, const void *Decoder) {
-
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned CRm = fieldFromInstruction(Val, 0, 4);
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 3667952d44c0..57b91366a085 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -20,7 +20,15 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -73,7 +81,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
-
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 9d80eed84dc2..86873a3a6ccb 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -235,4 +235,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index a58d5b34131b..40bf545e8322 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -357,13 +357,14 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
 }
 
 unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                         bool IsPCRel, MCContext *Ctx,
+                                         bool IsPCRel, MCContext &Ctx,
                                          bool IsLittleEndian,
                                          bool IsResolved) const {
   unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown fixup kind!");
+    Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
+    return 0;
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
@@ -412,8 +413,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       isAdd = false;
     }
-    if (Ctx && Value >= 4096) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 4096) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -433,8 +434,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       opc = 2; // 0b0010
     }
-    if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (ARM_AM::getSOImmVal(Value) == -1) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     // Encode the immediate and shift the opcode into place.
@@ -541,8 +542,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    if (Ctx && Value % 4 != 0) {
-      Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+    if (Value % 4 != 0) {
+      Ctx.reportError(Fixup.getLoc(), "misaligned ARM call destination");
       return 0;
     }
 
@@ -568,10 +569,10 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case ARM::fixup_arm_thumb_cp:
     // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
     // could have an error on our hands.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
@@ -581,8 +582,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // CB instructions can only branch to offsets in [4, 126] in multiples of 2
     // so ensure that the raw value LSB is zero and it lies in [2, 130].
     // An offset of 2 will be relaxed to a NOP.
-    if (Ctx && ((int64_t)Value < 2 || Value > 0x82 || Value & 1)) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if ((int64_t)Value < 2 || Value > 0x82 || Value & 1) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     // Offset by 4 and don't encode the lower bit, which is always 0.
@@ -592,21 +593,21 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
-               !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
+        !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
     return ((Value - 4) >> 1) & 0x7ff;
   case ARM::fixup_arm_thumb_bcc:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
@@ -620,8 +621,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value = (Value & 0xf) | ((Value & 0xf0) << 4);
@@ -641,8 +642,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     }
     // These values don't encode the low two bits since they're always zero.
     Value >>= 2;
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -667,13 +668,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // These values don't encode the low bit since it's always zero.
-    if (Ctx && (Value & 1)) {
-      Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+    if (Value & 1) {
+      Ctx.reportError(Fixup.getLoc(), "invalid value for this fixup");
       return 0;
     }
     Value >>= 1;
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -687,8 +688,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_mod_imm:
     Value = ARM_AM::getSOImmVal(Value);
-    if (Ctx && Value >> 12) {
-      Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+    if (Value >> 12) {
+      Ctx.reportError(Fixup.getLoc(), "out of range immediate fixup value");
       return 0;
     }
     return Value;
@@ -737,12 +738,6 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
             (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
             (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
     IsResolved = false;
-
-  // Try to get the encoded value for the fixup as-if we're mapping it into
-  // the instruction. This allows adjustFixupValue() to issue a diagnostic
-  // if the value aren't invalid.
-  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
-                         IsLittleEndian, IsResolved);
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -846,11 +841,10 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
 }
 
 void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+                               unsigned DataSize, uint64_t Value, bool IsPCRel,
+                               MCContext &Ctx) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value =
-      adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
+  Value = adjustFixupValue(Fixup, Value, IsPCRel, Ctx, IsLittleEndian, true);
   if (!Value)
     return; // Doesn't change encoding.
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 84caaacc47d3..2ddedb5d6105 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -46,11 +46,11 @@ public:
                          bool &IsResolved) override;
 
   unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
-                            MCContext *Ctx, bool IsLittleEndian,
+                            MCContext &Ctx, bool IsLittleEndian,
                             bool IsResolved) const;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   unsigned getRelaxedOpcode(unsigned Op) const;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 088b4205ed62..92e553f21f14 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -291,7 +291,11 @@ namespace ARMII {
 
     /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
     /// just that part of the flag set.
-    MO_OPTION_MASK = 0x1f,
+    MO_OPTION_MASK = 0x0f,
+
+    /// MO_SBREL - On a symbol operand, this represents a static base relative
+    /// relocation. Used in movw and movt instructions.
+    MO_SBREL = 0x10,
 
     /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
     /// to the symbol is for an import stub.  This is used for DLL import
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 6f19754b899e..e1fa24571820 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -7,32 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
   class ARMELFObjectWriter : public MCELFObjectTargetWriter {
     enum { DefaultEABIVersion = 0x05000000U };
-    unsigned GetRelocTypeInner(const MCValue &Target,
-                               const MCFixup &Fixup,
-                               bool IsPCRel) const;
 
+    unsigned GetRelocTypeInner(const MCValue &Target, const MCFixup &Fixup,
+                               bool IsPCRel, MCContext &Ctx) const;
 
   public:
     ARMELFObjectWriter(uint8_t OSABI);
 
-    ~ARMELFObjectWriter() override;
+    ~ARMELFObjectWriter() override = default;
 
     unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                           const MCFixup &Fixup, bool IsPCRel) const override;
@@ -40,15 +40,14 @@ namespace {
     bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
   };
-}
+
+} // end anonymous namespace
 
 ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
   : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI,
                             ELF::EM_ARM,
                             /*HasRelocationAddend*/ false) {}
 
-ARMELFObjectWriter::~ARMELFObjectWriter() {}
-
 bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
                                                  unsigned Type) const {
   // FIXME: This is extremely conservative. This really needs to use a
@@ -70,19 +69,20 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
-  return GetRelocTypeInner(Target, Fixup, IsPCRel);
+  return GetRelocTypeInner(Target, Fixup, IsPCRel, Ctx);
 }
 
 unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
-                                               bool IsPCRel) const  {
+                                               bool IsPCRel,
+                                               MCContext &Ctx) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   unsigned Type = 0;
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
     default:
-      report_fatal_error("unsupported relocation on symbol");
+      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_4:
       switch (Modifier) {
@@ -161,7 +161,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   } else {
     switch ((unsigned)Fixup.getKind()) {
     default:
-      report_fatal_error("unsupported relocation on symbol");
+      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_1:
       switch (Modifier) {
@@ -270,10 +270,26 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       }
       break;
     case ARM::fixup_t2_movt_hi16:
-      Type = ELF::R_ARM_THM_MOVT_ABS;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_THM_MOVT_ABS;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_THM_MOVT_BREL;
+        break;
+      }
       break;
     case ARM::fixup_t2_movw_lo16:
-      Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_THM_MOVW_BREL_NC;
+        break;
+      }
       break;
     }
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index f6bb35d2326b..6fa890ba1cd5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -15,7 +15,11 @@
 
 #include "ARMRegisterInfo.h"
 #include "ARMUnwindOpAsm.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -24,25 +28,33 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/TargetParser.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetParser.h"
 #include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 using namespace llvm;
 
@@ -101,16 +113,21 @@ ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
                                            bool VerboseAsm)
     : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
       IsVerboseAsm(VerboseAsm) {}
+
 void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
 void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
 void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+
 void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
   OS << "\t.personality " << Personality->getName() << '\n';
 }
+
 void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) {
   OS << "\t.personalityindex " << Index << '\n';
 }
+
 void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+
 void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
                                      int64_t Offset) {
   OS << "\t.setfp\t";
@@ -121,6 +138,7 @@ void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
     OS << ", #" << Offset;
   OS << '\n';
 }
+
 void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
   assert((Reg != ARM::SP && Reg != ARM::PC) &&
          "the operand of .movsp cannot be either sp or pc");
@@ -131,9 +149,11 @@ void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
     OS << ", #" << Offset;
   OS << '\n';
 }
+
 void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
   OS << "\t.pad\t#" << Offset << '\n';
 }
+
 void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                        bool isVector) {
   assert(RegList.size() && "RegList should not be empty");
@@ -151,8 +171,9 @@ void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
 
   OS << "}\n";
 }
-void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
-}
+
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {}
+
 void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
   if (IsVerboseAsm) {
@@ -162,6 +183,7 @@ void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   }
   OS << "\n";
 }
+
 void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
                                              StringRef String) {
   switch (Attribute) {
@@ -179,6 +201,7 @@ void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
   }
   OS << "\n";
 }
+
 void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
                                                 unsigned IntValue,
                                                 StringRef StringValue) {
@@ -194,20 +217,25 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
   }
   OS << "\n";
 }
+
 void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
   OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
 }
+
 void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
   OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
 }
+
 void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
   OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n';
 }
+
 void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
   OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n";
 }
-void ARMTargetAsmStreamer::finishAttributeSection() {
-}
+
+void ARMTargetAsmStreamer::finishAttributeSection() {}
+
 void
 ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
   OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
@@ -274,12 +302,12 @@ private:
   };
 
   StringRef CurrentVendor;
-  unsigned FPU;
-  unsigned Arch;
-  unsigned EmittedArch;
+  unsigned FPU = ARM::FK_INVALID;
+  unsigned Arch = ARM::AK_INVALID;
+  unsigned EmittedArch = ARM::AK_INVALID;
   SmallVector<AttributeItem, 64> Contents;
 
-  MCSection *AttributeSection;
+  MCSection *AttributeSection = nullptr;
 
   AttributeItem *getAttributeItem(unsigned Attribute) {
     for (size_t i = 0; i < Contents.size(); ++i)
@@ -393,9 +421,7 @@ private:
 
 public:
   ARMTargetELFStreamer(MCStreamer &S)
-    : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID),
-      Arch(ARM::AK_INVALID), EmittedArch(ARM::AK_INVALID),
-      AttributeSection(nullptr) {}
+    : ARMTargetStreamer(S), CurrentVendor("aeabi") {}
 };
 
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
@@ -416,12 +442,11 @@ public:
 
   ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS,
                  MCCodeEmitter *Emitter, bool IsThumb)
-      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
-        MappingSymbolCounter(0), LastEMS(EMS_None) {
+      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb) {
     EHReset();
   }
 
-  ~ARMELFStreamer() {}
+  ~ARMELFStreamer() override = default;
 
   void FinishImpl() override;
 
@@ -439,20 +464,21 @@ public:
   void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
 
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
+    LastMappingSymbols[getPreviousSection().first] = std::move(LastEMSInfo);
     MCELFStreamer::ChangeSection(Section, Subsection);
+    auto LastMappingSymbol = LastMappingSymbols.find(Section);
+    if (LastMappingSymbol != LastMappingSymbols.end()) {
+      LastEMSInfo = std::move(LastMappingSymbol->second);
+      return;
+    }
+    LastEMSInfo.reset(new ElfMappingSymbolInfo(SMLoc(), nullptr, 0));
   }
 
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst& Inst,
-                       const MCSubtargetInfo &STI) override {
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override {
     if (IsThumb)
       EmitThumbMappingSymbol();
     else
@@ -507,15 +533,25 @@ public:
     MCELFStreamer::EmitBytes(Data);
   }
 
+  void FlushPendingMappingSymbol() {
+    if (!LastEMSInfo->hasInfo())
+      return;
+    ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
+    EmitMappingSymbol("$d", EMS->Loc, EMS->F, EMS->Offset);
+    EMS->resetInfo();
+  }
+
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
-    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
+    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
       if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
         return;
       }
+      getOrCreateDataFragment();
+    }
 
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size, Loc);
@@ -548,22 +584,54 @@ private:
     EMS_Data
   };
 
+  struct ElfMappingSymbolInfo {
+    explicit ElfMappingSymbolInfo(SMLoc Loc, MCFragment *F, uint64_t O)
+        : Loc(Loc), F(F), Offset(O), State(EMS_None) {}
+    void resetInfo() {
+      F = nullptr;
+      Offset = 0;
+    }
+    bool hasInfo() { return F != nullptr; }
+    SMLoc Loc;
+    MCFragment *F;
+    uint64_t Offset;
+    ElfMappingSymbol State;
+  };
+
   void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data) return;
+    if (LastEMSInfo->State == EMS_Data)
+      return;
+    else if (LastEMSInfo->State == EMS_None) {
+      // This is a tentative symbol, it won't really be emitted until it's
+      // actually needed.
+      ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
+      auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+      if (!DF)
+        return;
+      EMS->Loc = SMLoc();
+      EMS->F = getCurrentFragment();
+      EMS->Offset = DF->getContents().size();
+      LastEMSInfo->State = EMS_Data;
+      return;
+    }
     EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
+    LastEMSInfo->State = EMS_Data;
   }
 
   void EmitThumbMappingSymbol() {
-    if (LastEMS == EMS_Thumb) return;
+    if (LastEMSInfo->State == EMS_Thumb)
+      return;
+    FlushPendingMappingSymbol();
     EmitMappingSymbol("$t");
-    LastEMS = EMS_Thumb;
+    LastEMSInfo->State = EMS_Thumb;
   }
 
   void EmitARMMappingSymbol() {
-    if (LastEMS == EMS_ARM) return;
+    if (LastEMSInfo->State == EMS_ARM)
+      return;
+    FlushPendingMappingSymbol();
     EmitMappingSymbol("$a");
-    LastEMS = EMS_ARM;
+    LastEMSInfo->State = EMS_ARM;
   }
 
   void EmitMappingSymbol(StringRef Name) {
@@ -576,6 +644,17 @@ private:
     Symbol->setExternal(false);
   }
 
+  void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F,
+                         uint64_t Offset) {
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
+    EmitLabel(Symbol, Loc, F);
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
+    Symbol->setOffset(Offset);
+  }
+
   void EmitThumbFunc(MCSymbol *Func) override {
     getAssembler().setIsThumbFunc(Func);
     EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
@@ -599,10 +678,12 @@ private:
   void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
 
   bool IsThumb;
-  int64_t MappingSymbolCounter;
+  int64_t MappingSymbolCounter = 0;
+
+  DenseMap<const MCSection *, std::unique_ptr<ElfMappingSymbolInfo>>
+      LastMappingSymbols;
 
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS;
+  std::unique_ptr<ElfMappingSymbolInfo> LastEMSInfo;
 
   // ARM Exception Handling Frame Information
   MCSymbol *ExTab;
@@ -618,6 +699,7 @@ private:
   SmallVector<uint8_t, 64> Opcodes;
   UnwindOpcodeAssembler UnwindOpAsm;
 };
+
 } // end anonymous namespace
 
 ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
@@ -627,33 +709,42 @@ ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
 void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
 void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
 void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+
 void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
   getStreamer().emitPersonality(Personality);
 }
+
 void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) {
   getStreamer().emitPersonalityIndex(Index);
 }
+
 void ARMTargetELFStreamer::emitHandlerData() {
   getStreamer().emitHandlerData();
 }
+
 void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
                                      int64_t Offset) {
   getStreamer().emitSetFP(FpReg, SpReg, Offset);
 }
+
 void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
   getStreamer().emitMovSP(Reg, Offset);
 }
+
 void ARMTargetELFStreamer::emitPad(int64_t Offset) {
   getStreamer().emitPad(Offset);
 }
+
 void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                        bool isVector) {
   getStreamer().emitRegSave(RegList, isVector);
 }
+
 void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset,
                                       const SmallVectorImpl<uint8_t> &Opcodes) {
   getStreamer().emitUnwindRaw(Offset, Opcodes);
 }
+
 void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
   assert(!Vendor.empty() && "Vendor cannot be empty.");
 
@@ -668,25 +759,31 @@ void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
   CurrentVendor = Vendor;
 
 }
+
 void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
 }
+
 void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
                                              StringRef Value) {
   setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
 }
+
 void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
                                                 unsigned IntValue,
                                                 StringRef StringValue) {
   setAttributeItems(Attribute, IntValue, StringValue,
                     /* OverwriteExisting= */ true);
 }
+
 void ARMTargetELFStreamer::emitArch(unsigned Value) {
   Arch = Value;
 }
+
 void ARMTargetELFStreamer::emitObjectArch(unsigned Value) {
   EmittedArch = Value;
 }
+
 void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   using namespace ARMBuildAttrs;
 
@@ -786,9 +883,11 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
     break;
   }
 }
+
 void ARMTargetELFStreamer::emitFPU(unsigned Value) {
   FPU = Value;
 }
+
 void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
   switch (FPU) {
   case ARM::FK_VFP:
@@ -920,6 +1019,7 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
     break;
   }
 }
+
 size_t ARMTargetELFStreamer::calculateContentSize() const {
   size_t Result = 0;
   for (size_t i = 0; i < Contents.size(); ++i) {
@@ -944,6 +1044,7 @@ size_t ARMTargetELFStreamer::calculateContentSize() const {
   }
   return Result;
 }
+
 void ARMTargetELFStreamer::finishAttributeSection() {
   // <format-version>
   // [ <section-length> "vendor-name"
@@ -1093,9 +1194,9 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
   const MCSymbolELF *Group = FnSection.getGroup();
   if (Group)
     Flags |= ELF::SHF_GROUP;
-  MCSectionELF *EHSection =
-      getContext().getELFSection(EHSecName, Type, Flags, 0, Group,
-                                 FnSection.getUniqueID(), nullptr, &FnSection);
+  MCSectionELF *EHSection = getContext().getELFSection(
+      EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(),
+      static_cast<const MCSymbolELF *>(&Fn));
 
   assert(EHSection && "Failed to get the required EH section");
 
@@ -1114,6 +1215,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
                     ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
                     SectionKind::getData(), FnStart);
 }
+
 void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
   MCDataFragment *Frag = getOrCreateDataFragment();
   Frag->getFixups().push_back(MCFixup::create(Frag->getContents().size(), Expr,
@@ -1396,8 +1498,6 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
     if (RelaxAll)
       S->getAssembler().setRelaxAll(true);
     return S;
-  }
-
 }
 
-
+} // end namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 559a4f8de75f..d9df2c6da7ec 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -11,22 +11,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 
 using namespace llvm;
 
@@ -36,9 +47,8 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
 STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
 namespace {
+
 class ARMMCCodeEmitter : public MCCodeEmitter {
-  ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
-  void operator=(const ARMMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCContext &CTX;
   bool IsLittleEndian;
@@ -47,15 +57,18 @@ public:
   ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle)
     : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) {
   }
-
-  ~ARMMCCodeEmitter() override {}
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
+  ARMMCCodeEmitter &operator=(const ARMMCCodeEmitter &) = delete;
+  ~ARMMCCodeEmitter() override = default;
 
   bool isThumb(const MCSubtargetInfo &STI) const {
     return STI.getFeatureBits()[ARM::ModeThumb];
   }
+
   bool isThumb2(const MCSubtargetInfo &STI) const {
     return isThumb(STI) && STI.getFeatureBits()[ARM::FeatureThumb2];
   }
+
   bool isTargetMachO(const MCSubtargetInfo &STI) const {
     const Triple &TT = STI.getTargetTriple();
     return TT.isOSBinFormatMachO();
@@ -200,6 +213,7 @@ public:
     case ARM_AM::ib: return 3;
     }
   }
+
   /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
   ///
   unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
@@ -273,7 +287,6 @@ public:
   unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
     const MCOperand &MO = MI.getOperand(Op);
 
     // We expect MO to be an immediate or an expression,
@@ -432,18 +445,6 @@ public:
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new ARMMCCodeEmitter(MCII, Ctx, true);
-}
-
-MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new ARMMCCodeEmitter(MCII, Ctx, false);
-}
-
 /// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
 /// instructions, and rewrite them to their Thumb2 form if we are currently in
 /// Thumb2 mode.
@@ -550,7 +551,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 bool ARMMCCodeEmitter::
 EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
                        unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
 
@@ -1515,7 +1516,7 @@ getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
   uint32_t v = ~MO.getImm();
   uint32_t lsb = countTrailingZeros(v);
   uint32_t msb = (32 - countLeadingZeros (v)) - 1;
-  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  assert(v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
   return lsb | (msb << 5);
 }
 
@@ -1700,3 +1701,15 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
 }
 
 #include "ARMGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, true);
+}
+
+MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, false);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 9e4d202321e6..477755157040 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -260,18 +260,37 @@ public:
       return false;
 
     int64_t Imm = Inst.getOperand(0).getImm();
-    // FIXME: This is not right for thumb.
     Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
     return true;
   }
 };
 
+class ThumbMCInstrAnalysis : public ARMMCInstrAnalysis {
+public:
+  ThumbMCInstrAnalysis(const MCInstrInfo *Info) : ARMMCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return false;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    Target = Addr+Imm+4; // In Thumb mode the PC is always off by 4 bytes.
+    return true;
+  }
+};
+
 }
 
 static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
   return new ARMMCInstrAnalysis(Info);
 }
 
+static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ThumbMCInstrAnalysis(Info);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
@@ -289,9 +308,6 @@ extern "C" void LLVMInitializeARMTargetMC() {
     TargetRegistry::RegisterMCSubtargetInfo(*T,
                                             ARM_MC::createARMMCSubtargetInfo);
 
-    // Register the MC instruction analyzer.
-    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
-
     TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
     TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer);
     TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer);
@@ -313,6 +329,12 @@ extern "C" void LLVMInitializeARMTargetMC() {
     TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo);
   }
 
+  // Register the MC instruction analyzer.
+  for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget()})
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
+  for (Target *T : {&getTheThumbLETarget(), &getTheThumbBETarget()})
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createThumbMCInstrAnalysis);
+
   // Register the MC Code Emitter
   for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
     TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 482bcf902518..34c770440e1b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//===- ARMMachORelocationInfo.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "ARMMCExpr.h"
-#include "llvm-c/Disassembler.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm-c/Disassembler.h"
 
 using namespace llvm;
-using namespace object;
 
 namespace {
+
 class ARMMachORelocationInfo : public MCRelocationInfo {
 public:
   ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
@@ -35,7 +35,8 @@ public:
     }
   }
 };
-} // End unnamed namespace
+
+} // end anonymous namespace
 
 /// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
 MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index c0d10c896354..73e563890dd9 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -10,20 +10,21 @@
 // This file implements the ARMTargetStreamer class.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/ADT/MapVector.h"
+
 #include "llvm/MC/ConstantPools.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 
 using namespace llvm;
+
 //
 // ARMTargetStreamer Implemenation
 //
+
 ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
 
-ARMTargetStreamer::~ARMTargetStreamer() {}
+ARMTargetStreamer::~ARMTargetStreamer() = default;
 
 // The constant pool handling is shared by all ARMTargetStreamer
 // implementations.
@@ -73,5 +74,4 @@ void ARMTargetStreamer::finishAttributeSection() {}
 void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {}
 void
 ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
-
 void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index 173cc93d44fb..d3ab83bbccbc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -14,12 +14,14 @@
 
 #include "ARMUnwindOpAsm.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
 
 using namespace llvm;
 
 namespace {
+
   /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
   /// with MSB to LSB per uint32_t ordering.  For example, the first byte will
   /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
@@ -27,20 +29,19 @@ namespace {
   class UnwindOpcodeStreamer {
   private:
     SmallVectorImpl<uint8_t> &Vec;
-    size_t Pos;
+    size_t Pos = 3;
 
   public:
-    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
-    }
+    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V) {}
 
     /// Emit the byte in MSB to LSB per uint32_t order.
-    inline void EmitByte(uint8_t elem) {
+    void EmitByte(uint8_t elem) {
       Vec[Pos] = elem;
       Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
     }
 
     /// Emit the size prefix.
-    inline void EmitSize(size_t Size) {
+    void EmitSize(size_t Size) {
       size_t SizeInWords = (Size + 3) / 4;
       assert(SizeInWords <= 0x100u &&
              "Only 256 additional words are allowed for unwind opcodes");
@@ -48,19 +49,20 @@ namespace {
     }
 
     /// Emit the personality index prefix.
-    inline void EmitPersonalityIndex(unsigned PI) {
+    void EmitPersonalityIndex(unsigned PI) {
       assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX &&
              "Invalid personality prefix");
       EmitByte(ARM::EHABI::EHT_COMPACT | PI);
     }
 
     /// Fill the rest of bytes with FINISH opcode.
-    inline void FillFinishOpcode() {
+    void FillFinishOpcode() {
       while (Pos < Vec.size())
         EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH);
     }
   };
-}
+
+} // end anonymous namespace
 
 void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
   if (RegSave == 0u)
@@ -153,7 +155,6 @@ void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
 
 void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
                                      SmallVectorImpl<uint8_t> &Result) {
-
   UnwindOpcodeStreamer OpStreamer(Result);
 
   if (HasPersonality) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index e0c113ecfaa3..a7bfbdf4938e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -16,8 +16,8 @@
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/DataTypes.h"
+#include <cstddef>
+#include <cstdint>
 
 namespace llvm {
 
@@ -25,13 +25,12 @@ class MCSymbol;
 
 class UnwindOpcodeAssembler {
 private:
-  llvm::SmallVector<uint8_t, 32> Ops;
-  llvm::SmallVector<unsigned, 8> OpBegins;
-  bool HasPersonality;
+  SmallVector<uint8_t, 32> Ops;
+  SmallVector<unsigned, 8> OpBegins;
+  bool HasPersonality = false;
 
 public:
-  UnwindOpcodeAssembler()
-      : HasPersonality(0) {
+  UnwindOpcodeAssembler() {
     OpBegins.push_back(0);
   }
 
@@ -40,12 +39,12 @@ public:
     Ops.clear();
     OpBegins.clear();
     OpBegins.push_back(0);
-    HasPersonality = 0;
+    HasPersonality = false;
   }
 
   /// Set the personality
   void setPersonality(const MCSymbol *Per) {
-    HasPersonality = 1;
+    HasPersonality = true;
   }
 
   /// Emit unwind opcodes for .save directives
@@ -88,6 +87,6 @@ private:
   }
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 166c04b41a77..7ae2f864d79d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -10,23 +10,28 @@
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
 namespace {
+
 class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
 public:
   ARMWinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
     assert(!Is64Bit && "AArch64 support not yet implemented");
   }
-  ~ARMWinCOFFObjectWriter() override {}
+
+  ~ARMWinCOFFObjectWriter() override = default;
 
   unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
                         bool IsCrossSection,
@@ -35,6 +40,8 @@ public:
   bool recordRelocation(const MCFixup &) const override;
 };
 
+} // end anonymous namespace
+
 unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
@@ -79,13 +86,13 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
 bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
   return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
 }
-}
 
 namespace llvm {
+
 MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS,
                                              bool Is64Bit) {
   MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
   return createWinCOFFObjectWriter(MOTW, OS);
 }
-}
 
+} // end namespace llvm
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 9953c61cd89c..fc083b98395b 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -11,14 +11,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Thumb1FrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "ThumbRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <iterator>
+#include <vector>
 
 using namespace llvm;
 
@@ -238,9 +260,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   if (HasFP) {
     FramePtrOffsetInBlock +=
         MFI.getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
-      .setMIFlags(MachineInstr::FrameSetup));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+        .addReg(ARM::SP)
+        .addImm(FramePtrOffsetInBlock / 4)
+        .setMIFlags(MachineInstr::FrameSetup)
+        .add(predOps(ARMCC::AL));
     if(FramePtrOffsetInBlock) {
       CFAOffset += FramePtrOffsetInBlock;
       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
@@ -336,14 +360,19 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   // will be allocated after this, so we can still use the base pointer
   // to reference locals.
   if (RegInfo->hasBasePointer(MF))
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
-                   .addReg(ARM::SP));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL));
 
   // If the frame has variable sized objects then the epilogue must restore
   // the sp from fp. We can assume there's an FP here since hasFP already
   // checks for hasVarSizedObjects.
   if (MFI.hasVarSizedObjects())
     AFI->setShouldRestoreSPFromFP(true);
+
+  // In some cases, virtual registers have been introduced, e.g. by uses of
+  // emitThumbRegPlusImmInReg.
+  MF.getProperties().reset(MachineFunctionProperties::Property::NoVRegs);
 }
 
 static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
@@ -408,13 +437,13 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
                "No scratch register to restore SP from FP!");
         emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                   TII, *RegInfo);
-        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                               ARM::SP)
-          .addReg(ARM::R4));
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+            .addReg(ARM::R4)
+            .add(predOps(ARMCC::AL));
       } else
-        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                               ARM::SP)
-          .addReg(FramePtr));
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+            .addReg(FramePtr)
+            .add(predOps(ARMCC::AL));
     } else {
       if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
           &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
@@ -493,12 +522,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
       return true;
     MachineInstrBuilder MIB =
-        AddDefaultPred(
-            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))
+            .add(predOps(ARMCC::AL));
     // Copy implicit ops and popped registers, if any.
     for (auto MO: MBBI->operands())
       if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
-        MIB.addOperand(MO);
+        MIB.add(MO);
     MIB.addReg(ARM::PC, RegState::Define);
     // Erase the old instruction (tBX_RET or tPOP).
     MBB.erase(MBBI);
@@ -566,22 +595,23 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   if (TemporaryReg) {
     assert(!PopReg && "Unnecessary MOV is about to be inserted");
     PopReg = PopFriendly.find_first();
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                       .addReg(TemporaryReg, RegState::Define)
-                       .addReg(PopReg, RegState::Kill));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(TemporaryReg, RegState::Define)
+        .addReg(PopReg, RegState::Kill)
+        .add(predOps(ARMCC::AL));
   }
 
   if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
     // We couldn't use the direct restoration above, so
     // perform the opposite conversion: tPOP_RET to tPOP.
     MachineInstrBuilder MIB =
-        AddDefaultPred(
-            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP))
+            .add(predOps(ARMCC::AL));
     bool Popped = false;
     for (auto MO: MBBI->operands())
       if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
           MO.getReg() != ARM::PC) {
-        MIB.addOperand(MO);
+        MIB.add(MO);
         if (!MO.isImplicit())
           Popped = true;
       }
@@ -590,23 +620,27 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
       MBB.erase(MIB.getInstr());
     // Erase the old instruction.
     MBB.erase(MBBI);
-    MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET)));
+    MBBI = BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET))
+               .add(predOps(ARMCC::AL));
   }
 
   assert(PopReg && "Do not know how to get LR");
-  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))
+      .add(predOps(ARMCC::AL))
       .addReg(PopReg, RegState::Define);
 
   emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
 
-  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                     .addReg(ARM::LR, RegState::Define)
-                     .addReg(PopReg, RegState::Kill));
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+      .addReg(ARM::LR, RegState::Define)
+      .addReg(PopReg, RegState::Kill)
+      .add(predOps(ARMCC::AL));
 
   if (TemporaryReg)
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                       .addReg(PopReg, RegState::Define)
-                       .addReg(TemporaryReg, RegState::Kill));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(PopReg, RegState::Define)
+        .addReg(TemporaryReg, RegState::Kill)
+        .add(predOps(ARMCC::AL));
 
   return true;
 }
@@ -667,8 +701,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   // Push the low registers and lr
   if (!LoRegsToSave.empty()) {
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
-    AddDefaultPred(MIB);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
     for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
       if (LoRegsToSave.count(Reg)) {
         bool isKill = !MF.getRegInfo().isLiveIn(Reg);
@@ -708,8 +742,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
 
     // Create the PUSH, but don't insert it yet (the MOVs need to come first).
-    MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH));
-    AddDefaultPred(PushMIB);
+    MachineInstrBuilder PushMIB =
+        BuildMI(MF, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
 
     SmallVector<unsigned, 4> RegsToPush;
     while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
@@ -719,11 +753,10 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
           MBB.addLiveIn(*HiRegToSave);
 
         // Emit a MOV from the high reg to the low reg.
-        MachineInstrBuilder MIB =
-            BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
-        MIB.addReg(*CopyReg, RegState::Define);
-        MIB.addReg(*HiRegToSave, getKillRegState(isKill));
-        AddDefaultPred(MIB);
+        BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
+            .addReg(*CopyReg, RegState::Define)
+            .addReg(*HiRegToSave, getKillRegState(isKill))
+            .add(predOps(ARMCC::AL));
 
         // Record the register that must be added to the PUSH.
         RegsToPush.push_back(*CopyReg);
@@ -735,7 +768,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     }
 
     // Add the low registers to the PUSH, in ascending order.
-    for (unsigned Reg : reverse(RegsToPush))
+    for (unsigned Reg : llvm::reverse(RegsToPush))
       PushMIB.addReg(Reg, RegState::Kill);
 
     // Insert the PUSH instruction after the MOVs.
@@ -817,19 +850,18 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
 
     // Create the POP instruction.
-    MachineInstrBuilder PopMIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPOP));
-    AddDefaultPred(PopMIB);
+    MachineInstrBuilder PopMIB =
+        BuildMI(MBB, MI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
     while (HiRegToRestore != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
       // Add the low register to the POP.
       PopMIB.addReg(*CopyReg, RegState::Define);
 
       // Create the MOV from low to high register.
-      MachineInstrBuilder MIB =
-          BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
-      MIB.addReg(*HiRegToRestore, RegState::Define);
-      MIB.addReg(*CopyReg, RegState::Kill);
-      AddDefaultPred(MIB);
+      BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
+          .addReg(*HiRegToRestore, RegState::Define)
+          .addReg(*CopyReg, RegState::Kill)
+          .add(predOps(ARMCC::AL));
 
       CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
       HiRegToRestore =
@@ -837,11 +869,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     }
   }
 
-
-
-
-  MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
-  AddDefaultPred(MIB);
+  MachineInstrBuilder MIB =
+      BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
   bool NeedsPop = false;
   for (unsigned i = CSI.size(); i != 0; --i) {
@@ -859,6 +888,16 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         // ARMv4T requires BX, see emitEpilogue
         if (!STI.hasV5TOps())
           continue;
+        // Tailcall optimization failed; change TCRETURN to a tBL
+        if (MI->getOpcode() == ARM::TCRETURNdi ||
+            MI->getOpcode() == ARM::TCRETURNri) {
+          unsigned Opcode = MI->getOpcode() == ARM::TCRETURNdi
+                            ? ARM::tBL : ARM::tBLXr;
+          MachineInstrBuilder BL = BuildMI(MF, DL, TII.get(Opcode));
+          BL.add(predOps(ARMCC::AL));
+          BL.add(MI->getOperand(0));
+          MBB.insert(MI, &*BL);
+        }
         Reg = ARM::PC;
         (*MIB).setDesc(TII.get(ARM::tPOP_RET));
         if (MI != MBB.end())
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 4b4fbaab28d9..27bff4d75acf 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -50,20 +50,29 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg)
       || !ARM::tGPRRegClass.contains(DestReg))
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc)));
+    BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .add(predOps(ARMCC::AL));
   else {
-    // FIXME: The performance consequences of this are going to be atrocious.
-    // Some things to try that should be better:
-    //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
-    //   * 'movs $dst, $src' if cpsr isn't live
-    // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
+    // FIXME: Can also use 'mov hi, $src; mov $dst, hi',
+    // with hi as either r10 or r11.
+
+    const TargetRegisterInfo *RegInfo = st.getRegisterInfo();
+    if (MBB.computeRegisterLiveness(RegInfo, ARM::CPSR, I)
+        == MachineBasicBlock::LQR_Dead) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          ->addRegisterDead(ARM::CPSR, RegInfo);
+      return;
+    }
 
     // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPOP)))
-      .addReg(DestReg, getDefRegState(true));
+    BuildMI(MBB, I, DL, get(ARM::tPUSH))
+        .add(predOps(ARMCC::AL))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(ARM::tPOP))
+        .add(predOps(ARMCC::AL))
+        .addReg(DestReg, getDefRegState(true));
   }
 }
 
@@ -87,9 +96,12 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
         MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::tSTRspi))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -113,8 +125,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
         MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
   }
 }
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index d01fc8c40ddf..04bdd91b53e6 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -9,13 +9,26 @@
 
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include <cassert>
+#include <new>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "thumb2-it"
@@ -24,16 +37,18 @@ STATISTIC(NumITs,        "Number of IT blocks inserted");
 STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
 namespace {
+
   class Thumb2ITBlockPass : public MachineFunctionPass {
   public:
     static char ID;
-    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
 
     bool restrictIT;
     const Thumb2InstrInfo *TII;
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
 
+    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
     MachineFunctionProperties getRequiredProperties() const override {
@@ -52,8 +67,10 @@ namespace {
                               SmallSet<unsigned, 4> &Uses);
     bool InsertITInstructions(MachineBasicBlock &MBB);
   };
+
   char Thumb2ITBlockPass::ID = 0;
-}
+
+} // end anonymous namespace
 
 /// TrackDefUses - Tracking what registers are being defined and used by
 /// instructions in the IT block. This also tracks "dependencies", i.e. uses
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 1c731d669eda..818ba85c7d40 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -117,8 +117,9 @@ void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
     return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
 
-  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc)));
+  BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .add(predOps(ARMCC::AL));
 }
 
 void Thumb2InstrInfo::
@@ -138,9 +139,12 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
       RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
       RC == &ARM::GPRnopcRegClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
     return;
   }
 
@@ -156,8 +160,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
     AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
     AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
-    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-    AddDefaultPred(MIB);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
     return;
   }
 
@@ -180,8 +183,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
       RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
       RC == &ARM::GPRnopcRegClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
     return;
   }
 
@@ -198,8 +204,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
     AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
     AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
-    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-    AddDefaultPred(MIB);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
 
     if (TargetRegisterInfo::isPhysicalRegister(DestReg))
       MIB.addReg(DestReg, RegState::ImplicitDefine);
@@ -259,10 +264,11 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     if (Fits) {
       if (isSub) {
         BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
-          .addReg(BaseReg)
-          .addReg(DestReg, RegState::Kill)
-          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-          .setMIFlags(MIFlags);
+            .addReg(BaseReg)
+            .addReg(DestReg, RegState::Kill)
+            .add(predOps(Pred, PredReg))
+            .add(condCodeOp())
+            .setMIFlags(MIFlags);
       } else {
         // Here we know that DestReg is not SP but we do not
         // know anything about BaseReg. t2ADDrr is an invalid
@@ -270,10 +276,11 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         // is fine if SP is the first argument. To be sure we
         // do not generate invalid encoding, put BaseReg first.
         BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
-          .addReg(BaseReg)
-          .addReg(DestReg, RegState::Kill)
-          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-          .setMIFlags(MIFlags);
+            .addReg(BaseReg)
+            .addReg(DestReg, RegState::Kill)
+            .add(predOps(Pred, PredReg))
+            .add(condCodeOp())
+            .setMIFlags(MIFlags);
       }
       return;
     }
@@ -284,8 +291,10 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     unsigned Opc = 0;
     if (DestReg == ARM::SP && BaseReg != ARM::SP) {
       // mov sp, rn. Note t2MOVr cannot be used.
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg)
-        .addReg(BaseReg).setMIFlags(MIFlags));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+          .addReg(BaseReg)
+          .setMIFlags(MIFlags)
+          .add(predOps(ARMCC::AL));
       BaseReg = ARM::SP;
       continue;
     }
@@ -296,8 +305,11 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
         assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
         Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
+        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+            .addReg(BaseReg)
+            .addImm(ThisVal / 4)
+            .setMIFlags(MIFlags)
+            .add(predOps(ARMCC::AL));
         NumBytes = 0;
         continue;
       }
@@ -334,12 +346,13 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     }
 
     // Build the new ADD / SUB.
-    MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-                     .addReg(BaseReg, RegState::Kill)
-                     .addImm(ThisVal)).setMIFlags(MIFlags);
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                                  .addReg(BaseReg, RegState::Kill)
+                                  .addImm(ThisVal)
+                                  .add(predOps(ARMCC::AL))
+                                  .setMIFlags(MIFlags);
     if (HasCCOut)
-      AddDefaultCC(MIB);
+      MIB.add(condCodeOp());
 
     BaseReg = DestReg;
   }
@@ -474,7 +487,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       do MI.RemoveOperand(FrameRegIdx+1);
       while (MI.getNumOperands() > FrameRegIdx+1);
       MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
-      AddDefaultPred(MIB);
+      MIB.add(predOps(ARMCC::AL));
       return true;
     }
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 8208e7e24770..c90475c28db7 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -10,20 +10,38 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
-#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h" // To access Function attributes
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
 #include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "t2-reduce-size"
@@ -40,6 +58,7 @@ static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
                                      cl::init(-1), cl::Hidden);
 
 namespace {
+
   /// ReduceTable - A static table with information on mapping from wide
   /// opcodes to narrow
   struct ReduceEntry {
@@ -139,11 +158,12 @@ namespace {
   class Thumb2SizeReduce : public MachineFunctionPass {
   public:
     static char ID;
-    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
 
     const Thumb2InstrInfo *TII;
     const ARMSubtarget *STI;
 
+    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
+
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     MachineFunctionProperties getRequiredProperties() const override {
@@ -201,19 +221,21 @@ namespace {
 
     struct MBBInfo {
       // The flags leaving this block have high latency.
-      bool HighLatencyCPSR;
+      bool HighLatencyCPSR = false;
       // Has this block been visited yet?
-      bool Visited;
+      bool Visited = false;
 
-      MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+      MBBInfo() = default;
     };
 
     SmallVector<MBBInfo, 8> BlockInfo;
 
     std::function<bool(const Function &)> PredicateFtor;
   };
+
   char Thumb2SizeReduce::ID = 0;
-}
+
+} // end anonymous namespace
 
 Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
     : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
@@ -490,14 +512,13 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     isLdStMul = true;
     break;
   }
-  case ARM::t2STMIA: {
+  case ARM::t2STMIA:
     // If the base register is killed, we don't care what its value is after the
     // instruction, so we can use an updating STMIA.
     if (!MI->getOperand(0).isKill())
       return false;
 
     break;
-  }
   case ARM::t2LDMIA_RET: {
     unsigned BaseReg = MI->getOperand(1).getReg();
     if (BaseReg != ARM::SP)
@@ -562,8 +583,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
 
   if (!isLdStMul) {
-    MIB.addOperand(MI->getOperand(0));
-    MIB.addOperand(MI->getOperand(1));
+    MIB.add(MI->getOperand(0));
+    MIB.add(MI->getOperand(1));
 
     if (HasImmOffset)
       MIB.addImm(OffsetImm / Scale);
@@ -577,7 +598,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Transfer the rest of operands.
   for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
-    MIB.addOperand(MI->getOperand(OpNum));
+    MIB.add(MI->getOperand(OpNum));
 
   // Transfer memoperands.
   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
@@ -621,12 +642,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
         MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
       return false;
 
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
-                                      TII->get(ARM::tADDrSPi))
-      .addOperand(MI->getOperand(0))
-      .addOperand(MI->getOperand(1))
-      .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
-    AddDefaultPred(MIB);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, MI->getDebugLoc(),
+                TII->get(ARM::tADDrSPi))
+            .add(MI->getOperand(0))
+            .add(MI->getOperand(1))
+            .addImm(Imm / 4) // The tADDrSPi has an implied scale by four.
+            .add(predOps(ARMCC::AL));
 
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
@@ -652,11 +674,10 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
       switch (Opc) {
       default: break;
-      case ARM::t2ADDSri: {
+      case ARM::t2ADDSri:
         if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
           return true;
         LLVM_FALLTHROUGH;
-      }
       case ARM::t2ADDSrr:
         return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
       }
@@ -698,7 +719,6 @@ bool
 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
                                 bool LiveCPSR, bool IsSelfLoop) {
-
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
 
@@ -785,13 +805,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
-  MIB.addOperand(MI->getOperand(0));
-  if (NewMCID.hasOptionalDef()) {
-    if (HasCC)
-      AddDefaultT1CC(MIB, CCDead);
-    else
-      AddNoT1CC(MIB);
-  }
+  MIB.add(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef())
+    MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
 
   // Transfer the rest of operands.
   unsigned NumOps = MCID.getNumOperands();
@@ -800,7 +816,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
       continue;
     if (SkipPred && MCID.OpInfo[i].isPredicate())
       continue;
-    MIB.addOperand(MI->getOperand(i));
+    MIB.add(MI->getOperand(i));
   }
 
   // Transfer MI flags.
@@ -880,13 +896,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
-  MIB.addOperand(MI->getOperand(0));
-  if (NewMCID.hasOptionalDef()) {
-    if (HasCC)
-      AddDefaultT1CC(MIB, CCDead);
-    else
-      AddNoT1CC(MIB);
-  }
+  MIB.add(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef())
+    MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
 
   // Transfer the rest of operands.
   unsigned NumOps = MCID.getNumOperands();
@@ -909,10 +921,10 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
       // Skip implicit def of CPSR. Either it's modeled as an optional
       // def now or it's already an implicit def on the new instruction.
       continue;
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
   if (!MCID.isPredicable() && NewMCID.isPredicable())
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index 2efd63b84a2c..15a567523336 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -93,9 +93,10 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
-    .addReg(DestReg, getDefRegState(true), SubIdx)
-    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
-    .setMIFlags(MIFlags);
+      .addReg(DestReg, getDefRegState(true), SubIdx)
+      .addConstantPoolIndex(Idx)
+      .add(predOps(ARMCC::AL))
+      .setMIFlags(MIFlags);
 }
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
@@ -145,14 +146,17 @@ static void emitThumbRegPlusImmInReg(
     LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
 
   if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
-    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)
+        .add(t1CondCodeOp())
         .addImm(NumBytes)
         .setMIFlags(MIFlags);
   } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
-    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)
+        .add(t1CondCodeOp())
         .addImm(NumBytes)
         .setMIFlags(MIFlags);
-    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)
+        .add(t1CondCodeOp())
         .addReg(LdReg, RegState::Kill)
         .setMIFlags(MIFlags);
   } else if (ST.genExecuteOnly()) {
@@ -167,12 +171,12 @@ static void emitThumbRegPlusImmInReg(
                     : ((isHigh || !CanChangeCC) ? ARM::tADDhirr : ARM::tADDrr);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
   if (Opc != ARM::tADDhirr)
-    MIB = AddDefaultT1CC(MIB);
+    MIB = MIB.add(t1CondCodeOp());
   if (DestReg == ARM::SP || isSub)
     MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
   else
     MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 }
 
 /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
@@ -307,12 +311,12 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg);
     if (CopyNeedsCC)
-      MIB = AddDefaultT1CC(MIB);
+      MIB = MIB.add(t1CondCodeOp());
     MIB.addReg(BaseReg, RegState::Kill);
     if (CopyOpc != ARM::tMOVr) {
       MIB.addImm(CopyImm);
     }
-    AddDefaultPred(MIB.setMIFlags(MIFlags));
+    MIB.setMIFlags(MIFlags).add(predOps(ARMCC::AL));
 
     BaseReg = DestReg;
   }
@@ -324,10 +328,11 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg);
     if (ExtraNeedsCC)
-      MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(BaseReg).addImm(ExtraImm);
-    MIB = AddDefaultPred(MIB);
-    MIB.setMIFlags(MIFlags);
+      MIB = MIB.add(t1CondCodeOp());
+    MIB.addReg(BaseReg)
+       .addImm(ExtraImm)
+       .add(predOps(ARMCC::AL))
+       .setMIFlags(MIFlags);
   }
 }
 
@@ -460,9 +465,10 @@ bool ThumbRegisterInfo::saveScavengerRegister(
   // a call clobbered register that we know won't be used in Thumb1 mode.
   const TargetInstrInfo &TII = *STI.getInstrInfo();
   DebugLoc DL;
-  AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
-    .addReg(ARM::R12, RegState::Define)
-    .addReg(Reg, RegState::Kill));
+  BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
+      .addReg(ARM::R12, RegState::Define)
+      .addReg(Reg, RegState::Kill)
+      .add(predOps(ARMCC::AL));
 
   // The UseMI is where we would like to restore the register. If there's
   // interference with R12 before then, however, we'll need to restore it
@@ -490,8 +496,10 @@ bool ThumbRegisterInfo::saveScavengerRegister(
     }
   }
   // Restore the register from R12
-  AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)).
-    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill));
+  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr))
+      .addReg(Reg, RegState::Define)
+      .addReg(ARM::R12, RegState::Kill)
+      .add(predOps(ARMCC::AL));
 
   return true;
 }
@@ -621,5 +629,5 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Add predicate back if it's needed.
   if (MI.isPredicable())
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 }
diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp
index 4afdd3a0ec08..50bb50b44f27 100644
--- a/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -130,7 +130,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     }
   }
 
-  printOperand(MI, OpNum, O);
+  if (Error)
+    printOperand(MI, OpNum, O);
 
   return false;
 }
diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 1b2f2cec0bca..13080a5d72f0 100644
--- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -509,8 +509,8 @@ bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
     const BlockAddress *BA = MI.getOperand(1).getBlockAddress();
     unsigned TF = MI.getOperand(1).getTargetFlags();
 
-    MIBLO.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_LO));
-    MIBHI.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_HI));
+    MIBLO.add(MachineOperand::CreateBA(BA, TF | AVRII::MO_LO));
+    MIBHI.add(MachineOperand::CreateBA(BA, TF | AVRII::MO_HI));
     break;
   }
   case MachineOperand::MO_Immediate: {
@@ -785,9 +785,8 @@ bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
       auto Op1 = MI.getOperand(0);
       auto Op2 = MI.getOperand(1);
 
-      MachineInstr &NewInst = *buildMI(MBB, MBBI, Opcode)
-        .addOperand(Op1).addOperand(Op2)
-        .getInstr();
+      MachineInstr &NewInst =
+          *buildMI(MBB, MBBI, Opcode).add(Op1).add(Op2).getInstr();
       f(NewInst);
   });
 }
@@ -810,15 +809,13 @@ bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
       unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
 
       // Create the load
-      buildMI(MBB, MBBI, LoadOpcode).addOperand(Op1).addOperand(Op2);
+      buildMI(MBB, MBBI, LoadOpcode).add(Op1).add(Op2);
 
       // Create the arithmetic op
-      buildMI(MBB, MBBI, ArithOpcode)
-        .addOperand(Op1).addOperand(Op1)
-        .addOperand(Op2);
+      buildMI(MBB, MBBI, ArithOpcode).add(Op1).add(Op1).add(Op2);
 
       // Create the store
-      buildMI(MBB, MBBI, StoreOpcode).addOperand(Op2).addOperand(Op1);
+      buildMI(MBB, MBBI, StoreOpcode).add(Op2).add(Op1);
   });
 }
 
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 07fc3f6890b8..0b95d3819399 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -48,6 +48,8 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
   setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
 
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
 
@@ -311,7 +313,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
          "Invalid opcode for Div/Rem lowering");
-  bool isSigned = (Opcode == ISD::SDIVREM);
+  bool IsSigned = (Opcode == ISD::SDIVREM);
   EVT VT = Op->getValueType(0);
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -320,16 +322,16 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   default:
     llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:
-    LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
+    LC = IsSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
     break;
   case MVT::i16:
-    LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
+    LC = IsSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
     break;
   case MVT::i32:
-    LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+    LC = IsSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
     break;
   case MVT::i64:
-    LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
+    LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
     break;
   }
 
@@ -340,8 +342,8 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   for (SDValue const &Value : Op->op_values()) {
     Entry.Node = Value;
     Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = IsSigned;
+    Entry.IsZExt = !IsSigned;
     Args.push_back(Entry);
   }
 
@@ -354,10 +356,10 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
-      .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
       .setInRegister()
-      .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setSExtResult(IsSigned)
+      .setZExtResult(!IsSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
@@ -932,6 +934,12 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
   bool UsesStack = false;
   for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
     unsigned Size = Args[i];
+
+    // If we have a zero-sized argument, don't attempt to lower it.
+    // AVR-GCC does not support zero-sized arguments and so we need not
+    // worry about ABI compatibility.
+    if (Size == 0) continue;
+
     MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
 
     // If we have plenty of regs to pass the whole argument do it.
@@ -1373,7 +1381,7 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // Don't emit the ret/reti instruction when the naked attribute is present in
   // the function being compiled.
   if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::Naked)) {
+          AttributeList::FunctionIndex, Attribute::Naked)) {
     return Chain;
   }
 
@@ -1975,4 +1983,3 @@ unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
 }
 
 } // end of namespace llvm
-
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index bc66379ab708..693d80a1c06f 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -694,7 +694,7 @@ Defs = [SREG] in
 }
 
 //===----------------------------------------------------------------------===//
-// One's/Two's Compliment
+// One's/Two's Complement
 //===----------------------------------------------------------------------===//
 let Constraints = "$src = $rd",
 Defs = [SREG] in
@@ -1718,7 +1718,7 @@ Defs = [SREG] in
                      (implicit SREG)]>;
 
   // CBR Rd, K
-  // Alias for `ANDI Rd, COM(K)` where COM(K) is the compliment of K.
+  // Alias for `ANDI Rd, COM(K)` where COM(K) is the complement of K.
   // FIXME: This uses the 'complement' encoder. We need it to also use the
   // imm_ldi8 encoder. This will cause no fixups to be created on this instruction.
   def CBRRdK : FRdK<0b0111,
diff --git a/lib/Target/AVR/AVRInstrumentFunctions.cpp b/lib/Target/AVR/AVRInstrumentFunctions.cpp
index 5553dc2da31b..e7fca74e1701 100644
--- a/lib/Target/AVR/AVRInstrumentFunctions.cpp
+++ b/lib/Target/AVR/AVRInstrumentFunctions.cpp
@@ -96,7 +96,7 @@ static void BuildSignatureCall(StringRef SymName, BasicBlock &BB, Function &F) {
   Value *FunctionName = CreateStringPtr(BB, F.getName());
 
   Value *Args[] = {FunctionName,
-                   ConstantInt::get(I16, F.getArgumentList().size())};
+                   ConstantInt::get(I16, F.arg_size())};
   CallInst::Create(Fn, Args, "", &BB);
 }
 
diff --git a/lib/Target/AVR/AVRMCInstLower.cpp b/lib/Target/AVR/AVRMCInstLower.cpp
index 342fe558813a..475dda420e89 100644
--- a/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/lib/Target/AVR/AVRMCInstLower.cpp
@@ -56,7 +56,7 @@ void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) con
 
     switch (MO.getType()) {
     default:
-      MI.dump();
+      MI.print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 081d8b5740ef..5c3b45ac2328 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -335,7 +335,7 @@ MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
 
 void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+                               bool IsPCRel, MCContext &Ctx) const {
   if (Value == 0)
     return; // Doesn't change encoding.
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 7ff4b8f350f6..f2be2494684a 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -41,7 +41,7 @@ public:
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index 481de320b22f..713754821005 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -1,5 +1,7 @@
 #include "AVRELFStreamer.h"
 
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/FormattedStream.h"
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index cca3bcc4968a..9f2ee8cf8035 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -23,6 +23,7 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
   CommentString = ";";
   PrivateGlobalPrefix = ".L";
   UsesELFSectionDirectiveForBSS = true;
+  UseIntegratedAssembler = true;
 }
 
 } // end of namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index e6dc8868c705..c3d43ebb407e 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "mccodeemitter"
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
index 5fa425c296a5..4cee8d904c9d 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -63,7 +63,7 @@ private:
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const;
 
-  /// Takes the compliment of a number (~0 - val).
+  /// Takes the complement of a number (~0 - val).
   unsigned encodeComplement(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 12091449cc11..279cdb1a89b4 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -71,7 +71,7 @@ bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
   // Addresses of the form Addr+const or Addr|const
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (isInt<32>(CN->getSExtValue())) {
+    if (isInt<16>(CN->getSExtValue())) {
 
       // If the first operand is a FI, get the TargetFI Node
       if (FrameIndexSDNode *FIN =
@@ -99,7 +99,7 @@ bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset)
 
   // Addresses of the form Addr+const or Addr|const
   ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-  if (isInt<32>(CN->getSExtValue())) {
+  if (isInt<16>(CN->getSExtValue())) {
 
     // If the first operand is a FI, get the TargetFI Node
     if (FrameIndexSDNode *FIN =
@@ -138,7 +138,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
     else
       errs() << "Error: ";
     errs() << "Unsupport signed division for DAG: ";
-    Node->dump(CurDAG);
+    Node->print(errs(), CurDAG);
     errs() << "Please convert to unsigned div/mod.\n";
     break;
   }
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index cca3492a1992..b9b3dff95c0a 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bpf-lower"
 
-static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) {
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
       DiagnosticInfoUnsupported(*MF.getFunction(), Msg, DL.getDebugLoc()));
@@ -306,11 +306,23 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    auto GV = G->getGlobal();
+    fail(CLI.DL, DAG,
+         "A call to global function '" + StringRef(GV->getName())
+         + "' is not supported. "
+         + (GV->isDeclaration() ?
+           "Only calls to predefined BPF helpers are allowed." :
+           "Please use __attribute__((always_inline) to make sure"
+           " this function is inlined."));
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT,
                                         G->getOffset(), 0);
-  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
+    fail(CLI.DL, DAG, Twine("A call to built-in function '"
+                            + StringRef(E->getSymbol())
+                            + "' is not supported."));
+  }
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index a7910dea98de..93ee24371c4d 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -470,6 +470,7 @@ def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
 
 // Calls
 def : Pat<(BPFcall tglobaladdr:$dst), (JAL tglobaladdr:$dst)>;
+def : Pat<(BPFcall texternalsym:$dst), (JAL texternalsym:$dst)>;
 def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
 
 // Loads
diff --git a/lib/Target/BPF/BPFMCInstLower.cpp b/lib/Target/BPF/BPFMCInstLower.cpp
index f64defecf3cc..c8528e867310 100644
--- a/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/lib/Target/BPF/BPFMCInstLower.cpp
@@ -29,6 +29,11 @@ BPFMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
   return Printer.getSymbol(MO.getGlobal());
 }
 
+MCSymbol *
+BPFMCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
 MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                              MCSymbol *Sym) const {
 
@@ -49,7 +54,7 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
@@ -66,6 +71,9 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
     case MachineOperand::MO_RegisterMask:
       continue;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
     case MachineOperand::MO_GlobalAddress:
       MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
       break;
diff --git a/lib/Target/BPF/BPFMCInstLower.h b/lib/Target/BPF/BPFMCInstLower.h
index 054e89407db2..eac811f4cf88 100644
--- a/lib/Target/BPF/BPFMCInstLower.h
+++ b/lib/Target/BPF/BPFMCInstLower.h
@@ -37,6 +37,7 @@ public:
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
 };
 }
 
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 71846e3e92c9..7925bee9c587 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "BPFGenRegisterInfo.inc"
@@ -41,6 +42,18 @@ BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL)
+{
+  if (Offset <= -512) {
+      auto F = MF.getFunction();
+      DiagnosticInfoUnsupported DiagStackSize(*F,
+          "Looks like the BPF stack limit of 512 bytes is exceeded. "
+          "Please move large on stack variables into BPF per-cpu array map.\n",
+          DL);
+      F->getContext().diagnose(DiagStackSize);
+  }
+}
+
 void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                           int SPAdj, unsigned FIOperandNum,
                                           RegScavenger *RS) const {
@@ -48,9 +61,18 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   unsigned i = 0;
   MachineInstr &MI = *II;
-  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
+  if (!DL)
+    /* try harder to get some debug loc */
+    for (auto &I : MBB)
+      if (I.getDebugLoc()) {
+        DL = I.getDebugLoc();
+        break;
+      }
+
   while (!MI.getOperand(i).isFI()) {
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
@@ -59,11 +81,11 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned FrameReg = getFrameRegister(MF);
   int FrameIndex = MI.getOperand(i).getIndex();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  MachineBasicBlock &MBB = *MI.getParent();
 
   if (MI.getOpcode() == BPF::MOV_rr) {
     int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
 
+    WarnSize(Offset, MF, DL);
     MI.getOperand(i).ChangeToRegister(FrameReg, false);
     unsigned reg = MI.getOperand(i - 1).getReg();
     BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
@@ -78,6 +100,8 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (!isInt<32>(Offset))
     llvm_unreachable("bug in frame offset");
 
+  WarnSize(Offset, MF, DL);
+
   if (MI.getOpcode() == BPF::FI_ri) {
     // architecture does not really support FI_ri, replace it with
     //    MOV_rr <target_reg>, frame_reg
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index afc321ea2c34..1f355171ebd3 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -28,7 +28,7 @@ public:
   ~BPFAsmBackend() override = default;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -62,8 +62,8 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 }
 
 void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+                               unsigned DataSize, uint64_t Value, bool IsPCRel,
+                               MCContext &Ctx) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
   } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 044db10fb3fa..1e6abfacb792 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -17,3 +17,9 @@ foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
   add_subdirectory(${t})
 endforeach()
+
+# Currently we do not allow libraries from lib to reference targets directly.
+# This property is used to enforce that convention. It is important because the
+# logic in llvm_map_components_to_libnames is order dependent on the target
+# libraries being created.
+set_property(GLOBAL PROPERTY LLVM_TARGETS_CONFIGURED On)
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index becc086c81b0..4bbc36a86e5b 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -63,21 +63,25 @@ using namespace llvm;
 static cl::opt<bool> EnableFutureRegs("mfuture-regs",
                                       cl::desc("Enable future registers"));
 
-static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
-cl::desc("Warn for missing parenthesis around predicate registers"),
-cl::init(true));
-static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
-cl::desc("Error for missing parenthesis around predicate registers"),
-cl::init(false));
-static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
-cl::desc("Warn for mismatching a signed and unsigned value"),
-cl::init(true));
-static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
-cl::desc("Warn for register names that arent contigious"),
-cl::init(true));
-static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
-cl::desc("Error for register names that aren't contigious"),
-cl::init(false));
+static cl::opt<bool> WarnMissingParenthesis(
+    "mwarn-missing-parenthesis",
+    cl::desc("Warn for missing parenthesis around predicate registers"),
+    cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis(
+    "merror-missing-parenthesis",
+    cl::desc("Error for missing parenthesis around predicate registers"),
+    cl::init(false));
+static cl::opt<bool> WarnSignedMismatch(
+    "mwarn-sign-mismatch",
+    cl::desc("Warn for mismatching a signed and unsigned value"),
+    cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister(
+    "mwarn-noncontigious-register",
+    cl::desc("Warn for register names that arent contigious"), cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister(
+    "merror-noncontigious-register",
+    cl::desc("Error for register names that aren't contigious"),
+    cl::init(false));
 
 namespace {
 
@@ -123,9 +127,11 @@ class HexagonAsmParser : public MCTargetAsmParser {
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
   bool OutOfRange(SMLoc IDLoc, long long Val, long long Max);
   int processInstruction(MCInst &Inst, OperandVector const &Operands,
                          SMLoc IDLoc);
@@ -168,11 +174,10 @@ public:
   bool parseInstruction(OperandVector &Operands);
   bool implicitExpressionLocation(OperandVector &Operands);
   bool parseExpressionOrOperand(OperandVector &Operands);
-  bool parseExpression(MCExpr const *& Expr);
+  bool parseExpression(MCExpr const *&Expr);
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc, OperandVector &Operands) override
-  {
+                        SMLoc NameLoc, OperandVector &Operands) override {
     llvm_unreachable("Unimplemented");
   }
 
@@ -289,45 +294,63 @@ public:
     return false;
   }
 
-  bool isf32Ext() const { return false; }
-  bool iss32_0Imm() const { return CheckImmRange(32, 0, true, true, false); }
+  bool isa30_2Imm() const { return CheckImmRange(30, 2, true, true, true); }
+  bool isb30_2Imm() const { return CheckImmRange(30, 2, true, true, true); }
+  bool isb15_2Imm() const { return CheckImmRange(15, 2, true, true, false); }
+  bool isb13_2Imm() const { return CheckImmRange(13, 2, true, true, false); }
+
+  bool ism32_0Imm() const { return true; }
+
+  bool isf32Imm() const { return false; }
+  bool isf64Imm() const { return false; }
+  bool iss32_0Imm() const { return true; }
+  bool iss31_1Imm() const { return true; }
+  bool iss30_2Imm() const { return true; }
+  bool iss29_3Imm() const { return true; }
   bool iss23_2Imm() const { return CheckImmRange(23, 2, true, true, false); }
+  bool iss10_0Imm() const { return CheckImmRange(10, 0, true, false, false); }
+  bool iss10_6Imm() const { return CheckImmRange(10, 6, true, false, false); }
+  bool iss9_0Imm() const { return CheckImmRange(9, 0, true, false, false); }
   bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
   bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
   bool iss7_0Imm() const { return CheckImmRange(7, 0, true, false, false); }
   bool iss6_0Imm() const { return CheckImmRange(6, 0, true, false, false); }
+  bool iss6_3Imm() const { return CheckImmRange(6, 3, true, false, false); }
   bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
   bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
   bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
   bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
-  bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
-  bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
   bool iss3_0Imm() const { return CheckImmRange(3, 0, true, false, false); }
 
   bool isu64_0Imm() const { return CheckImmRange(64, 0, false, true, true); }
-  bool isu32_0Imm() const { return CheckImmRange(32, 0, false, true, false); }
+  bool isu32_0Imm() const { return true; }
+  bool isu31_1Imm() const { return true; }
+  bool isu30_2Imm() const { return true; }
+  bool isu29_3Imm() const { return true; }
   bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
   bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
   bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
   bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
   bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
   bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
-  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
-  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
-  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
   bool isu10_0Imm() const { return CheckImmRange(10, 0, false, false, false); }
   bool isu9_0Imm() const { return CheckImmRange(9, 0, false, false, false); }
   bool isu8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
   bool isu7_0Imm() const { return CheckImmRange(7, 0, false, false, false); }
   bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
   bool isu5_0Imm() const { return CheckImmRange(5, 0, false, false, false); }
+  bool isu5_2Imm() const { return CheckImmRange(5, 2, false, false, false); }
+  bool isu5_3Imm() const { return CheckImmRange(5, 3, false, false, false); }
   bool isu4_0Imm() const { return CheckImmRange(4, 0, false, false, false); }
+  bool isu4_2Imm() const { return CheckImmRange(4, 2, false, false, false); }
   bool isu3_0Imm() const { return CheckImmRange(3, 0, false, false, false); }
+  bool isu3_1Imm() const { return CheckImmRange(3, 1, false, false, false); }
   bool isu2_0Imm() const { return CheckImmRange(2, 0, false, false, false); }
   bool isu1_0Imm() const { return CheckImmRange(1, 0, false, false, false); }
 
-  bool ism6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
-  bool isn8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
   bool isn1Const() const {
     if (!isImm())
       return false;
@@ -336,35 +359,18 @@ public:
       return false;
     return Value == -1;
   }
-
-  bool iss16_0Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
-  bool iss12_0Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
-  bool iss10_0Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
-  bool iss9_0Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
-  bool iss8_0Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
-  bool iss7_0Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
-  bool iss6_0Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
-  bool iss11_0Ext() const {
+  bool iss11_0Imm() const {
     return CheckImmRange(11 + 26, 0, true, true, true);
   }
-  bool iss11_1Ext() const {
+  bool iss11_1Imm() const {
     return CheckImmRange(11 + 26, 1, true, true, true);
   }
-  bool iss11_2Ext() const {
+  bool iss11_2Imm() const {
     return CheckImmRange(11 + 26, 2, true, true, true);
   }
-  bool iss11_3Ext() const {
+  bool iss11_3Imm() const {
     return CheckImmRange(11 + 26, 3, true, true, true);
   }
-
-  bool isu7_0Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
-  bool isu8_0Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
-  bool isu9_0Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
-  bool isu10_0Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
-  bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
-  bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
-  bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
-  bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
   bool isu32_0MustExt() const { return isImm(); }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -392,188 +398,10 @@ public:
     Inst.addOperand(MCOperand::createExpr(Expr));
   }
 
-  void addf32ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds32_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds23_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0Imm64Operands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds3_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-
-  void addu64_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu32_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu10_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu9_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu7_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu5_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu4_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu3_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu2_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu1_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addm6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addn8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds16_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds12_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds10_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds9_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds6_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
   void addn1ConstOperands(MCInst &Inst, unsigned N) const {
     addImmOperands(Inst, N);
   }
 
-  void addu7_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu8_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu9_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu10_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu32_0MustExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE =
-        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
-    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
-  }
-
-  void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE =
-        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
-    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
-  }
-
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
@@ -749,10 +577,6 @@ bool HexagonAsmParser::matchBundleOptions() {
       HexagonMCInstrInfo::setInnerLoop(MCB);
     else if (Option.compare_lower("endloop1") == 0)
       HexagonMCInstrInfo::setOuterLoop(MCB);
-    else if (Option.compare_lower("mem_noshuf") == 0)
-      HexagonMCInstrInfo::setMemReorderDisabled(MCB);
-    else if (Option.compare_lower("mem_shuf") == 0)
-      HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
     else
       return true;
     Lex();
@@ -770,8 +594,7 @@ void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
       int64_t Value (I.getImm());
       NewInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
           MCConstantExpr::create(Value, getContext()), getContext())));
-    }
-    else {
+    } else {
       if (I.isExpr() && cast<HexagonMCExpr>(I.getExpr())->signMismatch() &&
           WarnSignedMismatch)
         Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
@@ -1066,6 +889,9 @@ bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
 
 // validate register against architecture
 bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+  if (HexagonMCRegisterClasses[Hexagon::V62RegsRegClassID].contains(MatchNum))
+    if (!getSTI().getFeatureBits()[Hexagon::ArchV62])
+      return false;
   return true;
 }
 
@@ -1171,11 +997,15 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
 bool HexagonAsmParser::isLabel(AsmToken &Token) {
   MCAsmLexer &Lexer = getLexer();
   AsmToken const &Second = Lexer.getTok();
-  AsmToken Third = Lexer.peekTok();  
+  AsmToken Third = Lexer.peekTok();
   StringRef String = Token.getString();
   if (Token.is(AsmToken::TokenKind::LCurly) ||
       Token.is(AsmToken::TokenKind::RCurly))
     return false;
+  // special case for parsing vwhist256:sat
+  if (String.lower() == "vwhist256" && Second.is(AsmToken::Colon) &&
+      Third.getString().lower() == "sat")
+    return false;
   if (!Token.is(AsmToken::TokenKind::Identifier))
     return true;
   if (!matchRegister(String.lower()))
@@ -1756,8 +1586,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
           TmpInst.setOpcode(Hexagon::L2_loadrdgp);
 
         TmpInst.addOperand(MO_0);
-        TmpInst.addOperand(
-            MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+        TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCSymbolRefExpr::create(Sym, getContext()), getContext())));
         Inst = TmpInst;
       }
     }
@@ -2142,6 +1972,67 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
+  case Hexagon::PS_loadrubabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrubgp);
+    break;
+  case Hexagon::PS_loadrbabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrbgp);
+    break;
+  case Hexagon::PS_loadruhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadruhgp);
+    break;
+  case Hexagon::PS_loadrhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrhgp);
+    break;
+  case Hexagon::PS_loadriabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrigp);
+    break;
+  case Hexagon::PS_loadrdabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrdgp);
+    break;
+  case Hexagon::PS_storerbabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerbgp);
+    break;
+  case Hexagon::PS_storerhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerhgp);
+    break;
+  case Hexagon::PS_storerfabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerfgp);
+    break;
+  case Hexagon::PS_storeriabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerigp);
+    break;
+  case Hexagon::PS_storerdabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerdgp);
+    break;
+  case Hexagon::PS_storerbnewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerbnewgp);
+    break;
+  case Hexagon::PS_storerhnewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerhnewgp);
+    break;
+  case Hexagon::PS_storerinewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerinewgp);
+    break;
+  case Hexagon::A2_zxtb: {
+    Inst.setOpcode(Hexagon::A2_andir);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, Context)));
+    break;
+  }
   } // switch
 
   return Match_Success;
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index 963fb99ce09b..61d3630ac095 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -317,6 +317,15 @@ bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
   return true;
 }
 
+BT::RegisterCell &BT::RegisterCell::regify(unsigned R) {
+  for (unsigned i = 0, n = width(); i < n; ++i) {
+    const BitValue &V = Bits[i];
+    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+      Bits[i].RefI = BitRef(R, i);
+  }
+  return *this;
+}
+
 uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
   // The general problem is with finding a register class that corresponds
   // to a given reference reg:sub. There can be several such classes, and
@@ -378,12 +387,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
     return;
   assert(RR.Sub == 0 && "Unexpected sub-register in definition");
   // Eliminate all ref-to-reg-0 bit values: replace them with "self".
-  for (unsigned i = 0, n = RC.width(); i < n; ++i) {
-    const BitValue &V = RC[i];
-    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
-      RC[i].RefI = BitRef(RR.Reg, i);
-  }
-  M[RR.Reg] = RC;
+  M[RR.Reg] = RC.regify(RR.Reg);
 }
 
 // Check if the cell represents a compile-time integer value.
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index 48c5f2266acf..a547b34e852f 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -283,6 +283,9 @@ struct BitTracker::RegisterCell {
     return !operator==(RC);
   }
 
+  // Replace the ref-to-reg-0 bit values with the given register.
+  RegisterCell &regify(unsigned R);
+
   // Generate a "ref" cell for the corresponding register. In the resulting
   // cell each bit will be described as being the same as the corresponding
   // bit in register Reg (i.e. the cell is "defined" by register Reg).
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 2c527d598628..2f3dd3326fcc 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_target(HexagonCodeGen
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
   HexagonISelLowering.cpp
+  HexagonLoopIdiomRecognition.cpp
   HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
@@ -58,10 +59,10 @@ add_llvm_target(HexagonCodeGen
   RDFDeadCode.cpp
   RDFGraph.cpp
   RDFLiveness.cpp
-  )
+  RDFRegisters.cpp
+)
 
 add_subdirectory(AsmParser)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(Disassembler)
-
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index c05fbc1d7756..ae15ed0e9240 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -57,11 +57,38 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
-
-  void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
   void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
 };
 
+namespace {
+  uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
+                     int64_t Value) {
+    MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+      MCB, HexagonMCInstrInfo::bundleSize(MCB));
+    if (!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+      return Value;
+    unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+    uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+    int64_t Bits;
+    bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+    assert(Success); (void)Success;
+    uint32_t Upper26 = static_cast<uint32_t>(Bits);
+    uint32_t Operand = Upper26 | Lower6;
+    return Operand;
+  }
+  HexagonDisassembler const &disassembler(void const *Decoder) {
+    return *static_cast<HexagonDisassembler const *>(Decoder);
+  }
+  template <size_t T>
+  void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+    HexagonDisassembler const &Disassembler = disassembler(Decoder);
+    int64_t FullValue =
+        fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI,
+                  SignExtend64<T>(tmp));
+    int64_t Extended = SignExtend64<32>(FullValue);
+    HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+  }
+}
 } // end anonymous namespace
 
 // Forward declare these because the auto-generated code will reference them.
@@ -70,6 +97,10 @@ public:
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Address,
+                                                      const void *Decoder);
 static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t Address,
                                                    const void *Decoder);
@@ -79,6 +110,9 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t Address,
                                                   const void *Decoder);
+static DecodeStatus
+DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t Address,
                                                   const void *Decoder);
@@ -98,31 +132,10 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder);
 
-static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
-static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
-                                 void const *Decoder);
-
-static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
-                                 raw_ostream &os);
-
-static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
-
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t Address, const void *Decoder);
-static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
+static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder);
 static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                  const void *Decoder);
 static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
@@ -135,13 +148,12 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
-static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                   const void *Decoder);
-static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                     const void *Decoder);
 
+#include "HexagonDepDecoders.h"
 #include "HexagonGenDisassemblerTables.inc"
 
 static MCDisassembler *createHexagonDisassembler(const Target &T,
@@ -175,20 +187,31 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Size += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
-  if(Result == MCDisassembler::Fail)
+  if (Result == MCDisassembler::Fail)
     return Result;
-  HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
-  if(!Checker.check())
+  if (Size > HEXAGON_MAX_PACKET_SIZE)
+    return MCDisassembler::Fail;
+  HexagonMCChecker Checker(*MCII, STI, MI, MI, *getContext().getRegisterInfo());
+  if (!Checker.check())
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
 
-static HexagonDisassembler const &disassembler(void const *Decoder) {
-  return *static_cast<HexagonDisassembler const *>(Decoder);
+namespace {
+void adjustDuplex(MCInst &MI, MCContext &Context) {
+  switch (MI.getOpcode()) {
+  case Hexagon::SA1_setin1:
+    MI.insert(MI.begin() + 1,
+              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    break;
+  case Hexagon::SA1_dec:
+    MI.insert(MI.begin() + 2,
+              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    break;
+  default:
+    break;
+  }
 }
-
-static MCContext &contextFromDecoder(void const *Decoder) {
-  return disassembler(Decoder).getContext();
 }
 
 DecodeStatus HexagonDisassembler::getSingleInstruction(
@@ -196,8 +219,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     raw_ostream &os, raw_ostream &cs, bool &Complete) const {
   assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
 
-  uint32_t Instruction =
-      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+  uint32_t Instruction = support::endian::read32le(Bytes.data());
 
   auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
@@ -210,103 +232,92 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       return DecodeStatus::Fail;
   }
 
-  DecodeStatus Result = DecodeStatus::Success;
+  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+      MCB, HexagonMCInstrInfo::bundleSize(MCB));
+
+  DecodeStatus Result = DecodeStatus::Fail;
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
       HexagonII::INST_PARSE_DUPLEX) {
-    // Determine the instruction class of each instruction in the duplex.
-    unsigned duplexIClass, IClassLow, IClassHigh;
-
+    unsigned duplexIClass;
+    uint8_t const *DecodeLow, *DecodeHigh;
     duplexIClass = ((Instruction >> 28) & 0xe) | ((Instruction >> 13) & 0x1);
     switch (duplexIClass) {
     default:
       return MCDisassembler::Fail;
     case 0:
-      IClassLow = HexagonII::HSIG_L1;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_L132;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 1:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 2:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 3:
-      IClassLow = HexagonII::HSIG_A;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_A32;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 4:
-      IClassLow = HexagonII::HSIG_L1;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_L132;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 5:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 6:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 7:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 8:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 9:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 10:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_S1;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_S132;
       break;
     case 11:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_S1;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_S132;
       break;
     case 12:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 13:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 14:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_S2;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_S232;
       break;
     }
-
-    // Set the MCInst to be a duplex instruction. Which one doesn't matter.
-    MI.setOpcode(Hexagon::DuplexIClass0);
-
-    // Decode each instruction in the duplex.
-    // Create an MCInst for each instruction.
-    unsigned instLow = Instruction & 0x1fff;
-    unsigned instHigh = (Instruction >> 16) & 0x1fff;
-    unsigned opLow;
-    if (GetSubinstOpcode(IClassLow, instLow, opLow, os) !=
-        MCDisassembler::Success)
-      return MCDisassembler::Fail;
-    unsigned opHigh;
-    if (GetSubinstOpcode(IClassHigh, instHigh, opHigh, os) !=
-        MCDisassembler::Success)
-      return MCDisassembler::Fail;
+    MI.setOpcode(Hexagon::DuplexIClass0 + duplexIClass);
     MCInst *MILow = new (getContext()) MCInst;
-    MILow->setOpcode(opLow);
     MCInst *MIHigh = new (getContext()) MCInst;
-    MIHigh->setOpcode(opHigh);
-    addSubinstOperands(MILow, opLow, instLow);
-    addSubinstOperands(MIHigh, opHigh, instHigh);
-    // see ConvertToSubInst() in
-    // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
-
-    // Add the duplex instruction MCInsts as operands to the passed in MCInst.
+    Result = decodeInstruction(DecodeLow, *MILow, Instruction & 0x1fff, Address,
+                               this, STI);
+    if (Result != DecodeStatus::Success)
+      return DecodeStatus::Fail;
+    adjustDuplex(*MILow, getContext());
+    Result = decodeInstruction(
+        DecodeHigh, *MIHigh, (Instruction >> 16) & 0x1fff, Address, this, STI);
+    if (Result != DecodeStatus::Success)
+      return DecodeStatus::Fail;
+    adjustDuplex(*MIHigh, getContext());
     MCOperand OPLow = MCOperand::createInst(MILow);
     MCOperand OPHigh = MCOperand::createInst(MIHigh);
     MI.addOperand(OPLow);
@@ -316,34 +327,23 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     if ((Instruction & HexagonII::INST_PARSE_MASK) ==
         HexagonII::INST_PARSE_PACKET_END)
       Complete = true;
-    // Calling the auto-generated decoder function.
-    Result =
-        decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
 
-    // If a, "standard" insn isn't found check special cases.
-    if (MCDisassembler::Success != Result ||
-        MI.getOpcode() == Hexagon::A4_ext) {
-      Result = decodeImmext(MI, Instruction, this);
-      if (MCDisassembler::Success != Result) {
-        Result = decodeSpecial(MI, Instruction);
-      }
-    } else {
-      // If the instruction is a compound instruction, register values will
-      // follow the duplex model, so the register values in the MCInst are
-      // incorrect. If the instruction is a compound, loop through the
-      // operands and change registers appropriately.
-      if (HexagonMCInstrInfo::getType(*MCII, MI) == HexagonII::TypeCOMPOUND) {
-        for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
-          if (i->isReg()) {
-            unsigned reg = i->getReg() - Hexagon::R0;
-            i->setReg(getRegFromSubinstEncoding(reg));
-          }
-        }
-      }
-    }
+    if (Extender != nullptr)
+      Result = decodeInstruction(DecoderTableMustExtend32, MI, Instruction,
+                                 Address, this, STI);
+
+    if (Result != MCDisassembler::Success)
+      Result = decodeInstruction(DecoderTable32, MI, Instruction, Address, this,
+                                 STI);
+
+    if (Result != MCDisassembler::Success &&
+        STI.getFeatureBits()[Hexagon::ExtensionHVX])
+      Result = decodeInstruction(DecoderTableEXT_mmvec32, MI, Instruction,
+                                 Address, this, STI);
+
   }
 
-  switch(MI.getOpcode()) {
+  switch (MI.getOpcode()) {
   case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
   case Hexagon::J4_cmpeqn1_f_jumpnv_t:
   case Hexagon::J4_cmpeqn1_fp0_jump_nt:
@@ -368,7 +368,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   case Hexagon::J4_cmpgtn1_tp0_jump_t:
   case Hexagon::J4_cmpgtn1_tp1_jump_nt:
   case Hexagon::J4_cmpgtn1_tp1_jump_t:
-    MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
+    MI.insert(MI.begin() + 1,
+              MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
     break;
   default:
     break;
@@ -423,13 +424,10 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       return MCDisassembler::Fail;
   }
 
-  adjustExtendedInstructions(MI, MCB);
-  MCInst const *Extender =
-    HexagonMCInstrInfo::extenderForIndex(MCB,
-                                         HexagonMCInstrInfo::bundleSize(MCB));
-  if(Extender != nullptr) {
-    MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
-                          *MI.getOperand(1).getInst() : MI;
+  if (Extender != nullptr) {
+    MCInst const &Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI)
+                             ? *MI.getOperand(1).getInst()
+                             : MI;
     if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
         !HexagonMCInstrInfo::isExtended(*MCII, Inst))
       return MCDisassembler::Fail;
@@ -437,68 +435,6 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   return Result;
 }
 
-void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
-                                                     MCInst const &MCB) const {
-  if (!HexagonMCInstrInfo::hasExtenderForIndex(
-          MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
-    unsigned opcode;
-    // This code is used by the disassembler to disambiguate between GP
-    // relative and absolute addressing instructions since they both have
-    // same encoding bits. However, an absolute addressing instruction must
-    // follow an immediate extender. Disassembler alwaus select absolute
-    // addressing instructions first and uses this code to change them into
-    // GP relative instruction in the absence of the corresponding immediate
-    // extender.
-    switch (MCI.getOpcode()) {
-    case Hexagon::PS_storerbabs:
-      opcode = Hexagon::S2_storerbgp;
-      break;
-    case Hexagon::PS_storerhabs:
-      opcode = Hexagon::S2_storerhgp;
-      break;
-    case Hexagon::PS_storerfabs:
-      opcode = Hexagon::S2_storerfgp;
-      break;
-    case Hexagon::PS_storeriabs:
-      opcode = Hexagon::S2_storerigp;
-      break;
-    case Hexagon::PS_storerbnewabs:
-      opcode = Hexagon::S2_storerbnewgp;
-      break;
-    case Hexagon::PS_storerhnewabs:
-      opcode = Hexagon::S2_storerhnewgp;
-      break;
-    case Hexagon::PS_storerinewabs:
-      opcode = Hexagon::S2_storerinewgp;
-      break;
-    case Hexagon::PS_storerdabs:
-      opcode = Hexagon::S2_storerdgp;
-      break;
-    case Hexagon::PS_loadrbabs:
-      opcode = Hexagon::L2_loadrbgp;
-      break;
-    case Hexagon::PS_loadrubabs:
-      opcode = Hexagon::L2_loadrubgp;
-      break;
-    case Hexagon::PS_loadrhabs:
-      opcode = Hexagon::L2_loadrhgp;
-      break;
-    case Hexagon::PS_loadruhabs:
-      opcode = Hexagon::L2_loadruhgp;
-      break;
-    case Hexagon::PS_loadriabs:
-      opcode = Hexagon::L2_loadrigp;
-      break;
-    case Hexagon::PS_loadrdabs:
-      opcode = Hexagon::L2_loadrdgp;
-      break;
-    default:
-      opcode = MCI.getOpcode();
-    }
-    MCI.setOpcode(opcode);
-  }
-}
-
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
                                         ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
@@ -530,6 +466,20 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
 }
 
+static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Address,
+                                                      const void *Decoder) {
+  static const MCPhysReg GeneralSubRegDecoderTable[] = {
+      Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,
+      Hexagon::R4,  Hexagon::R5,  Hexagon::R6,  Hexagon::R7,
+      Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+      Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23,
+  };
+
+  return DecodeRegisterClass(Inst, RegNo, GeneralSubRegDecoderTable);
+}
+
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
@@ -557,6 +507,15 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
 }
 
+static DecodeStatus DecodeGeneralDoubleLow8RegsRegisterClass(
+    MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, const void *Decoder) {
+  static const MCPhysReg GeneralDoubleLow8RegDecoderTable[] = {
+      Hexagon::D0, Hexagon::D1, Hexagon::D2,  Hexagon::D3,
+      Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11};
+
+  return DecodeRegisterClass(Inst, RegNo, GeneralDoubleLow8RegDecoderTable);
+}
+
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
@@ -590,17 +549,23 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
                                                const void *Decoder) {
+  using namespace Hexagon;
   static const MCPhysReg CtrlRegDecoderTable[] = {
-    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
-    Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
-    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
-    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+    /*  0 */  SA0,        LC0,        SA1,        LC1,
+    /*  4 */  P3_0,       C5,         C6,         C7,
+    /*  8 */  USR,        PC,         UGP,        GP,
+    /* 12 */  CS0,        CS1,        UPCYCLELO,  UPCYCLEHI,
+    /* 16 */  FRAMELIMIT, FRAMEKEY,   PKTCOUNTLO, PKTCOUNTHI,
+    /* 20 */  0,          0,          0,          0,
+    /* 24 */  0,          0,          0,          0,
+    /* 28 */  0,          0,          UTIMERLO,   UTIMERHI
   };
 
   if (RegNo >= array_lengthof(CtrlRegDecoderTable))
     return MCDisassembler::Fail;
 
-  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+  static_assert(NoRegister == 0, "Expecting NoRegister to be 0");
+  if (CtrlRegDecoderTable[RegNo] == NoRegister)
     return MCDisassembler::Fail;
 
   unsigned Register = CtrlRegDecoderTable[RegNo];
@@ -611,20 +576,23 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t /*Address*/,
                                                  const void *Decoder) {
+  using namespace Hexagon;
   static const MCPhysReg CtrlReg64DecoderTable[] = {
-      Hexagon::C1_0,   Hexagon::NoRegister,
-      Hexagon::C3_2,   Hexagon::NoRegister,
-      Hexagon::C7_6,   Hexagon::NoRegister,
-      Hexagon::C9_8,   Hexagon::NoRegister,
-      Hexagon::C11_10, Hexagon::NoRegister,
-      Hexagon::CS,     Hexagon::NoRegister,
-      Hexagon::UPC,    Hexagon::NoRegister
+    /*  0 */  C1_0,       0,          C3_2,       0,
+    /*  4 */  C5_4,       0,          C7_6,       0,
+    /*  8 */  C9_8,       0,          C11_10,     0,
+    /* 12 */  CS,         0,          UPCYCLE,    0,
+    /* 16 */  C17_16,     0,          PKTCOUNT,   0,
+    /* 20 */  0,          0,          0,          0,
+    /* 24 */  0,          0,          0,          0,
+    /* 28 */  0,          0,          UTIMER,     0
   };
 
   if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
     return MCDisassembler::Fail;
 
-  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+  static_assert(NoRegister == 0, "Expecting NoRegister to be 0");
+  if (CtrlReg64DecoderTable[RegNo] == NoRegister)
     return MCDisassembler::Fail;
 
   unsigned Register = CtrlReg64DecoderTable[RegNo];
@@ -650,132 +618,23 @@ static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
-                          int64_t Value) {
-  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
-    MCB, HexagonMCInstrInfo::bundleSize(MCB));
-  if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
-    return Value;
-  unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
-  uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
-  int64_t Bits;
-  bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
-  assert(Success);(void)Success;
-  uint32_t Upper26 = static_cast<uint32_t>(Bits);
-  uint32_t Operand = Upper26 | Lower6;
-  return Operand;
-}
-
-template <size_t T>
-static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
-  HexagonDisassembler const &Disassembler = disassembler(Decoder);
-  int64_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, SignExtend64<T>(tmp));
-  int64_t Extended = SignExtend64<32>(FullValue);
-  HexagonMCInstrInfo::addConstant(MI, Extended,
-                                  Disassembler.getContext());
-}
-
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t /*Address*/,
                                        const void *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
-  int64_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, tmp);
+  int64_t FullValue =
+      fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI, tmp);
   assert(FullValue >= 0 && "Negative in unsigned decoder");
   HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
   return MCDisassembler::Success;
 }
 
-static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<16>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<12>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<11>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<13>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<14>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<10>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
-                                 const void *Decoder) {
-  signedDecoder<8>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<4>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<5>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<7>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<10>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<19>(MI, tmp, Decoder);
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+  tmp = SignExtend64(tmp, Bits);
+  signedDecoder<32>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
@@ -787,838 +646,13 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
   // r13_2 is not extendable, so if there are no extent bits, it's r13_2
   if (Bits == 0)
     Bits = 15;
-  uint32_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, SignExtend64(tmp, Bits));
+  uint32_t FullValue =
+      fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI,
+                SignExtend64(tmp, Bits));
   int64_t Extended = SignExtend64<32>(FullValue) + Address;
-  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
-                                              0, 4))
+  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, 0, 4))
     HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
   return MCDisassembler::Success;
 }
 
-// Addressing mode dependent load store opcode map.
-//   - If an insn is preceded by an extender the address is absolute.
-//      - memw(##symbol) = r0
-//   - If an insn is not preceded by an extender the address is GP relative.
-//      - memw(gp + #symbol) = r0
-// Please note that the instructions must be ordered in the descending order
-// of their opcode.
-// HexagonII::INST_ICLASS_ST
-static const unsigned int StoreConditionalOpcodeData[][2] = {
-    {S4_pstorerdfnew_abs, 0xafc02084},
-    {S4_pstorerdtnew_abs, 0xafc02080},
-    {S4_pstorerdf_abs, 0xafc00084},
-    {S4_pstorerdt_abs, 0xafc00080},
-    {S4_pstorerinewfnew_abs, 0xafa03084},
-    {S4_pstorerinewtnew_abs, 0xafa03080},
-    {S4_pstorerhnewfnew_abs, 0xafa02884},
-    {S4_pstorerhnewtnew_abs, 0xafa02880},
-    {S4_pstorerbnewfnew_abs, 0xafa02084},
-    {S4_pstorerbnewtnew_abs, 0xafa02080},
-    {S4_pstorerinewf_abs, 0xafa01084},
-    {S4_pstorerinewt_abs, 0xafa01080},
-    {S4_pstorerhnewf_abs, 0xafa00884},
-    {S4_pstorerhnewt_abs, 0xafa00880},
-    {S4_pstorerbnewf_abs, 0xafa00084},
-    {S4_pstorerbnewt_abs, 0xafa00080},
-    {S4_pstorerifnew_abs, 0xaf802084},
-    {S4_pstoreritnew_abs, 0xaf802080},
-    {S4_pstorerif_abs, 0xaf800084},
-    {S4_pstorerit_abs, 0xaf800080},
-    {S4_pstorerhfnew_abs, 0xaf402084},
-    {S4_pstorerhtnew_abs, 0xaf402080},
-    {S4_pstorerhf_abs, 0xaf400084},
-    {S4_pstorerht_abs, 0xaf400080},
-    {S4_pstorerbfnew_abs, 0xaf002084},
-    {S4_pstorerbtnew_abs, 0xaf002080},
-    {S4_pstorerbf_abs, 0xaf000084},
-    {S4_pstorerbt_abs, 0xaf000080}};
-// HexagonII::INST_ICLASS_LD
-
-// HexagonII::INST_ICLASS_LD_ST_2
-static unsigned int LoadStoreOpcodeData[][2] = {{PS_loadrdabs, 0x49c00000},
-                                                {PS_loadriabs, 0x49800000},
-                                                {PS_loadruhabs, 0x49600000},
-                                                {PS_loadrhabs, 0x49400000},
-                                                {PS_loadrubabs, 0x49200000},
-                                                {PS_loadrbabs, 0x49000000},
-                                                {PS_storerdabs, 0x48c00000},
-                                                {PS_storerinewabs, 0x48a01000},
-                                                {PS_storerhnewabs, 0x48a00800},
-                                                {PS_storerbnewabs, 0x48a00000},
-                                                {PS_storeriabs, 0x48800000},
-                                                {PS_storerfabs, 0x48600000},
-                                                {PS_storerhabs, 0x48400000},
-                                                {PS_storerbabs, 0x48000000}};
-static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
-static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
-
-static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
-  unsigned MachineOpcode = 0;
-  unsigned LLVMOpcode = 0;
-
-  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
-    for (size_t i = 0; i < NumCondS; ++i) {
-      if ((insn & StoreConditionalOpcodeData[i][1]) ==
-          StoreConditionalOpcodeData[i][1]) {
-        MachineOpcode = StoreConditionalOpcodeData[i][1];
-        LLVMOpcode = StoreConditionalOpcodeData[i][0];
-        break;
-      }
-    }
-  }
-  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
-    for (size_t i = 0; i < NumLS; ++i) {
-      if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
-        MachineOpcode = LoadStoreOpcodeData[i][1];
-        LLVMOpcode = LoadStoreOpcodeData[i][0];
-        break;
-      }
-    }
-  }
-
-  if (MachineOpcode) {
-    unsigned Value = 0;
-    unsigned shift = 0;
-    MI.setOpcode(LLVMOpcode);
-    // Remove the parse bits from the insn.
-    insn &= ~HexagonII::INST_PARSE_MASK;
-
-    switch (LLVMOpcode) {
-    default:
-      return MCDisassembler::Fail;
-      break;
-
-    case Hexagon::S4_pstorerdf_abs:
-    case Hexagon::S4_pstorerdt_abs:
-    case Hexagon::S4_pstorerdfnew_abs:
-    case Hexagon::S4_pstorerdtnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Rtt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::S4_pstorerbnewf_abs:
-    case Hexagon::S4_pstorerbnewt_abs:
-    case Hexagon::S4_pstorerbnewfnew_abs:
-    case Hexagon::S4_pstorerbnewtnew_abs:
-    case Hexagon::S4_pstorerhnewf_abs:
-    case Hexagon::S4_pstorerhnewt_abs:
-    case Hexagon::S4_pstorerhnewfnew_abs:
-    case Hexagon::S4_pstorerhnewtnew_abs:
-    case Hexagon::S4_pstorerinewf_abs:
-    case Hexagon::S4_pstorerinewt_abs:
-    case Hexagon::S4_pstorerinewfnew_abs:
-    case Hexagon::S4_pstorerinewtnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Nt
-      Value = (insn >> 8) & UINT64_C(7);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::S4_pstorerbf_abs:
-    case Hexagon::S4_pstorerbt_abs:
-    case Hexagon::S4_pstorerbfnew_abs:
-    case Hexagon::S4_pstorerbtnew_abs:
-    case Hexagon::S4_pstorerhf_abs:
-    case Hexagon::S4_pstorerht_abs:
-    case Hexagon::S4_pstorerhfnew_abs:
-    case Hexagon::S4_pstorerhtnew_abs:
-    case Hexagon::S4_pstorerif_abs:
-    case Hexagon::S4_pstorerit_abs:
-    case Hexagon::S4_pstorerifnew_abs:
-    case Hexagon::S4_pstoreritnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Rt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::L4_ploadrdf_abs:
-    case Hexagon::L4_ploadrdt_abs:
-    case Hexagon::L4_ploadrdfnew_abs:
-    case Hexagon::L4_ploadrdtnew_abs:
-      // op: Rdd
-      Value = insn & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: Pt
-      Value = ((insn >> 9) & UINT64_C(3));
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = ((insn >> 15) & UINT64_C(62));
-      Value |= ((insn >> 8) & UINT64_C(1));
-      MI.addOperand(MCOperand::createImm(Value));
-      break;
-
-    case Hexagon::L4_ploadrbf_abs:
-    case Hexagon::L4_ploadrbt_abs:
-    case Hexagon::L4_ploadrbfnew_abs:
-    case Hexagon::L4_ploadrbtnew_abs:
-    case Hexagon::L4_ploadrhf_abs:
-    case Hexagon::L4_ploadrht_abs:
-    case Hexagon::L4_ploadrhfnew_abs:
-    case Hexagon::L4_ploadrhtnew_abs:
-    case Hexagon::L4_ploadrubf_abs:
-    case Hexagon::L4_ploadrubt_abs:
-    case Hexagon::L4_ploadrubfnew_abs:
-    case Hexagon::L4_ploadrubtnew_abs:
-    case Hexagon::L4_ploadruhf_abs:
-    case Hexagon::L4_ploadruht_abs:
-    case Hexagon::L4_ploadruhfnew_abs:
-    case Hexagon::L4_ploadruhtnew_abs:
-    case Hexagon::L4_ploadrif_abs:
-    case Hexagon::L4_ploadrit_abs:
-    case Hexagon::L4_ploadrifnew_abs:
-    case Hexagon::L4_ploadritnew_abs:
-      // op: Rd
-      Value = insn & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: Pt
-      Value = (insn >> 9) & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 15) & UINT64_C(62);
-      Value |= (insn >> 8) & UINT64_C(1);
-      MI.addOperand(MCOperand::createImm(Value));
-      break;
-
-    // op: g16_2
-    case (Hexagon::PS_loadriabs):
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_loadrhabs:
-    case Hexagon::PS_loadruhabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_loadrbabs:
-    case Hexagon::PS_loadrubabs:
-      // op: Rd
-      Value |= insn & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(511);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      break;
-
-    case Hexagon::PS_loadrdabs:
-      Value = insn & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(511);
-      MI.addOperand(MCOperand::createImm(Value << 3));
-      break;
-
-    case Hexagon::PS_storerdabs:
-      // op: g16_3
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << 3));
-      // op: Rtt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    // op: g16_2
-    case Hexagon::PS_storerinewabs:
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_storerhnewabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_storerbnewabs:
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      // op: Nt
-      Value = (insn >> 8) & UINT64_C(7);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    // op: g16_2
-    case Hexagon::PS_storeriabs:
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_storerhabs:
-    case Hexagon::PS_storerfabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_storerbabs:
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      // op: Rt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-    }
-    return MCDisassembler::Success;
-  }
-  return MCDisassembler::Fail;
-}
-
-static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
-                                 void const *Decoder) {
-  // Instruction Class for a constant a extender: bits 31:28 = 0x0000
-  if ((~insn & 0xf0000000) == 0xf0000000) {
-    unsigned Value;
-    // 27:16 High 12 bits of 26-bit extender.
-    Value = (insn & 0x0fff0000) << 4;
-    // 13:0 Low 14 bits of 26-bit extender.
-    Value |= ((insn & 0x3fff) << 6);
-    MI.setOpcode(Hexagon::A4_ext);
-    HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
-    return MCDisassembler::Success;
-  }
-  return MCDisassembler::Fail;
-}
-
-// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
-enum subInstBinaryValues {
-  SA1_addi_BITS = 0x0000,
-  SA1_addi_MASK = 0x1800,
-  SA1_addrx_BITS = 0x1800,
-  SA1_addrx_MASK = 0x1f00,
-  SA1_addsp_BITS = 0x0c00,
-  SA1_addsp_MASK = 0x1c00,
-  SA1_and1_BITS = 0x1200,
-  SA1_and1_MASK = 0x1f00,
-  SA1_clrf_BITS = 0x1a70,
-  SA1_clrf_MASK = 0x1e70,
-  SA1_clrfnew_BITS = 0x1a50,
-  SA1_clrfnew_MASK = 0x1e70,
-  SA1_clrt_BITS = 0x1a60,
-  SA1_clrt_MASK = 0x1e70,
-  SA1_clrtnew_BITS = 0x1a40,
-  SA1_clrtnew_MASK = 0x1e70,
-  SA1_cmpeqi_BITS = 0x1900,
-  SA1_cmpeqi_MASK = 0x1f00,
-  SA1_combine0i_BITS = 0x1c00,
-  SA1_combine0i_MASK = 0x1d18,
-  SA1_combine1i_BITS = 0x1c08,
-  SA1_combine1i_MASK = 0x1d18,
-  SA1_combine2i_BITS = 0x1c10,
-  SA1_combine2i_MASK = 0x1d18,
-  SA1_combine3i_BITS = 0x1c18,
-  SA1_combine3i_MASK = 0x1d18,
-  SA1_combinerz_BITS = 0x1d08,
-  SA1_combinerz_MASK = 0x1d08,
-  SA1_combinezr_BITS = 0x1d00,
-  SA1_combinezr_MASK = 0x1d08,
-  SA1_dec_BITS = 0x1300,
-  SA1_dec_MASK = 0x1f00,
-  SA1_inc_BITS = 0x1100,
-  SA1_inc_MASK = 0x1f00,
-  SA1_seti_BITS = 0x0800,
-  SA1_seti_MASK = 0x1c00,
-  SA1_setin1_BITS = 0x1a00,
-  SA1_setin1_MASK = 0x1e40,
-  SA1_sxtb_BITS = 0x1500,
-  SA1_sxtb_MASK = 0x1f00,
-  SA1_sxth_BITS = 0x1400,
-  SA1_sxth_MASK = 0x1f00,
-  SA1_tfr_BITS = 0x1000,
-  SA1_tfr_MASK = 0x1f00,
-  SA1_zxtb_BITS = 0x1700,
-  SA1_zxtb_MASK = 0x1f00,
-  SA1_zxth_BITS = 0x1600,
-  SA1_zxth_MASK = 0x1f00,
-  SL1_loadri_io_BITS = 0x0000,
-  SL1_loadri_io_MASK = 0x1000,
-  SL1_loadrub_io_BITS = 0x1000,
-  SL1_loadrub_io_MASK = 0x1000,
-  SL2_deallocframe_BITS = 0x1f00,
-  SL2_deallocframe_MASK = 0x1fc0,
-  SL2_jumpr31_BITS = 0x1fc0,
-  SL2_jumpr31_MASK = 0x1fc4,
-  SL2_jumpr31_f_BITS = 0x1fc5,
-  SL2_jumpr31_f_MASK = 0x1fc7,
-  SL2_jumpr31_fnew_BITS = 0x1fc7,
-  SL2_jumpr31_fnew_MASK = 0x1fc7,
-  SL2_jumpr31_t_BITS = 0x1fc4,
-  SL2_jumpr31_t_MASK = 0x1fc7,
-  SL2_jumpr31_tnew_BITS = 0x1fc6,
-  SL2_jumpr31_tnew_MASK = 0x1fc7,
-  SL2_loadrb_io_BITS = 0x1000,
-  SL2_loadrb_io_MASK = 0x1800,
-  SL2_loadrd_sp_BITS = 0x1e00,
-  SL2_loadrd_sp_MASK = 0x1f00,
-  SL2_loadrh_io_BITS = 0x0000,
-  SL2_loadrh_io_MASK = 0x1800,
-  SL2_loadri_sp_BITS = 0x1c00,
-  SL2_loadri_sp_MASK = 0x1e00,
-  SL2_loadruh_io_BITS = 0x0800,
-  SL2_loadruh_io_MASK = 0x1800,
-  SL2_return_BITS = 0x1f40,
-  SL2_return_MASK = 0x1fc4,
-  SL2_return_f_BITS = 0x1f45,
-  SL2_return_f_MASK = 0x1fc7,
-  SL2_return_fnew_BITS = 0x1f47,
-  SL2_return_fnew_MASK = 0x1fc7,
-  SL2_return_t_BITS = 0x1f44,
-  SL2_return_t_MASK = 0x1fc7,
-  SL2_return_tnew_BITS = 0x1f46,
-  SL2_return_tnew_MASK = 0x1fc7,
-  SS1_storeb_io_BITS = 0x1000,
-  SS1_storeb_io_MASK = 0x1000,
-  SS1_storew_io_BITS = 0x0000,
-  SS1_storew_io_MASK = 0x1000,
-  SS2_allocframe_BITS = 0x1c00,
-  SS2_allocframe_MASK = 0x1e00,
-  SS2_storebi0_BITS = 0x1200,
-  SS2_storebi0_MASK = 0x1f00,
-  SS2_storebi1_BITS = 0x1300,
-  SS2_storebi1_MASK = 0x1f00,
-  SS2_stored_sp_BITS = 0x0a00,
-  SS2_stored_sp_MASK = 0x1e00,
-  SS2_storeh_io_BITS = 0x0000,
-  SS2_storeh_io_MASK = 0x1800,
-  SS2_storew_sp_BITS = 0x0800,
-  SS2_storew_sp_MASK = 0x1e00,
-  SS2_storewi0_BITS = 0x1000,
-  SS2_storewi0_MASK = 0x1f00,
-  SS2_storewi1_BITS = 0x1100,
-  SS2_storewi1_MASK = 0x1f00
-};
 
-static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
-                                 raw_ostream &os) {
-  switch (IClass) {
-  case HexagonII::HSIG_L1:
-    if ((inst & SL1_loadri_io_MASK) == SL1_loadri_io_BITS)
-      op = Hexagon::SL1_loadri_io;
-    else if ((inst & SL1_loadrub_io_MASK) == SL1_loadrub_io_BITS)
-      op = Hexagon::SL1_loadrub_io;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_L2:
-    if ((inst & SL2_deallocframe_MASK) == SL2_deallocframe_BITS)
-      op = Hexagon::SL2_deallocframe;
-    else if ((inst & SL2_jumpr31_MASK) == SL2_jumpr31_BITS)
-      op = Hexagon::SL2_jumpr31;
-    else if ((inst & SL2_jumpr31_f_MASK) == SL2_jumpr31_f_BITS)
-      op = Hexagon::SL2_jumpr31_f;
-    else if ((inst & SL2_jumpr31_fnew_MASK) == SL2_jumpr31_fnew_BITS)
-      op = Hexagon::SL2_jumpr31_fnew;
-    else if ((inst & SL2_jumpr31_t_MASK) == SL2_jumpr31_t_BITS)
-      op = Hexagon::SL2_jumpr31_t;
-    else if ((inst & SL2_jumpr31_tnew_MASK) == SL2_jumpr31_tnew_BITS)
-      op = Hexagon::SL2_jumpr31_tnew;
-    else if ((inst & SL2_loadrb_io_MASK) == SL2_loadrb_io_BITS)
-      op = Hexagon::SL2_loadrb_io;
-    else if ((inst & SL2_loadrd_sp_MASK) == SL2_loadrd_sp_BITS)
-      op = Hexagon::SL2_loadrd_sp;
-    else if ((inst & SL2_loadrh_io_MASK) == SL2_loadrh_io_BITS)
-      op = Hexagon::SL2_loadrh_io;
-    else if ((inst & SL2_loadri_sp_MASK) == SL2_loadri_sp_BITS)
-      op = Hexagon::SL2_loadri_sp;
-    else if ((inst & SL2_loadruh_io_MASK) == SL2_loadruh_io_BITS)
-      op = Hexagon::SL2_loadruh_io;
-    else if ((inst & SL2_return_MASK) == SL2_return_BITS)
-      op = Hexagon::SL2_return;
-    else if ((inst & SL2_return_f_MASK) == SL2_return_f_BITS)
-      op = Hexagon::SL2_return_f;
-    else if ((inst & SL2_return_fnew_MASK) == SL2_return_fnew_BITS)
-      op = Hexagon::SL2_return_fnew;
-    else if ((inst & SL2_return_t_MASK) == SL2_return_t_BITS)
-      op = Hexagon::SL2_return_t;
-    else if ((inst & SL2_return_tnew_MASK) == SL2_return_tnew_BITS)
-      op = Hexagon::SL2_return_tnew;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_A:
-    if ((inst & SA1_addi_MASK) == SA1_addi_BITS)
-      op = Hexagon::SA1_addi;
-    else if ((inst & SA1_addrx_MASK) == SA1_addrx_BITS)
-      op = Hexagon::SA1_addrx;
-    else if ((inst & SA1_addsp_MASK) == SA1_addsp_BITS)
-      op = Hexagon::SA1_addsp;
-    else if ((inst & SA1_and1_MASK) == SA1_and1_BITS)
-      op = Hexagon::SA1_and1;
-    else if ((inst & SA1_clrf_MASK) == SA1_clrf_BITS)
-      op = Hexagon::SA1_clrf;
-    else if ((inst & SA1_clrfnew_MASK) == SA1_clrfnew_BITS)
-      op = Hexagon::SA1_clrfnew;
-    else if ((inst & SA1_clrt_MASK) == SA1_clrt_BITS)
-      op = Hexagon::SA1_clrt;
-    else if ((inst & SA1_clrtnew_MASK) == SA1_clrtnew_BITS)
-      op = Hexagon::SA1_clrtnew;
-    else if ((inst & SA1_cmpeqi_MASK) == SA1_cmpeqi_BITS)
-      op = Hexagon::SA1_cmpeqi;
-    else if ((inst & SA1_combine0i_MASK) == SA1_combine0i_BITS)
-      op = Hexagon::SA1_combine0i;
-    else if ((inst & SA1_combine1i_MASK) == SA1_combine1i_BITS)
-      op = Hexagon::SA1_combine1i;
-    else if ((inst & SA1_combine2i_MASK) == SA1_combine2i_BITS)
-      op = Hexagon::SA1_combine2i;
-    else if ((inst & SA1_combine3i_MASK) == SA1_combine3i_BITS)
-      op = Hexagon::SA1_combine3i;
-    else if ((inst & SA1_combinerz_MASK) == SA1_combinerz_BITS)
-      op = Hexagon::SA1_combinerz;
-    else if ((inst & SA1_combinezr_MASK) == SA1_combinezr_BITS)
-      op = Hexagon::SA1_combinezr;
-    else if ((inst & SA1_dec_MASK) == SA1_dec_BITS)
-      op = Hexagon::SA1_dec;
-    else if ((inst & SA1_inc_MASK) == SA1_inc_BITS)
-      op = Hexagon::SA1_inc;
-    else if ((inst & SA1_seti_MASK) == SA1_seti_BITS)
-      op = Hexagon::SA1_seti;
-    else if ((inst & SA1_setin1_MASK) == SA1_setin1_BITS)
-      op = Hexagon::SA1_setin1;
-    else if ((inst & SA1_sxtb_MASK) == SA1_sxtb_BITS)
-      op = Hexagon::SA1_sxtb;
-    else if ((inst & SA1_sxth_MASK) == SA1_sxth_BITS)
-      op = Hexagon::SA1_sxth;
-    else if ((inst & SA1_tfr_MASK) == SA1_tfr_BITS)
-      op = Hexagon::SA1_tfr;
-    else if ((inst & SA1_zxtb_MASK) == SA1_zxtb_BITS)
-      op = Hexagon::SA1_zxtb;
-    else if ((inst & SA1_zxth_MASK) == SA1_zxth_BITS)
-      op = Hexagon::SA1_zxth;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_S1:
-    if ((inst & SS1_storeb_io_MASK) == SS1_storeb_io_BITS)
-      op = Hexagon::SS1_storeb_io;
-    else if ((inst & SS1_storew_io_MASK) == SS1_storew_io_BITS)
-      op = Hexagon::SS1_storew_io;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_S2:
-    if ((inst & SS2_allocframe_MASK) == SS2_allocframe_BITS)
-      op = Hexagon::SS2_allocframe;
-    else if ((inst & SS2_storebi0_MASK) == SS2_storebi0_BITS)
-      op = Hexagon::SS2_storebi0;
-    else if ((inst & SS2_storebi1_MASK) == SS2_storebi1_BITS)
-      op = Hexagon::SS2_storebi1;
-    else if ((inst & SS2_stored_sp_MASK) == SS2_stored_sp_BITS)
-      op = Hexagon::SS2_stored_sp;
-    else if ((inst & SS2_storeh_io_MASK) == SS2_storeh_io_BITS)
-      op = Hexagon::SS2_storeh_io;
-    else if ((inst & SS2_storew_sp_MASK) == SS2_storew_sp_BITS)
-      op = Hexagon::SS2_storew_sp;
-    else if ((inst & SS2_storewi0_MASK) == SS2_storewi0_BITS)
-      op = Hexagon::SS2_storewi0;
-    else if ((inst & SS2_storewi1_MASK) == SS2_storewi1_BITS)
-      op = Hexagon::SS2_storewi1;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  default:
-    os << "<unknown>";
-    return MCDisassembler::Fail;
-  }
-  return MCDisassembler::Success;
-}
-
-static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
-  if (encoded_reg < 8)
-    return Hexagon::R0 + encoded_reg;
-  else if (encoded_reg < 16)
-    return Hexagon::R0 + encoded_reg + 8;
-
-  // patently false value
-  return Hexagon::NoRegister;
-}
-
-static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
-  if (encoded_dreg < 4)
-    return Hexagon::D0 + encoded_dreg;
-  else if (encoded_dreg < 8)
-    return Hexagon::D0 + encoded_dreg + 4;
-
-  // patently false value
-  return Hexagon::NoRegister;
-}
-
-void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
-                                             unsigned inst) const {
-  int64_t operand;
-  MCOperand Op;
-  switch (opcode) {
-  case Hexagon::SL2_deallocframe:
-  case Hexagon::SL2_jumpr31:
-  case Hexagon::SL2_jumpr31_f:
-  case Hexagon::SL2_jumpr31_fnew:
-  case Hexagon::SL2_jumpr31_t:
-  case Hexagon::SL2_jumpr31_tnew:
-  case Hexagon::SL2_return:
-  case Hexagon::SL2_return_f:
-  case Hexagon::SL2_return_fnew:
-  case Hexagon::SL2_return_t:
-  case Hexagon::SL2_return_tnew:
-    // no operands for these instructions
-    break;
-  case Hexagon::SS2_allocframe:
-    // u 8-4{5_3}
-    operand = ((inst & 0x1f0) >> 4) << 3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL1_loadri_io:
-    // Rd 3-0, Rs 7-4, u 11-8{4_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 6;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL1_loadrub_io:
-    // Rd 3-0, Rs 7-4, u 11-8
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrb_io:
-    // Rd 3-0, Rs 7-4, u 10-8
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x700) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrh_io:
-  case Hexagon::SL2_loadruh_io:
-    // Rd 3-0, Rs 7-4, u 10-8{3_1}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x700) >> 8) << 1;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrd_sp:
-    // Rdd 2-0, u 7-3{5_3}
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x0f8) >> 3) << 3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadri_sp:
-    // Rd 3-0, u 8-4{5_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x1f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_addi:
-    // Rx 3-0 (x2), s7 10-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    MI->addOperand(Op);
-    operand = SignExtend64<7>((inst & 0x7f0) >> 4);
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_addrx:
-    // Rx 3-0 (x2), Rs 7-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SA1_and1:
-  case Hexagon::SA1_dec:
-  case Hexagon::SA1_inc:
-  case Hexagon::SA1_sxtb:
-  case Hexagon::SA1_sxth:
-  case Hexagon::SA1_tfr:
-  case Hexagon::SA1_zxtb:
-  case Hexagon::SA1_zxth:
-    // Rd 3-0, Rs 7-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SA1_addsp:
-    // Rd 3-0, u 9-4{6_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x3f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_seti:
-    // Rd 3-0, u 9-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x3f0) >> 4;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_clrf:
-  case Hexagon::SA1_clrfnew:
-  case Hexagon::SA1_clrt:
-  case Hexagon::SA1_clrtnew:
-  case Hexagon::SA1_setin1:
-    // Rd 3-0
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    if (opcode == Hexagon::SA1_setin1)
-      break;
-    MI->addOperand(MCOperand::createReg(Hexagon::P0));
-    break;
-  case Hexagon::SA1_cmpeqi:
-    // Rs 7-4, u 1-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = inst & 0x3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_combine0i:
-  case Hexagon::SA1_combine1i:
-  case Hexagon::SA1_combine2i:
-  case Hexagon::SA1_combine3i:
-    // Rdd 2-0, u 6-5
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x060) >> 5;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_combinerz:
-  case Hexagon::SA1_combinezr:
-    // Rdd 2-0, Rs 7-4
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS1_storeb_io:
-    // Rs 7-4, u 11-8, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS1_storew_io:
-    // Rs 7-4, u 11-8{4_2}, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0xf00) >> 8) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storebi0:
-  case Hexagon::SS2_storebi1:
-    // Rs 7-4, u 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = inst & 0xf;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SS2_storewi0:
-  case Hexagon::SS2_storewi1:
-    // Rs 7-4, u 3-0{4_2}
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SS2_stored_sp:
-    // s 8-3{6_3}, Rtt 2-0
-    operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storeh_io:
-    // Rs 7-4, u 10-8{3_1}, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x700) >> 8) << 1;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storew_sp:
-    // u 8-4{5_2}, Rd 3-0
-    operand = ((inst & 0x1f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  default:
-    // don't crash with an invalid subinstruction
-    // llvm_unreachable("Invalid subinstruction in duplex instruction");
-    break;
-  }
-}
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 0b2b46387b6a..4767165141a3 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -22,14 +22,12 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 
 // Hexagon Architectures
-def ArchV4:  SubtargetFeature<"v4",  "HexagonArchVersion", "V4",  "Hexagon V4">;
-def ArchV5:  SubtargetFeature<"v5",  "HexagonArchVersion", "V5",  "Hexagon V5">;
-def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
-def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+include "HexagonDepArch.td"
 
-def FeatureHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
+// Hexagon ISA Extensions
+def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
       "Hexagon HVX instructions">;
-def FeatureHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
+def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
       "Hexagon HVX Double instructions">;
 def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
       "Use constant-extended calls">;
@@ -37,19 +35,14 @@ def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasV5T             : Predicate<"HST->hasV5TOps()">;
-def NoV5T              : Predicate<"!HST->hasV5TOps()">;
-def HasV55T            : Predicate<"HST->hasV55TOps()">,
-                         AssemblerPredicate<"ArchV55">;
-def HasV60T            : Predicate<"HST->hasV60TOps()">,
-                         AssemblerPredicate<"ArchV60">;
+
 def UseMEMOP           : Predicate<"HST->useMemOps()">;
 def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
 def UseHVXDbl          : Predicate<"HST->useHVXDblOps()">,
-                         AssemblerPredicate<"FeatureHVXDbl">;
+                         AssemblerPredicate<"ExtensionHVXDbl">;
 def UseHVXSgl          : Predicate<"HST->useHVXSglOps()">;
 def UseHVX             : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
-                         AssemblerPredicate<"FeatureHVX">;
+                         AssemblerPredicate<"ExtensionHVX">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
@@ -81,7 +74,7 @@ class IntrinsicsRel;
 def getPredOpcode : InstrMapping {
   let FilterClass = "PredRel";
   // Instructions with the same BaseOpcode and isNVStore values form a row.
-  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isBrTaken", "isNT"];
   // Instructions with the same predicate sense form a column.
   let ColFields = ["PredSense"];
   // The key column is the unpredicated instructions.
@@ -132,7 +125,7 @@ def getPredNewOpcode : InstrMapping {
 //
 def getPredOldOpcode : InstrMapping {
   let FilterClass = "PredNewRel";
-  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore", "isBrTaken"];
   let ColFields = ["PNewValue"];
   let KeyCol = ["new"];
   let ValueCols = [[""]];
@@ -248,11 +241,18 @@ def getRealHWInstr : InstrMapping {
 //===----------------------------------------------------------------------===//
 include "HexagonSchedule.td"
 include "HexagonRegisterInfo.td"
-include "HexagonCallingConv.td"
-include "HexagonInstrInfo.td"
+include "HexagonOperands.td"
+include "HexagonDepOperands.td"
+include "HexagonDepITypes.td"
+include "HexagonInstrFormats.td"
+include "HexagonDepInstrFormats.td"
+include "HexagonDepInstrInfo.td"
+include "HexagonPseudo.td"
 include "HexagonPatterns.td"
+include "HexagonDepMappings.td"
 include "HexagonIntrinsics.td"
 include "HexagonIntrinsicsDerived.td"
+include "HexagonMapAsm2IntrinV62.gen.td"
 
 def HexagonInstrInfo : InstrInfo;
 
@@ -271,7 +271,9 @@ def : Proc<"hexagonv5",  HexagonModelV4,
 def : Proc<"hexagonv55", HexagonModelV55,
            [ArchV4, ArchV5, ArchV55]>;
 def : Proc<"hexagonv60", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60, FeatureHVX]>;
+           [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>;
+def : Proc<"hexagonv62", HexagonModelV62,
+           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ExtensionHVX]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 54db5ad4374b..fda23f8f6b05 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -261,10 +261,34 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
   return Sym;
 }
 
+static MCInst ScaleVectorOffset(MCInst &Inst, unsigned OpNo,
+                                unsigned VectorSize, MCContext &Ctx) {
+  MCInst T;
+  T.setOpcode(Inst.getOpcode());
+  for (unsigned i = 0, n = Inst.getNumOperands(); i != n; ++i) {
+    if (i != OpNo) {
+      T.addOperand(Inst.getOperand(i));
+      continue;
+    }
+    MCOperand &ImmOp = Inst.getOperand(i);
+    const auto *HE = static_cast<const HexagonMCExpr*>(ImmOp.getExpr());
+    int32_t V = cast<MCConstantExpr>(HE->getExpr())->getValue();
+    auto *NewCE = MCConstantExpr::create(V / int32_t(VectorSize), Ctx);
+    auto *NewHE = HexagonMCExpr::create(NewCE, Ctx);
+    T.addOperand(MCOperand::createExpr(NewHE));
+  }
+  return T;
+}
+
 void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
                                                   const MachineInstr &MI) {
   MCInst &MappedInst = static_cast <MCInst &>(Inst);
   const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  unsigned VectorSize = HST.useHVXSglOps()
+                            ? Hexagon::VectorRegsRegClass.getSize()
+                            : Hexagon::VectorRegs128BRegClass.getSize();
 
   switch (Inst.getOpcode()) {
   default: return;
@@ -282,6 +306,36 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     break;
   }
 
+  case Hexagon::A2_tfrf: {
+    Inst.setOpcode(Hexagon::A2_paddif);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrt: {
+    Inst.setOpcode(Hexagon::A2_paddit);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrfnew: {
+    Inst.setOpcode(Hexagon::A2_paddifnew);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrtnew: {
+    Inst.setOpcode(Hexagon::A2_padditnew);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_zxtb: {
+    Inst.setOpcode(Hexagon::A2_andir);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, OutContext)));
+    break;
+  }
+
   // "$dst = CONST64(#$src1)",
   case Hexagon::CONST64:
     if (!OutStreamer->hasRawTextSupport()) {
@@ -376,6 +430,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
     return;
   }
+  case Hexagon::PS_call_nr:
+    Inst.setOpcode(Hexagon::J2_call);
+    break;
   case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
     MCOperand &MO = MappedInst.getOperand(2);
     int64_t Imm;
@@ -564,6 +621,181 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     return;
   }
 
+  case Hexagon::V6_vL32Ub_pi:
+  case Hexagon::V6_vL32b_cur_pi:
+  case Hexagon::V6_vL32b_nt_cur_pi:
+  case Hexagon::V6_vL32b_pi:
+  case Hexagon::V6_vL32b_nt_pi:
+  case Hexagon::V6_vL32b_nt_tmp_pi:
+  case Hexagon::V6_vL32b_tmp_pi:
+  case Hexagon::V6_vL32Ub_pi_128B:
+  case Hexagon::V6_vL32b_cur_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_pi_128B:
+  case Hexagon::V6_vL32b_pi_128B:
+  case Hexagon::V6_vL32b_nt_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pi_128B:
+  case Hexagon::V6_vL32b_tmp_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vL32b_cur_ai:
+  case Hexagon::V6_vL32b_nt_ai:
+  case Hexagon::V6_vL32b_nt_cur_ai:
+  case Hexagon::V6_vL32b_nt_tmp_ai:
+  case Hexagon::V6_vL32b_tmp_ai:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vL32b_cur_ai_128B:
+  case Hexagon::V6_vL32b_nt_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_ai_128B:
+  case Hexagon::V6_vL32b_tmp_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_pi:
+  case Hexagon::V6_vS32b_new_pi:
+  case Hexagon::V6_vS32b_nt_new_pi:
+  case Hexagon::V6_vS32b_nt_pi:
+  case Hexagon::V6_vS32b_pi:
+  case Hexagon::V6_vS32Ub_pi_128B:
+  case Hexagon::V6_vS32b_new_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_pi_128B:
+  case Hexagon::V6_vS32b_nt_pi_128B:
+  case Hexagon::V6_vS32b_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_new_ai:
+  case Hexagon::V6_vS32b_nt_ai:
+  case Hexagon::V6_vS32b_nt_new_ai:
+  case Hexagon::V6_vS32Ub_ai_128B:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vS32b_new_ai_128B:
+  case Hexagon::V6_vS32b_nt_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 1, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32b_cur_npred_pi:
+  case Hexagon::V6_vL32b_cur_pred_pi:
+  case Hexagon::V6_vL32b_npred_pi:
+  case Hexagon::V6_vL32b_nt_cur_npred_pi:
+  case Hexagon::V6_vL32b_nt_cur_pred_pi:
+  case Hexagon::V6_vL32b_nt_npred_pi:
+  case Hexagon::V6_vL32b_nt_pred_pi:
+  case Hexagon::V6_vL32b_nt_tmp_npred_pi:
+  case Hexagon::V6_vL32b_nt_tmp_pred_pi:
+  case Hexagon::V6_vL32b_pred_pi:
+  case Hexagon::V6_vL32b_tmp_npred_pi:
+  case Hexagon::V6_vL32b_tmp_pred_pi:
+  case Hexagon::V6_vL32b_cur_npred_pi_128B:
+  case Hexagon::V6_vL32b_cur_pred_pi_128B:
+  case Hexagon::V6_vL32b_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_pred_pi_128B:
+  case Hexagon::V6_vL32b_nt_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_pred_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pred_pi_128B:
+  case Hexagon::V6_vL32b_pred_pi_128B:
+  case Hexagon::V6_vL32b_tmp_npred_pi_128B:
+  case Hexagon::V6_vL32b_tmp_pred_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 4, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32b_cur_npred_ai:
+  case Hexagon::V6_vL32b_cur_pred_ai:
+  case Hexagon::V6_vL32b_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai:
+  case Hexagon::V6_vL32b_nt_npred_ai:
+  case Hexagon::V6_vL32b_nt_pred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai:
+  case Hexagon::V6_vL32b_pred_ai:
+  case Hexagon::V6_vL32b_tmp_npred_ai:
+  case Hexagon::V6_vL32b_tmp_pred_ai:
+  case Hexagon::V6_vL32b_cur_npred_ai_128B:
+  case Hexagon::V6_vL32b_cur_pred_ai_128B:
+  case Hexagon::V6_vL32b_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai_128B:
+  case Hexagon::V6_vL32b_nt_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_pred_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai_128B:
+  case Hexagon::V6_vL32b_pred_ai_128B:
+  case Hexagon::V6_vL32b_tmp_npred_ai_128B:
+  case Hexagon::V6_vL32b_tmp_pred_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_npred_pi:
+  case Hexagon::V6_vS32Ub_pred_pi:
+  case Hexagon::V6_vS32b_new_npred_pi:
+  case Hexagon::V6_vS32b_new_pred_pi:
+  case Hexagon::V6_vS32b_npred_pi:
+  case Hexagon::V6_vS32b_nqpred_pi:
+  case Hexagon::V6_vS32b_nt_new_npred_pi:
+  case Hexagon::V6_vS32b_nt_new_pred_pi:
+  case Hexagon::V6_vS32b_nt_npred_pi:
+  case Hexagon::V6_vS32b_nt_nqpred_pi:
+  case Hexagon::V6_vS32b_nt_pred_pi:
+  case Hexagon::V6_vS32b_nt_qpred_pi:
+  case Hexagon::V6_vS32b_pred_pi:
+  case Hexagon::V6_vS32b_qpred_pi:
+  case Hexagon::V6_vS32Ub_npred_pi_128B:
+  case Hexagon::V6_vS32Ub_pred_pi_128B:
+  case Hexagon::V6_vS32b_new_npred_pi_128B:
+  case Hexagon::V6_vS32b_new_pred_pi_128B:
+  case Hexagon::V6_vS32b_npred_pi_128B:
+  case Hexagon::V6_vS32b_nqpred_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_npred_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_pred_pi_128B:
+  case Hexagon::V6_vS32b_nt_npred_pi_128B:
+  case Hexagon::V6_vS32b_nt_nqpred_pi_128B:
+  case Hexagon::V6_vS32b_nt_pred_pi_128B:
+  case Hexagon::V6_vS32b_nt_qpred_pi_128B:
+  case Hexagon::V6_vS32b_pred_pi_128B:
+  case Hexagon::V6_vS32b_qpred_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_npred_ai:
+  case Hexagon::V6_vS32Ub_pred_ai:
+  case Hexagon::V6_vS32b_new_npred_ai:
+  case Hexagon::V6_vS32b_new_pred_ai:
+  case Hexagon::V6_vS32b_npred_ai:
+  case Hexagon::V6_vS32b_nqpred_ai:
+  case Hexagon::V6_vS32b_nt_new_npred_ai:
+  case Hexagon::V6_vS32b_nt_new_pred_ai:
+  case Hexagon::V6_vS32b_nt_npred_ai:
+  case Hexagon::V6_vS32b_nt_nqpred_ai:
+  case Hexagon::V6_vS32b_nt_pred_ai:
+  case Hexagon::V6_vS32b_nt_qpred_ai:
+  case Hexagon::V6_vS32b_pred_ai:
+  case Hexagon::V6_vS32b_qpred_ai:
+  case Hexagon::V6_vS32Ub_npred_ai_128B:
+  case Hexagon::V6_vS32Ub_pred_ai_128B:
+  case Hexagon::V6_vS32b_new_npred_ai_128B:
+  case Hexagon::V6_vS32b_new_pred_ai_128B:
+  case Hexagon::V6_vS32b_npred_ai_128B:
+  case Hexagon::V6_vS32b_nqpred_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_npred_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_pred_ai_128B:
+  case Hexagon::V6_vS32b_nt_npred_ai_128B:
+  case Hexagon::V6_vS32b_nt_nqpred_ai_128B:
+  case Hexagon::V6_vS32b_nt_pred_ai_128B:
+  case Hexagon::V6_vS32b_nt_qpred_ai_128B:
+  case Hexagon::V6_vS32b_pred_ai_128B:
+  case Hexagon::V6_vS32b_qpred_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
   }
 }
 
@@ -578,13 +810,9 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (MI->isBundle()) {
     const MachineBasicBlock* MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
-    unsigned IgnoreCount = 0;
 
     for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
-      if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
-          MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
-        ++IgnoreCount;
-      else
+      if (!MII->isDebugValue() && !MII->isImplicitDef())
         HexagonLowerToMC(MCII, &*MII, MCB, *this);
   }
   else
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index fe7278fde1b1..61f290ca98d7 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -46,6 +46,17 @@ using namespace llvm;
 
 static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
   cl::init(true), cl::desc("Preserve subregisters in tied operands"));
+static cl::opt<bool> GenExtract("hexbit-extract", cl::Hidden,
+  cl::init(true), cl::desc("Generate extract instructions"));
+static cl::opt<bool> GenBitSplit("hexbit-bitsplit", cl::Hidden,
+  cl::init(true), cl::desc("Generate bitsplit instructions"));
+
+static cl::opt<unsigned> MaxExtract("hexbit-max-extract", cl::Hidden,
+  cl::init(UINT_MAX));
+static unsigned CountExtract = 0;
+static cl::opt<unsigned> MaxBitSplit("hexbit-max-bitsplit", cl::Hidden,
+  cl::init(UINT_MAX));
+static unsigned CountBitSplit = 0;
 
 namespace llvm {
 
@@ -249,8 +260,6 @@ INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
 
 bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
       RegisterSet &AVs) {
-  MachineDomTreeNode *N = MDT->getNode(&B);
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
   bool Changed = false;
 
   if (T.TopDown)
@@ -262,10 +271,9 @@ bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
   RegisterSet NewAVs = AVs;
   NewAVs.insert(Defs);
 
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
-    Changed |= visitBlock(*SB, T, NewAVs);
-  }
+  for (auto *DTN : children<MachineDomTreeNode*>(MDT->getNode(&B)))
+    Changed |= visitBlock(*(DTN->getBlock()), T, NewAVs);
+
   if (!T.TopDown)
     Changed |= T.processBlock(B, AVs);
 
@@ -896,6 +904,7 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
                   *MRI.getTargetRegisterInfo());
 
   auto VerifySR = [&HRI] (const TargetRegisterClass *RC, unsigned Sub) -> void {
+    (void)HRI;
     assert(Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo) ||
            Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi));
   };
@@ -983,9 +992,9 @@ bool DeadCodeElimination::isDead(unsigned R) const {
 
 bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
   bool Changed = false;
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-    Changed |= runOnNode(*I);
+
+  for (auto *DTN : children<MachineDomTreeNode*>(N))
+    Changed |= runOnNode(DTN);
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
@@ -1735,10 +1744,11 @@ namespace {
 // This is by no means complete
   class BitSimplification : public Transformation {
   public:
-    BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
-        const HexagonRegisterInfo &hri, MachineRegisterInfo &mri,
-        MachineFunction &mf)
-      : Transformation(true), HII(hii), HRI(hri), MRI(mri), MF(mf), BT(bt) {}
+    BitSimplification(BitTracker &bt, const MachineDominatorTree &mdt,
+        const HexagonInstrInfo &hii, const HexagonRegisterInfo &hri,
+        MachineRegisterInfo &mri, MachineFunction &mf)
+      : Transformation(true), MDT(mdt), HII(hii), HRI(hri), MRI(mri),
+        MF(mf), BT(bt) {}
 
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
 
@@ -1765,9 +1775,18 @@ namespace {
           const BitTracker::RegisterCell &RC);
     bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
           const BitTracker::RegisterCell &RC);
+    bool genBitSplit(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC, const RegisterSet &AVs);
     bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
           const BitTracker::RegisterCell &RC);
+    bool simplifyExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC, const RegisterSet &AVs);
+
+    // Cache of created instructions to avoid creating duplicates.
+    // XXX Currently only used by genBitSplit.
+    std::vector<MachineInstr*> NewMIs;
 
+    const MachineDominatorTree &MDT;
     const HexagonInstrInfo &HII;
     const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
@@ -2149,6 +2168,146 @@ bool BitSimplification::genExtractLow(MachineInstr *MI,
   return false;
 }
 
+bool BitSimplification::genBitSplit(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC,
+      const RegisterSet &AVs) {
+  if (!GenBitSplit)
+    return false;
+  if (CountBitSplit >= MaxBitSplit)
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A4_bitsplit:
+    case Hexagon::A4_bitspliti:
+      return false;
+  }
+
+  unsigned W = RC.width();
+  if (W != 32)
+    return false;
+
+  auto ctlz = [] (const BitTracker::RegisterCell &C) -> unsigned {
+    unsigned Z = C.width();
+    while (Z > 0 && C[Z-1].is(0))
+      --Z;
+    return C.width() - Z;
+  };
+
+  // Count the number of leading zeros in the target RC.
+  unsigned Z = ctlz(RC);
+  if (Z == 0 || Z == W)
+    return false;
+
+  // A simplistic analysis: assume the source register (the one being split)
+  // is fully unknown, and that all its bits are self-references.
+  const BitTracker::BitValue &B0 = RC[0];
+  if (B0.Type != BitTracker::BitValue::Ref)
+    return false;
+
+  unsigned SrcR = B0.RefI.Reg;
+  unsigned SrcSR = 0;
+  unsigned Pos = B0.RefI.Pos;
+
+  // All the non-zero bits should be consecutive bits from the same register.
+  for (unsigned i = 1; i < W-Z; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type != BitTracker::BitValue::Ref)
+      return false;
+    if (V.RefI.Reg != SrcR || V.RefI.Pos != Pos+i)
+      return false;
+  }
+
+  // Now, find the other bitfield among AVs.
+  for (unsigned S = AVs.find_first(); S; S = AVs.find_next(S)) {
+    // The number of leading zeros here should be the number of trailing
+    // non-zeros in RC.
+    if (!BT.has(S))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(S);
+    if (SC.width() != W || ctlz(SC) != W-Z)
+      continue;
+    // The Z lower bits should now match SrcR.
+    const BitTracker::BitValue &S0 = SC[0];
+    if (S0.Type != BitTracker::BitValue::Ref || S0.RefI.Reg != SrcR)
+      continue;
+    unsigned P = S0.RefI.Pos;
+
+    if (Pos <= P && (Pos + W-Z) != P)
+      continue;
+    if (P < Pos && (P + Z) != Pos)
+      continue;
+    // The starting bitfield position must be at a subregister boundary.
+    if (std::min(P, Pos) != 0 && std::min(P, Pos) != 32)
+      continue;
+
+    unsigned I;
+    for (I = 1; I < Z; ++I) {
+      const BitTracker::BitValue &V = SC[I];
+      if (V.Type != BitTracker::BitValue::Ref)
+        break;
+      if (V.RefI.Reg != SrcR || V.RefI.Pos != P+I)
+        break;
+    }
+    if (I != Z)
+      continue;
+
+    // Generate bitsplit where S is defined.
+    CountBitSplit++;
+    MachineInstr *DefS = MRI.getVRegDef(S);
+    assert(DefS != nullptr);
+    DebugLoc DL = DefS->getDebugLoc();
+    MachineBasicBlock &B = *DefS->getParent();
+    auto At = DefS->isPHI() ? B.getFirstNonPHI()
+                            : MachineBasicBlock::iterator(DefS);
+    if (MRI.getRegClass(SrcR)->getID() == Hexagon::DoubleRegsRegClassID)
+      SrcSR = (std::min(Pos, P) == 32) ? Hexagon::isub_hi : Hexagon::isub_lo;
+    if (!validateReg({SrcR,SrcSR}, Hexagon::A4_bitspliti, 1))
+      continue;
+    unsigned ImmOp = Pos <= P ? W-Z : Z;
+
+    // Find an existing bitsplit instruction if one already exists.
+    unsigned NewR = 0;
+    for (MachineInstr *In : NewMIs) {
+      if (In->getOpcode() != Hexagon::A4_bitspliti)
+        continue;
+      MachineOperand &Op1 = In->getOperand(1);
+      if (Op1.getReg() != SrcR || Op1.getSubReg() != SrcSR)
+        continue;
+      if (In->getOperand(2).getImm() != ImmOp)
+        continue;
+      // Check if the target register is available here.
+      MachineOperand &Op0 = In->getOperand(0);
+      MachineInstr *DefI = MRI.getVRegDef(Op0.getReg());
+      assert(DefI != nullptr);
+      if (!MDT.dominates(DefI, &*At))
+        continue;
+
+      // Found one that can be reused.
+      assert(Op0.getSubReg() == 0);
+      NewR = Op0.getReg();
+      break;
+    }
+    if (!NewR) {
+      NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+      auto NewBS = BuildMI(B, At, DL, HII.get(Hexagon::A4_bitspliti), NewR)
+                      .addReg(SrcR, 0, SrcSR)
+                      .addImm(ImmOp);
+      NewMIs.push_back(NewBS);
+    }
+    if (Pos <= P) {
+      HBS::replaceRegWithSub(RD.Reg, NewR, Hexagon::isub_lo, MRI);
+      HBS::replaceRegWithSub(S,      NewR, Hexagon::isub_hi, MRI);
+    } else {
+      HBS::replaceRegWithSub(S,      NewR, Hexagon::isub_lo, MRI);
+      HBS::replaceRegWithSub(RD.Reg, NewR, Hexagon::isub_hi, MRI);
+    }
+    return true;
+  }
+
+  return false;
+}
+
 // Check for tstbit simplification opportunity, where the bit being checked
 // can be tracked back to another register. For example:
 //   vreg2 = S2_lsr_i_r  vreg1, 5
@@ -2210,6 +2369,201 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
   return false;
 }
 
+// Detect whether RD is a bitfield extract (sign- or zero-extended) of
+// some register from the AVs set. Create a new corresponding instruction
+// at the location of MI. The intent is to recognize situations where
+// a sequence of instructions performs an operation that is equivalent to
+// an extract operation, such as a shift left followed by a shift right.
+bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC,
+      const RegisterSet &AVs) {
+  if (!GenExtract)
+    return false;
+  if (CountExtract >= MaxExtract)
+    return false;
+  CountExtract++;
+
+  unsigned W = RC.width();
+  unsigned RW = W;
+  unsigned Len;
+  bool Signed;
+
+  // The code is mostly class-independent, except for the part that generates
+  // the extract instruction, and establishes the source register (in case it
+  // needs to use a subregister).
+  const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+  if (FRC != &Hexagon::IntRegsRegClass && FRC != &Hexagon::DoubleRegsRegClass)
+    return false;
+  assert(RD.Sub == 0);
+
+  // Observation:
+  // If the cell has a form of 00..0xx..x with k zeros and n remaining
+  // bits, this could be an extractu of the n bits, but it could also be
+  // an extractu of a longer field which happens to have 0s in the top
+  // bit positions.
+  // The same logic applies to sign-extended fields.
+  //
+  // Do not check for the extended extracts, since it would expand the
+  // search space quite a bit. The search may be expensive as it is.
+
+  const BitTracker::BitValue &TopV = RC[W-1];
+
+  // Eliminate candidates that have self-referential bits, since they
+  // cannot be extracts from other registers. Also, skip registers that
+  // have compile-time constant values.
+  bool IsConst = true;
+  for (unsigned I = 0; I != W; ++I) {
+    const BitTracker::BitValue &V = RC[I];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == RD.Reg)
+      return false;
+    IsConst = IsConst && (V.is(0) || V.is(1));
+  }
+  if (IsConst)
+    return false;
+
+  if (TopV.is(0) || TopV.is(1)) {
+    bool S = TopV.is(1);
+    for (--W; W > 0 && RC[W-1].is(S); --W)
+      ;
+    Len = W;
+    Signed = S;
+    // The sign bit must be a part of the field being extended.
+    if (Signed)
+      ++Len;
+  } else {
+    // This could still be a sign-extended extract.
+    assert(TopV.Type == BitTracker::BitValue::Ref);
+    if (TopV.RefI.Reg == RD.Reg || TopV.RefI.Pos == W-1)
+      return false;
+    for (--W; W > 0 && RC[W-1] == TopV; --W)
+      ;
+    // The top bits of RC are copies of TopV. One occurrence of TopV will
+    // be a part of the field.
+    Len = W + 1;
+    Signed = true;
+  }
+
+  // This would be just a copy. It should be handled elsewhere.
+  if (Len == RW)
+    return false;
+
+  DEBUG({
+    dbgs() << __func__ << " on reg: " << PrintReg(RD.Reg, &HRI, RD.Sub)
+           << ", MI: " << *MI;
+    dbgs() << "Cell: " << RC << '\n';
+    dbgs() << "Expected bitfield size: " << Len << " bits, "
+           << (Signed ? "sign" : "zero") << "-extended\n";
+  });
+
+  bool Changed = false;
+
+  for (unsigned R = AVs.find_first(); R != 0; R = AVs.find_next(R)) {
+    if (!BT.has(R))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(R);
+    unsigned SW = SC.width();
+
+    // The source can be longer than the destination, as long as its size is
+    // a multiple of the size of the destination. Also, we would need to be
+    // able to refer to the subregister in the source that would be of the
+    // same size as the destination, but only check the sizes here.
+    if (SW < RW || (SW % RW) != 0)
+      continue;
+
+    // The field can start at any offset in SC as long as it contains Len
+    // bits and does not cross subregister boundary (if the source register
+    // is longer than the destination).
+    unsigned Off = 0;
+    while (Off <= SW-Len) {
+      unsigned OE = (Off+Len)/RW;
+      if (OE != Off/RW) {
+        // The assumption here is that if the source (R) is longer than the
+        // destination, then the destination is a sequence of words of
+        // size RW, and each such word in R can be accessed via a subregister.
+        //
+        // If the beginning and the end of the field cross the subregister
+        // boundary, advance to the next subregister.
+        Off = OE*RW;
+        continue;
+      }
+      if (HBS::isEqual(RC, 0, SC, Off, Len))
+        break;
+      ++Off;
+    }
+
+    if (Off > SW-Len)
+      continue;
+
+    // Found match.
+    unsigned ExtOpc = 0;
+    if (Off == 0) {
+      if (Len == 8)
+        ExtOpc = Signed ? Hexagon::A2_sxtb : Hexagon::A2_zxtb;
+      else if (Len == 16)
+        ExtOpc = Signed ? Hexagon::A2_sxth : Hexagon::A2_zxth;
+      else if (Len < 10 && !Signed)
+        ExtOpc = Hexagon::A2_andir;
+    }
+    if (ExtOpc == 0) {
+      ExtOpc =
+          Signed ? (RW == 32 ? Hexagon::S4_extract  : Hexagon::S4_extractp)
+                 : (RW == 32 ? Hexagon::S2_extractu : Hexagon::S2_extractup);
+    }
+    unsigned SR = 0;
+    // This only recognizes isub_lo and isub_hi.
+    if (RW != SW && RW*2 != SW)
+      continue;
+    if (RW != SW)
+      SR = (Off/RW == 0) ? Hexagon::isub_lo : Hexagon::isub_hi;
+    Off = Off % RW;
+
+    if (!validateReg({R,SR}, ExtOpc, 1))
+      continue;
+
+    // Don't generate the same instruction as the one being optimized.
+    if (MI->getOpcode() == ExtOpc) {
+      // All possible ExtOpc's have the source in operand(1).
+      const MachineOperand &SrcOp = MI->getOperand(1);
+      if (SrcOp.getReg() == R)
+        continue;
+    }
+
+    DebugLoc DL = MI->getDebugLoc();
+    MachineBasicBlock &B = *MI->getParent();
+    unsigned NewR = MRI.createVirtualRegister(FRC);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+    auto MIB = BuildMI(B, At, DL, HII.get(ExtOpc), NewR)
+                  .addReg(R, 0, SR);
+    switch (ExtOpc) {
+      case Hexagon::A2_sxtb:
+      case Hexagon::A2_zxtb:
+      case Hexagon::A2_sxth:
+      case Hexagon::A2_zxth:
+        break;
+      case Hexagon::A2_andir:
+        MIB.addImm((1u << Len) - 1);
+        break;
+      case Hexagon::S4_extract:
+      case Hexagon::S2_extractu:
+      case Hexagon::S4_extractp:
+      case Hexagon::S2_extractup:
+        MIB.addImm(Len)
+           .addImm(Off);
+        break;
+      default:
+        llvm_unreachable("Unexpected opcode");
+    }
+
+    HBS::replaceReg(RD.Reg, NewR, MRI);
+    BT.put(BitTracker::RegisterRef(NewR), RC);
+    Changed = true;
+    break;
+  }
+
+  return Changed;
+}
+
 bool BitSimplification::processBlock(MachineBasicBlock &B,
       const RegisterSet &AVs) {
   if (!BT.reached(&B))
@@ -2247,12 +2601,15 @@ bool BitSimplification::processBlock(MachineBasicBlock &B,
 
     if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
       bool T = genPackhl(MI, RD, RC);
+      T = T || simplifyExtractLow(MI, RD, RC, AVB);
       Changed |= T;
       continue;
     }
 
     if (FRC->getID() == Hexagon::IntRegsRegClassID) {
-      bool T = genExtractHalf(MI, RD, RC);
+      bool T = genBitSplit(MI, RD, RC, AVB);
+      T = T || simplifyExtractLow(MI, RD, RC, AVB);
+      T = T || genExtractHalf(MI, RD, RC);
       T = T || genCombineHalf(MI, RD, RC);
       T = T || genExtractLow(MI, RD, RC);
       Changed |= T;
@@ -2313,7 +2670,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
 
   BT.run();
   RegisterSet ABS;  // Available registers for BS.
-  BitSimplification BitS(BT, HII, HRI, MRI, MF);
+  BitSimplification BitS(BT, *MDT, HII, HRI, MRI, MF);
   Changed |= visitBlock(Entry, BitS, ABS);
 
   Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
@@ -2599,7 +2956,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
     for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) {
       const MachineOperand &Op = SI->getOperand(j);
       if (!Op.isReg()) {
-        MIB.addOperand(Op);
+        MIB.add(Op);
         continue;
       }
       if (!Op.isUse())
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index 436f88dcd450..90ccecb6629a 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -74,7 +74,7 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
     // Module::AnyPointerSize.
     if (Width == 0 || Width > 64)
       break;
-    AttributeSet Attrs = F.getAttributes();
+    AttributeList Attrs = F.getAttributes();
     if (Attrs.hasAttribute(AttrIdx, Attribute::ByVal))
       continue;
     InPhysReg = getNextPhysReg(InPhysReg, Width);
@@ -272,6 +272,9 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
   // cases below.
   uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
 
+  // Register id of the 0th operand. It can be 0.
+  unsigned Reg0 = Reg[0].Reg;
+
   switch (Opc) {
     // Transfer immediate:
 
@@ -792,6 +795,17 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
     case A2_zxth:
       return rr0(eZXT(rc(1), 16), Outputs);
 
+    // Saturations
+
+    case A2_satb:
+      return rr0(eSXT(RegisterCell::self(0, W0).regify(Reg0), 8), Outputs);
+    case A2_sath:
+      return rr0(eSXT(RegisterCell::self(0, W0).regify(Reg0), 16), Outputs);
+    case A2_satub:
+      return rr0(eZXT(RegisterCell::self(0, W0).regify(Reg0), 8), Outputs);
+    case A2_satuh:
+      return rr0(eZXT(RegisterCell::self(0, W0).regify(Reg0), 16), Outputs);
+
     // Bit count:
 
     case S2_cl0:
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
index adc213c3d438..1640b40c164f 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -219,8 +219,7 @@ HexagonBlockRanges::HexagonBlockRanges(MachineFunction &mf)
     TII(*HST.getInstrInfo()), TRI(*HST.getRegisterInfo()),
     Reserved(TRI.getReservedRegs(mf)) {
   // Consider all non-allocatable registers as reserved.
-  for (auto I = TRI.regclass_begin(), E = TRI.regclass_end(); I != E; ++I) {
-    auto *RC = *I;
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
     if (RC->isAllocatable())
       continue;
     for (unsigned R : *RC)
@@ -233,14 +232,16 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::getLiveIns(
       const TargetRegisterInfo &TRI) {
   RegisterSet LiveIns;
   RegisterSet Tmp;
+
   for (auto I : B.liveins()) {
-    if (I.LaneMask.all()) {
-      Tmp.insert({I.PhysReg,0});
+    MCSubRegIndexIterator S(I.PhysReg, &TRI);
+    if (I.LaneMask.all() || (I.LaneMask.any() && !S.isValid())) {
+      Tmp.insert({I.PhysReg, 0});
       continue;
     }
-    for (MCSubRegIndexIterator S(I.PhysReg, &TRI); S.isValid(); ++S) {
-      LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
-      if ((M & I.LaneMask).any())
+    for (; S.isValid(); ++S) {
+      unsigned SI = S.getSubRegIndex();
+      if ((I.LaneMask & TRI.getSubRegIndexLaneMask(SI)).any())
         Tmp.insert({S.getSubReg(), 0});
     }
   }
@@ -307,6 +308,8 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
     LastUse[R] = LastDef[R] = IndexType::None;
   };
 
+  RegisterSet Defs, Clobbers;
+
   for (auto &In : B) {
     if (In.isDebugValue())
       continue;
@@ -325,19 +328,67 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
           closeRange(S);
       }
     }
-    // Process defs.
+    // Process defs and clobbers.
+    Defs.clear();
+    Clobbers.clear();
     for (auto &Op : In.operands()) {
       if (!Op.isReg() || !Op.isDef() || Op.isUndef())
         continue;
       RegisterRef R = { Op.getReg(), Op.getSubReg() };
-      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
-        continue;
       for (auto S : expandToSubRegs(R, MRI, TRI)) {
-        if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
-          closeRange(S);
-        LastDef[S] = Index;
+        if (TargetRegisterInfo::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
+          continue;
+        if (Op.isDead())
+          Clobbers.insert(S);
+        else
+          Defs.insert(S);
+      }
+    }
+
+    for (auto &Op : In.operands()) {
+      if (!Op.isRegMask())
+        continue;
+      const uint32_t *BM = Op.getRegMask();
+      for (unsigned PR = 1, N = TRI.getNumRegs(); PR != N; ++PR) {
+        // Skip registers that have subregisters. A register is preserved
+        // iff its bit is set in the regmask, so if R1:0 was preserved, both
+        // R1 and R0 would also be present.
+        if (MCSubRegIterator(PR, &TRI, false).isValid())
+          continue;
+        if (Reserved[PR])
+          continue;
+        if (BM[PR/32] & (1u << (PR%32)))
+          continue;
+        RegisterRef R = { PR, 0 };
+        if (!Defs.count(R))
+          Clobbers.insert(R);
       }
     }
+    // Defs and clobbers can overlap, e.g.
+    // %D0<def,dead> = COPY %vreg5, %R0<imp-def>, %R1<imp-def>
+    for (RegisterRef R : Defs)
+      Clobbers.erase(R);
+
+    // Update maps for defs.
+    for (RegisterRef S : Defs) {
+      // Defs should already be expanded into subregs.
+      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+             !MCSubRegIterator(S.Reg, &TRI, false).isValid());
+      if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+        closeRange(S);
+      LastDef[S] = Index;
+    }
+    // Update maps for clobbers.
+    for (RegisterRef S : Clobbers) {
+      // Clobbers should already be expanded into subregs.
+      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+             !MCSubRegIterator(S.Reg, &TRI, false).isValid());
+      if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+        closeRange(S);
+      // Create a single-instruction range.
+      LastDef[S] = LastUse[S] = Index;
+      closeRange(S);
+    }
   }
 
   // Collect live-on-exit.
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
deleted file mode 100644
index e61b2a7a58ac..000000000000
--- a/lib/Target/Hexagon/HexagonCallingConv.td
+++ /dev/null
@@ -1,35 +0,0 @@
-//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the Hexagon architectures.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Return Value Calling Conventions
-//===----------------------------------------------------------------------===//
-
-// Hexagon 32-bit C return-value convention.
-def RetCC_Hexagon32 : CallingConv<[
-  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
-
-  // Alternatively, they are assigned to the stack in 4-byte aligned units.
-  CCAssignToStack<4, 4>
-]>;
-
-// Hexagon 32-bit C Calling convention.
-def CC_Hexagon32 : CallingConv<[
-  // All arguments get passed in integer registers if there is space.
-  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
-
-  // Alternatively, they are assigned to the stack in 4-byte aligned units.
-  CCAssignToStack<4, 4>
-]>;
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 489da6be923d..a07ba77e6f3e 100644
--- a/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -315,11 +315,8 @@ void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
   // visited".
 
   Order.push_back(Root);
-  DomTreeNode *DTN = DT->getNode(Root);
-  typedef GraphTraits<DomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType Iter;
-  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
-    getBlockTraversalOrder((*I)->getBlock(), Order);
+  for (auto *DTN : children<DomTreeNode*>(DT->getNode(Root)))
+    getBlockTraversalOrder(DTN->getBlock(), Order);
 }
 
 bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
@@ -1235,11 +1232,8 @@ void HexagonCommonGEP::removeDeadCode() {
 
   for (unsigned i = 0; i < BO.size(); ++i) {
     BasicBlock *B = cast<BasicBlock>(BO[i]);
-    DomTreeNode *N = DT->getNode(B);
-    typedef GraphTraits<DomTreeNode*> GTN;
-    typedef GTN::ChildIteratorType Iter;
-    for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-      BO.push_back((*I)->getBlock());
+    for (auto DTN : children<DomTreeNode*>(DT->getNode(B)))
+      BO.push_back(DTN->getBlock());
   }
 
   for (unsigned i = BO.size(); i > 0; --i) {
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 36080997ec6b..8118c8eb149d 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -440,22 +440,28 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
 
     // Put instructions that last defined integer or double registers into the
     // map.
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Op = MI.getOperand(I);
-      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
-        continue;
-      unsigned Reg = Op.getReg();
-      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-          LastDef[*SubRegs] = &MI;
-        }
-      } else if (Hexagon::IntRegsRegClass.contains(Reg))
-        LastDef[Reg] = &MI;
+    for (MachineOperand &Op : MI.operands()) {
+      if (Op.isReg()) {
+        if (!Op.isDef() || !Op.getReg())
+          continue;
+        unsigned Reg = Op.getReg();
+        if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            LastDef[*SubRegs] = &MI;
+        } else if (Hexagon::IntRegsRegClass.contains(Reg))
+          LastDef[Reg] = &MI;
+      } else if (Op.isRegMask()) {
+        for (unsigned Reg : Hexagon::IntRegsRegClass)
+          if (Op.clobbersPhysReg(Reg))
+            LastDef[Reg] = &MI;
+      }
     }
   }
 }
 
 bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
 
   if (IsCombinesDisabled) return false;
 
diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h
new file mode 100644
index 000000000000..1009aa39cefb
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepArch.h
@@ -0,0 +1,10 @@
+//===--- HexagonDepArch.h -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+enum HexagonArchEnum { V4,V5,V55,V60,V62 };
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
new file mode 100644
index 000000000000..5b1d02c136f0
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -0,0 +1,19 @@
+//===--- HexagonDepArch.td ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "V62", "Enable Hexagon V62 architecture">;
+def HasV62T : Predicate<"HST->hasV62TOps()">, AssemblerPredicate<"ArchV62">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Enable Hexagon V60 architecture">;
+def HasV60T : Predicate<"HST->hasV60TOps()">, AssemblerPredicate<"ArchV60">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Enable Hexagon V55 architecture">;
+def HasV55T : Predicate<"HST->hasV55TOps()">, AssemblerPredicate<"ArchV55">;
+def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Enable Hexagon V4 architecture">;
+def HasV4T : Predicate<"HST->hasV4TOps()">, AssemblerPredicate<"ArchV4">;
+def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Enable Hexagon V5 architecture">;
+def HasV5T : Predicate<"HST->hasV5TOps()">, AssemblerPredicate<"ArchV5">;
diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Hexagon/HexagonDepDecoders.h
new file mode 100644
index 000000000000..aa9787ecf0c8
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepDecoders.h
@@ -0,0 +1,64 @@
+//===--- HexagonDepDecoders.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<4>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<14>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<8>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<7>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<12>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<3>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<13>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<9>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<5>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Hexagon/HexagonDepITypes.h b/lib/Target/Hexagon/HexagonDepITypes.h
new file mode 100644
index 000000000000..f8ae39a37994
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepITypes.h
@@ -0,0 +1,53 @@
+//===--- HexagonDepITypes.h -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+namespace HexagonII {
+enum Type {
+  TypeALU32_2op = 0,
+  TypeALU32_3op = 1,
+  TypeALU32_ADDI = 2,
+  TypeALU64 = 3,
+  TypeCJ = 4,
+  TypeCOPROC_VMEM = 5,
+  TypeCR = 7,
+  TypeCVI_HIST = 10,
+  TypeCVI_VA = 16,
+  TypeCVI_VA_DV = 17,
+  TypeCVI_VINLANESAT = 18,
+  TypeCVI_VM_CUR_LD = 19,
+  TypeCVI_VM_LD = 20,
+  TypeCVI_VM_NEW_ST = 21,
+  TypeCVI_VM_ST = 22,
+  TypeCVI_VM_STU = 23,
+  TypeCVI_VM_TMP_LD = 24,
+  TypeCVI_VM_VP_LDU = 25,
+  TypeCVI_VP = 26,
+  TypeCVI_VP_VS = 27,
+  TypeCVI_VS = 28,
+  TypeCVI_VX = 30,
+  TypeCVI_VX_DV = 31,
+  TypeDUPLEX = 32,
+  TypeENDLOOP = 33,
+  TypeEXTENDER = 34,
+  TypeJ = 35,
+  TypeLD = 36,
+  TypeM = 37,
+  TypeMAPPING = 38,
+  TypeNCJ = 39,
+  TypePSEUDO = 40,
+  TypeST = 41,
+  TypeSUBINSN = 42,
+  TypeS_2op = 43,
+  TypeS_3op = 44,
+  TypeV2LDST = 47,
+  TypeV4LDST = 48
+};
+}
+}
diff --git a/lib/Target/Hexagon/HexagonDepITypes.td b/lib/Target/Hexagon/HexagonDepITypes.td
new file mode 100644
index 000000000000..f1d689ce12f4
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepITypes.td
@@ -0,0 +1,48 @@
+//===--- HexagonDepITypes.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class IType<bits<6> t> { bits<6> Value = t; }
+def TypeALU32_2op : IType<0>;
+def TypeALU32_3op : IType<1>;
+def TypeALU32_ADDI : IType<2>;
+def TypeALU64 : IType<3>;
+def TypeCJ : IType<4>;
+def TypeCOPROC_VMEM : IType<5>;
+def TypeCR : IType<7>;
+def TypeCVI_HIST : IType<10>;
+def TypeCVI_VA : IType<16>;
+def TypeCVI_VA_DV : IType<17>;
+def TypeCVI_VINLANESAT : IType<18>;
+def TypeCVI_VM_CUR_LD : IType<19>;
+def TypeCVI_VM_LD : IType<20>;
+def TypeCVI_VM_NEW_ST : IType<21>;
+def TypeCVI_VM_ST : IType<22>;
+def TypeCVI_VM_STU : IType<23>;
+def TypeCVI_VM_TMP_LD : IType<24>;
+def TypeCVI_VM_VP_LDU : IType<25>;
+def TypeCVI_VP : IType<26>;
+def TypeCVI_VP_VS : IType<27>;
+def TypeCVI_VS : IType<28>;
+def TypeCVI_VX : IType<30>;
+def TypeCVI_VX_DV : IType<31>;
+def TypeDUPLEX : IType<32>;
+def TypeENDLOOP : IType<33>;
+def TypeEXTENDER : IType<34>;
+def TypeJ : IType<35>;
+def TypeLD : IType<36>;
+def TypeM : IType<37>;
+def TypeMAPPING : IType<38>;
+def TypeNCJ : IType<39>;
+def TypePSEUDO : IType<40>;
+def TypeST : IType<41>;
+def TypeSUBINSN : IType<42>;
+def TypeS_2op : IType<43>;
+def TypeS_3op : IType<44>;
+def TypeV2LDST : IType<47>;
+def TypeV4LDST : IType<48>;
diff --git a/lib/Target/Hexagon/HexagonDepInstrFormats.td b/lib/Target/Hexagon/HexagonDepInstrFormats.td
new file mode 100644
index 000000000000..d7a99f48803b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -0,0 +1,4182 @@
+//===--- HexagonDepInstrFormats.td ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Enc_12122225 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <3> Qd8;
+  let Inst{2-0} = Qd8{2-0};
+}
+class Enc_16626097 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_13397056 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7315939 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{24-22} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_15275738 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12822813 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_10282127 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_14264243 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{11-8} = Rt16{3-0};
+}
+class Enc_6778937 : OpcodeHexagon {
+  bits <5> Rxx32;
+  let Inst{20-16} = Rxx32{4-0};
+  bits <0> sgp10;
+}
+class Enc_5480539 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_11422009 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_16357011 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{8-4} = Vv32{4-0};
+  bits <5> Vt32;
+  let Inst{13-9} = Vt32{4-0};
+  bits <4> Vdd16;
+  let Inst{3-0} = Vdd16{3-0};
+}
+class Enc_4975051 : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_14786238 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_15472748 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6773159 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_12535811 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_14007201 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_2577026 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{2-0} = Qt8{2-0};
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_7305764 : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_11682941 : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_16376009 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13249928 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_1971351 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13715847 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_13303422 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14574598 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_13094118 : OpcodeHexagon {
+  bits <5> Css32;
+  let Inst{20-16} = Css32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4231995 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_844699 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-22} = n1{2-0};
+}
+class Enc_8752140 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7978128 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_10492541 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_0 : OpcodeHexagon {
+}
+class Enc_15733946 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_738356 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_14400220 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{9-5} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_15194851 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_14172170 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_10065510 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14998517 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <3> n1;
+  let Inst{29-29} = n1{2-2};
+  let Inst{26-25} = n1{1-0};
+}
+class Enc_16657398 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_14620934 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_10075393 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_8638014 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13261538 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_8990840 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5974204 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_4711514 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_11492529 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9277990 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6690615 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_1220199 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_7785569 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_2880796 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_6858527 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{4-0} = Vv32{4-0};
+}
+class Enc_11863656 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_151014 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Px4;
+  let Inst{6-5} = Px4{1-0};
+}
+class Enc_10333841 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_14044877 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_13691337 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <2> Qx4;
+  let Inst{6-5} = Qx4{1-0};
+}
+class Enc_3817033 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <3> Qt8;
+  let Inst{10-8} = Qt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_3540372 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_5200852 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_15949334 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3831744 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8280533 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_10969213 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_3974695 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-4} = Ii{6-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_7255914 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7212930 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12781442 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_799555 : OpcodeHexagon {
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_11083408 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_900013 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_9487067 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{19-16} = Ii{11-8};
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16014536 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_12419313 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_5503430 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_14767681 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_9093094 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <2> Pu4;
+  let Inst{24-23} = Pu4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11542684 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{27-21} = Ii{15-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8877260 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_1737833 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_255516 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_10721363 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_7076358 : OpcodeHexagon {
+  bits <5> Zdd8;
+  let Inst{4-0} = Zdd8{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11930928 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2410156 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_6735062 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_7965855 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_5202340 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vyy32;
+  let Inst{4-0} = Vyy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10568534 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16730127 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11224149 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_9772987 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_9238139 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Zdd8;
+  let Inst{4-0} = Zdd8{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2082775 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_5790679 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_9305257 : OpcodeHexagon {
+  bits <5> Zu8;
+  let Inst{12-8} = Zu8{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_3735566 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12654528 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_15290236 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_11139981 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_15546666 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-8} = Ii{8-6};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_486163 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2079016 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{1-0} = Ii{1-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_10095813 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_13133322 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_9422954 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_10642833 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14989332 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{4-0} = Vv32{4-0};
+}
+class Enc_10263630 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_13937564 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_7171569 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_2702036 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_1928953 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_5853469 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_7692963 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_15140689 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_748676 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_3372766 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7900405 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11930027 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_971574 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{23-23} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_13453446 : OpcodeHexagon {
+  bits <24> Ii;
+  let Inst{24-16} = Ii{23-15};
+  let Inst{13-1} = Ii{14-2};
+}
+class Enc_6356866 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_16246706 : OpcodeHexagon {
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_5326450 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11687333 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_2771456 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11282123 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_518319 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16104442 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_7912540 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_15560488 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7581852 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_10030031 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_3915770 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4075554 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11326438 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4050532 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_14461004 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13344657 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13114546 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_14530015 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-23} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_5967898 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_15450971 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_15536400 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{3-0} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_1291652 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+}
+class Enc_5636753 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+}
+class Enc_5757366 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_9752128 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13618890 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_5890213 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_5582416 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_13536408 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{3-0} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_9773189 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rxx32;
+  let Inst{12-8} = Rxx32{4-0};
+}
+class Enc_2152247 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_12848507 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_16279406 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_1734121 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_766909 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_4527648 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8849208 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_9894557 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4109168 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_14560494 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9773167 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_1898420 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_11498120 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_15459921 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10058269 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_10197700 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_12608570 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4804090 : OpcodeHexagon {
+  bits <6> Ss64;
+  let Inst{21-16} = Ss64{5-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14973146 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_5718302 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_2103742 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_7564330 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2176383 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{9-4} = Ii{5-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_7736768 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_13189194 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_5154851 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_1329520 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Cdd32;
+  let Inst{4-0} = Cdd32{4-0};
+}
+class Enc_14057553 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9223889 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_10979813 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_13490067 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{2-0} = Qt8{2-0};
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_10076500 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_163381 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_10328975 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_14939491 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_8891794 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_7723767 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2639299 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{11-8} = Rd16{3-0};
+}
+class Enc_11552785 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11849200 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_14868535 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{23-22} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+}
+class Enc_48594 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6608821 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_11049656 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-3} = Ii{7-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_117962 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{23-21} = Ii{7-5};
+  let Inst{13-13} = Ii{4-4};
+  let Inst{7-5} = Ii{3-1};
+  let Inst{3-3} = Ii{0-0};
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5900401 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_36641 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_9626139 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11971407 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9852473 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_6495334 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_1186018 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_15999208 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_11477246 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_7971062 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4327792 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_10326434 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1572239 : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_6372758 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_15793331 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_11424254 : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_4983213 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_16035138 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+}
+class Enc_8225953 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_4397470 : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_1004392 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_16319737 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_2296022 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9664427 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Qss8;
+  let Inst{2-0} = Qss8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_877823 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_1589406 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6900405 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14150875 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-22} = n1{3-0};
+}
+class Enc_15707793 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Gd32;
+  let Inst{4-0} = Gd32{4-0};
+}
+class Enc_14689096 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_9915754 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7470998 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_11471622 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_14363183 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_15816255 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5321335 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <4> Vdd16;
+  let Inst{7-4} = Vdd16{3-0};
+}
+class Enc_12702821 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_449439 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_2054304 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <6> Sd64;
+  let Inst{5-0} = Sd64{5-0};
+}
+class Enc_236434 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_5598813 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8409782 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_15182416 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4501395 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6039436 : OpcodeHexagon {
+  bits <3> Qtt8;
+  let Inst{2-0} = Qtt8{2-0};
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_476163 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_11281763 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9929262 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+}
+class Enc_13174858 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8437395 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_16578332 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-8} = Ii{8-6};
+  bits <5> Zdd8;
+  let Inst{4-0} = Zdd8{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12829314 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_9744403 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{13-9} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{8-4} = Vv32{4-0};
+  bits <4> Vdd16;
+  let Inst{3-0} = Vdd16{3-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10968391 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <7> n1;
+  let Inst{28-28} = n1{6-6};
+  let Inst{25-22} = n1{5-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_64199 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_11039423 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6730375 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_16213761 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_13204995 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_13338314 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_9920336 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_15380240 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_3296020 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2428539 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_10039393 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9372046 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_2901241 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_16145290 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_13783220 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_12261611 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6135183 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_5523416 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13472494 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16303398 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3494181 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13983714 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_931653 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7622936 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_8773155 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_5401217 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <3> n1;
+  let Inst{28-28} = n1{2-2};
+  let Inst{24-23} = n1{1-0};
+}
+class Enc_6736678 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_3457570 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_3813442 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3135259 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_5486172 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
+}
+class Enc_11081334 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+}
+class Enc_9470751 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_2683366 : OpcodeHexagon {
+  bits <3> Quu8;
+  let Inst{10-8} = Quu8{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
+}
+class Enc_15830826 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+}
+class Enc_4967902 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_14287645 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8324216 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_913538 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_16311032 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_9864697 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <6> II;
+  let Inst{20-16} = II{5-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11205051 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_5611087 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10915758 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8943121 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_1539665 : OpcodeHexagon {
+  bits <5> Cs32;
+  let Inst{20-16} = Cs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8479583 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-23} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_313333 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_11544269 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_9018141 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Cd32;
+  let Inst{4-0} = Cd32{4-0};
+}
+class Enc_6152036 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Gdd32;
+  let Inst{4-0} = Gdd32{4-0};
+}
+class Enc_1954437 : OpcodeHexagon {
+  bits <6> Sss64;
+  let Inst{21-16} = Sss64{5-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3742184 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1835415 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-5} = Ii{6-1};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1085466 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_13150110 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_6772177 : OpcodeHexagon {
+  bits <5> Zu8;
+  let Inst{12-8} = Zu8{4-0};
+  bits <5> Zd8;
+  let Inst{4-0} = Zd8{4-0};
+}
+class Enc_6616512 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_1886960 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2835415 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{10-5} = Ii{7-2};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14024197 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_12297800 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_7254313 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_677558 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-5} = Ii{8-3};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_6223403 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_674613 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_16479122 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{7-3} = Ii{7-3};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_11704059 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_9165078 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{8-3} = Ii{8-3};
+  bits <3> Rtt8;
+  let Inst{2-0} = Rtt8{2-0};
+}
+class Enc_15376009 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8838398 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{21-21} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_2328527 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_1451363 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_4030179 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_13770697 : OpcodeHexagon {
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ry32;
+  let Inst{12-8} = Ry32{4-0};
+}
+class Enc_12212978 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12665927 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2082956 : OpcodeHexagon {
+  bits <32> Ii;
+  let Inst{27-16} = Ii{31-20};
+  let Inst{13-0} = Ii{19-6};
+}
+class Enc_220949 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_9939385 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <10> II;
+  let Inst{20-16} = II{9-5};
+  let Inst{7-5} = II{4-2};
+  let Inst{1-0} = II{1-0};
+}
+class Enc_2117024 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8390029 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_10989558 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_5972412 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_12851489 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9554661 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4202401 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6091631 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qt4;
+  let Inst{23-22} = Qt4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_10157519 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_4835423 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{10-5} = Ii{5-0};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14046916 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_2921694 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8732960 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+}
+class Enc_5338033 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_6956613 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2153798 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_16210172 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{10-8} = Qt8{2-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_5023792 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_1244745 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_10002182 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_12492533 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1774350 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_2703240 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_6975103 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_9789480 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_12244921 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8674673 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-22} = n1{1-0};
+}
+class Enc_8514936 : OpcodeHexagon {
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13455308 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_10188026 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_3158657 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10597934 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <2> n1;
+  let Inst{9-8} = n1{1-0};
+}
+class Enc_10612292 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_5178985 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3967902 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_2462143 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9849208 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_12618352 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_7303598 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_13823098 : OpcodeHexagon {
+  bits <5> Gss32;
+  let Inst{20-16} = Gss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_16388420 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_8328140 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1793896 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4944558 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_13211717 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{20-16} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_8170340 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <3> Qdd8;
+  let Inst{2-0} = Qdd8{2-0};
+}
+class Enc_14071773 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8605375 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12711252 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{9-8} = Pv4{1-0};
+}
+class Enc_8202458 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8577055 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_1409050 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_7466005 : OpcodeHexagon {
+  bits <5> Gs32;
+  let Inst{20-16} = Gs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2380082 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_10067774 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11000933 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
+}
+class Enc_13201267 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_1989309 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_9082775 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8065534 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4631106 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pu4;
+  let Inst{7-6} = Pu4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_11065510 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6673186 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_8498433 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4395009 : OpcodeHexagon {
+  bits <7> Ii;
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10926598 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_7606379 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8131399 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_11522288 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_114098 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_5654851 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_12023037 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_176263 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{9-4} = Ii{7-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_6130414 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{13-0} = Ii{13-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_631197 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_16214129 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_8333157 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4834775 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rd16;
+  let Inst{19-16} = Rd16{3-0};
+}
+class Enc_16601956 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_15946706 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{6-5} = Ii{1-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_6923828 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_1332717 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1786883 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <6> Sdd64;
+  let Inst{5-0} = Sdd64{5-0};
+}
+class Enc_14303394 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9282127 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_2813446 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_364753 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{23-23} = n1{0-0};
+}
+class Enc_12477789 : OpcodeHexagon {
+  bits <15> Ii;
+  let Inst{21-21} = Ii{14-14};
+  let Inst{13-13} = Ii{13-13};
+  let Inst{11-1} = Ii{12-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_44555 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_8497723 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_4359901 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{22-22} = n1{0-0};
+}
+class Enc_11271630 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10501894 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_9768377 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_16268019 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_8814718 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6212930 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5462762 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_6154421 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_8940892 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3531000 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_14311138 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2216485 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12395768 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_11047413 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_1256611 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_7884306 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{8-4} = Ii{7-3};
+}
+class Enc_11244923 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8612939 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{22-22} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_16355964 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12616482 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_5915771 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_14459927 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7504828 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14209223 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_3931661 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13606251 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_11475992 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_13133231 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9959498 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{22-21} = Ii{7-6};
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-5} = Ii{4-2};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_8919369 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-23} = n1{3-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_2968094 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_4813442 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4684887 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{25-23} = n1{2-0};
+}
+class Enc_15606259 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_2268028 : OpcodeHexagon {
+  bits <3> Qtt8;
+  let Inst{10-8} = Qtt8{2-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
+}
+class Enc_13430430 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <3> Qxx8;
+  let Inst{2-0} = Qxx8{2-0};
+}
+class Enc_13336212 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <1> n1;
+  let Inst{9-9} = n1{0-0};
+}
+class Enc_15008287 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_4897205 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_8038806 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12669374 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_971347 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1997594 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11940513 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_2735552 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_16410950 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6226085 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14193700 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_15763937 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <6> n1;
+  let Inst{29-29} = n1{5-5};
+  let Inst{26-25} = n1{4-3};
+  let Inst{23-22} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_2492727 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_13425035 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4135257 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_14631806 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_12397062 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11959851 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
new file mode 100644
index 000000000000..2bfde9acaea9
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -0,0 +1,45573 @@
+//===--- HexagonDepInstrInfo.td -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def A2_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = abs($Rs32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_absp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = abs($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_abssat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = abs($Rs32):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_add : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_addh_h16_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_sat_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_l16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_l16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_l16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_l16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,#$Ii)",
+ALU32_ADDI_tc_1_SLOT0123, TypeALU32_ADDI>, Enc_11542684, PredNewRel, ImmRegRel {
+let Inst{31-28} = 0b1011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isPredicable = 1;
+let isAdd = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def A2_addp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let isCommutable = 1;
+let isAdd = 1;
+}
+def A2_addpsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let Defs = [USR_OVF];
+let isCommutable = 1;
+}
+def A2_addsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_addsp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rs32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64> {
+let isPseudo = 1;
+}
+def A2_addsph : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):raw:hi",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_addspl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):raw:lo",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_and : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_and";
+let InputType = "reg";
+let BaseOpcode = "A2_and";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_andir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = and($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_13472494, ImmRegRel {
+let Inst{31-22} = 0b0111011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_and";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_andp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = and($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_aslh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+let isPredicable = 1;
+}
+def A2_asrh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+let isPredicable = 1;
+}
+def A2_combine_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.h,$Rs32.h)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.h,$Rs32.l)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.l,$Rs32.h)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.l,$Rs32.l)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_14007201 {
+let Inst{31-23} = 0b011111000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_combinew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1997594, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110101000;
+let InputType = "reg";
+let BaseOpcode = "A2_combinew";
+let isPredicable = 1;
+}
+def A2_max : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = max($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_maxp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = max($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_maxu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = maxu($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_maxup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = maxu($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_min : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = min($Rt32,$Rs32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_minp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = min($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_minu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = minu($Rt32,$Rs32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_minup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = minu($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_neg : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = neg($Rs32)",
+PSEUDO, TypeALU32_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_negp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = neg($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_negsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = neg($Rs32):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_nop : HInst<
+(outs),
+(ins),
+"nop",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0111111100000000;
+}
+def A2_not : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = not($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_notp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = not($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_or : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_or";
+let InputType = "reg";
+let BaseOpcode = "A2_or";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_orir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = or($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_13472494, ImmRegRel {
+let Inst{31-22} = 0b0111011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_or";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_orp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = or($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_paddf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddifnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-23} = 0b011101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011101000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_padditnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-23} = 0b011101000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_pandf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_and";
+}
+def A2_pandfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_and";
+}
+def A2_pandt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_and";
+}
+def A2_pandtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_and";
+}
+def A2_porf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_or";
+}
+def A2_porfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_or";
+}
+def A2_port : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_or";
+}
+def A2_portnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_or";
+}
+def A2_psubf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sub";
+}
+def A2_pxorf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxorfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxort : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxortnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_xor";
+}
+def A2_roundsat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = round($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = sat($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sath : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sath($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satub($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satuh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satuh($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_sub";
+let InputType = "reg";
+let BaseOpcode = "A2_sub";
+let isPredicable = 1;
+}
+def A2_subh_h16_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_sat_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_l16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_l16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_l16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_l16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = sub($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_subri : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = sub(#$Ii,$Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_13472494, PredNewRel, ImmRegRel {
+let Inst{31-22} = 0b0111011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_sub";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_subsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_svaddh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vaddh($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svaddhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vaddh($Rs32,$Rt32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svadduhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vadduh($Rs32,$Rt32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svavgh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vavgh($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svavghs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vavgh($Rs32,$Rt32):rnd",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svnavgh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vnavgh($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_svsubh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubh($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_svsubhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubh($Rt32,$Rs32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_svsubuhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubuh($Rt32,$Rs32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_swiz : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = swiz($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_sxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+let isPredicable = 1;
+}
+def A2_sxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+let isPredicable = 1;
+}
+def A2_sxtw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = sxtw($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100010;
+}
+def A2_tfr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPredicable = 1;
+}
+def A2_tfrcrr : HInst<
+(outs IntRegs:$Rd32),
+(ins CtrRegs:$Cs32),
+"$Rd32 = $Cs32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_1539665 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_tfrf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrih : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u16_0Imm:$Ii),
+"$Rx32.h = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_6130414 {
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def A2_tfril : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u16_0Imm:$Ii),
+"$Rx32.l = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_6130414 {
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def A2_tfrp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let BaseOpcode = "A2_tfrp";
+let isPredicable = 1;
+let isPseudo = 1;
+}
+def A2_tfrpf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if (!$Pu4) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrpfnew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if (!$Pu4.new) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrpi : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii),
+"$Rdd32 = #$Ii",
+ALU64_tc_1_SLOT23, TypeALU64> {
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isPseudo = 1;
+}
+def A2_tfrpt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if ($Pu4) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrptnew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if ($Pu4.new) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrrcr : HInst<
+(outs CtrRegs:$Cd32),
+(ins IntRegs:$Rs32),
+"$Cd32 = $Rs32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_9018141 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_tfrsi : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii),
+"$Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_7971062, PredNewRel, ImmRegRel {
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isPredicable = 1;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def A2_tfrt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vabsh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000010;
+}
+def A2_vabshsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsh($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000010;
+let Defs = [USR_OVF];
+}
+def A2_vabsw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsw($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000010;
+}
+def A2_vabswsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsw($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000010;
+let Defs = [USR_OVF];
+}
+def A2_vaddb_map : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddb($Rss32,$Rtt32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vaddh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddh($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddh($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vaddub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddub($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddubs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddub($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vadduhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vadduh($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vaddw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddw($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddws : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddw($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vavgh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavghcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32):crnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavghr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavgub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgub($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavgubr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgub($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavguh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguh($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavguhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguh($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavguw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguw($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vavguwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguw($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vavgw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vavgwcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32):crnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vavgwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vcmpbeq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b110000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpbgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b111000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpheq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmphgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmphgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpweq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpwgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpwgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vconj : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vconj($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000100;
+let Defs = [USR_OVF];
+}
+def A2_vmaxb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxb($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxub($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxuh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxuh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxuw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxuw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vmaxw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vminb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminb($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vminh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminub($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminuh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminuh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminuw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminuw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vnavgh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+}
+def A2_vnavghcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32):crnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavghr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32):rnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavgw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+}
+def A2_vnavgwcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32):crnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavgwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32):rnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vraddub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vraddub($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def A2_vraddub_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vraddub($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A2_vrsadub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrsadub($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def A2_vrsadub_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrsadub($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A2_vsubb_map : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vsubb($Rss32,$Rtt32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vsubh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubh($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsubhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubh($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_vsubub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubub($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsububs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubub($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_vsubuhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubuh($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_vsubw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubw($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsubws : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubw($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_xor : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let BaseOpcode = "A2_xor";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_xorp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = xor($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_zxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+let isPredicable = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_zxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+let isPredicable = 1;
+}
+def A4_addp_c : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Px4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
+"$Rdd32 = add($Rss32,$Rtt32,$Px4):carry",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_151014 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010110;
+let isPredicateLate = 1;
+let Constraints = "$Px4 = $Px4in";
+}
+def A4_andn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = and($Rt32,~$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A4_andnp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = and($Rtt32,~$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+}
+def A4_bitsplit : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = bitsplit($Rs32,$Rt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010100001;
+}
+def A4_bitspliti : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rdd32 = bitsplit($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_5654851 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+}
+def A4_boundscheck : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rs32,$Rtt32)",
+M_tc_3x_SLOT23, TypeALU64> {
+let isPseudo = 1;
+}
+def A4_boundscheck_hi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rss32,$Rtt32):raw:hi",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_boundscheck_lo : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rss32,$Rtt32):raw:lo",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_cmpbeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.eq($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b110000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbeq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpbeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Pd4 = cmpb.eq($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101000;
+let CextOpcode = "A4_cmpbeq";
+let InputType = "imm";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpbgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.gt($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmpbgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s8_0Imm:$Ii),
+"$Pd4 = cmpb.gt($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101001;
+let CextOpcode = "A4_cmpbgt";
+let InputType = "imm";
+let isCompare = 1;
+}
+def A4_cmpbgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.gtu($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b111000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmpbgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmpb.gtu($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3531000, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011101010;
+let CextOpcode = "A4_cmpbgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+}
+def A4_cmpheq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.eq($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpheq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpheqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmph.eq($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101000;
+let CextOpcode = "A4_cmpheq";
+let InputType = "imm";
+let isCommutable = 1;
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cmphgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.gt($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmphgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmphgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmph.gt($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101001;
+let CextOpcode = "A4_cmphgt";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cmphgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.gtu($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmphgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmphgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmph.gtu($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3531000, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011101010;
+let CextOpcode = "A4_cmphgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+}
+def A4_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9864697 {
+let Inst{31-21} = 0b01111100100;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def A4_combineir : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rdd32 = combine(#$Ii,$Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_2462143 {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011001;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_combineri : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rdd32 = combine($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_2462143 {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011000;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cround_ri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = cround($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_cround_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cround($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_ext : HInst<
+(outs),
+(ins u26_6Imm:$Ii),
+"immext(#$Ii)",
+EXTENDER_tc_1_SLOT0123, TypeEXTENDER>, Enc_2082956 {
+let Inst{31-28} = 0b0000;
+}
+def A4_modwrapu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = modwrap($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_orn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = or($Rt32,~$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A4_ornp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = or($Rtt32,~$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+}
+def A4_paslhf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslhfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslhtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_aslh";
+}
+def A4_pasrhf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrhfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrhtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_asrh";
+}
+def A4_psxtbf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxthf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxthfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxtht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxthtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxth";
+}
+def A4_pzxtbf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxthf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxthfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxtht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxthtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxth";
+}
+def A4_rcmpeq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeq";
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A4_rcmpeqi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_16355964, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeqi";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_rcmpneq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = !cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpneq";
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A4_rcmpneqi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = !cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_16355964, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeqi";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_round_ri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = round($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_round_ri_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = round($Rs32,#$Ii):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A4_round_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = round($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_round_rr_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = round($Rs32,$Rt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A4_subp_c : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Px4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
+"$Rdd32 = sub($Rss32,$Rtt32,$Px4):carry",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_151014 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010111;
+let isPredicateLate = 1;
+let Constraints = "$Px4 = $Px4in";
+}
+def A4_tfrcpp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins CtrRegs64:$Css32),
+"$Rdd32 = $Css32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_13094118 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101000000;
+}
+def A4_tfrpcp : HInst<
+(outs CtrRegs64:$Cdd32),
+(ins DoubleRegs:$Rss32),
+"$Cdd32 = $Rss32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_1329520 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100011001;
+}
+def A4_tlbmatch : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Pd4 = tlbmatch($Rss32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2492727 {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+let isPredicateLate = 1;
+}
+def A4_vcmpbeq_any : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = any8(vcmpb.eq($Rss32,$Rtt32))",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_vcmpbeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u8_0Imm:$Ii),
+"$Pd4 = vcmpb.eq($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmpbgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_vcmpbgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpb.gt($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmpbgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmpb.gtu($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2968094 {
+let Inst{4-2} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vcmpheqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmph.eq($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmphgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmph.gt($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmphgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmph.gtu($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2968094 {
+let Inst{4-2} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vcmpweqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpw.eq($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmpwgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpw.gt($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmpwgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmpw.gtu($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2968094 {
+let Inst{4-2} = 0b100;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vrmaxh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxuh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxuh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxuw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxuw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminuh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminuh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminuw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminuw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A5_ACS : HInst<
+(outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
+M_tc_3stall_SLOT23, TypeM>, Enc_12822813, Requires<[HasV55T]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A5_vaddhubs : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vaddhub($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9277990, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A6_vminub_RdP : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_766909, Requires<[HasV62T]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def C2_all8 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = all8($Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_6975103 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011101000;
+}
+def C2_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = and($Pt4,$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011000000;
+}
+def C2_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = and($Pt4,!$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011011000;
+}
+def C2_any8 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = any8($Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_6975103 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011100000;
+}
+def C2_bitsclr : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = bitsclr($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111100;
+}
+def C2_bitsclri : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii),
+"$Pd4 = bitsclr($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_14574598 {
+let Inst{7-2} = 0b000000;
+let Inst{31-21} = 0b10000101100;
+}
+def C2_bitsset : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = bitsset($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111010;
+}
+def C2_ccombinewf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewnewf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewnewt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_cmoveif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmoveit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmovenewif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmovenewit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_2early_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010000;
+let CextOpcode = "C2_cmpeq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C2_cmpeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-22} = 0b0111010100;
+let CextOpcode = "C2_cmpeq";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C2_cmpeqp : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C2_cmpgei : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s8_0Imm:$Ii),
+"$Pd4 = cmp.ge($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op> {
+let isCompare = 1;
+let isPseudo = 1;
+}
+def C2_cmpgeui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Pd4 = cmp.geu($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op> {
+let isCompare = 1;
+let isPseudo = 1;
+}
+def C2_cmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.gt($Rs32,$Rt32)",
+ALU32_3op_tc_2early_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010010;
+let CextOpcode = "C2_cmpgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C2_cmpgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.gt($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-22} = 0b0111010101;
+let CextOpcode = "C2_cmpgt";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C2_cmpgtp : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCompare = 1;
+}
+def C2_cmpgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.gtu($Rs32,$Rt32)",
+ALU32_3op_tc_2early_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010011;
+let CextOpcode = "C2_cmpgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C2_cmpgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmp.gtu($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_13249928, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-21} = 0b01110101100;
+let CextOpcode = "C2_cmpgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def C2_cmpgtup : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCompare = 1;
+}
+def C2_cmplt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.lt($Rs32,$Rt32)",
+PSEUDO, TypeALU32_3op> {
+let isCompare = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_cmpltu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.ltu($Rs32,$Rt32)",
+PSEUDO, TypeALU32_3op> {
+let isCompare = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_mask : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4),
+"$Rdd32 = mask($Pt4)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_10328975 {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b1000011000000000;
+}
+def C2_mux : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mux($Pu4,$Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def C2_muxii : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rd32 = mux($Pu4,#$Ii,#$II)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9093094 {
+let Inst{31-25} = 0b0111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_muxir : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = mux($Pu4,$Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_muxri : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = mux($Pu4,#$Ii,$Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_not : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = not($Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_6975103 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011110000;
+}
+def C2_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = or($Pt4,$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011001000;
+}
+def C2_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = or($Pt4,!$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011111000;
+}
+def C2_pxfer_map : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = $Ps4",
+S_2op_tc_1_SLOT23, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_tfrpr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Ps4),
+"$Rd32 = $Ps4",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_11139981 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-18} = 0b10001001010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def C2_tfrrp : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32),
+"$Pd4 = $Rs32",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_4527648 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-21} = 0b10000101010;
+}
+def C2_vitpack : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Rd32 = vitpack($Ps4,$Pt4)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_6735062 {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b10001001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def C2_vmux : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmux($Pu4,$Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_7606379 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010001000;
+}
+def C2_xor : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = xor($Ps4,$Pt4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8324216 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011010000;
+}
+def C4_addipc : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = add(pc,#$Ii)",
+CR_tc_2_SLOT3, TypeCR>, Enc_9554661 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0110101001001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def C4_and_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,and($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011000100;
+}
+def C4_and_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,and($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011100100;
+}
+def C4_and_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,or($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011001100;
+}
+def C4_and_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,or($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011101100;
+}
+def C4_cmplte : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.gt($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010010;
+let CextOpcode = "C4_cmplte";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C4_cmpltei : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = !cmp.gt($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-22} = 0b0111010101;
+let CextOpcode = "C4_cmplte";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C4_cmplteu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.gtu($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010011;
+let CextOpcode = "C4_cmplteu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C4_cmplteui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = !cmp.gtu($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_13249928, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-21} = 0b01110101100;
+let CextOpcode = "C4_cmplteu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def C4_cmpneq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010000;
+let CextOpcode = "C4_cmpneq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C4_cmpneqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = !cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-22} = 0b0111010100;
+let CextOpcode = "C4_cmpneq";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C4_fastcorner9 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = fastcorner9($Ps4,$Pt4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8324216 {
+let Inst{7-2} = 0b100100;
+let Inst{13-10} = 0b1000;
+let Inst{31-18} = 0b01101011000000;
+}
+def C4_fastcorner9_not : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = !fastcorner9($Ps4,$Pt4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8324216 {
+let Inst{7-2} = 0b100100;
+let Inst{13-10} = 0b1000;
+let Inst{31-18} = 0b01101011000100;
+}
+def C4_nbitsclr : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !bitsclr($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111101;
+}
+def C4_nbitsclri : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii),
+"$Pd4 = !bitsclr($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_14574598 {
+let Inst{7-2} = 0b000000;
+let Inst{31-21} = 0b10000101101;
+}
+def C4_nbitsset : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !bitsset($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111011;
+}
+def C4_or_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,and($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011010100;
+}
+def C4_or_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,and($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011110100;
+}
+def C4_or_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,or($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011011100;
+}
+def C4_or_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,or($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011111100;
+}
+def F2_conv_d2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_d2df($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_d2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_d2sf($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2d : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2d($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2d_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2d($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2sf($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2ud : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2ud($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2ud_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2ud($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2uw : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2uw($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2uw_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2uw($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2w : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2w($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2w_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2w($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2d : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2d($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2d_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2d($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2df($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2ud : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2ud($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2ud_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2ud($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2uw : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2uw($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2uw_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2uw($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2w : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2w($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2w_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2w($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_ud2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_ud2df($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_ud2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_ud2sf($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_uw2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_uw2df($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_uw2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_uw2sf($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_w2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_w2df($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_w2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_w2sf($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_dfclass : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Pd4 = dfclass($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_14400220, Requires<[HasV5T]> {
+let Inst{4-2} = 0b100;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b11011100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_dfcmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpge : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpuo : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfimm_n : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u10_0Imm:$Ii),
+"$Rdd32 = dfmake(#$Ii):neg",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_2702036, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101100101;
+let prefersSlot3 = 1;
+}
+def F2_dfimm_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u10_0Imm:$Ii),
+"$Rdd32 = dfmake(#$Ii):pos",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_2702036, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101100100;
+let prefersSlot3 = 1;
+}
+def F2_sfadd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfadd($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let isCommutable = 1;
+}
+def F2_sfclass : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = sfclass($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_2103742, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_sfcmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.eq($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpge : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.ge($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.gt($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpuo : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.uo($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sffixupd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sffixupd($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+}
+def F2_sffixupn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sffixupn($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+}
+def F2_sffixupr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sffixupr($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+}
+def F2_sffma : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += sfmpy($Rs32,$Rt32)",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffma_lib : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += sfmpy($Rs32,$Rt32):lib",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffma_sc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
+"$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_15194851, Requires<[HasV5T]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffms : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= sfmpy($Rs32,$Rt32)",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffms_lib : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= sfmpy($Rs32,$Rt32):lib",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sfimm_n : HInst<
+(outs IntRegs:$Rd32),
+(ins u10_0Imm:$Ii),
+"$Rd32 = sfmake(#$Ii):neg",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_9082775, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def F2_sfimm_p : HInst<
+(outs IntRegs:$Rd32),
+(ins u10_0Imm:$Ii),
+"$Rd32 = sfmake(#$Ii):pos",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_9082775, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def F2_sfinvsqrta : HInst<
+(outs IntRegs:$Rd32, PredRegs:$Pe4),
+(ins IntRegs:$Rs32),
+"$Rd32,$Pe4 = sfinvsqrta($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_5718302, Requires<[HasV5T]> {
+let Inst{13-7} = 0b0000000;
+let Inst{31-21} = 0b10001011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def F2_sfmax : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmax($Rs32,$Rt32)",
+M_tc_2_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_sfmin : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmin($Rs32,$Rt32)",
+M_tc_2_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_sfmpy : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmpy($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let isCommutable = 1;
+}
+def F2_sfrecipa : HInst<
+(outs IntRegs:$Rd32, PredRegs:$Pe4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_5853469, Requires<[HasV5T]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def F2_sfsub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfsub($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def J2_call : HInst<
+(outs),
+(ins a30_2Imm:$Ii),
+"call $Ii",
+J_tc_2early_SLOT23, TypeJ>, Enc_13453446, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{31-25} = 0b0101101;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let isPredicable = 1;
+let hasSideEffects = 1;
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 24;
+let opExtentAlign = 2;
+}
+def J2_callf : HInst<
+(outs),
+(ins PredRegs:$Pu4, a30_2Imm:$Ii),
+"if (!$Pu4) call $Ii",
+J_tc_2early_SLOT23, TypeJ>, Enc_14868535, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_callr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"callr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010000101;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+}
+def J2_callrf : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) callr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+}
+def J2_callrt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) callr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010001000;
+let isPredicated = 1;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+}
+def J2_callt : HInst<
+(outs),
+(ins PredRegs:$Pu4, a30_2Imm:$Ii),
+"if ($Pu4) call $Ii",
+J_tc_2early_SLOT23, TypeJ>, Enc_14868535, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011101;
+let isPredicated = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_endloop0 : HInst<
+(outs),
+(ins),
+"endloop0",
+PSEUDO, TypeJ> {
+let Uses = [LC0, SA0];
+let Defs = [LC0, P3, PC, USR];
+let isPseudo = 1;
+}
+def J2_endloop01 : HInst<
+(outs),
+(ins),
+"endloop01",
+PSEUDO, TypeJ> {
+let Uses = [LC0, LC1, SA0, SA1];
+let Defs = [LC0, LC1, P3, PC, USR];
+let isPseudo = 1;
+}
+def J2_endloop1 : HInst<
+(outs),
+(ins),
+"endloop1",
+PSEUDO, TypeJ> {
+let Uses = [LC1, SA1];
+let Defs = [LC1, PC];
+let isPseudo = 1;
+}
+def J2_jump : HInst<
+(outs),
+(ins b30_2Imm:$Ii),
+"jump $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_13453446, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{31-25} = 0b0101100;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isBarrier = 1;
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 24;
+let opExtentAlign = 2;
+}
+def J2_jumpf : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpf_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, b15_2Imm:$Ii),
+"if (!$Pu4) jump $Ii",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumpfnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4.new) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b010;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpfnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4.new) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b110;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpfpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, Requires<[HasV60T]>, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b100;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"jumpr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059, PredNewRel {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010010100;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isBarrier = 1;
+let isPredicable = 1;
+}
+def J2_jumprf : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprf_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr $Rs32",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumprfnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprfnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprfpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, Requires<[HasV60T]>, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprgtez : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32>=#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000101;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprgtezpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32>=#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000101;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprltez : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32<=#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000111;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprltezpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32<=#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000111;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprnz : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32==#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprnzpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32==#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprt_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr $Rs32",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumprtnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprtnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprtpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, Requires<[HasV60T]>, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprz : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32!=#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprzpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32!=#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpt_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, b15_2Imm:$Ii),
+"if ($Pu4) jump $Ii",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumptnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4.new) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b010;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumptnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4.new) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b110;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumptpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, Requires<[HasV60T]>, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b100;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_loop0i : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"loop0($Ii,#$II)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001000;
+let Defs = [LC0, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop0r : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"loop0($Ii,$Rs32)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000000;
+let Defs = [LC0, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop1i : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"loop1($Ii,#$II)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001001;
+let Defs = [LC1, SA1];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop1r : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"loop1($Ii,$Rs32)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000001;
+let Defs = [LC1, SA1];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_pause : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"pause(#$Ii)",
+J_tc_2early_SLOT2, TypeJ>, Enc_8732960 {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010001000000;
+let isSolo = 1;
+}
+def J2_ploop1si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp1loop0($Ii,#$II)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001101;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop1sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp1loop0($Ii,$Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000101;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop2si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp2loop0($Ii,#$II)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001110;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop2sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp2loop0($Ii,$Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000110;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop3si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp3loop0($Ii,#$II)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001111;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop3sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp3loop0($Ii,$Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000111;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_trap0 : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"trap0(#$Ii)",
+J_tc_2early_SLOT2, TypeJ>, Enc_8732960 {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010000000000;
+let isSolo = 1;
+}
+def J4_cmpeq_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4359901, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_8612939, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_844699, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_5338033, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14150875, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_15450971, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_14998517, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_11544269, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_5401217, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12419313, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_4684887, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_220949, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_8674673, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15763937, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_5915771, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7315939, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7785569, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_10968391, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_364753, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_8479583, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_2428539, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_8919369, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_8577055, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14530015, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmplt_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_hintjumpr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"hintjr($Rs32)",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010010101;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+}
+def J4_jumpseti : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u6_0Imm:$II, b30_2Imm:$Ii),
+"$Rd16 = #$II ; jump $Ii",
+COMPOUND, TypeCJ>, Enc_4834775 {
+let Inst{0-0} = 0b0;
+let Inst{31-22} = 0b0001011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_jumpsetr : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"$Rd16 = $Rs16 ; jump $Ii",
+COMPOUND, TypeCJ>, Enc_2639299 {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!tstbit($Ns8.new,#0)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!tstbit($Ns8.new,#0)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (tstbit($Ns8.new,#0)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (tstbit($Ns8.new,#0)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def L2_deallocframe : HInst<
+(outs),
+(ins),
+"deallocframe",
+LD_tc_ld_SLOT01, TypeLD>, Enc_0 {
+let Inst{4-0} = 0b11110;
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010000000;
+let Inst{20-16} = 0b11110;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [R29, R30, R31];
+}
+def L2_loadalignb_io : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Ryy32 = memb_fifo($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_449439 {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignb_pbr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110100;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pci : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_971347 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pcr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pi : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Ryy32 = memb_fifo($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6372758 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_zomap : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
+"$Ryy32 = memb_fifo($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignh_io : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Ryy32 = memh_fifo($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_11930027 {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignh_pbr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110010;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pci : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_1971351 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pcr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pi : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Ryy32 = memh_fifo($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_3372766 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_zomap : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
+"$Ryy32 = memh_fifo($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadbsw2_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = membh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738 {
+let Inst{24-21} = 0b0001;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadbsw2_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = membh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = membh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbsw4_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rdd32 = membh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_9852473 {
+let Inst{24-21} = 0b0111;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadbsw4_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110111;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_3931661 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rdd32 = membh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_8752140 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = membh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbzw2_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memubh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738 {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadbzw2_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memubh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memubh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbzw4_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rdd32 = memubh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_9852473 {
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadbzw4_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110101;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_3931661 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rdd32 = memubh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_8752140 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memubh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrb_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memb($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_14461004, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def L2_loadrb_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_16303398 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5598813, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrbgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memb(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrb_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def L2_loadrd_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii),
+"$Rdd32 = memd($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_163381, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def L2_loadrd_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111110;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_931653 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii),
+"$Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_9752128, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrdgp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u29_3Imm:$Ii),
+"$Rdd32 = memd(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4975051, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b01001;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrd_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def L2_loadrh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadrh_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrhgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memh(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrh_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def L2_loadri_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rd32 = memw($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_8990840, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadri_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_14303394 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_16376009, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrigp : HInst<
+(outs IntRegs:$Rd32),
+(ins u30_2Imm:$Ii),
+"$Rd32 = memw(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_8814718, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadri_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def L2_loadrub_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memub($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_14461004, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def L2_loadrub_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_16303398 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5598813, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrubgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memub(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrub_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def L2_loadruh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memuh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadruh_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadruhgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memuh(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadruh_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def L2_loadw_locked : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw_locked($Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_4075554 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let isSoloAX = 1;
+let mayLoad = 1;
+}
+def L2_ploadrbf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbt_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbt_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbt_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdf_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdf_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdf_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdfnew_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdfnew_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdfnew_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdt_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdt_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdt_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdtnew_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdtnew_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdtnew_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrht_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrht_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrht_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrif_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrif_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrif_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrifnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrifnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrifnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrit_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if ($Pt4) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrit_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if ($Pt4) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrit_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadritnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadritnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadritnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubt_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubt_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubt_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruht_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruht_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruht_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) += $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_add_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) += $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) += $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_add_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) += $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) += $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_add_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) += $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) &= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_and_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) &= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) &= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_and_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) &= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) &= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_and_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) &= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) += #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_iadd_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) += #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) += #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_iadd_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) += #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) += #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_iadd_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) += #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) = clrbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_iand_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) = clrbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) = clrbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_iand_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) = clrbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) = clrbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_iand_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) = clrbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) = setbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ior_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) = setbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) = setbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_ior_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) = setbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) = setbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_ior_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) = setbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) -= #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_isub_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) -= #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) -= #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_isub_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) -= #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) -= #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_isub_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) -= #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_loadalignb_ap : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Re32),
+(ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
+"$Ryy32 = memb_fifo($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_11047413 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010100;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignb_ur : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Ryy32 = memb_fifo($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_7303598 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100100;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 4;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignh_ap : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Re32),
+(ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
+"$Ryy32 = memh_fifo($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_11047413 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010010;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignh_ur : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Ryy32 = memh_fifo($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_7303598 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100010;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 4;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadbsw2_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = membh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw2_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = membh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw4_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = membh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_877823 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010111;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw4_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = membh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_5582416 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100111;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw2_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memubh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw2_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memubh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw4_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = memubh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_877823 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010101;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw4_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = memubh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_5582416 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100101;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadd_locked : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd_locked($Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_4030179 {
+let Inst{13-5} = 0b010000000;
+let Inst{31-21} = 0b10010010000;
+let accessSize = DoubleWordAccess;
+let isSoloAX = 1;
+let mayLoad = 1;
+}
+def L4_loadrb_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memb($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrb_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+let isPredicable = 1;
+}
+def L4_loadrb_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memb($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrd_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = memd($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_877823 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011110;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrd_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7581852, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010110;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+let isPredicable = 1;
+}
+def L4_loadrd_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = memd($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_5582416, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101110;
+let addrMode = BaseLongOffset;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrh_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrh_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+let isPredicable = 1;
+}
+def L4_loadrh_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadri_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memw($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadri_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+let isPredicable = 1;
+}
+def L4_loadri_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memw($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrub_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memub($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrub_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+let isPredicable = 1;
+}
+def L4_loadrub_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memub($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadruh_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memuh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadruh_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+let isPredicable = 1;
+}
+def L4_loadruh_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memuh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_or_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) |= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_or_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) |= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_or_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) |= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_or_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) |= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_or_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) |= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_or_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) |= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ploadrbf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbt_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbt_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrdf_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdf_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdfnew_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdfnew_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdt_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdt_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110000110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdtnew_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdtnew_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110010110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrhf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrhfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrht_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrht_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrhtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrif_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrif_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrifnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrifnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrit_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrit_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadritnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadritnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrubf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubt_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubt_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadruhf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruhfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruht_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruht_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruhtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_return : HInst<
+(outs),
+(ins),
+"dealloc_return",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_0, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isBarrier = 1;
+let isPredicable = 1;
+let isTaken = 1;
+}
+def L4_return_f : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4) dealloc_return",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1100;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_fnew_pnt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4.new) dealloc_return:nt",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_fnew_pt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4.new) dealloc_return:t",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1110;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_t : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4) dealloc_return",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_tnew_pnt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4.new) dealloc_return:nt",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_tnew_pt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4.new) dealloc_return:t",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_sub_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) -= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_sub_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) -= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_sub_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) -= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_sub_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) -= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_sub_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) -= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_sub_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) -= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M2_acci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += add($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889, ImmRegRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_acci";
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_accii : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 += add($Rs32,#$Ii)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_11522288, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_acci";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_cmaci_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpyi($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacr_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpyr($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacsc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32*):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacsc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32*):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmpyi_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpyi($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_cmpyr_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpyr($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_cmpyrs_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrs_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrsc_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32*):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrsc_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32*):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpys_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpysc_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32*):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpysc_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32*):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cnacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacsc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32*):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacsc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32*):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_nac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_rnd_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_dpmpyss_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_dpmpyuu_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyuu_nac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyuu_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M2_hmmpyh_rs1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.h):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.h):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyl_rs1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.l):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.l):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_maci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyi($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_maci";
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_macsin : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rx32 -= mpyi($Rs32,#$Ii)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_11522288 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_macsip : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rx32 += mpyi($Rs32,#$Ii)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_11522288, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_maci";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mmachs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmpyh_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_acc_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_nac_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_rnd_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_sat_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_up_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_up_s1_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpyd_acc_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_nac_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_rnd_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyi($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_mpyi";
+let InputType = "reg";
+}
+def M2_mpysin : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Rd32 = -mpyi($Rs32,#$Ii)",
+M_tc_3x_SLOT23, TypeM>, Enc_16355964 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpysip : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rd32 = +mpyi($Rs32,#$Ii)",
+M_tc_3x_SLOT23, TypeM>, Enc_16355964 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def M2_mpysmi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, m32_0Imm:$Ii),
+"$Rd32 = mpyi($Rs32,#$Ii)",
+M_tc_3x_SLOT23, TypeM>, ImmRegRel {
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "M2_mpyi";
+let InputType = "imm";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def M2_mpysu_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpysu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_acc_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_nac_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_acc_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_nac_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyui : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyui($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M2_nacci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= add($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_naccii : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 -= add($Rs32,#$Ii)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_11522288 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_subacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rx32 += sub($Rt32,$Rs32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_7692963 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_vabsdiffh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffh($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M2_vabsdiffw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffw($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+}
+def M2_vcmac_s0_sat_i : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vcmpyi($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vcmac_s0_sat_r : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vcmpyr($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vcmpy_s0_sat_i : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyi($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s0_sat_r : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyr($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s1_sat_i : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyi($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s1_sat_r : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyr($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpy($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vdmacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpy($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vdmpyrs_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vdmpy($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpyrs_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vdmpy($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpys_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpy($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpy($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmac2 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2s_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2s_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2su_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyhsu($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2su_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyhsu($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmpy2es_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyeh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2es_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyeh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyh($Rs32,$Rt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s0pack : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vmpyh($Rs32,$Rt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyh($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s1pack : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vmpyh($Rs32,$Rt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2su_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyhsu($Rs32,$Rt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2su_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyhsu($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vraddh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vraddh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_vradduh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vradduh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_vrcmaci_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyi($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmaci_s0c : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyi($Rss32,$Rtt32*)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmacr_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyr($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmacr_s0c : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyr($Rss32,$Rtt32*)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpyi_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyi($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyi_s0c : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyi($Rss32,$Rtt32*)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyr_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyr($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyr_s0c : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyr($Rss32,$Rtt32*)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpys_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += vrcmpys($Rss32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM> {
+let isPseudo = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_acc_s1_h : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_acc_s1_l : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vrcmpys($Rss32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM> {
+let isPseudo = 1;
+}
+def M2_vrcmpys_s1_h : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1_l : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1rp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = vrcmpys($Rss32,$Rt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def M2_vrcmpys_s1rp_h : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:hi",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1rp_l : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:lo",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrmac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyh($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrmpy_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_xor_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= xor($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= and($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= and($Rs32,~$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= or($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_xor : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= xor($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_cmpyi_wh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyiwh($Rss32,$Rt32):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyi_whc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyr_wh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyrwh($Rss32,$Rt32):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyr_whc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_mac_up_s1_sat : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_mpyri_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii, IntRegs:$Rs32, u6_0Imm:$II),
+"$Rd32 = add(#$Ii,mpyi($Rs32,#$II))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_971574, ImmRegRel {
+let Inst{31-24} = 0b11011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyri_addr";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyri_addr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Ru32, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rd32 = add($Ru32,mpyi($Rs32,#$Ii))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_236434, ImmRegRel {
+let Inst{31-23} = 0b110111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyri_addr";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyri_addr_u2 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Ru32, u6_2Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = add($Ru32,mpyi(#$Ii,$Rs32))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_9959498 {
+let Inst{31-23} = 0b110111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M4_mpyrr_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add(#$Ii,mpyi($Rs32,$Rt32))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_2216485, ImmRegRel {
+let Inst{31-23} = 0b110101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyrr_addr";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyrr_addr : HInst<
+(outs IntRegs:$Ry32),
+(ins IntRegs:$Ru32, IntRegs:$Ry32in, IntRegs:$Rs32),
+"$Ry32 = add($Ru32,mpyi($Ry32in,$Rs32))",
+M_tc_3x_SLOT23, TypeM>, Enc_13770697, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyrr_addr";
+let InputType = "reg";
+let Constraints = "$Ry32 = $Ry32in";
+}
+def M4_nac_up_s1_sat : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= and($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= and($Rs32,~$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= or($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_xor : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= xor($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_pmpyw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = pmpyw($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M4_pmpyw_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 ^= pmpyw($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vpmpyh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vpmpyh($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101110;
+let prefersSlot3 = 1;
+}
+def M4_vpmpyh_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 ^= vpmpyh($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyweh($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyweh($Rss32,$Rtt32):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyweh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyeh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyweh($Rss32,$Rtt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyoh_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpywoh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyoh_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpywoh($Rss32,$Rtt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyoh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpywoh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyoh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpywoh($Rss32,$Rtt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
+def M4_xor_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= and($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= and($Rs32,~$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= or($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 ^= xor($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vdmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vdmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M5_vmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpybsu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vmacbuu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpybu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpybsu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M5_vmpybuu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpybu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+}
+def M5_vrmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpybsu($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vrmacbuu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpybu($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vrmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpybsu($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M5_vrmpybuu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpybu($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+}
+def M6_vabsdiffb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333, Requires<[HasV62T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+}
+def M6_vabsdiffub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333, Requires<[HasV62T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
+def PS_loadrbabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memb(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_loadrdabs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u29_3Imm:$Ii),
+"$Rdd32 = memd(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4975051, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def PS_loadrhabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memh(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_loadriabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u30_2Imm:$Ii),
+"$Rd32 = memw(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_8814718, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def PS_loadrubabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memub(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_loadruhabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memuh(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerbabs : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_12395768, AddrModeRel {
+let Inst{24-21} = 0b0000;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_storerbnewabs : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Nt8),
+"memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeV2LDST>, Enc_4050532, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+let opNewValue = 1;
+}
+def PS_storerdabs : HInst<
+(outs),
+(ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_11682941, AddrModeRel {
+let Inst{24-21} = 0b0110;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def PS_storerfabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerhabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerhnewabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Nt8),
+"memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeV2LDST>, Enc_13618890, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+let opNewValue = 1;
+}
+def PS_storeriabs : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_15999208, AddrModeRel {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def PS_storerinewabs : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Nt8),
+"memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeV2LDST>, Enc_12297800, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def S2_addasl_rrri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32, u3_0Imm:$Ii),
+"$Rd32 = addasl($Rt32,$Rs32,#$Ii)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_3494181 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_allocframe : HInst<
+(outs),
+(ins u11_3Imm:$Ii),
+"allocframe(#$Ii)",
+ST_tc_ld_SLOT0, TypeST>, Enc_15830826 {
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10100000100;
+let Inst{20-16} = 0b11101;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [R29, R30, R31];
+let Defs = [R29, R30];
+}
+def S2_asl_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asl($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_asl_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asl($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asl_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asl($Rs32,#$Ii):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_asl_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vaslh($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775 {
+let Inst{7-5} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_asl_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vaslw($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13201267 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_asl_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = asl($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_asl_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asl($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asl_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asl($Rs32,$Rt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_asl_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vaslh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_asl_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vaslw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_asr_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asr($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_asr_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_rnd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asr($Rss32,#$Ii):rnd",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000000110;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_p_rnd_goodsyntax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asrrnd($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Requires<[HasV5T]> {
+let isPseudo = 1;
+}
+def S2_asr_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asr($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asr($Rs32,#$Ii):rnd",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_r_rnd_goodsyntax : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asrrnd($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def S2_asr_i_svw_trun : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rd32 = vasrw($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2380082 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775 {
+let Inst{7-5} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_asr_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vasrw($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13201267 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_asr_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = asr($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_asr_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asr($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asr($Rs32,$Rt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_asr_r_svw_trun : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = vasrw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14287645 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vasrh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_asr_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vasrw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_brev : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = brev($Rs32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_brevp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = brev($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000110;
+}
+def S2_cabacdecbin : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = decbin($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+let Defs = [P0];
+}
+def S2_cl0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = cl0($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_cl0p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = cl0($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_cl1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = cl1($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_cl1p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = cl1($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = clb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clbnorm : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = normamt($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clbp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = clb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clrbit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = clrbit($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clrbit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = clrbit($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = ct0($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct0p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = ct0($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = ct1($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct1p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = ct1($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_deinterleave : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = deinterleave($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000110;
+}
+def S2_extractu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = extractu($Rs32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_11930928 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_extractu_rp : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rd32 = extractu($Rs32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_15472748 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_extractup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rdd32 = extractu($Rss32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_9894557 {
+let Inst{31-24} = 0b10000001;
+let prefersSlot3 = 1;
+}
+def S2_extractup_rp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = extractu($Rss32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+let prefersSlot3 = 1;
+}
+def S2_insert : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = insert($Rs32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2880796 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_insert_rp : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rx32 = insert($Rs32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_16311032 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_insertp : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rxx32 = insert($Rss32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_631197 {
+let Inst{31-24} = 0b10000011;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_insertp_rp : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 = insert($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001010000;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_interleave : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = interleave($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000110;
+}
+def S2_lfsp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = lfs($Rss32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+let prefersSlot3 = 1;
+}
+def S2_lsl_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = lsl($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_lsl_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = lsl($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsl_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlslh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_lsl_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlslw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_lsr_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = lsr($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_lsr_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = lsr($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsr_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vlsrh($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775 {
+let Inst{7-5} = 0b001;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_lsr_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vlsrw($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13201267 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_lsr_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = lsr($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_lsr_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = lsr($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsr_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlsrh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_lsr_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlsrw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_packhl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = packhl($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110101100;
+let InputType = "reg";
+}
+def S2_parityp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = parity($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_9277990 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_pstorerbf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S2_pstorerbf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerbfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S2_pstorerbnewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerbnewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S2_pstorerbnewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerbnewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S2_pstorerbt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerbtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S2_pstorerdf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerdfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S2_pstorerdt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerdtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerff_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerff_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerff_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerffnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerft_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000011;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerft_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerft_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerftnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerhf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerhfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S2_pstorerhnewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerhnewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S2_pstorerhnewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerhnewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerht_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerht_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerht_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerhtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerif_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S2_pstorerif_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerif_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerifnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S2_pstorerinewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerinewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S2_pstorerinewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerinewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerit_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S2_pstorerit_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerit_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstoreritnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_setbit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = setbit($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_setbit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = setbit($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_shuffeb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = shuffeb($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffeh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = shuffeh($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffob : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = shuffob($Rtt32,$Rss32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffoh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = shuffoh($Rtt32,$Rss32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_storerb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13150110, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def S2_storerb_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++$Mu2:brev) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111000;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_3915770 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++I:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_12492533, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++$Mu2) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerbgp : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb(gp+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_12395768, AddrModeRel {
+let Inst{24-21} = 0b0000;
+let Inst{31-27} = 0b01001;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def S2_storerbnew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rs32+#$Ii) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10002182, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S2_storerbnew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++$Mu2:brev) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101111101;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerb_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_5326450 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++I:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_5900401, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++$Mu2) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerbnewgp : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Nt8),
+"memb(gp+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_4050532, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+let opNewValue = 1;
+}
+def S2_storerd_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16319737, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def S2_storerd_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++$Mu2:brev) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_15816255 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111110;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++#$Ii:circ($Mu2)) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_4501395 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++I:circ($Mu2)) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_15816255 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11271630, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++$Mu2) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_15816255 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerdgp : HInst<
+(outs),
+(ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd(gp+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11682941, AddrModeRel {
+let Inst{24-21} = 0b0110;
+let Inst{31-27} = 0b01001;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerdabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def S2_storerf_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7736768, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def S2_storerf_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2:brev) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111011;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_10915758 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++I:circ($Mu2)) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11492529, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerfgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(gp+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerfabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def S2_storerh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7736768, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def S2_storerh_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2:brev) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111010;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_10915758 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++I:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11492529, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerhgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(gp+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def S2_storerhnew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Nt8),
+"memh($Rs32+#$Ii) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_748676, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+let opNewValue = 2;
+}
+def S2_storerhnew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++$Mu2:brev) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101111101;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerh_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10326434 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++I:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_6900405, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b001;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++$Mu2) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerhnewgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Nt8),
+"memh(gp+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_13618890, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+let opNewValue = 1;
+}
+def S2_storeri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_6673186, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def S2_storeri_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++$Mu2:brev) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111100;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_9915754 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++I:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10492541, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++$Mu2) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerigp : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw(gp+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_15999208, AddrModeRel {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b01001;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def S2_storerinew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Nt8),
+"memw($Rs32+#$Ii) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_8409782, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+let opNewValue = 2;
+}
+def S2_storerinew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++$Mu2:brev) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101111101;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storeri_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_11326438 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++I:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_7900405, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b010;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isPredicable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++$Mu2) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerinewgp : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Nt8),
+"memw(gp+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_12297800, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def S2_storew_locked : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw_locked($Rs32,$Pd4) = $Rt32",
+ST_tc_ld_SLOT0, TypeST>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000101;
+let accessSize = WordAccess;
+let isSoloAX = 1;
+let mayStore = 1;
+let isPredicateLate = 1;
+}
+def S2_svsathb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsathb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_svsathub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsathub($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_tableidxb : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxb($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxb_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxb($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxd : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxd($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxd_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxd($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxh : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxh($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxh_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxh($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxw : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxw($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxw_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxw($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_togglebit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = togglebit($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_togglebit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = togglebit($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_tstbit_i : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = tstbit($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_2103742 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101000;
+}
+def S2_tstbit_r : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = tstbit($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111000;
+}
+def S2_valignib : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, u3_0Imm:$Ii),
+"$Rdd32 = valignb($Rtt32,$Rss32,#$Ii)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11971407 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000000000;
+}
+def S2_valignrb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, PredRegs:$Pu4),
+"$Rdd32 = valignb($Rtt32,$Rss32,$Pu4)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11552785 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010000;
+}
+def S2_vcnegh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vcnegh($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vcrotate : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vcrotate($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vrcnegh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += vrcnegh($Rss32,$Rt32)",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_vrndpackwh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vrndwh($Rss32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vrndpackwhs : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vrndwh($Rss32):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsathb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathb_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsathb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsathub : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsathub($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathub_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsathub($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsatwh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsatwh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsatwh_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsatwh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsatwuh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsatwuh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsatwuh_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsatwuh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsplatrb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsplatb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vsplatrh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsplath($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100010;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vspliceib : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, u3_0Imm:$Ii),
+"$Rdd32 = vspliceb($Rss32,$Rtt32,#$Ii)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_16730127 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000000100;
+}
+def S2_vsplicerb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Pu4),
+"$Rdd32 = vspliceb($Rss32,$Rtt32,$Pu4)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_5178985 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010100;
+}
+def S2_vsxtbh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsxtbh($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vsxthw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsxthw($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vtrunehb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vtrunehb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vtrunewh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunewh($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_vtrunohb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vtrunohb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vtrunowh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunowh($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_vzxtbh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vzxtbh($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vzxthw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vzxthw($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S4_addaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,add($Ru32,#$Ii))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6495334 {
+let Inst{31-23} = 0b110110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_addi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = add(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b100;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_addi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = add(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b100;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_andi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = and(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b000;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_andi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = and(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b000;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_clbaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s6_0Imm:$Ii),
+"$Rd32 = add(clb($Rs32),#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_5523416 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10001100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_clbpaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, s6_0Imm:$Ii),
+"$Rd32 = add(clb($Rss32),#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_10188026 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_clbpnorm : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = normamt($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S4_extract : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = extract($Rs32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_11930928 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extract_rp : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rd32 = extract($Rs32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_15472748 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extractp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rdd32 = extract($Rss32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_9894557 {
+let Inst{31-24} = 0b10001010;
+let prefersSlot3 = 1;
+}
+def S4_extractp_rp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = extract($Rss32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+}
+def S4_lsli : HInst<
+(outs IntRegs:$Rd32),
+(ins s6_0Imm:$Ii, IntRegs:$Rt32),
+"$Rd32 = lsl(#$Ii,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_518319 {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S4_ntstbit_i : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = !tstbit($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_2103742 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101001;
+}
+def S4_ntstbit_r : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !tstbit($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111001;
+}
+def S4_or_andi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 |= and($Rs32,#$Ii)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6356866 {
+let Inst{31-22} = 0b1101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_or_andix : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Ru32, IntRegs:$Rx32in, s32_0Imm:$Ii),
+"$Rx32 = or($Ru32,and($Rx32in,#$Ii))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_7504828 {
+let Inst{31-22} = 0b1101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_or_ori : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 |= or($Rs32,#$Ii)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6356866 {
+let Inst{31-22} = 0b1101101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_ori_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = or(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b010;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_ori_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = or(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b010;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_parity : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = parity($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_pstorerbf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerbnewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b000;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_pstorerbnewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerbnewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b000;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_pstorerbnewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerbt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100000;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110000;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerdf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110101110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S4_pstorerdfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerdt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110100110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S4_pstorerdtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110110110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerff_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerff_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerffnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerffnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerffnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerffnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerft_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerft_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100011;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerftnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerftnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010011;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerftnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110011;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerftnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerhf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerhfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerhnewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b001;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S4_pstorerhnewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerhnewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b001;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S4_pstorerhnewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerht_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerht_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100010;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerhtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110010;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerif_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerif_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerifnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerifnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S4_pstorerifnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerifnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerinewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b010;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S4_pstorerinewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerinewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b010;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S4_pstorerinewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerit_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerit_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100100;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstoreritnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstoreritnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S4_pstoreritnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110100;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstoreritnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_stored_locked : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd_locked($Rs32,$Pd4) = $Rtt32",
+ST_tc_ld_SLOT0, TypeST>, Enc_2921694 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
+let accessSize = DoubleWordAccess;
+let isSoloAX = 1;
+let mayStore = 1;
+let isPredicateLate = 1;
+}
+def S4_storeirb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11282123, PredNewRel {
+let Inst{31-21} = 0b00111100000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeirb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111000000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111001000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_10282127, PredNewRel {
+let Inst{31-21} = 0b00111100001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeirh_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirht_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111000001;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirht_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111001001;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeiri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9282127, PredNewRel {
+let Inst{31-21} = 0b00111100010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeiri_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirif_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirif_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirifnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirifnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirit_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111000010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirit_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeiritnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111001010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeiritnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storerb_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memb($Re32=#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerb_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011000;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storerb_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memb($Ru32<<#$Ii+#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101000;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storerb_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerbnew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memb($Re32=#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_14193700, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerb_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerbnew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_5486172, AddrModeRel {
+let Inst{6-3} = 0b0000;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerbnew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memb($Ru32<<#$Ii+#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10076500, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S4_storerb_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_storerd_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, DoubleRegs:$Rtt32),
+"memd($Re32=#$II) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_8131399 {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S4_storerd_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerd_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9772987, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011110;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+let isPredicable = 1;
+}
+def S4_storerd_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, DoubleRegs:$Rtt32),
+"memd($Ru32<<#$Ii+#$II) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_12848507, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101110;
+let addrMode = BaseLongOffset;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerf_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Re32=#$II) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246 {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S4_storerf_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerf_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011011;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+let isPredicable = 1;
+}
+def S4_storerf_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Ru32<<#$Ii+#$II) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101011;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S4_storerf_rr";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerh_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Re32=#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerh_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011010;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storerh_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Ru32<<#$Ii+#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101010;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerhnew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memh($Re32=#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_14193700, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b001;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerh_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerhnew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_5486172, AddrModeRel {
+let Inst{6-3} = 0b0001;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerhnew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memh($Ru32<<#$Ii+#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10076500, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_storeri_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memw($Re32=#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeri_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011100;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storeri_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memw($Ru32<<#$Ii+#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101100;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerinew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memw($Re32=#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_14193700, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b010;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storeri_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerinew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_5486172, AddrModeRel {
+let Inst{6-3} = 0b0010;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerinew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memw($Ru32<<#$Ii+#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10076500, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_subaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Ru32),
+"$Rd32 = add($Rs32,sub(#$Ii,$Ru32))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6495334 {
+let Inst{31-23} = 0b110110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_subi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = sub(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b110;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_subi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = sub(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b110;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_vrcrotate : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rdd32 = vrcrotate($Rss32,$Rt32,#$Ii)",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_114098 {
+let Inst{7-6} = 0b11;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+}
+def S4_vrcrotate_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rxx32 += vrcrotate($Rss32,$Rt32,#$Ii)",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_13114546 {
+let Inst{7-6} = 0b00;
+let Inst{31-21} = 0b11001011101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S4_vxaddsubh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubh($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S4_vxaddsubhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubh($Rss32,$Rtt32):rnd:>>1:sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxaddsubw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubw($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddh($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddh($Rss32,$Rtt32):rnd:>>1:sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddw($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S5_asrhub_rnd_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):raw",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8038806, Requires<[HasV5T]> {
+let Inst{7-5} = 0b100;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_asrhub_rnd_sat_goodsyntax : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Requires<[HasV5T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def S5_asrhub_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8038806, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_popcountp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = popcount($Rss32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S5_vasrhrnd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000001;
+let prefersSlot3 = 1;
+}
+def S5_vasrhrnd_goodsyntax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii):rnd",
+S_2op_tc_1_SLOT23, TypeS_2op>, Requires<[HasV5T]> {
+let isPseudo = 1;
+}
+def S6_rol_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000000000;
+}
+def S6_rol_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S6_rol_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_vsplatrbp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsplatb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV62T]> {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100010;
+}
+def S6_vtrunehb_ppp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunehb($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157, Requires<[HasV62T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S6_vtrunohb_ppp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunohb($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157, Requires<[HasV62T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def SA1_addi : HInst<
+(outs GeneralSubRegs:$Rx16),
+(ins IntRegs:$Rx16in, s32_0Imm:$Ii),
+"$Rx16 = add($Rx16in,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_3974695 {
+let Inst{12-11} = 0b00;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+let Constraints = "$Rx16 = $Rx16in";
+}
+def SA1_addrx : HInst<
+(outs GeneralSubRegs:$Rx16),
+(ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16),
+"$Rx16 = add($Rx16in,$Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_6135183 {
+let Inst{12-8} = 0b11000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let Constraints = "$Rx16 = $Rx16in";
+}
+def SA1_addsp : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u6_2Imm:$Ii),
+"$Rd16 = add(r29,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_176263 {
+let Inst{12-10} = 0b011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_and1 : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = and($Rs16,#1)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrf : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (!p0) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrfnew : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (!p0.new) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrt : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (p0) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrtnew : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (p0.new) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_cmpeqi : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u2_0Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_2079016 {
+let Inst{3-2} = 0b00;
+let Inst{12-8} = 0b11001;
+let AsmVariantName = "NonParsable";
+let Defs = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine0i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#0,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b00;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine1i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#1,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b01;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine2i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#2,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b10;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine3i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#3,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b11;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combinerz : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins GeneralSubRegs:$Rs16),
+"$Rdd8 = combine($Rs16,#0)",
+PSEUDO, TypeSUBINSN>, Enc_10501894 {
+let Inst{3-3} = 0b1;
+let Inst{12-8} = 0b11101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combinezr : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins GeneralSubRegs:$Rs16),
+"$Rdd8 = combine(#0,$Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_10501894 {
+let Inst{3-3} = 0b0;
+let Inst{12-8} = 0b11101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_dec : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1),
+"$Rd16 = add($Rs16,#$n1)",
+PSEUDO, TypeSUBINSN>, Enc_10597934 {
+let Inst{12-8} = 0b10011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_inc : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = add($Rs16,#1)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_seti : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u32_0Imm:$Ii),
+"$Rd16 = #$Ii",
+PSEUDO, TypeSUBINSN>, Enc_2176383 {
+let Inst{12-10} = 0b010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def SA1_setin1 : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins n1Const:$n1),
+"$Rd16 = #$n1",
+PSEUDO, TypeSUBINSN>, Enc_13336212 {
+let Inst{12-4} = 0b110100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_sxtb : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = sxtb($Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_sxth : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = sxth($Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_tfr : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = $Rs16",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_zxtb : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = and($Rs16,#255)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_zxth : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = zxth($Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SL1_loadri_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"$Rd16 = memw($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_13606251 {
+let Inst{12-12} = 0b0;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L1";
+}
+def SL1_loadrub_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"$Rd16 = memub($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15606259 {
+let Inst{12-12} = 0b1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L1";
+}
+def SL2_deallocframe : HInst<
+(outs),
+(ins),
+"deallocframe",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111100000000;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [R30, R29, R31];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31 : HInst<
+(outs),
+(ins),
+"jumpr r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000000;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [R31];
+let Defs = [PC];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_f : HInst<
+(outs),
+(ins),
+"if (!p0) jumpr r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_fnew : HInst<
+(outs),
+(ins),
+"if (!p0.new) jumpr:nt r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_t : HInst<
+(outs),
+(ins),
+"if (p0) jumpr r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_tnew : HInst<
+(outs),
+(ins),
+"if (p0.new) jumpr:nt r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrb_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_0Imm:$Ii),
+"$Rd16 = memb($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_3135259 {
+let Inst{12-11} = 0b10;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrd_sp : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u5_3Imm:$Ii),
+"$Rdd8 = memd(r29+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_16479122 {
+let Inst{12-8} = 0b11110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrh_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
+"$Rd16 = memh($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_4135257 {
+let Inst{12-11} = 0b00;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadri_sp : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u5_2Imm:$Ii),
+"$Rd16 = memw(r29+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_64199 {
+let Inst{12-9} = 0b1110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadruh_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
+"$Rd16 = memuh($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_4135257 {
+let Inst{12-11} = 0b01;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return : HInst<
+(outs),
+(ins),
+"dealloc_return",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000000;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R30, R29, R31];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_f : HInst<
+(outs),
+(ins),
+"if (!p0) dealloc_return",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_fnew : HInst<
+(outs),
+(ins),
+"if (!p0.new) dealloc_return:nt",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_t : HInst<
+(outs),
+(ins),
+"if (p0) dealloc_return",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_tnew : HInst<
+(outs),
+(ins),
+"if (p0.new) dealloc_return:nt",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SS1_storeb_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii, GeneralSubRegs:$Rt16),
+"memb($Rs16+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_13204995 {
+let Inst{12-12} = 0b1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S1";
+}
+def SS1_storew_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii, GeneralSubRegs:$Rt16),
+"memw($Rs16+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_11205051 {
+let Inst{12-12} = 0b0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S1";
+}
+def SS2_allocframe : HInst<
+(outs),
+(ins u5_3Imm:$Ii),
+"allocframe(#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_7884306 {
+let Inst{3-0} = 0b0000;
+let Inst{12-9} = 0b1110;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R30, R29, R31];
+let Defs = [R30, R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storebi0 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"memb($Rs16+#$Ii) = #0",
+PSEUDO, TypeSUBINSN>, Enc_13536408 {
+let Inst{12-8} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storebi1 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"memb($Rs16+#$Ii) = #1",
+PSEUDO, TypeSUBINSN>, Enc_13536408 {
+let Inst{12-8} = 0b10011;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_stored_sp : HInst<
+(outs),
+(ins s6_3Imm:$Ii, GeneralDoubleLow8Regs:$Rtt8),
+"memd(r29+#$Ii) = $Rtt8",
+PSEUDO, TypeSUBINSN>, Enc_9165078 {
+let Inst{12-9} = 0b0101;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storeh_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii, GeneralSubRegs:$Rt16),
+"memh($Rs16+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_1734121 {
+let Inst{12-11} = 0b00;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storew_sp : HInst<
+(outs),
+(ins u5_2Imm:$Ii, GeneralSubRegs:$Rt16),
+"memw(r29+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_6690615 {
+let Inst{12-9} = 0b0100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storewi0 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"memw($Rs16+#$Ii) = #0",
+PSEUDO, TypeSUBINSN>, Enc_15536400 {
+let Inst{12-8} = 0b10000;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storewi1 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"memw($Rs16+#$Ii) = #1",
+PSEUDO, TypeSUBINSN>, Enc_15536400 {
+let Inst{12-8} = 0b10001;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def V6_MAP_equb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_extractw : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rs32),
+"$Rd32 = vextract($Vu32,$Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_16601956, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_extractw_128B : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rs32),
+"$Rd32 = vextract($Vu32,$Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_16601956, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_extractw_alt : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rs32),
+"$Rd32.w = vextract($Vu32,$Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_extractw_alt_128B : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rs32),
+"$Rd32.w = vextract($Vu32,$Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_hi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vss32),
+"$Vd32 = hi($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_hi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vss32),
+"$Vd32 = hi($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ld0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ld0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldu0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmemu($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldu0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmemu($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lo : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vss32),
+"$Vd32 = lo($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lo_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vss32),
+"$Vd32 = lo($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplatb : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.b = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplatb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.b = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplath : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.h = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplath_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.h = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplatw : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vsplat($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_9768377, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplatw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vsplat($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_9768377, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_and : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = and($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_and_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = and($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_and_n : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = and($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_and_n_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = and($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_not : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4),
+"$Qd4 = not($Qs4)",
+CVI_VA, TypeCVI_VA>, Enc_4897205, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_not_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4),
+"$Qd4 = not($Qs4)",
+CVI_VA, TypeCVI_VA>, Enc_4897205, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_or : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = or($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_or_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = or($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_or_n : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = or($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_or_n_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = or($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_scalar2 : HInst<
+(outs VecPredRegs:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV60T,UseHVX]> {
+let Inst{13-2} = 0b000000010001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_scalar2_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV60T,UseHVX]> {
+let Inst{13-2} = 0b000000010001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_scalar2v2 : HInst<
+(outs VecPredRegs:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq2($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV62T,UseHVX]> {
+let Inst{13-2} = 0b000000010011;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_scalar2v2_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq2($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV62T,UseHVX]> {
+let Inst{13-2} = 0b000000010011;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_xor : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = xor($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_xor_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = xor($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_shuffeqh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_shuffeqh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_shuffeqw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_shuffeqw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_st0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_st0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stn0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Os8),
+"vmem($Rt32) = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 1;
+}
+def V6_stn0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Os8),
+"vmem($Rt32) = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def V6_stnnt0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Os8),
+"vmem($Rt32):nt = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 1;
+}
+def V6_stnnt0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Os8),
+"vmem($Rt32):nt = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def V6_stnp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnpnt0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnpnt0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnq0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnq0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnqnt0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnqnt0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnt0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnt0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stpnt0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stpnt0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stq0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stq0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stqnt0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stqnt0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stu0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stu0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stunp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stunp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stup0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stup0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32Ub_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmemu($Rt32+#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32Ub_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmemu($Rt32+#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32Ub_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmemu($Rx32++#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmemu($Rx32++#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmemu($Rx32++$Mu2)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmemu($Rx32++$Mu2)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_6923828, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_5757366, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_3296020, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_2296022, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_6923828, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_5757366, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_new_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_6608821, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 2;
+}
+def V6_vS32b_new_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2152247, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def V6_vS32b_new_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_new_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_new_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001101;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001101;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_12244921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_11244923, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_new_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_new_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nqpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nqpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_6923828, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_5757366, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_new_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_6608821, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 2;
+}
+def V6_vS32b_nt_new_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2152247, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def V6_vS32b_nt_new_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001111;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001111;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_12244921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_11244923, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001010;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001010;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_nqpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_nqpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_3296020, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2296022, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_qpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_qpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_3296020, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2296022, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_qpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_qpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vabsdiffh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vabs($Vu32.h)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vabs($Vu32.h)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vabs($Vu32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vabs($Vu32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsh($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsh($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vabs($Vu32.w)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vabs($Vu32.w)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vabs($Vu32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vabs($Vu32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsw($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsw($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddcarry : HInst<
+(outs VectorRegs:$Vd32, VecPredRegs:$Qx4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, VecPredRegs:$Qx4in),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vaddcarry_128B : HInst<
+(outs VectorRegs128B:$Vd32, VecPredRegs128B:$Qx4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, VecPredRegs128B:$Qx4in),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vaddclbh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddclbh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddclbw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddclbw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhw_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddububb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddububb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vadduh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vadduh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vadduh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vadduh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhw_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vadduw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vadduw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vadduw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vadduw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_valignb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_valignb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_valignbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = valign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_valignbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = valign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vand : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vand($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vand_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vand($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandnqrt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4711514, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandnqrt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4711514, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandnqrt_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4944558, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4944558, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandnqrt_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandqrt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4711514, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandqrt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4711514, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandqrt_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4944558, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4944558, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandqrt_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvnqv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vu32),
+"$Vd32 = vand(!$Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvnqv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vu32),
+"$Vd32 = vand(!$Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvqv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vu32),
+"$Vd32 = vand($Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvqv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vu32),
+"$Vd32 = vand($Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvrt : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qd4 = vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_11498120, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvrt_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qd4 = vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_11498120, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvrt_acc : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qx4 |= vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_10612292, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qx4 |= vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_10612292, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_alt : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_alt_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_alt : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qd4.ub = vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvrt_alt_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qd4.ub = vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasl($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasl($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vasl($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vasl($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaslh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaslh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslw_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vasl($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vasl($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaslw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaslw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasr($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasr($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhbrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhubrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhubsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vasr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vasr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vasrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vasrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasruwuhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasruwuhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrw_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwuhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwuhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vasr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vasr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vasrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vasrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vassign : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassign_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vassignp : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32),
+"$Vdd32 = $Vuu32",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassignp_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32),
+"$Vdd32 = $Vuu32",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavghrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavghrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavghrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavghrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgubrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgubrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgubrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgubrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguhrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguhrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguhrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguhrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgwrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgwrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgwrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgwrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vccombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vccombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0h : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.uh = vcl0($Vu32.uh)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0h_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.uh = vcl0($Vu32.uh)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0h_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vcl0h($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0h_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vcl0h($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0w : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.uw = vcl0($Vu32.uw)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0w_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.uw = vcl0($Vu32.uw)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0w_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vcl0w($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0w_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vcl0w($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcmov : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32),
+"if ($Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcmov_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32),
+"if ($Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isRegSequence = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isRegSequence = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vd0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins),
+"$Vd32 = #0",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vd0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins),
+"$Vd32 = #0",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdeal : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vdeal($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vdeal_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vdeal($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vdealb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.b = vdeal($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb4w_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdealb4w($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdealb4w($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.b = vdeal($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vdealb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vdealb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vdeal($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vdeal($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vdealh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vdealh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealvdd : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealvdd_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdelta : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdelta_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_dv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_dv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhisat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhisat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhisat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhisat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsuisat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsuisat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsuisat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsuisat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsusat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsusat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsusat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsusat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhvsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhvsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhvsat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhvsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdsaduh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdsaduh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdsaduh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdsaduh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgth_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgth_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtub_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtub_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtuh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtuh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtuw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtuw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vhist : HInst<
+(outs),
+(ins),
+"vhist",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vhist_128B : HInst<
+(outs),
+(ins),
+"vhist",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vhistq : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vhist($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vhistq_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vhist($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vinsertwr : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, IntRegs:$Rt32),
+"$Vx32.w = vinsert($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_313333, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b100000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vinsertwr_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"$Vx32.w = vinsert($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_313333, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b100000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlalignb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlalignb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlalignbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlalignbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vlsrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vlsrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vlsrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vlsrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb_nm : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvb_nm_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb_oracc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8877260, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8877260, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracci : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8280533, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracci_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8280533, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh_nm : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwh_nm_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh_oracc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_16213761, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_16213761, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracci : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_3457570, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracci_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_3457570, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwhi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_13261538, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwhi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_13261538, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vmax($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vmax($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmax($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmax($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmax($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmax($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vmin($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vmin($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmin($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmin($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmin($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmin($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabusv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabusv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabusv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vmpabus($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabusv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vmpabus($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabuuv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabuuv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabuuv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vmpabuu($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabuuv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vmpabuu($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpahb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpahb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpahb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpahb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpauhb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpauhb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpauhb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpauhb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybusv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybusv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybusv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybusv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh_64 : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_64_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhsat_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsrs : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhsrs_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhsrs_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhsrs_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhss : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhss_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhss_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhss_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhvsrs : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhvsrs_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhvsrs_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhvsrs_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyieoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyieoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiewh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyiewh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyiewh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiewuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiewuh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiewuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyih : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyih_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyih_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyih_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyihb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyihb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyihb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyihb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiowh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiowh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiowh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyiowh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiowh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyiowh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwub_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_10058269, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_10058269, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_64_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyowh_64_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyowh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_rnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_rnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd_sacc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:rnd:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:rnd:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyub : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyub_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyub_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyub_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyubv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyubv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyubv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyubv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuhv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuhv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuhv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuhv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmux : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qt4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_1572239, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmux_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qt4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_1572239, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnccombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnccombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vncmov : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32),
+"if (!$Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vncmov_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32),
+"if (!$Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamth : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vnormamt($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamth_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vnormamt($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamth_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnormamth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamth_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnormamth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamtw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vnormamt($Vu32.w)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamtw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vnormamt($Vu32.w)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamtw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnormamtw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamtw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnormamtw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnot : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnot($Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnot_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnot($Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vor : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vor_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhb_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhb_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhub_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhub_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhub_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhub_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackob : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackob_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackob_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackob_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackoh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackoh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwuh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwuh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwuh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwuh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpopcounth : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vpopcount($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpopcounth_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vpopcount($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpopcounth_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vpopcounth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpopcounth_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vpopcounth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrdelta : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrdelta_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybus : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybus_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybus_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybus_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyub_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vror : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vror($Vu32,$Rt32)",
+CVI_VP, TypeCVI_VP>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vror_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vror($Vu32,$Rt32)",
+CVI_VP, TypeCVI_VP>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduhub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduhub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduhub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrounduhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduhub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrounduhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrounduwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrounduwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrsadubi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrsadubi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrsadubi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrsadubi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsathub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsathub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsathub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsathub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsathub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsathub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatuwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatuwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatuwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsatuwuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatuwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsatuwuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsat($Vu32.w,$Vv32.w)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsat($Vu32.w,$Vv32.w)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsatwh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsatwh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.h = vsxt($Vu32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.h = vsxt($Vu32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vsxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vsxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.w = vsxt($Vu32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.w = vsxt($Vu32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vsxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vsxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufeh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufeh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufeh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufeh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuff : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vshuff($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vshuff_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vshuff($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vshuffb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.b = vshuff($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.b = vshuff($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vshuffb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vshuffb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffeb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffeb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffeb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffeb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vshuff($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vshuff($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vshuffh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vshuffh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffob : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffob_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffob_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffob_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffvdd : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffvdd_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vshuffoeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vshuffoeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vshuffoeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vshuffoeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubcarry : HInst<
+(outs VectorRegs:$Vd32, VecPredRegs:$Qx4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, VecPredRegs:$Qx4in),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vsubcarry_128B : HInst<
+(outs VectorRegs128B:$Vd32, VecPredRegs128B:$Qx4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, VecPredRegs128B:$Qx4in),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vsubh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubububb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubububb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubuh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubuh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubuw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubuw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubuw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubuw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vswap : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecPredRegs:$Qt4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_11424254, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vswap_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecPredRegs128B:$Qt4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_11424254, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpybus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpybus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpybus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpybus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyhb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyhb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyhb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyhb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtran2x2_map : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vtrans2x2($Vy32,$Vx32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vtran2x2_map_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vtrans2x2($Vy32,$Vx32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vunpackb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.h = vunpack($Vu32.b)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.h = vunpack($Vu32.b)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.w = vunpack($Vu32.h)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.w = vunpack($Vu32.h)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackob : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32.h |= vunpacko($Vu32.b)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32.h |= vunpacko($Vu32.b)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32 |= vunpackob($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32 |= vunpackob($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32.w |= vunpacko($Vu32.h)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32.w |= vunpacko($Vu32.h)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32 |= vunpackoh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32 |= vunpackoh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackub : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uh = vunpack($Vu32.ub)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackub_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uh = vunpack($Vu32.ub)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackub_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackub($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackub_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackub($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackuh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uw = vunpack($Vu32.uh)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackuh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uw = vunpack($Vu32.uh)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackuh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackuh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackuh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackuh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128 : HInst<
+(outs),
+(ins),
+"vwhist128",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128_128B : HInst<
+(outs),
+(ins),
+"vwhist128",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128m : HInst<
+(outs),
+(ins u1_0Imm:$Ii),
+"vwhist128(#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_1291652, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128m_128B : HInst<
+(outs),
+(ins u1_0Imm:$Ii),
+"vwhist128(#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_1291652, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128q : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist128($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128q_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist128($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128qm : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, u1_0Imm:$Ii),
+"vwhist128($Qv4,#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_7978128, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128qm_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, u1_0Imm:$Ii),
+"vwhist128($Qv4,#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_7978128, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256 : HInst<
+(outs),
+(ins),
+"vwhist256",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256_128B : HInst<
+(outs),
+(ins),
+"vwhist256",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256_sat : HInst<
+(outs),
+(ins),
+"vwhist256:sat",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256_sat_128B : HInst<
+(outs),
+(ins),
+"vwhist256:sat",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256q : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist256($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256q_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist256($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256q_sat : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist256($Qv4):sat",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256q_sat_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist256($Qv4):sat",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vxor : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vxor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vxor_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vxor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uh = vzxt($Vu32.ub)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uh = vzxt($Vu32.ub)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vzxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vzxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uw = vzxt($Vu32.uh)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uw = vzxt($Vu32.uh)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vzxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vzxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def Y2_barrier : HInst<
+(outs),
+(ins),
+"barrier",
+ST_tc_3stall_SLOT0, TypeST>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b1010100000000000;
+let isSoloAX = 1;
+let hasSideEffects = 1;
+}
+def Y2_break : HInst<
+(outs),
+(ins),
+"brkpt",
+CR_tc_3x_SLOT3, TypeCR>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0110110000100000;
+let isSolo = 1;
+}
+def Y2_dccleana : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dccleana($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000000;
+let isSoloAin1 = 1;
+}
+def Y2_dccleaninva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dccleaninva($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000010;
+let isSoloAin1 = 1;
+}
+def Y2_dcfetch : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dcfetch($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasSideEffects = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def Y2_dcfetchbo : HInst<
+(outs),
+(ins IntRegs:$Rs32, u11_3Imm:$Ii),
+"dcfetch($Rs32+#$Ii)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_4983213 {
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10010100000;
+let addrMode = BaseImmOffset;
+let hasSideEffects = 1;
+}
+def Y2_dcinva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dcinva($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000001;
+let isSoloAin1 = 1;
+}
+def Y2_dczeroa : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dczeroa($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000110;
+let mayStore = 1;
+let isSoloAin1 = 1;
+}
+def Y2_icinva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"icinva($Rs32)",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010110110;
+let isSolo = 1;
+}
+def Y2_isync : HInst<
+(outs),
+(ins),
+"isync",
+J_tc_2early_SLOT2, TypeJ>, Enc_0 {
+let Inst{13-0} = 0b00000000000010;
+let Inst{31-16} = 0b0101011111000000;
+let isSolo = 1;
+}
+def Y2_syncht : HInst<
+(outs),
+(ins),
+"syncht",
+ST_tc_ld_SLOT0, TypeST>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b1010100001000000;
+let isSolo = 1;
+}
+def Y4_l2fetch : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"l2fetch($Rs32,$Rt32)",
+ST_tc_3stall_SLOT0, TypeST>, Enc_14620934 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110000;
+let isSoloAX = 1;
+let mayStore = 1;
+let hasSideEffects = 1;
+}
+def Y4_trace : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"trace($Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01100010010;
+let isSoloAX = 1;
+}
+def Y5_l2fetch : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"l2fetch($Rs32,$Rtt32)",
+ST_tc_3stall_SLOT0, TypeST>, Enc_8943121, Requires<[HasV5T]> {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110100;
+let isSoloAX = 1;
+let mayStore = 1;
+let hasSideEffects = 1;
+}
+def dep_A2_addsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32):sat:deprecated",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def dep_A2_subsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32):sat:deprecated",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def dep_S2_packhl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = packhl($Rs32,$Rt32):deprecated",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010100000;
+}
diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td
new file mode 100644
index 000000000000..77a56a9adf10
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepMappings.td
@@ -0,0 +1,654 @@
+//===--- HexagonDepMappings.td --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def A2_negAlias : InstAlias<"$Rd32=neg($Rs32)", (A2_subri IntRegs:$Rd32, 0, IntRegs:$Rs32)>;
+def A2_notAlias : InstAlias<"$Rd32=not($Rs32)", (A2_subri IntRegs:$Rd32, -1, IntRegs:$Rs32)>;
+def A2_tfrfAlias : InstAlias<"if (!$Pu4) $Rd32=$Rs32", (A2_paddif IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrfnewAlias : InstAlias<"if (!$Pu4.new) $Rd32=$Rs32", (A2_paddifnew IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrtAlias : InstAlias<"if ($Pu4) $Rd32=$Rs32", (A2_paddit IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrtnewAlias : InstAlias<"if ($Pu4.new) $Rd32=$Rs32", (A2_padditnew IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_vaddb_mapAlias : InstAlias<"$Rdd32=vaddb($Rss32,$Rtt32)", (A2_vaddub DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def A2_vsubb_mapAlias : InstAlias<"$Rdd32=vsubb($Rss32,$Rtt32)", (A2_vsubub DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def A2_zxtbAlias : InstAlias<"$Rd32=zxtb($Rs32)", (A2_andir IntRegs:$Rd32, IntRegs:$Rs32, 255)>;
+def C2_cmpltAlias : InstAlias<"$Pd4=cmp.lt($Rs32,$Rt32)", (C2_cmpgt PredRegs:$Pd4, IntRegs:$Rt32, IntRegs:$Rs32)>;
+def C2_cmpltuAlias : InstAlias<"$Pd4=cmp.ltu($Rs32,$Rt32)", (C2_cmpgtu PredRegs:$Pd4, IntRegs:$Rt32, IntRegs:$Rs32)>;
+def C2_pxfer_mapAlias : InstAlias<"$Pd4=$Ps4", (C2_or PredRegs:$Pd4, PredRegs:$Ps4, PredRegs:$Ps4)>;
+def J2_jumpf_nopred_mapAlias : InstAlias<"if (!$Pu4) jump $Ii", (J2_jumpf PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def J2_jumprf_nopred_mapAlias : InstAlias<"if (!$Pu4) jumpr $Rs32", (J2_jumprf PredRegs:$Pu4, IntRegs:$Rs32)>;
+def J2_jumprt_nopred_mapAlias : InstAlias<"if ($Pu4) jumpr $Rs32", (J2_jumprt PredRegs:$Pu4, IntRegs:$Rs32)>;
+def J2_jumpt_nopred_mapAlias : InstAlias<"if ($Pu4) jump $Ii", (J2_jumpt PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def L2_loadalignb_zomapAlias : InstAlias<"$Ryy32=memb_fifo($Rs32)", (L2_loadalignb_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
+def L2_loadalignh_zomapAlias : InstAlias<"$Ryy32=memh_fifo($Rs32)", (L2_loadalignh_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
+def L2_loadbsw2_zomapAlias : InstAlias<"$Rd32=membh($Rs32)", (L2_loadbsw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadbsw4_zomapAlias : InstAlias<"$Rdd32=membh($Rs32)", (L2_loadbsw4_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadbzw2_zomapAlias : InstAlias<"$Rd32=memubh($Rs32)", (L2_loadbzw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadbzw4_zomapAlias : InstAlias<"$Rdd32=memubh($Rs32)", (L2_loadbzw4_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadrb_zomapAlias : InstAlias<"$Rd32=memb($Rs32)", (L2_loadrb_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadrd_zomapAlias : InstAlias<"$Rdd32=memd($Rs32)", (L2_loadrd_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadrh_zomapAlias : InstAlias<"$Rd32=memh($Rs32)", (L2_loadrh_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadri_zomapAlias : InstAlias<"$Rd32=memw($Rs32)", (L2_loadri_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadrub_zomapAlias : InstAlias<"$Rd32=memub($Rs32)", (L2_loadrub_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadruh_zomapAlias : InstAlias<"$Rd32=memuh($Rs32)", (L2_loadruh_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_ploadrbf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memb($Rs32)", (L2_ploadrbf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memb($Rs32)", (L2_ploadrbfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbt_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memb($Rs32)", (L2_ploadrbt_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memb($Rs32)", (L2_ploadrbtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdf_zomapAlias : InstAlias<"if (!$Pt4) $Rdd32=memd($Rs32)", (L2_ploadrdf_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rdd32=memd($Rs32)", (L2_ploadrdfnew_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdt_zomapAlias : InstAlias<"if ($Pt4) $Rdd32=memd($Rs32)", (L2_ploadrdt_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rdd32=memd($Rs32)", (L2_ploadrdtnew_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memh($Rs32)", (L2_ploadrhf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memh($Rs32)", (L2_ploadrhfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrht_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memh($Rs32)", (L2_ploadrht_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memh($Rs32)", (L2_ploadrhtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrif_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memw($Rs32)", (L2_ploadrif_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrifnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memw($Rs32)", (L2_ploadrifnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrit_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memw($Rs32)", (L2_ploadrit_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadritnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memw($Rs32)", (L2_ploadritnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memub($Rs32)", (L2_ploadrubf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memub($Rs32)", (L2_ploadrubfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubt_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memub($Rs32)", (L2_ploadrubt_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memub($Rs32)", (L2_ploadrubtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memuh($Rs32)", (L2_ploadruhf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memuh($Rs32)", (L2_ploadruhfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruht_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memuh($Rs32)", (L2_ploadruht_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memuh($Rs32)", (L2_ploadruhtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L4_add_memopb_zomapAlias : InstAlias<"memb($Rs32)+=$Rt32", (L4_add_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_add_memoph_zomapAlias : InstAlias<"memh($Rs32)+=$Rt32", (L4_add_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_add_memopw_zomapAlias : InstAlias<"memw($Rs32)+=$Rt32", (L4_add_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memopb_zomapAlias : InstAlias<"memb($Rs32)&=$Rt32", (L4_and_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memoph_zomapAlias : InstAlias<"memh($Rs32)&=$Rt32", (L4_and_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memopw_zomapAlias : InstAlias<"memw($Rs32)&=$Rt32", (L4_and_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_iadd_memopb_zomapAlias : InstAlias<"memb($Rs32)+=#$II", (L4_iadd_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iadd_memoph_zomapAlias : InstAlias<"memh($Rs32)+=#$II", (L4_iadd_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iadd_memopw_zomapAlias : InstAlias<"memw($Rs32)+=#$II", (L4_iadd_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memopb_zomapAlias : InstAlias<"memb($Rs32)=clrbit(#$II)", (L4_iand_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memoph_zomapAlias : InstAlias<"memh($Rs32)=clrbit(#$II)", (L4_iand_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memopw_zomapAlias : InstAlias<"memw($Rs32)=clrbit(#$II)", (L4_iand_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memopb_zomapAlias : InstAlias<"memb($Rs32)=setbit(#$II)", (L4_ior_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memoph_zomapAlias : InstAlias<"memh($Rs32)=setbit(#$II)", (L4_ior_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memopw_zomapAlias : InstAlias<"memw($Rs32)=setbit(#$II)", (L4_ior_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memopb_zomapAlias : InstAlias<"memb($Rs32)-=#$II", (L4_isub_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memoph_zomapAlias : InstAlias<"memh($Rs32)-=#$II", (L4_isub_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memopw_zomapAlias : InstAlias<"memw($Rs32)-=#$II", (L4_isub_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_or_memopb_zomapAlias : InstAlias<"memb($Rs32)|=$Rt32", (L4_or_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_or_memoph_zomapAlias : InstAlias<"memh($Rs32)|=$Rt32", (L4_or_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_or_memopw_zomapAlias : InstAlias<"memw($Rs32)|=$Rt32", (L4_or_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memopb_zomapAlias : InstAlias<"memb($Rs32)-=$Rt32", (L4_sub_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memoph_zomapAlias : InstAlias<"memh($Rs32)-=$Rt32", (L4_sub_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memopw_zomapAlias : InstAlias<"memw($Rs32)-=$Rt32", (L4_sub_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def M2_mpyuiAlias : InstAlias<"$Rd32=mpyui($Rs32,$Rt32)", (M2_mpyi IntRegs:$Rd32, IntRegs:$Rs32, IntRegs:$Rt32)>;
+def S2_pstorerbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=$Rt32", (S2_pstorerbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerbnewf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=$Nt8.new", (S2_pstorerbnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerbnewt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=$Nt8.new", (S2_pstorerbnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerbt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=$Rt32", (S2_pstorerbt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerdf_zomapAlias : InstAlias<"if (!$Pv4) memd($Rs32)=$Rtt32", (S2_pstorerdf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_pstorerdt_zomapAlias : InstAlias<"if ($Pv4) memd($Rs32)=$Rtt32", (S2_pstorerdt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_pstorerff_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Rt32.h", (S2_pstorerff_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerft_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Rt32.h", (S2_pstorerft_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerhf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Rt32", (S2_pstorerhf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerhnewf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Nt8.new", (S2_pstorerhnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerhnewt_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Nt8.new", (S2_pstorerhnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerht_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Rt32", (S2_pstorerht_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerif_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=$Rt32", (S2_pstorerif_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerinewf_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=$Nt8.new", (S2_pstorerinewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerinewt_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=$Nt8.new", (S2_pstorerinewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerit_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=$Rt32", (S2_pstorerit_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerb_zomapAlias : InstAlias<"memb($Rs32)=$Rt32", (S2_storerb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerbnew_zomapAlias : InstAlias<"memb($Rs32)=$Nt8.new", (S2_storerbnew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_storerd_zomapAlias : InstAlias<"memd($Rs32)=$Rtt32", (S2_storerd_io IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_storerf_zomapAlias : InstAlias<"memh($Rs32)=$Rt32.h", (S2_storerf_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerh_zomapAlias : InstAlias<"memh($Rs32)=$Rt32", (S2_storerh_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerhnew_zomapAlias : InstAlias<"memh($Rs32)=$Nt8.new", (S2_storerhnew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_storeri_zomapAlias : InstAlias<"memw($Rs32)=$Rt32", (S2_storeri_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerinew_zomapAlias : InstAlias<"memw($Rs32)=$Nt8.new", (S2_storerinew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_tableidxb_goodsyntaxAlias : InstAlias<"$Rx32=tableidxb($Rs32,#$Ii,#$II)", (S2_tableidxb IntRegs:$Rx32, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II)>;
+def S4_pstorerbfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=$Rt32", (S4_pstorerbfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerbnewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=$Nt8.new", (S4_pstorerbnewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerbnewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=$Nt8.new", (S4_pstorerbnewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerbtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=$Rt32", (S4_pstorerbtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerdfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memd($Rs32)=$Rtt32", (S4_pstorerdfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S4_pstorerdtnew_zomapAlias : InstAlias<"if ($Pv4.new) memd($Rs32)=$Rtt32", (S4_pstorerdtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S4_pstorerffnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Rt32.h", (S4_pstorerffnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerftnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Rt32.h", (S4_pstorerftnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerhfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Rt32", (S4_pstorerhfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerhnewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Nt8.new", (S4_pstorerhnewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerhnewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Nt8.new", (S4_pstorerhnewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerhtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Rt32", (S4_pstorerhtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerifnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=$Rt32", (S4_pstorerifnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerinewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=$Nt8.new", (S4_pstorerinewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerinewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=$Nt8.new", (S4_pstorerinewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstoreritnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=$Rt32", (S4_pstoreritnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_storeirb_zomapAlias : InstAlias<"memb($Rs32)=#$II", (S4_storeirb_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=#$II", (S4_storeirbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=#$II", (S4_storeirbfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=#$II", (S4_storeirbt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=#$II", (S4_storeirbtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirh_zomapAlias : InstAlias<"memh($Rs32)=#$II", (S4_storeirh_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=#$II", (S4_storeirhf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=#$II", (S4_storeirhfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirht_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=#$II", (S4_storeirht_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=#$II", (S4_storeirhtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeiri_zomapAlias : InstAlias<"memw($Rs32)=#$II", (S4_storeiri_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirif_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=#$II", (S4_storeirif_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirifnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=#$II", (S4_storeirifnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirit_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=#$II", (S4_storeirit_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeiritnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=#$II", (S4_storeiritnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def V6_MAP_equbAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equhAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equwAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_extractw_altAlias : InstAlias<"$Rd32.w=vextract($Vu32,$Rs32)", (V6_extractw IntRegs:$Rd32, VectorRegs:$Vu32, IntRegs:$Rs32)>, Requires<[UseHVX]>;
+def V6_extractw_alt_128BAlias : InstAlias<"$Rd32.w=vextract($Vu32,$Rs32)", (V6_extractw IntRegs:$Rd32, VectorRegs:$Vu32, IntRegs:$Rs32)>, Requires<[UseHVX]>;
+def V6_ld0Alias : InstAlias<"$Vd32=vmem($Rt32)", (V6_vL32b_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ld0_128BAlias : InstAlias<"$Vd32=vmem($Rt32)", (V6_vL32b_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldnt0Alias : InstAlias<"$Vd32=vmem($Rt32):nt", (V6_vL32b_nt_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldnt0_128BAlias : InstAlias<"$Vd32=vmem($Rt32):nt", (V6_vL32b_nt_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldu0Alias : InstAlias<"$Vd32=vmemu($Rt32)", (V6_vL32Ub_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldu0_128BAlias : InstAlias<"$Vd32=vmemu($Rt32)", (V6_vL32Ub_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_st0Alias : InstAlias<"vmem($Rt32)=$Vs32", (V6_vS32b_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_st0_128BAlias : InstAlias<"vmem($Rt32)=$Vs32", (V6_vS32b_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stn0Alias : InstAlias<"vmem($Rt32)=$Os8.new", (V6_vS32b_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stn0_128BAlias : InstAlias<"vmem($Rt32)=$Os8.new", (V6_vS32b_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnnt0Alias : InstAlias<"vmem($Rt32):nt=$Os8.new", (V6_vS32b_nt_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnnt0_128BAlias : InstAlias<"vmem($Rt32):nt=$Os8.new", (V6_vS32b_nt_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnp0Alias : InstAlias<"if (!$Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnp0_128BAlias : InstAlias<"if (!$Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnpnt0Alias : InstAlias<"if (!$Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnpnt0_128BAlias : InstAlias<"if (!$Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnq0Alias : InstAlias<"if (!$Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnq0_128BAlias : InstAlias<"if (!$Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnqnt0Alias : InstAlias<"if (!$Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnqnt0_128BAlias : InstAlias<"if (!$Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnt0Alias : InstAlias<"vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnt0_128BAlias : InstAlias<"vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stp0Alias : InstAlias<"if ($Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stp0_128BAlias : InstAlias<"if ($Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stpnt0Alias : InstAlias<"if ($Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stpnt0_128BAlias : InstAlias<"if ($Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stq0Alias : InstAlias<"if ($Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stq0_128BAlias : InstAlias<"if ($Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stqnt0Alias : InstAlias<"if ($Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stqnt0_128BAlias : InstAlias<"if ($Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stu0Alias : InstAlias<"vmemu($Rt32)=$Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stu0_128BAlias : InstAlias<"vmemu($Rt32)=$Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stunp0Alias : InstAlias<"if (!$Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stunp0_128BAlias : InstAlias<"if (!$Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stup0Alias : InstAlias<"if ($Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stup0_128BAlias : InstAlias<"if ($Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_vabsdiffh_altAlias : InstAlias<"$Vd32=vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffh_alt_128BAlias : InstAlias<"$Vd32=vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffub_altAlias : InstAlias<"$Vd32=vabsdiffub($Vu32,$Vv32)", (V6_vabsdiffub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffub_alt_128BAlias : InstAlias<"$Vd32=vabsdiffub($Vu32,$Vv32)", (V6_vabsdiffub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffuh_altAlias : InstAlias<"$Vd32=vabsdiffuh($Vu32,$Vv32)", (V6_vabsdiffuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffuh_alt_128BAlias : InstAlias<"$Vd32=vabsdiffuh($Vu32,$Vv32)", (V6_vabsdiffuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffw_altAlias : InstAlias<"$Vd32=vabsdiffw($Vu32,$Vv32)", (V6_vabsdiffw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffw_alt_128BAlias : InstAlias<"$Vd32=vabsdiffw($Vu32,$Vv32)", (V6_vabsdiffw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsh_altAlias : InstAlias<"$Vd32=vabsh($Vu32)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_alt_128BAlias : InstAlias<"$Vd32=vabsh($Vu32)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_sat_altAlias : InstAlias<"$Vd32=vabsh($Vu32):sat", (V6_vabsh_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_sat_alt_128BAlias : InstAlias<"$Vd32=vabsh($Vu32):sat", (V6_vabsh_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuh_altAlias : InstAlias<"$Vd32.uh=vabs($Vu32.h)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuh_alt_128BAlias : InstAlias<"$Vd32.uh=vabs($Vu32.h)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuw_altAlias : InstAlias<"$Vd32.uw=vabs($Vu32.w)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuw_alt_128BAlias : InstAlias<"$Vd32.uw=vabs($Vu32.w)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_altAlias : InstAlias<"$Vd32=vabsw($Vu32)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_alt_128BAlias : InstAlias<"$Vd32=vabsw($Vu32)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_sat_altAlias : InstAlias<"$Vd32=vabsw($Vu32):sat", (V6_vabsw_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_sat_alt_128BAlias : InstAlias<"$Vd32=vabsw($Vu32):sat", (V6_vabsw_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddb_altAlias : InstAlias<"$Vd32=vaddb($Vu32,$Vv32)", (V6_vaddb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddb_alt_128BAlias : InstAlias<"$Vd32=vaddb($Vu32,$Vv32)", (V6_vaddb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddb_dv_altAlias : InstAlias<"$Vdd32=vaddb($Vuu32,$Vvv32)", (V6_vaddb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddb_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddb($Vuu32,$Vvv32)", (V6_vaddb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddbnq_altAlias : InstAlias<"if (!$Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbnq_alt_128BAlias : InstAlias<"if (!$Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbq_altAlias : InstAlias<"if ($Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbq_alt_128BAlias : InstAlias<"if ($Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddh_altAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32)", (V6_vaddh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddh_alt_128BAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32)", (V6_vaddh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddh_dv_altAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32)", (V6_vaddh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddh_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32)", (V6_vaddh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhnq_altAlias : InstAlias<"if (!$Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhnq_alt_128BAlias : InstAlias<"if (!$Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhq_altAlias : InstAlias<"if ($Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhq_alt_128BAlias : InstAlias<"if ($Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_altAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32):sat", (V6_vaddhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_alt_128BAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32):sat", (V6_vaddhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_dv_altAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32):sat", (V6_vaddhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32):sat", (V6_vaddhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhw_altAlias : InstAlias<"$Vdd32=vaddh($Vu32,$Vv32)", (V6_vaddhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhw_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vu32,$Vv32)", (V6_vaddhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubh_altAlias : InstAlias<"$Vdd32=vaddub($Vu32,$Vv32)", (V6_vaddubh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubh_alt_128BAlias : InstAlias<"$Vdd32=vaddub($Vu32,$Vv32)", (V6_vaddubh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_altAlias : InstAlias<"$Vd32=vaddub($Vu32,$Vv32):sat", (V6_vaddubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_alt_128BAlias : InstAlias<"$Vd32=vaddub($Vu32,$Vv32):sat", (V6_vaddubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_dv_altAlias : InstAlias<"$Vdd32=vaddub($Vuu32,$Vvv32):sat", (V6_vaddubsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddub($Vuu32,$Vvv32):sat", (V6_vaddubsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_altAlias : InstAlias<"$Vd32=vadduh($Vu32,$Vv32):sat", (V6_vadduhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_alt_128BAlias : InstAlias<"$Vd32=vadduh($Vu32,$Vv32):sat", (V6_vadduhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_dv_altAlias : InstAlias<"$Vdd32=vadduh($Vuu32,$Vvv32):sat", (V6_vadduhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vadduh($Vuu32,$Vvv32):sat", (V6_vadduhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhw_altAlias : InstAlias<"$Vdd32=vadduh($Vu32,$Vv32)", (V6_vadduhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhw_alt_128BAlias : InstAlias<"$Vdd32=vadduh($Vu32,$Vv32)", (V6_vadduhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_altAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32)", (V6_vaddw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_alt_128BAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32)", (V6_vaddw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_dv_altAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32)", (V6_vaddw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddw_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32)", (V6_vaddw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddwnq_altAlias : InstAlias<"if (!$Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwnq_alt_128BAlias : InstAlias<"if (!$Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwq_altAlias : InstAlias<"if ($Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwq_alt_128BAlias : InstAlias<"if ($Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_altAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32):sat", (V6_vaddwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_alt_128BAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32):sat", (V6_vaddwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_dv_altAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32):sat", (V6_vaddwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32):sat", (V6_vaddwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vandqrt_acc_altAlias : InstAlias<"$Vx32.ub|=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt_acc VectorRegs:$Vx32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_acc_alt_128BAlias : InstAlias<"$Vx32.ub|=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt_acc VectorRegs:$Vx32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_altAlias : InstAlias<"$Vd32.ub=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt VectorRegs:$Vd32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_alt_128BAlias : InstAlias<"$Vd32.ub=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt VectorRegs:$Vd32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_acc_altAlias : InstAlias<"$Qx4.ub|=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt_acc VecPredRegs:$Qx4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_acc_alt_128BAlias : InstAlias<"$Qx4.ub|=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt_acc VecPredRegs:$Qx4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_altAlias : InstAlias<"$Qd4.ub=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt VecPredRegs:$Qd4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_alt_128BAlias : InstAlias<"$Qd4.ub=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt VecPredRegs:$Qd4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslh_altAlias : InstAlias<"$Vd32=vaslh($Vu32,$Rt32)", (V6_vaslh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslh_alt_128BAlias : InstAlias<"$Vd32=vaslh($Vu32,$Rt32)", (V6_vaslh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslhv_altAlias : InstAlias<"$Vd32=vaslh($Vu32,$Vv32)", (V6_vaslhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslhv_alt_128BAlias : InstAlias<"$Vd32=vaslh($Vu32,$Vv32)", (V6_vaslhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslw_acc_altAlias : InstAlias<"$Vx32+=vaslw($Vu32,$Rt32)", (V6_vaslw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_acc_alt_128BAlias : InstAlias<"$Vx32+=vaslw($Vu32,$Rt32)", (V6_vaslw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_altAlias : InstAlias<"$Vd32=vaslw($Vu32,$Rt32)", (V6_vaslw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_alt_128BAlias : InstAlias<"$Vd32=vaslw($Vu32,$Rt32)", (V6_vaslw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslwv_altAlias : InstAlias<"$Vd32=vaslw($Vu32,$Vv32)", (V6_vaslwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslwv_alt_128BAlias : InstAlias<"$Vd32=vaslw($Vu32,$Vv32)", (V6_vaslwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrh_altAlias : InstAlias<"$Vd32=vasrh($Vu32,$Rt32)", (V6_vasrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrh_alt_128BAlias : InstAlias<"$Vd32=vasrh($Vu32,$Rt32)", (V6_vasrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrhbrndsat_altAlias : InstAlias<"$Vd32=vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhbrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhubrndsat_altAlias : InstAlias<"$Vd32=vasrhub($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhubrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhubsat_altAlias : InstAlias<"$Vd32=vasrhub($Vu32,$Vv32,$Rt8):sat", (V6_vasrhubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhv_altAlias : InstAlias<"$Vd32=vasrh($Vu32,$Vv32)", (V6_vasrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrhv_alt_128BAlias : InstAlias<"$Vd32=vasrh($Vu32,$Vv32)", (V6_vasrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrw_acc_altAlias : InstAlias<"$Vx32+=vasrw($Vu32,$Rt32)", (V6_vasrw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_acc_alt_128BAlias : InstAlias<"$Vx32+=vasrw($Vu32,$Rt32)", (V6_vasrw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_altAlias : InstAlias<"$Vd32=vasrw($Vu32,$Rt32)", (V6_vasrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_alt_128BAlias : InstAlias<"$Vd32=vasrw($Vu32,$Rt32)", (V6_vasrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrwh_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8)", (V6_vasrwhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwhrndsat_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrwhrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwhsat_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwuhsat_altAlias : InstAlias<"$Vd32=vasrwuh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwv_altAlias : InstAlias<"$Vd32=vasrw($Vu32,$Vv32)", (V6_vasrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrwv_alt_128BAlias : InstAlias<"$Vd32=vasrw($Vu32,$Vv32)", (V6_vasrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgh_altAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32)", (V6_vavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgh_alt_128BAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32)", (V6_vavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavghrnd_altAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32):rnd", (V6_vavghrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavghrnd_alt_128BAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32):rnd", (V6_vavghrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgub_altAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32)", (V6_vavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgub_alt_128BAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32)", (V6_vavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgubrnd_altAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32):rnd", (V6_vavgubrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgubrnd_alt_128BAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32):rnd", (V6_vavgubrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguh_altAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32)", (V6_vavguh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguh_alt_128BAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32)", (V6_vavguh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguhrnd_altAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32):rnd", (V6_vavguhrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguhrnd_alt_128BAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32):rnd", (V6_vavguhrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgw_altAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32)", (V6_vavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgw_alt_128BAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32)", (V6_vavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgwrnd_altAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32):rnd", (V6_vavgwrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgwrnd_alt_128BAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32):rnd", (V6_vavgwrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vcl0h_altAlias : InstAlias<"$Vd32=vcl0h($Vu32)", (V6_vcl0h VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0h_alt_128BAlias : InstAlias<"$Vd32=vcl0h($Vu32)", (V6_vcl0h VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0w_altAlias : InstAlias<"$Vd32=vcl0w($Vu32)", (V6_vcl0w VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0w_alt_128BAlias : InstAlias<"$Vd32=vcl0w($Vu32)", (V6_vcl0w VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vd0Alias : InstAlias<"$Vd32=#0", (V6_vxor VectorRegs:$Vd32, VectorRegs:$Vd32, VectorRegs:$Vd32)>, Requires<[UseHVX]>;
+def V6_vd0_128BAlias : InstAlias<"$Vd32=#0", (V6_vxor VectorRegs:$Vd32, VectorRegs:$Vd32, VectorRegs:$Vd32)>, Requires<[UseHVX]>;
+def V6_vdd0Alias : InstAlias<"$Vdd32=#0", (V6_vsubw_dv VecDblRegs:$Vdd32, W15, W15)>, Requires<[UseHVX]>;
+def V6_vdd0_128BAlias : InstAlias<"$Vdd32=#0", (V6_vsubw_dv VecDblRegs:$Vdd32, W15, W15)>, Requires<[UseHVX]>;
+def V6_vdealb4w_altAlias : InstAlias<"$Vd32=vdealb4w($Vu32,$Vv32)", (V6_vdealb4w VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdealb4w_alt_128BAlias : InstAlias<"$Vd32=vdealb4w($Vu32,$Vv32)", (V6_vdealb4w VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdealb_altAlias : InstAlias<"$Vd32=vdealb($Vu32)", (V6_vdealb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealb_alt_128BAlias : InstAlias<"$Vd32=vdealb($Vu32)", (V6_vdealb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealh_altAlias : InstAlias<"$Vd32=vdealh($Vu32)", (V6_vdealh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealh_alt_128BAlias : InstAlias<"$Vd32=vdealh($Vu32)", (V6_vdealh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_acc_altAlias : InstAlias<"$Vx32+=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_altAlias : InstAlias<"$Vd32=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_alt_128BAlias : InstAlias<"$Vd32=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_acc_altAlias : InstAlias<"$Vxx32+=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_altAlias : InstAlias<"$Vdd32=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_alt_128BAlias : InstAlias<"$Vdd32=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_acc_altAlias : InstAlias<"$Vx32+=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_altAlias : InstAlias<"$Vd32=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_alt_128BAlias : InstAlias<"$Vd32=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_acc_altAlias : InstAlias<"$Vxx32+=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_altAlias : InstAlias<"$Vdd32=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_alt_128BAlias : InstAlias<"$Vdd32=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_altAlias : InstAlias<"$Vd32=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_altAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_acc_altAlias : InstAlias<"$Vx32+=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_altAlias : InstAlias<"$Vd32=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_alt_128BAlias : InstAlias<"$Vd32=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_acc_altAlias : InstAlias<"$Vx32+=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_altAlias : InstAlias<"$Vd32=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_alt_128BAlias : InstAlias<"$Vd32=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_altAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_acc_altAlias : InstAlias<"$Vxx32+=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_altAlias : InstAlias<"$Vdd32=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_alt_128BAlias : InstAlias<"$Vdd32=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrh_altAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Rt32)", (V6_vlsrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrh_alt_128BAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Rt32)", (V6_vlsrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrhv_altAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Vv32)", (V6_vlsrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrhv_alt_128BAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Vv32)", (V6_vlsrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrw_altAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Rt32)", (V6_vlsrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrw_alt_128BAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Rt32)", (V6_vlsrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrwv_altAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Vv32)", (V6_vlsrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrwv_alt_128BAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Vv32)", (V6_vlsrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxh_altAlias : InstAlias<"$Vd32=vmaxh($Vu32,$Vv32)", (V6_vmaxh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxh_alt_128BAlias : InstAlias<"$Vd32=vmaxh($Vu32,$Vv32)", (V6_vmaxh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxub_altAlias : InstAlias<"$Vd32=vmaxub($Vu32,$Vv32)", (V6_vmaxub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxub_alt_128BAlias : InstAlias<"$Vd32=vmaxub($Vu32,$Vv32)", (V6_vmaxub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxuh_altAlias : InstAlias<"$Vd32=vmaxuh($Vu32,$Vv32)", (V6_vmaxuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxuh_alt_128BAlias : InstAlias<"$Vd32=vmaxuh($Vu32,$Vv32)", (V6_vmaxuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxw_altAlias : InstAlias<"$Vd32=vmaxw($Vu32,$Vv32)", (V6_vmaxw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxw_alt_128BAlias : InstAlias<"$Vd32=vmaxw($Vu32,$Vv32)", (V6_vmaxw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminh_altAlias : InstAlias<"$Vd32=vminh($Vu32,$Vv32)", (V6_vminh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminh_alt_128BAlias : InstAlias<"$Vd32=vminh($Vu32,$Vv32)", (V6_vminh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminub_altAlias : InstAlias<"$Vd32=vminub($Vu32,$Vv32)", (V6_vminub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminub_alt_128BAlias : InstAlias<"$Vd32=vminub($Vu32,$Vv32)", (V6_vminub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminuh_altAlias : InstAlias<"$Vd32=vminuh($Vu32,$Vv32)", (V6_vminuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminuh_alt_128BAlias : InstAlias<"$Vd32=vminuh($Vu32,$Vv32)", (V6_vminuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminw_altAlias : InstAlias<"$Vd32=vminw($Vu32,$Vv32)", (V6_vminw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminw_alt_128BAlias : InstAlias<"$Vd32=vminw($Vu32,$Vv32)", (V6_vminw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpabus_acc_altAlias : InstAlias<"$Vxx32+=vmpabus($Vuu32,$Rt32)", (V6_vmpabus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpabus($Vuu32,$Rt32)", (V6_vmpabus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_altAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Rt32)", (V6_vmpabus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_alt_128BAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Rt32)", (V6_vmpabus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabusv_altAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Vvv32)", (V6_vmpabusv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabusv_alt_128BAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Vvv32)", (V6_vmpabusv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabuuv_altAlias : InstAlias<"$Vdd32=vmpabuu($Vuu32,$Vvv32)", (V6_vmpabuuv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabuuv_alt_128BAlias : InstAlias<"$Vdd32=vmpabuu($Vuu32,$Vvv32)", (V6_vmpabuuv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpahb_acc_altAlias : InstAlias<"$Vxx32+=vmpahb($Vuu32,$Rt32)", (V6_vmpahb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpahb($Vuu32,$Rt32)", (V6_vmpahb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_altAlias : InstAlias<"$Vdd32=vmpahb($Vuu32,$Rt32)", (V6_vmpahb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_alt_128BAlias : InstAlias<"$Vdd32=vmpahb($Vuu32,$Rt32)", (V6_vmpahb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_acc_altAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Rt32)", (V6_vmpybus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Rt32)", (V6_vmpybus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_altAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Rt32)", (V6_vmpybus VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_alt_128BAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Rt32)", (V6_vmpybus VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_acc_altAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Vv32)", (V6_vmpybusv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Vv32)", (V6_vmpybusv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_altAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Vv32)", (V6_vmpybusv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_alt_128BAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Vv32)", (V6_vmpybusv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_acc_altAlias : InstAlias<"$Vxx32+=vmpyb($Vu32,$Vv32)", (V6_vmpybv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyb($Vu32,$Vv32)", (V6_vmpybv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_altAlias : InstAlias<"$Vdd32=vmpyb($Vu32,$Vv32)", (V6_vmpybv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_alt_128BAlias : InstAlias<"$Vdd32=vmpyb($Vu32,$Vv32)", (V6_vmpybv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyewuh_altAlias : InstAlias<"$Vd32=vmpyewuh($Vu32,$Vv32)", (V6_vmpyewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyewuh_alt_128BAlias : InstAlias<"$Vd32=vmpyewuh($Vu32,$Vv32)", (V6_vmpyewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyh_altAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Rt32)", (V6_vmpyh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyh_alt_128BAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Rt32)", (V6_vmpyh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsat_acc_altAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Rt32):sat", (V6_vmpyhsat_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsat_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Rt32):sat", (V6_vmpyhsat_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsrs_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:rnd:sat", (V6_vmpyhsrs VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsrs_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:rnd:sat", (V6_vmpyhsrs VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhss_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:sat", (V6_vmpyhss VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhss_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:sat", (V6_vmpyhss VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_acc_altAlias : InstAlias<"$Vxx32+=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_altAlias : InstAlias<"$Vdd32=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_alt_128BAlias : InstAlias<"$Vdd32=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_acc_altAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Vv32)", (V6_vmpyhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Vv32)", (V6_vmpyhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_altAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Vv32)", (V6_vmpyhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_alt_128BAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Vv32)", (V6_vmpyhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhvsrs_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyhvsrs VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhvsrs_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyhvsrs VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewh_acc_altAlias : InstAlias<"$Vx32+=vmpyiewh($Vu32,$Vv32)", (V6_vmpyiewh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiewh($Vu32,$Vv32)", (V6_vmpyiewh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_acc_altAlias : InstAlias<"$Vx32+=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_altAlias : InstAlias<"$Vd32=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_alt_128BAlias : InstAlias<"$Vd32=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_acc_altAlias : InstAlias<"$Vx32+=vmpyih($Vu32,$Vv32)", (V6_vmpyih_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyih($Vu32,$Vv32)", (V6_vmpyih_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_altAlias : InstAlias<"$Vd32=vmpyih($Vu32,$Vv32)", (V6_vmpyih VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_alt_128BAlias : InstAlias<"$Vd32=vmpyih($Vu32,$Vv32)", (V6_vmpyih VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_acc_altAlias : InstAlias<"$Vx32+=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_altAlias : InstAlias<"$Vd32=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_alt_128BAlias : InstAlias<"$Vd32=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiowh_altAlias : InstAlias<"$Vd32=vmpyiowh($Vu32,$Vv32)", (V6_vmpyiowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiowh_alt_128BAlias : InstAlias<"$Vd32=vmpyiowh($Vu32,$Vv32)", (V6_vmpyiowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_acc_altAlias : InstAlias<"$Vx32+=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_altAlias : InstAlias<"$Vd32=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_alt_128BAlias : InstAlias<"$Vd32=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_acc_altAlias : InstAlias<"$Vx32+=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_altAlias : InstAlias<"$Vd32=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_alt_128BAlias : InstAlias<"$Vd32=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_altAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:sat", (V6_vmpyowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_alt_128BAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:sat", (V6_vmpyowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_rnd_altAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyowh_rnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_rnd_alt_128BAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyowh_rnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyub_acc_altAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Rt32)", (V6_vmpyub_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Rt32)", (V6_vmpyub_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_altAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Rt32)", (V6_vmpyub VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_alt_128BAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Rt32)", (V6_vmpyub VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_acc_altAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Vv32)", (V6_vmpyubv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Vv32)", (V6_vmpyubv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_altAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Vv32)", (V6_vmpyubv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_alt_128BAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Vv32)", (V6_vmpyubv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_acc_altAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_altAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_alt_128BAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_acc_altAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_altAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_alt_128BAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgh_altAlias : InstAlias<"$Vd32=vnavgh($Vu32,$Vv32)", (V6_vnavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgh_alt_128BAlias : InstAlias<"$Vd32=vnavgh($Vu32,$Vv32)", (V6_vnavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgub_altAlias : InstAlias<"$Vd32=vnavgub($Vu32,$Vv32)", (V6_vnavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgub_alt_128BAlias : InstAlias<"$Vd32=vnavgub($Vu32,$Vv32)", (V6_vnavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgw_altAlias : InstAlias<"$Vd32=vnavgw($Vu32,$Vv32)", (V6_vnavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgw_alt_128BAlias : InstAlias<"$Vd32=vnavgw($Vu32,$Vv32)", (V6_vnavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnormamth_altAlias : InstAlias<"$Vd32=vnormamth($Vu32)", (V6_vnormamth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamth_alt_128BAlias : InstAlias<"$Vd32=vnormamth($Vu32)", (V6_vnormamth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamtw_altAlias : InstAlias<"$Vd32=vnormamtw($Vu32)", (V6_vnormamtw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamtw_alt_128BAlias : InstAlias<"$Vd32=vnormamtw($Vu32)", (V6_vnormamtw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vpackeb_altAlias : InstAlias<"$Vd32=vpackeb($Vu32,$Vv32)", (V6_vpackeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeb_alt_128BAlias : InstAlias<"$Vd32=vpackeb($Vu32,$Vv32)", (V6_vpackeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeh_altAlias : InstAlias<"$Vd32=vpackeh($Vu32,$Vv32)", (V6_vpackeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeh_alt_128BAlias : InstAlias<"$Vd32=vpackeh($Vu32,$Vv32)", (V6_vpackeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhb_sat_altAlias : InstAlias<"$Vd32=vpackhb($Vu32,$Vv32):sat", (V6_vpackhb_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhb_sat_alt_128BAlias : InstAlias<"$Vd32=vpackhb($Vu32,$Vv32):sat", (V6_vpackhb_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhub_sat_altAlias : InstAlias<"$Vd32=vpackhub($Vu32,$Vv32):sat", (V6_vpackhub_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhub_sat_alt_128BAlias : InstAlias<"$Vd32=vpackhub($Vu32,$Vv32):sat", (V6_vpackhub_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackob_altAlias : InstAlias<"$Vd32=vpackob($Vu32,$Vv32)", (V6_vpackob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackob_alt_128BAlias : InstAlias<"$Vd32=vpackob($Vu32,$Vv32)", (V6_vpackob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackoh_altAlias : InstAlias<"$Vd32=vpackoh($Vu32,$Vv32)", (V6_vpackoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackoh_alt_128BAlias : InstAlias<"$Vd32=vpackoh($Vu32,$Vv32)", (V6_vpackoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwh_sat_altAlias : InstAlias<"$Vd32=vpackwh($Vu32,$Vv32):sat", (V6_vpackwh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwh_sat_alt_128BAlias : InstAlias<"$Vd32=vpackwh($Vu32,$Vv32):sat", (V6_vpackwh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwuh_sat_altAlias : InstAlias<"$Vd32=vpackwuh($Vu32,$Vv32):sat", (V6_vpackwuh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwuh_sat_alt_128BAlias : InstAlias<"$Vd32=vpackwuh($Vu32,$Vv32):sat", (V6_vpackwuh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpopcounth_altAlias : InstAlias<"$Vd32=vpopcounth($Vu32)", (V6_vpopcounth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vpopcounth_alt_128BAlias : InstAlias<"$Vd32=vpopcounth($Vu32)", (V6_vpopcounth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_acc_altAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_altAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_alt_128BAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_acc_altAlias : InstAlias<"$Vxx32+=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_altAlias : InstAlias<"$Vdd32=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_alt_128BAlias : InstAlias<"$Vdd32=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_acc_altAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_altAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_alt_128BAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_acc_altAlias : InstAlias<"$Vx32+=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_altAlias : InstAlias<"$Vd32=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_alt_128BAlias : InstAlias<"$Vd32=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_acc_altAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_altAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_alt_128BAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_acc_altAlias : InstAlias<"$Vxx32+=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_altAlias : InstAlias<"$Vdd32=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_alt_128BAlias : InstAlias<"$Vdd32=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_acc_altAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_altAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_alt_128BAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhb_altAlias : InstAlias<"$Vd32=vroundhb($Vu32,$Vv32):sat", (V6_vroundhb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhb_alt_128BAlias : InstAlias<"$Vd32=vroundhb($Vu32,$Vv32):sat", (V6_vroundhb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhub_altAlias : InstAlias<"$Vd32=vroundhub($Vu32,$Vv32):sat", (V6_vroundhub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhub_alt_128BAlias : InstAlias<"$Vd32=vroundhub($Vu32,$Vv32):sat", (V6_vroundhub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwh_altAlias : InstAlias<"$Vd32=vroundwh($Vu32,$Vv32):sat", (V6_vroundwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwh_alt_128BAlias : InstAlias<"$Vd32=vroundwh($Vu32,$Vv32):sat", (V6_vroundwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwuh_altAlias : InstAlias<"$Vd32=vroundwuh($Vu32,$Vv32):sat", (V6_vroundwuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwuh_alt_128BAlias : InstAlias<"$Vd32=vroundwuh($Vu32,$Vv32):sat", (V6_vroundwuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrsadubi_acc_altAlias : InstAlias<"$Vxx32+=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_altAlias : InstAlias<"$Vdd32=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_alt_128BAlias : InstAlias<"$Vdd32=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vsathub_altAlias : InstAlias<"$Vd32=vsathub($Vu32,$Vv32)", (V6_vsathub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsathub_alt_128BAlias : InstAlias<"$Vd32=vsathub($Vu32,$Vv32)", (V6_vsathub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsatwh_altAlias : InstAlias<"$Vd32=vsatwh($Vu32,$Vv32)", (V6_vsatwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsatwh_alt_128BAlias : InstAlias<"$Vd32=vsatwh($Vu32,$Vv32)", (V6_vsatwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsb_altAlias : InstAlias<"$Vdd32=vsxtb($Vu32)", (V6_vsb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsb_alt_128BAlias : InstAlias<"$Vdd32=vsxtb($Vu32)", (V6_vsb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsh_altAlias : InstAlias<"$Vdd32=vsxth($Vu32)", (V6_vsh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsh_alt_128BAlias : InstAlias<"$Vdd32=vsxth($Vu32)", (V6_vsh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshufeh_altAlias : InstAlias<"$Vd32=vshuffeh($Vu32,$Vv32)", (V6_vshufeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufeh_alt_128BAlias : InstAlias<"$Vd32=vshuffeh($Vu32,$Vv32)", (V6_vshufeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffb_altAlias : InstAlias<"$Vd32=vshuffb($Vu32)", (V6_vshuffb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffb_alt_128BAlias : InstAlias<"$Vd32=vshuffb($Vu32)", (V6_vshuffb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffeb_altAlias : InstAlias<"$Vd32=vshuffeb($Vu32,$Vv32)", (V6_vshuffeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffeb_alt_128BAlias : InstAlias<"$Vd32=vshuffeb($Vu32,$Vv32)", (V6_vshuffeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffh_altAlias : InstAlias<"$Vd32=vshuffh($Vu32)", (V6_vshuffh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffh_alt_128BAlias : InstAlias<"$Vd32=vshuffh($Vu32)", (V6_vshuffh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffob_altAlias : InstAlias<"$Vd32=vshuffob($Vu32,$Vv32)", (V6_vshuffob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffob_alt_128BAlias : InstAlias<"$Vd32=vshuffob($Vu32,$Vv32)", (V6_vshuffob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeb_altAlias : InstAlias<"$Vdd32=vshuffoeb($Vu32,$Vv32)", (V6_vshufoeb VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeb_alt_128BAlias : InstAlias<"$Vdd32=vshuffoeb($Vu32,$Vv32)", (V6_vshufoeb VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeh_altAlias : InstAlias<"$Vdd32=vshuffoeh($Vu32,$Vv32)", (V6_vshufoeh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeh_alt_128BAlias : InstAlias<"$Vdd32=vshuffoeh($Vu32,$Vv32)", (V6_vshufoeh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoh_altAlias : InstAlias<"$Vd32=vshuffoh($Vu32,$Vv32)", (V6_vshufoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoh_alt_128BAlias : InstAlias<"$Vd32=vshuffoh($Vu32,$Vv32)", (V6_vshufoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_altAlias : InstAlias<"$Vd32=vsubb($Vu32,$Vv32)", (V6_vsubb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_alt_128BAlias : InstAlias<"$Vd32=vsubb($Vu32,$Vv32)", (V6_vsubb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_dv_altAlias : InstAlias<"$Vdd32=vsubb($Vuu32,$Vvv32)", (V6_vsubb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubb_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubb($Vuu32,$Vvv32)", (V6_vsubb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubbnq_altAlias : InstAlias<"if (!$Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbnq_alt_128BAlias : InstAlias<"if (!$Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbq_altAlias : InstAlias<"if ($Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbq_alt_128BAlias : InstAlias<"if ($Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubh_altAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32)", (V6_vsubh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubh_alt_128BAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32)", (V6_vsubh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubh_dv_altAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32)", (V6_vsubh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubh_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32)", (V6_vsubh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhnq_altAlias : InstAlias<"if (!$Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhnq_alt_128BAlias : InstAlias<"if (!$Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhq_altAlias : InstAlias<"if ($Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhq_alt_128BAlias : InstAlias<"if ($Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_altAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32):sat", (V6_vsubhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_alt_128BAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32):sat", (V6_vsubhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_dv_altAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32):sat", (V6_vsubhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32):sat", (V6_vsubhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhw_altAlias : InstAlias<"$Vdd32=vsubh($Vu32,$Vv32)", (V6_vsubhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhw_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vu32,$Vv32)", (V6_vsubhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububh_altAlias : InstAlias<"$Vdd32=vsubub($Vu32,$Vv32)", (V6_vsububh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububh_alt_128BAlias : InstAlias<"$Vdd32=vsubub($Vu32,$Vv32)", (V6_vsububh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_altAlias : InstAlias<"$Vd32=vsubub($Vu32,$Vv32):sat", (V6_vsububsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_alt_128BAlias : InstAlias<"$Vd32=vsubub($Vu32,$Vv32):sat", (V6_vsububsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_dv_altAlias : InstAlias<"$Vdd32=vsubub($Vuu32,$Vvv32):sat", (V6_vsububsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubub($Vuu32,$Vvv32):sat", (V6_vsububsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_altAlias : InstAlias<"$Vd32=vsubuh($Vu32,$Vv32):sat", (V6_vsubuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_alt_128BAlias : InstAlias<"$Vd32=vsubuh($Vu32,$Vv32):sat", (V6_vsubuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_dv_altAlias : InstAlias<"$Vdd32=vsubuh($Vuu32,$Vvv32):sat", (V6_vsubuhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubuh($Vuu32,$Vvv32):sat", (V6_vsubuhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhw_altAlias : InstAlias<"$Vdd32=vsubuh($Vu32,$Vv32)", (V6_vsubuhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhw_alt_128BAlias : InstAlias<"$Vdd32=vsubuh($Vu32,$Vv32)", (V6_vsubuhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_altAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32)", (V6_vsubw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_alt_128BAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32)", (V6_vsubw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_dv_altAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32)", (V6_vsubw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubw_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32)", (V6_vsubw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubwnq_altAlias : InstAlias<"if (!$Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwnq_alt_128BAlias : InstAlias<"if (!$Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwq_altAlias : InstAlias<"if ($Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwq_alt_128BAlias : InstAlias<"if ($Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_altAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32):sat", (V6_vsubwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_alt_128BAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32):sat", (V6_vsubwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_dv_altAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32):sat", (V6_vsubwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32):sat", (V6_vsubwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_acc_altAlias : InstAlias<"$Vxx32+=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_altAlias : InstAlias<"$Vdd32=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_alt_128BAlias : InstAlias<"$Vdd32=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_acc_altAlias : InstAlias<"$Vxx32+=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_altAlias : InstAlias<"$Vdd32=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_alt_128BAlias : InstAlias<"$Vdd32=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_acc_altAlias : InstAlias<"$Vxx32+=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_altAlias : InstAlias<"$Vdd32=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_alt_128BAlias : InstAlias<"$Vdd32=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtran2x2_mapAlias : InstAlias<"vtrans2x2($Vy32,$Vx32,$Rt32)", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtran2x2_map_128BAlias : InstAlias<"vtrans2x2($Vy32,$Vx32,$Rt32)", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vunpackb_altAlias : InstAlias<"$Vdd32=vunpackb($Vu32)", (V6_vunpackb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackb_alt_128BAlias : InstAlias<"$Vdd32=vunpackb($Vu32)", (V6_vunpackb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackh_altAlias : InstAlias<"$Vdd32=vunpackh($Vu32)", (V6_vunpackh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackh_alt_128BAlias : InstAlias<"$Vdd32=vunpackh($Vu32)", (V6_vunpackh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackoh_altAlias : InstAlias<"$Vxx32|=vunpackoh($Vu32)", (V6_vunpackoh VecDblRegs:$Vxx32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackoh_alt_128BAlias : InstAlias<"$Vxx32|=vunpackoh($Vu32)", (V6_vunpackoh VecDblRegs:$Vxx32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackub_altAlias : InstAlias<"$Vdd32=vunpackub($Vu32)", (V6_vunpackub VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackub_alt_128BAlias : InstAlias<"$Vdd32=vunpackub($Vu32)", (V6_vunpackub VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackuh_altAlias : InstAlias<"$Vdd32=vunpackuh($Vu32)", (V6_vunpackuh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackuh_alt_128BAlias : InstAlias<"$Vdd32=vunpackuh($Vu32)", (V6_vunpackuh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzb_altAlias : InstAlias<"$Vdd32=vzxtb($Vu32)", (V6_vzb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzb_alt_128BAlias : InstAlias<"$Vdd32=vzxtb($Vu32)", (V6_vzb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzh_altAlias : InstAlias<"$Vdd32=vzxth($Vu32)", (V6_vzh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzh_alt_128BAlias : InstAlias<"$Vdd32=vzxth($Vu32)", (V6_vzh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def Y2_dcfetchAlias : InstAlias<"dcfetch($Rs32)", (Y2_dcfetchbo IntRegs:$Rs32, 0)>;
diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td
new file mode 100644
index 000000000000..0e83b2678732
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepOperands.td
@@ -0,0 +1,132 @@
+//===--- HexagonDepOperands.td --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
+def s3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
+def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
+def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
+def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
+def s10_6ImmOperand : AsmOperandClass { let Name = "s10_6Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s10_6Imm : Operand<i32> { let ParserMatchClass = s10_6ImmOperand; let DecoderMethod = "s10_6ImmDecoder"; }
+def s10_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 6>(N->getSExtValue());}]>;
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
+def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def a30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
+def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
+def s8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
+def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
+def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
+def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
+def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
+def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b15_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
+def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
+def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
+def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def m32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
+def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u3_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
+def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
+def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u1_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
+def s31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
+def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
+def s30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
+def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
+def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
+def s6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
+def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
+def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
+def s32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
+def s6_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
+def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
+def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
+def s4_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
+def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u16_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
+def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
+def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
+def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
+def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
+def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u7_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b13_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
+def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
+def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u2_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
+def s4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
+def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
+def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
+def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+def s10_0ImmOperand : AsmOperandClass { let Name = "s10_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s10_0Imm : Operand<i32> { let ParserMatchClass = s10_0ImmOperand; let DecoderMethod = "s10_0ImmDecoder"; }
+def s10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 0>(N->getSExtValue());}]>;
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index a5351cd08da5..67af947e089d 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -105,6 +105,8 @@ namespace {
     cl::init(false), cl::desc("Enable branch probability info"));
   cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
     cl::desc("Size limit in Hexagon early if-conversion"));
+  cl::opt<bool> SkipExitBranches("eif-no-loop-exit", cl::init(false),
+    cl::Hidden, cl::desc("Do not convert branches that may exit the loop"));
 
   struct PrintMB {
     PrintMB(const MachineBasicBlock *B) : MB(B) {}
@@ -142,8 +144,8 @@ namespace {
   raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
     OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
        << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
-       << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
-       << PrintMB(P.FP.FalseB)
+       << ", TrueB:" << PrintMB(P.FP.TrueB)
+       << ", FalseB:" << PrintMB(P.FP.FalseB)
        << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
     return OS;
   }
@@ -187,7 +189,8 @@ namespace {
     bool usesUndefVReg(const MachineInstr *MI) const;
     bool isValid(const FlowPattern &FP) const;
     unsigned countPredicateDefs(const MachineBasicBlock *B) const;
-    unsigned computePhiCost(MachineBasicBlock *B) const;
+    unsigned computePhiCost(const MachineBasicBlock *B,
+          const FlowPattern &FP) const;
     bool isProfitable(const FlowPattern &FP) const;
     bool isPredicableStore(const MachineInstr *MI) const;
     bool isSafeToSpeculate(const MachineInstr *MI) const;
@@ -199,6 +202,9 @@ namespace {
           MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
           unsigned PredR, bool IfTrue);
 
+    unsigned buildMux(MachineBasicBlock *B, MachineBasicBlock::iterator At,
+          const TargetRegisterClass *DRC, unsigned PredR, unsigned TR,
+          unsigned TSR, unsigned FR, unsigned FSR);
     void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
     void convert(const FlowPattern &FP);
 
@@ -230,7 +236,7 @@ bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
     return false;
   MachineBasicBlock *SB = *B->succ_begin();
   MachineLoop *L = MLI->getLoopFor(SB);
-  return L && SB == L->getHeader();
+  return L && SB == L->getHeader() && MDT->dominates(B, SB);
 }
 
 bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
@@ -264,9 +270,6 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
     // mark as diamond with both sides equal?
     return false;
   }
-  // Loop could be null for both.
-  if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
-    return false;
 
   // Record the true/false blocks in such a way that "true" means "if (PredR)",
   // and "false" means "if (!PredR)".
@@ -289,8 +292,14 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
   // it has a single successor. In fact, the block has to end either with
   // an unconditional branch (which can be predicated), or with a fall-
   // through.
-  bool TOk = (TNP == 1) && (TNS == 1);
-  bool FOk = (FNP == 1) && (FNS == 1);
+  // Also, skip blocks that do not belong to the same loop.
+  bool TOk = (TNP == 1 && TNS == 1 && MLI->getLoopFor(TB) == L);
+  bool FOk = (FNP == 1 && FNS == 1 && MLI->getLoopFor(FB) == L);
+
+  // If requested (via an option), do not consider branches where the
+  // true and false targets do not belong to the same loop.
+  if (SkipExitBranches && MLI->getLoopFor(TB) != MLI->getLoopFor(FB))
+    return false;
 
   // If neither is predicable, there is nothing interesting.
   if (!TOk && !FOk)
@@ -307,17 +316,15 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
       // Diamond: "if (P) then TB; else FB;".
     } else {
       // TOk && !FOk
-      if (TSB == FB) {
+      if (TSB == FB)
         JB = FB;
-        FB = nullptr;
-      }
+      FB = nullptr;
     }
   } else {
     // !TOk && FOk  (at least one must be true by now).
-    if (FSB == TB) {
+    if (FSB == TB)
       JB = TB;
-      TB = nullptr;
-    }
+    TB = nullptr;
   }
   // Don't try to predicate loop preheaders.
   if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
@@ -383,8 +390,14 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
       unsigned R = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(R))
         continue;
-      if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
-        continue;
+      switch (MRI->getRegClass(R)->getID()) {
+        case Hexagon::PredRegsRegClassID:
+        case Hexagon::VecPredRegsRegClassID:
+        case Hexagon::VecPredRegs128BRegClassID:
+          break;
+        default:
+          continue;
+      }
       for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
         if (U->getParent()->isPHI())
           return false;
@@ -442,24 +455,39 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
   return true;
 }
 
-unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
-  assert(B->pred_size() <= 2);
+unsigned HexagonEarlyIfConversion::computePhiCost(const MachineBasicBlock *B,
+      const FlowPattern &FP) const {
   if (B->pred_size() < 2)
     return 0;
 
   unsigned Cost = 0;
-  MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
-  for (I = B->begin(); I != E; ++I) {
-    const MachineOperand &RO1 = I->getOperand(1);
-    const MachineOperand &RO3 = I->getOperand(3);
-    assert(RO1.isReg() && RO3.isReg());
+  for (const MachineInstr &MI : *B) {
+    if (!MI.isPHI())
+      break;
+    // If both incoming blocks are one of the TrueB/FalseB/SplitB, then
+    // a MUX may be needed. Otherwise the PHI will need to be updated at
+    // no extra cost.
+    // Find the interesting PHI operands for further checks.
+    SmallVector<unsigned,2> Inc;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      const MachineBasicBlock *BB = MI.getOperand(i+1).getMBB();
+      if (BB == FP.SplitB || BB == FP.TrueB || BB == FP.FalseB)
+        Inc.push_back(i);
+    }
+    assert(Inc.size() <= 2);
+    if (Inc.size() < 2)
+      continue;
+
+    const MachineOperand &RA = MI.getOperand(1);
+    const MachineOperand &RB = MI.getOperand(3);
+    assert(RA.isReg() && RB.isReg());
     // Must have a MUX if the phi uses a subregister.
-    if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+    if (RA.getSubReg() != 0 || RB.getSubReg() != 0) {
       Cost++;
       continue;
     }
-    MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
-    MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+    const MachineInstr *Def1 = MRI->getVRegDef(RA.getReg());
+    const MachineInstr *Def3 = MRI->getVRegDef(RB.getReg());
     if (!HII->isPredicable(*Def1) || !HII->isPredicable(*Def3))
       Cost++;
   }
@@ -485,7 +513,6 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
 
 bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   if (FP.TrueB && FP.FalseB) {
-
     // Do not IfCovert if the branch is one sided.
     if (MBPI) {
       BranchProbability Prob(9, 10);
@@ -510,18 +537,16 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   // the code size. If the predicated blocks are smaller than a packet size,
   // approximate the spare room in the packet that could be filled with the
   // predicated/speculated instructions.
-  unsigned TS = 0, FS = 0, Spare = 0;
-  if (FP.TrueB) {
-    TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
-    if (TS < HEXAGON_PACKET_SIZE)
-      Spare += HEXAGON_PACKET_SIZE-TS;
-  }
-  if (FP.FalseB) {
-    FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
-    if (FS < HEXAGON_PACKET_SIZE)
-      Spare += HEXAGON_PACKET_SIZE-TS;
-  }
-  unsigned TotalIn = TS+FS;
+  auto TotalCount = [] (const MachineBasicBlock *B, unsigned &Spare) {
+    if (!B)
+      return 0u;
+    unsigned T = std::distance(B->begin(), B->getFirstTerminator());
+    if (T < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-T;
+    return T;
+  };
+  unsigned Spare = 0;
+  unsigned TotalIn = TotalCount(FP.TrueB, Spare) + TotalCount(FP.FalseB, Spare);
   DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
                << TotalIn << ", spare room: " << Spare << "\n");
   if (TotalIn >= SizeLimit+Spare)
@@ -536,17 +561,17 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   unsigned TotalPh = 0;
   unsigned PredDefs = countPredicateDefs(FP.SplitB);
   if (FP.JoinB) {
-    TotalPh = computePhiCost(FP.JoinB);
+    TotalPh = computePhiCost(FP.JoinB, FP);
     PredDefs += countPredicateDefs(FP.JoinB);
   } else {
     if (FP.TrueB && FP.TrueB->succ_size() > 0) {
       MachineBasicBlock *SB = *FP.TrueB->succ_begin();
-      TotalPh += computePhiCost(SB);
+      TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
     if (FP.FalseB && FP.FalseB->succ_size() > 0) {
       MachineBasicBlock *SB = *FP.FalseB->succ_begin();
-      TotalPh += computePhiCost(SB);
+      TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
   }
@@ -680,12 +705,12 @@ void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
     MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, HII->get(COpc));
     MachineInstr::mop_iterator MOI = MI->operands_begin();
     if (HII->isPostIncrement(*MI)) {
-      MIB.addOperand(*MOI);
+      MIB.add(*MOI);
       ++MOI;
     }
     MIB.addReg(PredR);
     for (const MachineOperand &MO : make_range(MOI, MI->operands_end()))
-      MIB.addOperand(MO);
+      MIB.add(MO);
 
     // Set memory references.
     MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
@@ -733,6 +758,43 @@ void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
   }
 }
 
+unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
+      MachineBasicBlock::iterator At, const TargetRegisterClass *DRC,
+      unsigned PredR, unsigned TR, unsigned TSR, unsigned FR, unsigned FSR) {
+  unsigned Opc = 0;
+  switch (DRC->getID()) {
+    case Hexagon::IntRegsRegClassID:
+      Opc = Hexagon::C2_mux;
+      break;
+    case Hexagon::DoubleRegsRegClassID:
+      Opc = Hexagon::PS_pselect;
+      break;
+    case Hexagon::VectorRegsRegClassID:
+      Opc = Hexagon::PS_vselect;
+      break;
+    case Hexagon::VecDblRegsRegClassID:
+      Opc = Hexagon::PS_wselect;
+      break;
+    case Hexagon::VectorRegs128BRegClassID:
+      Opc = Hexagon::PS_vselect_128B;
+      break;
+    case Hexagon::VecDblRegs128BRegClassID:
+      Opc = Hexagon::PS_wselect_128B;
+      break;
+    default:
+      llvm_unreachable("unexpected register type");
+  }
+  const MCInstrDesc &D = HII->get(Opc);
+
+  DebugLoc DL = B->findBranchDebugLoc();
+  unsigned MuxR = MRI->createVirtualRegister(DRC);
+  BuildMI(*B, At, DL, D, MuxR)
+    .addReg(PredR)
+    .addReg(TR, 0, TSR)
+    .addReg(FR, 0, FSR);
+  return MuxR;
+}
+
 void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
       const FlowPattern &FP) {
   // Visit all PHI nodes in the WhereB block and generate MUX instructions
@@ -759,40 +821,25 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
       TR = SR, TSR = SSR;
     else if (FR == 0)
       FR = SR, FSR = SSR;
-    assert(TR && FR);
-
-    using namespace Hexagon;
-
-    unsigned DR = PN->getOperand(0).getReg();
-    const TargetRegisterClass *RC = MRI->getRegClass(DR);
-    unsigned Opc = 0;
-    if (RC == &IntRegsRegClass)
-      Opc = C2_mux;
-    else if (RC == &DoubleRegsRegClass)
-      Opc = PS_pselect;
-    else if (RC == &VectorRegsRegClass)
-      Opc = PS_vselect;
-    else if (RC == &VecDblRegsRegClass)
-      Opc = PS_wselect;
-    else if (RC == &VectorRegs128BRegClass)
-      Opc = PS_vselect_128B;
-    else if (RC == &VecDblRegs128BRegClass)
-      Opc = PS_wselect_128B;
-    else
-      llvm_unreachable("unexpected register type");
-    const MCInstrDesc &D = HII->get(Opc);
-
-    MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
-    DebugLoc DL;
-    if (MuxAt != FP.SplitB->end())
-      DL = MuxAt->getDebugLoc();
-    unsigned MuxR = MRI->createVirtualRegister(RC);
-    BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
-      .addReg(FP.PredR)
-      .addReg(TR, 0, TSR)
-      .addReg(FR, 0, FSR);
-
-    PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+
+    assert(TR || FR);
+    unsigned MuxR = 0, MuxSR = 0;
+
+    if (TR && FR) {
+      unsigned DR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(DR);
+      MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC,
+                      FP.PredR, TR, TSR, FR, FSR);
+    } else if (TR) {
+      MuxR = TR;
+      MuxSR = TSR;
+    } else {
+      MuxR = FR;
+      MuxSR = FSR;
+    }
+
+    PN->addOperand(MachineOperand::CreateReg(MuxR, false, false, false, false,
+                                             false, false, MuxSR));
     PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
   }
 }
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 8f070d842b8c..d8ba5dcd35ad 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -362,14 +362,16 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   if (Range.empty())
     return;
 
-  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> bool {
+  // Return two booleans: { def-modifes-reg, def-covers-reg }.
+  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> std::pair<bool,bool> {
     if (!Op.isReg() || !Op.isDef())
-      return false;
+      return { false, false };
     unsigned DR = Op.getReg(), DSR = Op.getSubReg();
     if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
-      return false;
+      return { false, false };
     LaneBitmask SLM = getLaneMask(DR, DSR);
-    return (SLM & LM).any();
+    LaneBitmask A = SLM & LM;
+    return { A.any(), A == SLM };
   };
 
   // The splitting step will create pairs of predicated definitions without
@@ -453,20 +455,27 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   // Remove <dead> flags from all defs that are not dead after live range
   // extension, and collect all def operands. They will be used to generate
   // the necessary implicit uses.
+  // At the same time, add <dead> flag to all defs that are actually dead.
+  // This can happen, for example, when a mux with identical inputs is
+  // replaced with a COPY: the use of the predicate register disappears and
+  // the dead can become dead.
   std::set<RegisterRef> DefRegs;
   for (auto &Seg : Range) {
     if (!Seg.start.isRegister())
       continue;
     MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
     for (auto &Op : DefI->operands()) {
-      if (Seg.start.isDead() || !IsRegDef(Op))
-        continue;
-      DefRegs.insert(Op);
-      Op.setIsDead(false);
+      auto P = IsRegDef(Op);
+      if (P.second && Seg.end.isDead()) {
+        Op.setIsDead(true);
+      } else if (P.first) {
+        DefRegs.insert(Op);
+        Op.setIsDead(false);
+      }
     }
   }
 
-  // Finally, add implicit uses to each predicated def that is reached
+  // Now, add implicit uses to each predicated def that is reached
   // by other defs.
   for (auto &Seg : Range) {
     if (!Seg.start.isRegister() || !Range.liveAt(Seg.start.getPrevSlot()))
@@ -486,6 +495,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
     for (RegisterRef R : ImpUses)
       MachineInstrBuilder(MF, DefI).addReg(R.Reg, RegState::Implicit, R.Sub);
   }
+
 }
 
 void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
@@ -595,9 +605,9 @@ MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
           .addReg(SrcOp.getReg(), SrcState, SrcOp.getSubReg());
   } else {
     MIB = BuildMI(B, At, DL, HII->get(Opc))
-          .addReg(DstR, DstState, DstSR)
-          .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
-          .addOperand(SrcOp);
+              .addReg(DstR, DstState, DstSR)
+              .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
+              .add(SrcOp);
   }
 
   DEBUG(dbgs() << "created an initial copy: " << *MIB);
@@ -622,6 +632,12 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   bool ReadUndef = MD.isUndef();
   MachineBasicBlock::iterator At = MI;
 
+  auto updateRegs = [&UpdRegs] (const MachineInstr &MI) -> void {
+    for (auto &Op : MI.operands())
+      if (Op.isReg())
+        UpdRegs.insert(Op.getReg());
+  };
+
   // If this is a mux of the same register, just replace it with COPY.
   // Ideally, this would happen earlier, so that register coalescing would
   // see it.
@@ -630,6 +646,8 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   if (ST.isReg() && SF.isReg()) {
     RegisterRef RT(ST);
     if (RT == RegisterRef(SF)) {
+      // Copy regs to update first.
+      updateRegs(MI);
       MI.setDesc(HII->get(TargetOpcode::COPY));
       unsigned S = getRegState(ST);
       while (MI.getNumOperands() > 1)
@@ -651,9 +669,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   LIS->InsertMachineInstrInMaps(*TfrF);
 
   // Will need to recalculate live intervals for all registers in MI.
-  for (auto &Op : MI.operands())
-    if (Op.isReg())
-      UpdRegs.insert(Op.getReg());
+  updateRegs(MI);
 
   removeInstr(MI);
   return true;
@@ -828,7 +844,7 @@ void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
   while (Ox < NP) {
     MachineOperand &MO = MI.getOperand(Ox);
     if (!MO.isReg() || !MO.isImplicit())
-      MB.addOperand(MO);
+      MB.add(MO);
     Ox++;
   }
 
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index dfd1f1d4f886..015d3b840e6f 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -190,5 +190,5 @@ void HexagonFixupHwLoops::useExtLoopInstr(MachineFunction &MF,
   MIB = BuildMI(*MBB, MII, DL, TII->get(newOp));
 
   for (unsigned i = 0; i < MII->getNumOperands(); ++i)
-    MIB.addOperand(MII->getOperand(i));
+    MIB.add(MII->getOperand(i));
 }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a3f6273f9f67..0e2380f4316a 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -301,16 +301,30 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
         // the frame creation/destruction instructions.
         if (MO.isFI())
           return true;
-        if (!MO.isReg())
-          continue;
-        unsigned R = MO.getReg();
-        // Virtual registers will need scavenging, which then may require
-        // a stack slot.
-        if (TargetRegisterInfo::isVirtualRegister(R))
-          return true;
-        for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
-          if (CSR[*S])
+        if (MO.isReg()) {
+          unsigned R = MO.getReg();
+          // Virtual registers will need scavenging, which then may require
+          // a stack slot.
+          if (TargetRegisterInfo::isVirtualRegister(R))
             return true;
+          for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
+            if (CSR[*S])
+              return true;
+          continue;
+        }
+        if (MO.isRegMask()) {
+          // A regmask would normally have all callee-saved registers marked
+          // as preserved, so this check would not be needed, but in case of
+          // ever having other regmasks (for other calling conventions),
+          // make sure they would be processed correctly.
+          const uint32_t *BM = MO.getRegMask();
+          for (int x = CSR.find_first(); x >= 0; x = CSR.find_next(x)) {
+            unsigned R = x;
+            // If this regmask does not preserve a CSR, a frame will be needed.
+            if (!(BM[R/32] & (1u << (R%32))))
+              return true;
+          }
+        }
       }
     }
     return false;
@@ -1473,8 +1487,7 @@ bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
     return false;
 
   unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR)
-    .addOperand(MI->getOperand(1));
+  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR).add(MI->getOperand(1));
   BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
     .addReg(TmpR, RegState::Kill);
 
@@ -1646,8 +1659,15 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   LivePhysRegs LPR(&HRI);
   LPR.addLiveIns(B);
   SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
-  for (auto R = B.begin(); R != It; ++R)
+  for (auto R = B.begin(); R != It; ++R) {
+    Clobbers.clear();
     LPR.stepForward(*R, Clobbers);
+    // Dead defs are recorded in Clobbers, but are not automatically removed
+    // from the live set.
+    for (auto &C : Clobbers)
+      if (C.second->isReg() && C.second->isDead())
+        LPR.removeReg(C.first);
+  }
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned SrcR = MI->getOperand(2).getReg();
@@ -1985,9 +2005,9 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
   // class HaveRC and a new class NewRC. Return nullptr if a common class
   // cannot be found, otherwise return the resulting class. If HaveRC is
   // nullptr, assume that it is still unset.
-  auto getCommonRC = [&HRI] (const TargetRegisterClass *HaveRC,
-                             const TargetRegisterClass *NewRC)
-        -> const TargetRegisterClass* {
+  auto getCommonRC =
+      [](const TargetRegisterClass *HaveRC,
+         const TargetRegisterClass *NewRC) -> const TargetRegisterClass * {
     if (HaveRC == nullptr || HaveRC == NewRC)
       return NewRC;
     // Different classes, both non-null. Pick the more general one.
@@ -2221,7 +2241,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         if (SrcRR.Reg != FoundR || SrcRR.Sub != 0) {
           const DebugLoc &DL = SI.getDebugLoc();
           CopyIn = BuildMI(B, StartIt, DL, HII.get(TargetOpcode::COPY), FoundR)
-                      .addOperand(SrcOp);
+                       .add(SrcOp);
         }
 
         ++StartIt;
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index bb5e379ce014..c99ad5130aef 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -197,13 +197,13 @@ bool HexagonGenExtract::convert(Instruction *In) {
     // It is still ok to generate extract, but only if the mask eliminates
     // those bits (i.e. M does not have any bits set beyond U).
     APInt C = APInt::getHighBitsSet(BW, BW-U);
-    if (M.intersects(C) || !APIntOps::isMask(W, M))
+    if (M.intersects(C) || !M.isMask(W))
       return false;
   } else {
     // Check if M starts with a contiguous sequence of W times 1 bits. Get
     // the low U bits of M (which eliminates the 0 bits shifted in on the
     // left), and check if the result is APInt's "mask":
-    if (!APIntOps::isMask(W, M.getLoBits(U)))
+    if (!M.getLoBits(U).isMask(W))
       return false;
   }
 
@@ -221,11 +221,8 @@ bool HexagonGenExtract::convert(Instruction *In) {
 
 bool HexagonGenExtract::visitBlock(BasicBlock *B) {
   // Depth-first, bottom-up traversal.
-  DomTreeNode *DTN = DT->getNode(B);
-  typedef GraphTraits<DomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType Iter;
-  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
-    visitBlock((*I)->getBlock());
+  for (auto *DTN : children<DomTreeNode*>(DT->getNode(B)))
+    visitBlock(DTN->getBlock());
 
   // Allow limiting the number of generated extracts for debugging purposes.
   bool HasCutoff = ExtractCutoff.getPosition();
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index 5a8e392d1275..54d99d399f88 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -947,11 +947,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
     BlockDefs.insert(InsDefs);
   }
 
-  MachineDomTreeNode *N = MDT->getNode(B);
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType ChildIter;
-  for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
+  for (auto *DTN : children<MachineDomTreeNode*>(MDT->getNode(B))) {
+    MachineBasicBlock *SB = DTN->getBlock();
     collectInBlock(SB, AVs);
   }
 
@@ -1422,9 +1419,9 @@ bool HexagonGenInsert::generateInserts() {
 
 bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
   bool Changed = false;
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-    Changed |= removeDeadCode(*I);
+
+  for (auto *DTN : children<MachineDomTreeNode*>(N))
+    Changed |= removeDeadCode(DTN);
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index a718df9c70ab..85222944c77c 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -324,9 +324,9 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     if (!MxOpc)
       continue;
     BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
-      .addReg(MX.PredR)
-      .addOperand(*MX.SrcT)
-      .addOperand(*MX.SrcF);
+        .addReg(MX.PredR)
+        .add(*MX.SrcT)
+        .add(*MX.SrcF);
     B.erase(MX.Def1);
     B.erase(MX.Def2);
     Changed = true;
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index e477dcc0f64a..86a8089401c2 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -100,6 +100,7 @@ namespace {
     MachineRegisterInfo        *MRI;
     MachineDominatorTree       *MDT;
     const HexagonInstrInfo     *TII;
+    const HexagonRegisterInfo  *TRI;
 #ifndef NDEBUG
     static int Counter;
 #endif
@@ -381,7 +382,9 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
-  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+  TII = HST.getInstrInfo();
+  TRI = HST.getRegisterInfo();
 
   for (auto &L : *MLI)
     if (!L->getParentLoop()) {
@@ -960,24 +963,21 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
 /// \brief Return true if the operation is invalid within hardware loop.
 bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
                                                   bool IsInnerHWLoop) const {
-
   // Call is not allowed because the callee may use a hardware loop except for
   // the case when the call never returns.
   if (MI->getDesc().isCall())
     return !TII->doesNotReturn(*MI);
 
   // Check if the instruction defines a hardware loop register.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isDef())
-      continue;
-    unsigned R = MO.getReg();
-    if (IsInnerHWLoop && (R == Hexagon::LC0 || R == Hexagon::SA0 ||
-                          R == Hexagon::LC1 || R == Hexagon::SA1))
-      return true;
-    if (!IsInnerHWLoop && (R == Hexagon::LC1 || R == Hexagon::SA1))
+  using namespace Hexagon;
+  static const unsigned Regs01[] = { LC0, SA0, LC1, SA1 };
+  static const unsigned Regs1[]  = { LC1, SA1 };
+  auto CheckRegs = IsInnerHWLoop ? makeArrayRef(Regs01, array_lengthof(Regs01))
+                                 : makeArrayRef(Regs1, array_lengthof(Regs1));
+  for (unsigned R : CheckRegs)
+    if (MI->modifiesRegister(R, TRI))
       return true;
-  }
+
   return false;
 }
 
@@ -1511,7 +1511,7 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
       int64_t V1, V2;
       if (!checkForImmediate(S1, V1) || !checkForImmediate(S2, V2))
         return false;
-      TV = V2 | (V1 << 32);
+      TV = V2 | (static_cast<uint64_t>(V1) << 32);
       break;
     }
     case TargetOpcode::REG_SEQUENCE: {
diff --git a/lib/Target/Hexagon/HexagonIICHVX.td b/lib/Target/Hexagon/HexagonIICHVX.td
new file mode 100644
index 000000000000..4081a225832b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIICHVX.td
@@ -0,0 +1,102 @@
+//===--- HexagonIICHVX.td -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Though all these itinerary classes exist for V60 onwards, they are being
+// listed here as 'HVXV62Itin' because itinerary class description prior to V62
+// doesn't include operand cycle info. In future, I plan to merge them
+// together and call it 'HVXItin'.
+//
+class HVXV62Itin {
+  list<InstrItinData> HVXV62Itin_list = [
+    InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+                                   [InstrStage<1, [SLOT0, SLOT1]>],
+                                   [3, 1, 1, 1]>,
+    InstrItinData<COPROC_VX_vtc_long_SLOT23,
+                                   [InstrStage<1, [SLOT2, SLOT3]>],
+                                   [3, 1, 1, 1]>,
+    InstrItinData<COPROC_VX_vtc_SLOT23,
+                                   [InstrStage<1, [SLOT2, SLOT3]>],
+                                   [3, 1, 1, 1]>,
+    InstrItinData<CVI_VA,          [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE,CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VA_DV,       [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF, CVI_MPY01]>],
+                                    [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_LONG,     [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_LATE,     [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX,          [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV_LONG,  [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV,       [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV_SLOT2, [InstrStage<1, [SLOT2], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV_SLOT2_LONG_EARLY,
+                                   [InstrStage<1, [SLOT2], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP,          [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_LONG,     [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS_EARLY, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS_LONG,  [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS,       [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS_LONG_EARLY,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_DV,       [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VS,          [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_SHIFT]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VINLANESAT,  [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_LD,       [InstrStage<1, [SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_TMP_LD,   [InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD]>],[1, 1, 1, 1, 10]>,
+    InstrItinData<CVI_VM_CUR_LD,   [InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_VP_LDU,   [InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_ST,       [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_NEW_ST,   [InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_STU,      [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_HIST,        [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_ALL]>], [1, 1, 1, 1]>];
+}
diff --git a/lib/Target/Hexagon/HexagonIICScalar.td b/lib/Target/Hexagon/HexagonIICScalar.td
new file mode 100644
index 000000000000..e69cfbdad688
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIICScalar.td
@@ -0,0 +1,164 @@
+//===--- HexagonIICScalar.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// These itinerary class descriptions are based on the instruction timing
+// classes as per V62. Curretnly, they are just extracted from
+// HexagonScheduleV62.td but will soon be auto-generated by HexagonGen.py.
+
+class ScalarItin {
+  list<InstrItinData> ScalarItin_list = [
+    InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+    InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+    InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+    InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+
+    // ALU64
+    InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+    InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+
+    // CR -> System
+    InstrItinData<CR_tc_2_SLOT3      , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+    InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+    InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<1, [SLOT3]>], [3, 1, 1]>,
+
+    // Jump (conditional/unconditional/return etc)
+    InstrItinData<CR_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [2, 1, 1, 1]>,
+    InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [3, 1, 1, 1]>,
+    InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [1, 1, 1, 1]>,
+    InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [2, 1, 1, 1]>,
+    InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [2, 1, 1, 1]>,
+    InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT,
+        [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1, 1]>,
+
+    // JR
+    InstrItinData<J_tc_2early_SLOT2  , [InstrStage<1, [SLOT2]>], [2, 1, 1]>,
+    InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<1, [SLOT2]>], [3, 1, 1]>,
+
+    // Extender
+    InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
+                          [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1, 1]>,
+
+    // Load
+    InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
+                                         [3, 1]>,
+    InstrItinData<LD_tc_ld_pi_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                         [3, 1]>,
+    InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [4, 1]>,
+    InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [3, 1]>,
+
+    // M
+    InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [1, 1, 1]>,
+    InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [2, 1, 1]>,
+    InstrItinData<M_tc_2_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [2, 1, 1]>,
+    InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1]>,
+    InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1]>,
+    InstrItinData<M_tc_3x_acc_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1, 1]>,
+    InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [4, 1, 1]>,
+    InstrItinData<M_tc_3or4x_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [4, 1, 1]>,
+    InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1]>,
+
+    // Store
+    InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                      [1, 1, 1]>,
+    InstrItinData<ST_tc_st_pi_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                      [1, 1, 1]>,
+    InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [3, 1, 1]>,
+    InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [3, 1, 1]>,
+    InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+    InstrItinData<ST_tc_st_pi_SLOT0 , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+
+    // S
+    InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+    InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
+    InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [4, 1, 1]>,
+    InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+    InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+    InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+    InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+
+    // New Value Compare Jump
+    InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>],
+                                          [3, 1, 1, 1]>,
+
+    // Mem ops
+    InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                        [1, 1, 1, 1]>,
+    InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [2, 1, 1, 1]>,
+    InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [1, 1, 1, 1]>,
+    InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                        [1, 1, 1, 1]>,
+    InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [3, 1, 1, 1]>,
+    InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [1, 1, 1, 1]>,
+
+    // Endloop
+    InstrItinData<J_tc_2early_SLOT0123, [InstrStage<1, [SLOT_ENDLOOP]>],
+                                        [2]>,
+    InstrItinData<MAPPING_tc_1_SLOT0123      ,
+                         [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                         [1, 1, 1, 1]>,
+
+    // Duplex and Compound
+    InstrItinData<DUPLEX     , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+    InstrItinData<COMPOUND_CJ_ARCHDEPSLOT,
+        [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>,
+    // Misc
+    InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                           [1, 1, 1]>,
+    InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                           [1, 1, 1]>,
+    InstrItinData<PSEUDOM    , [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>];
+}
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index f6012d29d422..8e10c521a77d 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -123,6 +123,12 @@ private:
   bool isAlignedMemNode(const MemSDNode *N) const;
   bool isPositiveHalfWord(const SDNode *N) const;
 
+  // DAG preprocessing functions.
+  void ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes);
+  void ppAddrReorderAddShl(std::vector<SDNode*> &&Nodes);
+  void ppAddrRewriteAndSrl(std::vector<SDNode*> &&Nodes);
+  void ppHoistZextI1(std::vector<SDNode*> &&Nodes);
+
   SmallDenseMap<SDNode *,int> RootWeights;
   SmallDenseMap<SDNode *,int> RootHeights;
   SmallDenseMap<const Value *,int> GAUsesInFunction;
@@ -932,55 +938,21 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
 
 
 void HexagonDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode()) {
-    N->setNodeId(-1);
-    return;   // Already selected.
-  }
+  if (N->isMachineOpcode())
+    return N->setNodeId(-1);  // Already selected.
 
   switch (N->getOpcode()) {
-  case ISD::Constant:
-    SelectConstant(N);
-    return;
-
-  case ISD::ConstantFP:
-    SelectConstantFP(N);
-    return;
-
-  case ISD::FrameIndex:
-    SelectFrameIndex(N);
-    return;
-
-  case ISD::BITCAST:
-    SelectBitcast(N);
-    return;
-
-  case ISD::SHL:
-    SelectSHL(N);
-    return;
-
-  case ISD::LOAD:
-    SelectLoad(N);
-    return;
-
-  case ISD::STORE:
-    SelectStore(N);
-    return;
-
-  case ISD::MUL:
-    SelectMul(N);
-    return;
-
-  case ISD::ZERO_EXTEND:
-    SelectZeroExtend(N);
-    return;
-
-  case ISD::INTRINSIC_W_CHAIN:
-    SelectIntrinsicWChain(N);
-    return;
-
-  case ISD::INTRINSIC_WO_CHAIN:
-    SelectIntrinsicWOChain(N);
-    return;
+  case ISD::Constant:             return SelectConstant(N);
+  case ISD::ConstantFP:           return SelectConstantFP(N);
+  case ISD::FrameIndex:           return SelectFrameIndex(N);
+  case ISD::BITCAST:              return SelectBitcast(N);
+  case ISD::SHL:                  return SelectSHL(N);
+  case ISD::LOAD:                 return SelectLoad(N);
+  case ISD::STORE:                return SelectStore(N);
+  case ISD::MUL:                  return SelectMul(N);
+  case ISD::ZERO_EXTEND:          return SelectZeroExtend(N);
+  case ISD::INTRINSIC_W_CHAIN:    return SelectIntrinsicWChain(N);
+  case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
   }
 
   SelectCode(N);
@@ -1010,15 +982,52 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
 }
 
 
-void HexagonDAGToDAGISel::PreprocessISelDAG() {
+static bool isMemOPCandidate(SDNode *I, SDNode *U) {
+  // I is an operand of U. Check if U is an arithmetic (binary) operation
+  // usable in a memop, where the other operand is a loaded value, and the
+  // result of U is stored in the same location.
+
+  if (!U->hasOneUse())
+    return false;
+  unsigned Opc = U->getOpcode();
+  switch (Opc) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::AND:
+    case ISD::OR:
+      break;
+    default:
+      return false;
+  }
+
+  SDValue S0 = U->getOperand(0);
+  SDValue S1 = U->getOperand(1);
+  SDValue SY = (S0.getNode() == I) ? S1 : S0;
+
+  SDNode *UUse = *U->use_begin();
+  if (UUse->getNumValues() != 1)
+    return false;
+
+  // Check if one of the inputs to U is a load instruction and the output
+  // is used by a store instruction. If so and they also have the same
+  // base pointer, then don't preoprocess this node sequence as it
+  // can be matched to a memop.
+  SDNode *SYNode = SY.getNode();
+  if (UUse->getOpcode() == ISD::STORE && SYNode->getOpcode() == ISD::LOAD) {
+    SDValue LDBasePtr = cast<MemSDNode>(SYNode)->getBasePtr();
+    SDValue STBasePtr = cast<MemSDNode>(UUse)->getBasePtr();
+    if (LDBasePtr == STBasePtr)
+      return true;
+  }
+  return false;
+}
+
+
+// Transform: (or (select c x 0) z)  ->  (select c (or x z) z)
+//            (or (select c 0 y) z)  ->  (select c z (or y z))
+void HexagonDAGToDAGISel::ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes) {
   SelectionDAG &DAG = *CurDAG;
-  std::vector<SDNode*> Nodes;
-  for (SDNode &Node : DAG.allnodes())
-    Nodes.push_back(&Node);
 
-  // Simplify: (or (select c x 0) z)  ->  (select c (or x z) z)
-  //           (or (select c 0 y) z)  ->  (select c z (or y z))
-  // This may not be the right thing for all targets, so do it here.
   for (auto I : Nodes) {
     if (I->getOpcode() != ISD::OR)
       continue;
@@ -1056,18 +1065,22 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
       }
     }
   }
+}
+
+// Transform: (store ch val (add x (add (shl y c) e)))
+//        to: (store ch val (add x (shl (add y d) c))),
+// where e = (shl d c) for some integer d.
+// The purpose of this is to enable generation of loads/stores with
+// shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+// value c must be 0, 1 or 2.
+void HexagonDAGToDAGISel::ppAddrReorderAddShl(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
 
-  // Transform: (store ch addr (add x (add (shl y c) e)))
-  //        to: (store ch addr (add x (shl (add y d) c))),
-  // where e = (shl d c) for some integer d.
-  // The purpose of this is to enable generation of loads/stores with
-  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
-  // value c must be 0, 1 or 2.
   for (auto I : Nodes) {
     if (I->getOpcode() != ISD::STORE)
       continue;
 
-    // I matched: (store ch addr Off)
+    // I matched: (store ch val Off)
     SDValue Off = I->getOperand(2);
     // Off needs to match: (add x (add (shl y c) (shl d c))))
     if (Off.getOpcode() != ISD::ADD)
@@ -1109,15 +1122,192 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     SDValue NewShl = DAG.getNode(ISD::SHL, DL, VT, NewAdd, C);
     ReplaceNode(T0.getNode(), NewShl.getNode());
   }
+}
+
+// Transform: (load ch (add x (and (srl y c) Mask)))
+//        to: (load ch (add x (shl (srl y d) d-c)))
+// where
+// Mask = 00..0 111..1 0.0
+//          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+//          |     +-------- 1s
+//          +-------------- at most c 0s
+// Motivating example:
+// DAG combiner optimizes (add x (shl (srl y 5) 2))
+//                     to (add x (and (srl y 3) 1FFFFFFC))
+// which results in a constant-extended and(##...,lsr). This transformation
+// undoes this simplification for cases where the shl can be folded into
+// an addressing mode.
+void HexagonDAGToDAGISel::ppAddrRewriteAndSrl(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
+
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::LOAD && Opc != ISD::STORE)
+      continue;
+    SDValue Addr = Opc == ISD::LOAD ? N->getOperand(1) : N->getOperand(2);
+    // Addr must match: (add x T0)
+    if (Addr.getOpcode() != ISD::ADD)
+      continue;
+    SDValue T0 = Addr.getOperand(1);
+    // T0 must match: (and T1 Mask)
+    if (T0.getOpcode() != ISD::AND)
+      continue;
+
+    // We have an AND.
+    //
+    // Check the first operand. It must be: (srl y c).
+    SDValue S = T0.getOperand(0);
+    if (S.getOpcode() != ISD::SRL)
+      continue;
+    ConstantSDNode *SN = dyn_cast<ConstantSDNode>(S.getOperand(1).getNode());
+    if (SN == nullptr)
+      continue;
+    if (SN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t CV = SN->getZExtValue();
+
+    // Check the second operand: the supposed mask.
+    ConstantSDNode *MN = dyn_cast<ConstantSDNode>(T0.getOperand(1).getNode());
+    if (MN == nullptr)
+      continue;
+    if (MN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t Mask = MN->getZExtValue();
+    // Examine the mask.
+    uint32_t TZ = countTrailingZeros(Mask);
+    uint32_t M1 = countTrailingOnes(Mask >> TZ);
+    uint32_t LZ = countLeadingZeros(Mask);
+    // Trailing zeros + middle ones + leading zeros must equal the width.
+    if (TZ + M1 + LZ != 32)
+      continue;
+    // The number of trailing zeros will be encoded in the addressing mode.
+    if (TZ > 2)
+      continue;
+    // The number of leading zeros must be at most c.
+    if (LZ > CV)
+      continue;
+
+    // All looks good.
+    SDValue Y = S.getOperand(0);
+    EVT VT = Addr.getValueType();
+    SDLoc dl(S);
+    // TZ = D-C, so D = TZ+C.
+    SDValue D = DAG.getConstant(TZ+CV, dl, VT);
+    SDValue DC = DAG.getConstant(TZ, dl, VT);
+    SDValue NewSrl = DAG.getNode(ISD::SRL, dl, VT, Y, D);
+    SDValue NewShl = DAG.getNode(ISD::SHL, dl, VT, NewSrl, DC);
+    ReplaceNode(T0.getNode(), NewShl.getNode());
+  }
+}
+
+// Transform: (op ... (zext i1 c) ...) -> (select c (op ... 0 ...)
+//                                                  (op ... 1 ...))
+void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
+
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::ZERO_EXTEND)
+      continue;
+    SDValue OpI1 = N->getOperand(0);
+    EVT OpVT = OpI1.getValueType();
+    if (!OpVT.isSimple() || OpVT.getSimpleVT() != MVT::i1)
+      continue;
+    for (auto I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+      SDNode *U = *I;
+      if (U->getNumValues() != 1)
+        continue;
+      EVT UVT = U->getValueType(0);
+      if (!UVT.isSimple() || !UVT.isInteger() || UVT.getSimpleVT() == MVT::i1)
+        continue;
+      if (isMemOPCandidate(N, U))
+        continue;
+
+      // Potentially simplifiable operation.
+      unsigned I1N = I.getOperandNo();
+      SmallVector<SDValue,2> Ops(U->getNumOperands());
+      for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i)
+        Ops[i] = U->getOperand(i);
+      EVT BVT = Ops[I1N].getValueType();
+
+      SDLoc dl(U);
+      SDValue C0 = DAG.getConstant(0, dl, BVT);
+      SDValue C1 = DAG.getConstant(1, dl, BVT);
+      SDValue If0, If1;
+
+      if (isa<MachineSDNode>(U)) {
+        unsigned UseOpc = U->getMachineOpcode();
+        Ops[I1N] = C0;
+        If0 = SDValue(DAG.getMachineNode(UseOpc, dl, UVT, Ops), 0);
+        Ops[I1N] = C1;
+        If1 = SDValue(DAG.getMachineNode(UseOpc, dl, UVT, Ops), 0);
+      } else {
+        unsigned UseOpc = U->getOpcode();
+        Ops[I1N] = C0;
+        If0 = DAG.getNode(UseOpc, dl, UVT, Ops);
+        Ops[I1N] = C1;
+        If1 = DAG.getNode(UseOpc, dl, UVT, Ops);
+      }
+      SDValue Sel = DAG.getNode(ISD::SELECT, dl, UVT, OpI1, If1, If0);
+      DAG.ReplaceAllUsesWith(U, Sel.getNode());
+    }
+  }
+}
+
+void HexagonDAGToDAGISel::PreprocessISelDAG() {
+  // Repack all nodes before calling each preprocessing function,
+  // because each of them can modify the set of nodes.
+  auto getNodes = [this] () -> std::vector<SDNode*> {
+    std::vector<SDNode*> T;
+    T.reserve(CurDAG->allnodes_size());
+    for (SDNode &N : CurDAG->allnodes())
+      T.push_back(&N);
+    return T;
+  };
+
+  // Transform: (or (select c x 0) z)  ->  (select c (or x z) z)
+  //            (or (select c 0 y) z)  ->  (select c z (or y z))
+  ppSimplifyOrSelect0(getNodes());
+
+  // Transform: (store ch val (add x (add (shl y c) e)))
+  //        to: (store ch val (add x (shl (add y d) c))),
+  // where e = (shl d c) for some integer d.
+  // The purpose of this is to enable generation of loads/stores with
+  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+  // value c must be 0, 1 or 2.
+  ppAddrReorderAddShl(getNodes());
+
+  // Transform: (load ch (add x (and (srl y c) Mask)))
+  //        to: (load ch (add x (shl (srl y d) d-c)))
+  // where
+  // Mask = 00..0 111..1 0.0
+  //          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+  //          |     +-------- 1s
+  //          +-------------- at most c 0s
+  // Motivating example:
+  // DAG combiner optimizes (add x (shl (srl y 5) 2))
+  //                     to (add x (and (srl y 3) 1FFFFFFC))
+  // which results in a constant-extended and(##...,lsr). This transformation
+  // undoes this simplification for cases where the shl can be folded into
+  // an addressing mode.
+  ppAddrRewriteAndSrl(getNodes());
+
+  // Transform: (op ... (zext i1 c) ...) -> (select c (op ... 0 ...)
+  //                                                  (op ... 1 ...))
+  ppHoistZextI1(getNodes());
+
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "Preprocessed (Hexagon) selection DAG:";
+    CurDAG->dump();
+  });
 
   if (EnableAddressRebalancing) {
     rebalanceAddressTrees();
 
-    DEBUG(
-      dbgs() << "************* SelectionDAG after preprocessing: ***********\n";
+    DEBUG_WITH_TYPE("isel", {
+      dbgs() << "Address tree balanced selection DAG:";
       CurDAG->dump();
-      dbgs() << "************* End SelectionDAG after preprocessing ********\n";
-    );
+    });
   }
 }
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index e87e1e6a7e0f..418dd71aeb4b 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -256,7 +256,9 @@ static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
     return false;
   }
 
-  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+  if (LocVT == MVT::i1) {
+    LocVT = MVT::i32;
+  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
     ValVT = MVT::i32;
     if (ArgFlags.isSExt())
@@ -483,9 +485,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
     }
   }
 
-  unsigned Offset = State.AllocateStack(4, 4);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
+  return true;
 }
 
 static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
@@ -498,9 +498,7 @@ static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
     }
   }
 
-  unsigned Offset = State.AllocateStack(8, 8);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
+  return true;
 }
 
 static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
@@ -511,7 +509,6 @@ static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
   bool UseHVX = HST.useHVXOps();
   bool UseHVXDbl = HST.useHVXDblOps();
 
-  unsigned OffSiz = 64;
   if (LocVT == MVT::v16i32) {
     if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -523,18 +520,14 @@ static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
-    OffSiz = 128;
   } else if (LocVT == MVT::v64i32) {
     if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
-    OffSiz = 256;
   }
 
-  unsigned Offset = State.AllocateStack(OffSiz, OffSiz);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
+  return true;
 }
 
 void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
@@ -590,6 +583,16 @@ static bool isHvxVectorType(MVT Ty) {
   }
 }
 
+bool
+HexagonTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_Hexagon);
+}
+
 // LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
 // passed by value, the function prototype is modified to return void and
 // the value is stored in memory pointed by a pointer passed by caller.
@@ -644,11 +647,11 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 
 /// LowerCallResult - Lower the result values of an ISD::CALL into the
 /// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// Chain/Glue are the input chain/glue to use, and that TheCall is the call
 /// being lowered. Returns a SDNode with the same number of values as the
 /// ISD::CALL.
 SDValue HexagonTargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
@@ -671,21 +674,24 @@ SDValue HexagonTargetLowering::LowerCallResult(
       // predicate register as the call result.
       auto &MRI = DAG.getMachineFunction().getRegInfo();
       SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                       MVT::i32, InFlag);
+                                       MVT::i32, Glue);
       // FR0 = (Value, Chain, Glue)
       unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
       SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
                                      FR0.getValue(0), FR0.getValue(2));
       // TPR = (Chain, Glue)
-      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1,
-                                  TPR.getValue(1));
+      // Don't glue this CopyFromReg, because it copies from a virtual
+      // register. If it is glued to the call, InstrEmitter will add it
+      // as an implicit def to the call (EmitMachineNode).
+      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1);
+      Glue = TPR.getValue(1);
     } else {
       RetVal = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                  RVLocs[i].getValVT(), InFlag);
+                                  RVLocs[i].getValVT(), Glue);
+      Glue = RetVal.getValue(2);
     }
     InVals.push_back(RetVal.getValue(0));
     Chain = RetVal.getValue(1);
-    InFlag = RetVal.getValue(2);
   }
 
   return Chain;
@@ -840,16 +846,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
+  SDValue Glue;
   if (!IsTailCall) {
     SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
     Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+    Glue = Chain.getValue(1);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
   // The Glue is necessary since all emitted instructions must be
   // stuck together.
-  SDValue Glue;
   if (!IsTailCall) {
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -902,6 +909,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
   }
 
+  const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (Glue.getNode())
     Ops.push_back(Glue);
 
@@ -1054,6 +1065,18 @@ SDValue HexagonTargetLowering::LowerPREFETCH(SDValue Op,
   return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
 }
 
+// Custom-handle ISD::READCYCLECOUNTER because the target-independent SDNode
+// is marked as having side-effects, while the register read on Hexagon does
+// not have any. TableGen refuses to accept the direct pattern from that node
+// to the A4_tfrcpp.
+SDValue HexagonTargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDLoc dl(Op);
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+  return DAG.getNode(HexagonISD::READCYCLE, dl, VTs, Chain);
+}
+
 SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
@@ -1140,10 +1163,25 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
       EVT RegVT = VA.getLocVT();
       if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
           RegVT == MVT::i32 || RegVT == MVT::f32) {
-        unsigned VReg =
+        unsigned VReg = 
           RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+        SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+        // Treat values of type MVT::i1 specially: they are passed in
+        // registers of type i32, but they need to remain as values of
+        // type i1 for consistency of the argument lowering.
+        if (VA.getValVT() == MVT::i1) {
+          // Generate a copy into a predicate register and use the value
+          // of the register as the "InVal".
+          unsigned PReg =
+            RegInfo.createVirtualRegister(&Hexagon::PredRegsRegClass);
+          SDNode *T = DAG.getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
+                                         Copy.getValue(0));
+          Copy = DAG.getCopyToReg(Copy.getValue(1), dl, PReg, SDValue(T, 0));
+          Copy = DAG.getCopyFromReg(Copy, dl, PReg, MVT::i1);
+        }
+        InVals.push_back(Copy);
+        Chain = Copy.getValue(1);
       } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
         unsigned VReg =
           RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
@@ -1217,7 +1255,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         InVals.push_back(FIN);
       } else {
         InVals.push_back(
-            DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, MachinePointerInfo()));
+            DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
       }
     }
   }
@@ -1272,17 +1310,6 @@ static bool isSExtFree(SDValue N) {
   return false;
 }
 
-SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue InpVal = Op.getOperand(0);
-  if (isa<ConstantSDNode>(InpVal)) {
-    uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue();
-    return DAG.getTargetConstant(countPopulation(V), dl, MVT::i64);
-  }
-  SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal);
-  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut);
-}
-
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
@@ -1571,9 +1598,10 @@ HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
 
 SDValue
 HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-      GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT, unsigned ReturnReg,
+      GlobalAddressSDNode *GA, SDValue Glue, EVT PtrVT, unsigned ReturnReg,
       unsigned char OperandFlags) const {
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
@@ -1585,23 +1613,21 @@ HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
   // 2. Callee which in this case is the Global address value.
   // 3. Registers live into the call.In this case its R0, as we
   //    have just one argument to be passed.
-  // 4. InFlag if there is any.
+  // 4. Glue.
   // Note: The order is important.
 
-  if (InFlag) {
-    SDValue Ops[] = { Chain, TGA,
-                      DAG.getRegister(Hexagon::R0, PtrVT), *InFlag };
-    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
-  } else {
-    SDValue Ops[]  = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT)};
-    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
-  }
+  const auto &HRI = *Subtarget.getRegisterInfo();
+  const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  SDValue Ops[] = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT),
+                    DAG.getRegisterMask(Mask), Glue };
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
 
   // Inform MFI that function has calls.
   MFI.setAdjustsStack(true);
 
-  SDValue Flag = Chain.getValue(1);
-  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+  Glue = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
 }
 
 //
@@ -1694,7 +1720,7 @@ HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
   InFlag = Chain.getValue(1);
 
-  return GetDynamicTLSAddr(DAG, Chain, GA, &InFlag, PtrVT,
+  return GetDynamicTLSAddr(DAG, Chain, GA, InFlag, PtrVT,
                            Hexagon::R0, HexagonII::MO_GDPLT);
 }
 
@@ -1821,6 +1847,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
@@ -1891,7 +1918,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i8,  Promote);
   setOperationAction(ISD::CTPOP, MVT::i16, Promote);
   setOperationAction(ISD::CTPOP, MVT::i32, Promote);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i64, Legal);
 
   // We custom lower i64 to i64 mul, so that it is not considered as a legal
   // operation. There is a pattern that will match i64 mul and transform it
@@ -1901,7 +1933,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   for (unsigned IntExpOp :
        { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
          ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
-         ISD::BSWAP,     ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+         ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
          ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
     setOperationAction(IntExpOp, MVT::i32, Expand);
     setOperationAction(IntExpOp, MVT::i64, Expand);
@@ -2268,7 +2300,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
   case HexagonISD::JT:            return "HexagonISD::JT";
   case HexagonISD::PACKHL:        return "HexagonISD::PACKHL";
-  case HexagonISD::POPCOUNT:      return "HexagonISD::POPCOUNT";
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::SHUFFEB:       return "HexagonISD::SHUFFEB";
   case HexagonISD::SHUFFEH:       return "HexagonISD::SHUFFEH";
@@ -2296,6 +2327,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VSRLW:         return "HexagonISD::VSRLW";
   case HexagonISD::VSXTBH:        return "HexagonISD::VSXTBH";
   case HexagonISD::VSXTBW:        return "HexagonISD::VSXTBW";
+  case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
@@ -2968,11 +3000,11 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
-    case ISD::CTPOP:                return LowerCTPOP(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
     case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
     case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
+    case ISD::READCYCLECOUNTER:     return LowerREADCYCLECOUNTER(Op, DAG);
   }
 }
 
@@ -3026,37 +3058,25 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
       }
     case 'q': // q0-q3
-      switch (VT.SimpleTy) {
+      switch (VT.getSizeInBits()) {
       default:
-        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
-      case MVT::v1024i1:
-      case MVT::v512i1:
-      case MVT::v32i16:
-      case MVT::v16i32:
-      case MVT::v64i8:
-      case MVT::v8i64:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled vector size");
+      case 512:
         return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+      case 1024:
+        return std::make_pair(0U, &Hexagon::VecPredRegs128BRegClass);
       }
     case 'v': // V0-V31
-      switch (VT.SimpleTy) {
+      switch (VT.getSizeInBits()) {
       default:
-        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
-      case MVT::v16i32:
-      case MVT::v32i16:
-      case MVT::v64i8:
-      case MVT::v8i64:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled vector size");
+      case 512:
         return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
-      case MVT::v32i32:
-      case MVT::v64i16:
-      case MVT::v16i64:
-      case MVT::v128i8:
+      case 1024:
         if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
           return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
         return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
-      case MVT::v256i8:
-      case MVT::v128i16:
-      case MVT::v64i32:
-      case MVT::v32i64:
+      case 2048:
         return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
       }
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index a8ed29e585d4..fb8f0ba6b057 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -50,7 +50,6 @@ namespace HexagonISD {
       JT,          // Jump table.
       CP,          // Constant pool.
 
-      POPCOUNT,
       COMBINE,
       PACKHL,
       VSPLATB,
@@ -86,6 +85,7 @@ namespace HexagonISD {
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
+      READCYCLE,
 
       OP_END
     };
@@ -146,6 +146,7 @@ namespace HexagonISD {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue
@@ -163,7 +164,7 @@ namespace HexagonISD {
     SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
         SelectionDAG &DAG) const;
     SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-        GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT,
+        GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
         unsigned ReturnReg, unsigned char OperandFlags) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
 
@@ -179,12 +180,16 @@ namespace HexagonISD {
 
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
diff --git a/lib/Target/Hexagon/HexagonInstrAlias.td b/lib/Target/Hexagon/HexagonInstrAlias.td
deleted file mode 100644
index 7283d94ee759..000000000000
--- a/lib/Target/Hexagon/HexagonInstrAlias.td
+++ /dev/null
@@ -1,652 +0,0 @@
-//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//                     Hexagon Instruction Mappings
-//===----------------------------------------------------------------------===//
-
-
-def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
-                (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
-                (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
-                (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memb({GP}+#$addr) = $Nt",
-                (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt",
-                (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
-                (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memw({GP}+#$addr) = $Nt",
-                (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memd({GP}+#$addr) = $Nt",
-                (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
-
-def : InstAlias<"$Nt = memb({GP}+#$addr)",
-                (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
-def : InstAlias<"$Nt = memub({GP}+#$addr)",
-                (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
-def : InstAlias<"$Nt = memh({GP}+#$addr)",
-                (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
-def : InstAlias<"$Nt = memuh({GP}+#$addr)",
-                (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
-def : InstAlias<"$Nt = memw({GP}+#$addr)",
-                (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
-def : InstAlias<"$Nt = memd({GP}+#$addr)",
-                (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
-
-// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
-def : InstAlias<"memb($Rs) = $Rt",
-      (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt",
-      (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt.h",
-      (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memw($Rs) = $Rt",
-      (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memb($Rs) = $Rt.new",
-      (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt.new",
-      (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memw($Rs) = $Rt.new",
-      (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memb($Rs) = #$S8",
-      (S4_storeirb_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memh($Rs) = #$S8",
-      (S4_storeirh_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memw($Rs) = #$S8",
-      (S4_storeiri_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memd($Rs) = $Rtt",
-      (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"memb($Rs) = setbit(#$U5)",
-      (L4_ior_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memh($Rs) = setbit(#$U5)",
-      (L4_ior_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memw($Rs) = setbit(#$U5)",
-      (L4_ior_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memb($Rs) = clrbit(#$U5)",
-      (L4_iand_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memh($Rs) = clrbit(#$U5)",
-      (L4_iand_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memw($Rs) = clrbit(#$U5)",
-      (L4_iand_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
-def : InstAlias<"$Rd = memb($Rs)",
-      (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memub($Rs)",
-      (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memh($Rs)",
-      (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memuh($Rs)",
-      (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memw($Rs)",
-      (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memd($Rs)",
-      (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memubh($Rs)",
-      (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memubh($Rs)",
-      (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = membh($Rs)",
-      (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = membh($Rs)",
-      (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memb_fifo($Rs)",
-      (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memh_fifo($Rs)",
-      (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
-//       to: if ($Pt) $Rd = memXX($Rs)
-def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
-      (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
-      (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
-      (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
-      (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
-      (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
-      (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
-//       to: if ($Pt) memXX($Rs) = $Rt
-def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
-      (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
-      (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
-      (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
-      (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
-      (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
-      (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
-      (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
-      (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
-      (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
-      (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
-      (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-
-// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
-//       to: if (!$Pt) $Rd = memXX($Rs)
-def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
-      (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
-      (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
-      (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
-      (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
-      (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
-      (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
-//       to: if (!$Pt) memXX($Rs) = $Rt
-def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
-      (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
-      (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
-      (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
-      (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
-      (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
-      (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
-      (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
-      (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
-      (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
-      (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
-      (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
-      (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
-      (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
-      (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
-      (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
-      (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
-      (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
-      (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
-      (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
-      (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
-      (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
-      (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
-      (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
-//       to: memXX($Rs) |= $Rt
-def : InstAlias<"memb($Rs) &= $Rt",
-      (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) |= $Rt",
-      (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) += $Rt",
-      (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) -= $Rt",
-      (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) += #$U5",
-      (L4_iadd_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) -= #$U5",
-      (L4_isub_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) &= $Rt",
-      (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) |= $Rt",
-      (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) += $Rt",
-      (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) -= $Rt",
-      (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) += #$U5",
-      (L4_iadd_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) -= #$U5",
-      (L4_isub_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) &= $Rt",
-      (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) |= $Rt",
-      (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) += $Rt",
-      (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) -= $Rt",
-      (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) += #$U5",
-      (L4_iadd_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) -= #$U5",
-      (L4_isub_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-//
-// Alias of: if ($Pv.new) memX($Rs) = $Rt
-//       to: if (p3.new) memX(r17 + #0) = $Rt
-def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
-      (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
-      (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
-      (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
-      (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
-      (S4_pstorerdtnew_io
-       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
-      (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
-      (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
-      (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
-      (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
-      (S4_pstorerdfnew_io
-       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-//
-// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
-//       to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
-def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
-      (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
-      (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
-      (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
-      (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
-      (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
-      (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
-      (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
-      (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
-      (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
-      (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
-      (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
-      (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"dcfetch($Rs)",
-      (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
-
-// Alias of some insn mappings, others must be handled by the parser
-def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
-      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
-      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-
-// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
-def : InstAlias<"$Rd = neg($Rs)",
-      (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
-
-def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
-def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
-def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
-def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
-
-def : InstAlias<"$Pd = $Ps",
-      (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
-
-def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
-      (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
-
-def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
-      (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
-      (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
-
-// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
-def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
-      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
-      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-
-// maps if (!Pu) jumpr Rs -> if (!Pu) jumpr:nt Rs
-def : InstAlias<"if (!$Pu) jumpr $Rs",
-      (J2_jumprf PredRegs:$Pu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// maps if (Pu) jumpr Rs -> if (Pu) jumpr:nt Rs
-def : InstAlias<"if ($Pu) jumpr $Rs",
-      (J2_jumprt PredRegs:$Pu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// maps if (!Pu) jump $r15_2 -> if (!Pu) jump:nt $r15_2
-def : InstAlias<"if (!$Pu) jump $r15_2",
-      (J2_jumpf PredRegs:$Pu, brtarget:$r15_2)>,
-      Requires<[HasV60T]>;
-
-// maps if (Pu) jump $r15_2 -> if (Pu) jump:nt $r15_2
-def : InstAlias<"if ($Pu) jump $r15_2",
-     (J2_jumpt PredRegs:$Pu, brtarget:$r15_2)>,
-     Requires<[HasV60T]>;
-
-def : InstAlias<"if ($src) jump $r15_2",
-      (J2_jumpt PredRegs:$src, brtarget:$r15_2), 0>;
-
-def : InstAlias<"if (!$src) jump $r15_2",
-      (J2_jumpf PredRegs:$src, brtarget:$r15_2), 0>;
-
-def : InstAlias<"if ($src1) jumpr $src2",
-      (J2_jumprt PredRegs:$src1, IntRegs:$src2), 0>;
-
-def : InstAlias<"if (!$src1) jumpr $src2",
-      (J2_jumprf PredRegs:$src1, IntRegs:$src2), 0>;
-
-// maps Vdd = Vss to Vdd = V6_vassignp(Vss)
-def : InstAlias<"$Vdd = $Vss",
-      (V6_vassignp VecDblRegs:$Vdd, VecDblRegs:$Vss)>,
-      Requires<[HasV60T]>;
-
-// maps Vd = #0 to Vd = vxor(Vd, Vd)
-def : InstAlias<"$Vd = #0",
-      (V6_vxor VectorRegs:$Vd, VectorRegs:$Vd, VectorRegs:$Vd)>,
-      Requires<[HasV60T]>;
-
-// maps Vdd  = #0 to Vdd = vsub(Vdd, Vdd)
-def : InstAlias<"$Vdd = #0",
-      (V6_vsubw_dv VecDblRegs:$Vdd, VecDblRegs:$Vdd, VecDblRegs:$Vdd)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd = vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd &= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd |= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd ^= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd = vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd &= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd |= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd ^= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd = vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd &= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd |= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd ^= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Rd.w = vextract($Vu, $Rs)" -> "$Rd = vextract($Vu, $Rs)"
-def : InstAlias<"$Rd.w = vextract($Vu, $Rs)",
-      (V6_extractw IntRegs:$Rd, VectorRegs:$Vu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// Mapping from vtrans2x2(Vy32,Vx32,Rt32) to vshuff(Vy32,Vx32,Rt32)
-def : InstAlias<"vtrans2x2($Vy, $Vx, $Rt)",
-      (V6_vshuff VectorRegs:$Vy, VectorRegs:$Vx, IntRegs:$Rt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmem($Rs)",
-      (V6_vL32b_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmem($Rs):nt",
-      (V6_vL32b_nt_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs)=$Vt",
-      (V6_vS32b_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs)=$Vt.new",
-      (V6_vS32b_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs):nt=$Vt.new",
-      (V6_vS32b_nt_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Qv) vmem($Rs)=$Vt",
-      (V6_vS32b_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Qv) vmem($Rs)=$Vt",
-      (V6_vS32b_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Qv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Qv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmem($Rs)=$Vt",
-      (V6_vS32b_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmem($Rs)=$Vt",
-      (V6_vS32b_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmemu($Rs)",
-      (V6_vL32Ub_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmemu($Rs)=$Vt",
-      (V6_vS32Ub_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmemu($Rs)=$Vt",
-      (V6_vS32Ub_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmemu($Rs)=$Vt",
-      (V6_vS32Ub_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-
diff --git a/lib/Target/Hexagon/HexagonInstrEnc.td b/lib/Target/Hexagon/HexagonInstrEnc.td
deleted file mode 100644
index 280832fd167f..000000000000
--- a/lib/Target/Hexagon/HexagonInstrEnc.td
+++ /dev/null
@@ -1,1019 +0,0 @@
-class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { opc{14-4}, src2};
-  let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
-}
-
-class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
-class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
-class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
-class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
-class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
-class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
-class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
-class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
-class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
-class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
-class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
-class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
-class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
-class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
-class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
-class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
-class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
-class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
-class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
-class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
-class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
-class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
-class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
-class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
-class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
-class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
-class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
-class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
-class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
-class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
-class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
-class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
-class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
-class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
-class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
-class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
-class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
-class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
-class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
-class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
-class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
-class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
-class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
-class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
-class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
-class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
-class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
-class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
-class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
-class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
-class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
-class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
-class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
-class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
-class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
-class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
-class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
-class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
-class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
-class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
-class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
-class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
-class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
-class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
-class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
-class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
-class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
-class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
-class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
-class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
-class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
-class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
-class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
-class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
-class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
-class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
-class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
-class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
-class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
-class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
-class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
-class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
-class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
-class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
-class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
-class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
-class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
-class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
-class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
-class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
-class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
-class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
-class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
-class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
-class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
-class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
-class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
-class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
-class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
-class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
-class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
-class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
-class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
-class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
-class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
-class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
-class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
-class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
-class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
-class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
-class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
-class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
-class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
-class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
-class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
-class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
-class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
-class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
-class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
-class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
-class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
-class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
-class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
-class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
-class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
-class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
-class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
-class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
-class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
-class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
-class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
-class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
-class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
-class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
-class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
-class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
-class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
-class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
-class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
-class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
-class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
-class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
-class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
-class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
-class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
-class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
-class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
-class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
-class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
-class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
-class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
-class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
-class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
-class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
-class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
-class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
-class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
-class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
-class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
-class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
-class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
-class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
-class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
-class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
-class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
-class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
-class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
-class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
-class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
-class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
-class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
-class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
-class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
-class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
-class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
-class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
-class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
-class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
-class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
-class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
-class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
-class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
-class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
-class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
-class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
-
-class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
-  let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
-}
-
-class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
-class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
-class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
-class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
-class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
-class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
-class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
-class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
-class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
-class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
-class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
-class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
-class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
-class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
-class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
-class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
-class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
-class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
-class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
-class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
-class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
-class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
-class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
-class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
-class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
-class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
-class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
-class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
-class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
-class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
-class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
-class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
-class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
-class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
-class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
-class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
-class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
-class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
-
-class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
-  let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
-class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
-class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
-class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
-class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
-class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
-class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
-class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
-class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
-class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
-class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
-class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
-
-class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { 0b00011110000000, opc{5-4} };
-  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
-class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
-class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
-class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
-class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
-class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
-class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
-class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
-class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
-class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
-class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
-class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
-class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
-class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
-class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
-class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
-class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
-class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
-class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
-class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
-class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
-class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
-class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
-class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
-class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
-
-class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
-class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
-class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
-class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
-class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
-class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
-class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
-
-class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
-class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
-class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
-class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
-class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
-class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
-class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
-
-class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
-class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
-class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
-
-class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
-class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
-class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
-class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
-class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
-
-class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<4> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{9-6};
-  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<11> src3;
-  bits<4> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{10-7};
-  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
-class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
-class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
-class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
-class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
-class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
-class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
-class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
-class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
-class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
-
-class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
-class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
-class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
-class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
-class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
-class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
-class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
-class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
-class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
-class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<4> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{9-6};
-  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
-class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
-class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
-class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<11> src3;
-  bits<4> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{10-7};
-  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
-class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
-class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
-class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
-
-// TODO: Change script to generate dst, src1, src2 instead of
-// dst, dst2, src1.
-class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
-class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
-class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
-class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
-class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
-class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
-class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
-
-class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
-class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
-class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
-class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
-class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
-class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
-class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
-
-
-// TODO: Change script to generate src1, src2 and src3 instead of
-// dst, src1, src2.
-class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
-  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
-class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
-class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
-
-class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
-  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
-class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
-class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
-
-// TODO: Change script to generate src1, src2 and src3 instead of
-// dst, src1, src2.
-class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
-class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
-class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
-
-// TODO: Change script to generate src1, src2,src3 and src4 instead of
-// dst, src1, src2, src3.
-class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<9> src3;
-  bits<3> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{8-6};
-  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
-class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
-class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
-class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
-class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
-class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
-class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
-class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
-class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
-class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
-
-// TODO: Change script to generate src1, src2,src3 and src4 instead of
-// dst, src1, src2, src3.
-class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<3> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{9-7};
-  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
-class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
-class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
-class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
-class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
-class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
-class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
-class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
-class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
-class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<9> src3;
-  bits<3> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{8-6};
-  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
-class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
-class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
-class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<3> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{9-7};
-  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
-class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
-class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
-class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
-
-class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<1> src2;
-
-  let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
-class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
-class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
-class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
-class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
-class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
-class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
-
-class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<1> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
-class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
-class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
-
-class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<1> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
-}
-
-class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
-class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
-
-class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<1> src3;
-  bits<5> src4;
-
-  let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
-class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
-class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
-class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
-class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
-class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
-class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
-class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
-class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
-class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<1> src3;
-  bits<3> src4;
-
-  let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
-class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
-class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
-class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
-
-
-class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<1> src3;
-
-  let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
-  let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
-}
-
-class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
-class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
-class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
-class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
-class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
-class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
-
-class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<2> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
-  let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
-}
-
-class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
-class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
-
-class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b00011001111, src3{4-0} };
-  let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
-}
-
-class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
-class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
-
-
-class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
-  let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
-}
-
-class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
-class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
-
-class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
-  let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
-}
-
-class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
-class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
-
-class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
-  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
-class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
-class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
-class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
-class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
-class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
-class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
-class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
-class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
-class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
-class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
-class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
-class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
-class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
-class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
-
-class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
-  let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
-}
-
-class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
-class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
-class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
-class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
-class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
-class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
-class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
-class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
-
-class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<2> src1;
-  bits<2> src2;
-
-  let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
-  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
-}
-
-class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
-class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
-class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
-class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
-class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
-
-class V6_pred_not_enc : OpcodeHexagon {
-  bits<2> dst;
-  bits<2> src1;
-
-  let Inst{31-16} = { 0b0001111000000011 };
-  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
-}
-
-class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<2> src1;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
-  let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
-}
-
-class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
-class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
-
-class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{15-5}, src1{4-0} };
-  let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
-}
-
-class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
-class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
-class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
-
-
-class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
-  let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
-}
-
-class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
-class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
-
-class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<6> src2;
-
-  let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
-  let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
-}
-
-class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
-class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
-class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
-class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
-class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
-class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
-
-class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { opc{14-4}, src1{4-0} };
-  let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
-class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
-class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
-class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
-class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
-class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
-class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
-class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
-
-class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
-
-  let Inst{31-16} = { opc{24-10}, 0 };
-  let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
-}
-
-class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
-class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
-class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
-class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
-
-class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
-  let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
-}
-
-class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
-class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
-
-class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
-  bits<5> src1;
-
-  let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
-  let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
-}
-
-class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
-class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
-
-class A5_ACS_enc : OpcodeHexagon {
-  bits<5> dst1;
-  bits<2> dst2;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b11101010101, src1{4-0} };
-  let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
-}
-
-class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<2> src3;
-
-  let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
-  let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
-}
-
-class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
-class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
-class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
-
-class V6_vhistq_enc : OpcodeHexagon {
-  bits<2> src1;
-
-  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
-  let Inst{13-0} = { 0b10000010000000 };
-}
-
-// TODO: Change script to generate dst1 instead of dst.
-class A6_vminub_RdP_enc : OpcodeHexagon {
-  bits<5> dst1;
-  bits<2> dst2;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b11101010111, src2{4-0} };
-  let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
-}
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index fa3cccbd0879..39c2a6e4f5a5 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -7,26 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-//                         Hexagon Instruction Flags +
-//
-//                    *** Must match HexagonBaseInfo.h ***
-//===----------------------------------------------------------------------===//
-
-class IType<bits<5> t> {
-  bits<5> Value = t;
-}
-def TypePSEUDO : IType<0>;
-def TypeALU32  : IType<1>;
-def TypeCR     : IType<2>;
-def TypeJR     : IType<3>;
-def TypeJ      : IType<4>;
-def TypeLD     : IType<5>;
-def TypeST     : IType<6>;
-def TypeSYSTEM : IType<7>;
-def TypeXTYPE  : IType<8>;
-def TypeENDLOOP: IType<31>;
-
 // Maintain list of valid subtargets for each instruction.
 class SubTarget<bits<6> value> {
   bits<6> Value = value;
@@ -54,6 +34,7 @@ class MemAccessSize<bits<4> value> {
   bits<4> Value = value;
 }
 
+// MemAccessSize is represented as 1+log2(N) where N is size in bits.
 def NoMemAccess      : MemAccessSize<0>;// Not a memory access instruction.
 def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
@@ -70,10 +51,9 @@ def Vector128Access  : MemAccessSize<8>;// Vector access instruction (memv)
 class OpcodeHexagon {
   field bits<32> Inst = ?; // Default to an invalid insn.
   bits<4> IClass = 0; // ICLASS
+  bits<1> zero = 0;
 
   let Inst{31-28} = IClass;
-
-  bits<1> zero = 0;
 }
 
 class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -99,85 +79,88 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Instruction type according to the ISA.
   IType Type = type;
-  let TSFlags{4-0} = Type.Value;
+  let TSFlags{5-0} = Type.Value;
 
   // Solo instructions, i.e., those that cannot be in a packet with others.
   bits<1> isSolo = 0;
-  let TSFlags{5} = isSolo;
+  let TSFlags{6} = isSolo;
   // Packed only with A or X-type instructions.
   bits<1> isSoloAX = 0;
-  let TSFlags{6} = isSoloAX;
+  let TSFlags{7} = isSoloAX;
   // Only A-type instruction in first slot or nothing.
   bits<1> isSoloAin1 = 0;
-  let TSFlags{7} = isSoloAin1;
+  let TSFlags{8} = isSoloAin1;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{8} = isPredicated;
+  let TSFlags{9} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{9} = isPredicatedFalse;
+  let TSFlags{10} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{10} = isPredicatedNew;
+  let TSFlags{11} = isPredicatedNew;
   bits<1> isPredicateLate = 0;
-  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
+  let TSFlags{12} = isPredicateLate; // Late predicate producer insn.
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{12} = isNewValue; // New-value consumer insn.
+  let TSFlags{13} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{13} = hasNewValue; // New-value producer insn.
+  let TSFlags{14} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{16-14} = opNewValue; // New-value produced operand.
+  let TSFlags{17-15} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{18} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{18} = isNVStore; // New-value store insn.
+  let TSFlags{19} = isNVStore; // New-value store insn.
   bits<1> isCVLoadable = 0;
-  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  let TSFlags{20} = isCVLoadable; // Load that can become cur-value load.
   bits<1> isCVLoad = 0;
-  let TSFlags{20} = isCVLoad; // Cur-value load insn.
+  let TSFlags{21} = isCVLoad; // Cur-value load insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{21} = isExtendable; // Insn may be extended.
+  let TSFlags{22} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{22} = isExtended; // Insn must be extended.
+  let TSFlags{23} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
+  let TSFlags{26-24} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{27} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{32-28} = opExtentBits; //Number of bits of range before extending.
   bits<2> opExtentAlign = 0;
-  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
+  let TSFlags{34-33} = opExtentAlign; // Alignment exponent before extending.
 
   // If an instruction is valid on a subtarget, set the corresponding
   // bit from validSubTargets.
   // By default, instruction is valid on all subtargets.
   SubTarget validSubTargets = HasAnySubT;
-  let TSFlags{39-34} = validSubTargets.Value;
+  let TSFlags{40-35} = validSubTargets.Value;
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{42-40} = addrMode.Value;
+  let TSFlags{43-41} = addrMode.Value;
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{46-43} = accessSize.Value;
+  let TSFlags{47-44} = accessSize.Value;
 
   bits<1> isTaken = 0;
-  let TSFlags {47} = isTaken; // Branch prediction.
+  let TSFlags {48} = isTaken; // Branch prediction.
 
   bits<1> isFP = 0;
-  let TSFlags {48} = isFP; // Floating-point.
+  let TSFlags {49} = isFP; // Floating-point.
 
   bits<1> hasNewValue2 = 0;
-  let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+  let TSFlags{51} = hasNewValue2; // Second New-value producer insn.
   bits<3> opNewValue2 = 0;
-  let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+  let TSFlags{54-52} = opNewValue2; // Second New-value produced operand.
 
   bits<1> isAccumulator = 0;
-  let TSFlags{54} = isAccumulator;
+  let TSFlags{55} = isAccumulator;
+
+  bits<1> prefersSlot3 = 0;
+  let TSFlags{56} = prefersSlot3; // Complex XU
 
   bit cofMax1 = 0;
   let TSFlags{60} = cofMax1;
@@ -200,9 +183,13 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   let NValueST = !if(isNVStore, "true", "false");
   let isNT = !if(isNonTemporal, "true", "false");
 
+  let hasSideEffects = 0;
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
 
+class HInst<dag outs, dag ins, string asmstr, InstrItinClass itin, IType type> :
+      InstHexagon<outs, ins, asmstr, [], "", itin, type>;
+
 //===----------------------------------------------------------------------===//
 //                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
@@ -214,14 +201,13 @@ class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
-let mayLoad = 1 in
-class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
+class PseudoLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
 class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                   string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
+  : PseudoLDInst<outs, ins, asmstr, pattern, cstr>;
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
@@ -247,6 +233,11 @@ class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
 
+let mayStore = 1 in
+class STInst_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
 class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
   : STInst<outs, ins, asmstr, pattern, cstr>;
@@ -269,28 +260,24 @@ class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                  string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
   : STInst<outs, ins, asmstr, pattern, cstr, itin>;
 
-// SYSTEM Instruction Class in V4 can take SLOT0 only
-// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
-class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>,
-    OpcodeHexagon;
-
-// ALU32 Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>, OpcodeHexagon;
-
 // ALU64 Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
 class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU64>,
      OpcodeHexagon;
 
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Inst_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU64>;
+
+
 class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
@@ -302,13 +289,13 @@ class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeM>,
     OpcodeHexagon;
 
 // Same as above but doesn't derive from OpcodeHexagon
 class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeM>;
 
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
@@ -324,12 +311,16 @@ class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeS_2op>,
     OpcodeHexagon;
 
+class SInst_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeS_2op>;
+
 class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeS_2op>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
@@ -337,7 +328,9 @@ class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
-  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
+  : SInst<outs, ins, asmstr, pattern, cstr, itin> {
+  let Type = TypeS_3op;
+}
 
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
@@ -349,12 +342,6 @@ class JInst_CJUMP_UCJUMP<dag outs, dag ins, string asmstr, list<dag> pattern = [
             string cstr = "", InstrItinClass itin = J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
 
-// JR Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>, OpcodeHexagon;
-
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -383,26 +370,6 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-
-//
-// ALU32 patterns
-//.
-class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
 //
 // ALU64 patterns.
 //
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 493d04703da9..1fdf930c62fd 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -11,18 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//----------------------------------------------------------------------------//
-//                         Hexagon Instruction Flags
-//
-//                        *** Must match BaseInfo.h ***
-//----------------------------------------------------------------------------//
-
-def TypeV4LDST    : IType<9>;
-def TypeNV       : IType<10>;
-def TypeDUPLEX   : IType<11>;
-def TypeCOMPOUND : IType<12>;
-def TypePREFIX   : IType<30>;
-
 //                      Duplex Instruction Class Declaration
 //===----------------------------------------------------------------------===//
 
@@ -61,7 +49,7 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 
-  let TSFlags{4-0} = Type.Value;
+  let TSFlags{5-0} = Type.Value;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
@@ -107,7 +95,7 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
 //
 class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>, OpcodeHexagon;
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNCJ>, OpcodeHexagon;
 
 class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
@@ -141,7 +129,7 @@ class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
   : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
-                TypePREFIX>, OpcodeHexagon;
+                TypeEXTENDER>, OpcodeHexagon;
 
 class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
@@ -150,11 +138,11 @@ class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>,
     OpcodeHexagon;
 
 class CJInst_JMPSET<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCJ>,
     OpcodeHexagon;
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
index b9f4373a0b79..c8a7faea5ed5 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -12,28 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 //----------------------------------------------------------------------------//
-//                         Hexagon Instruction Flags +
-//
-//                        *** Must match BaseInfo.h ***
-//----------------------------------------------------------------------------//
-
-def TypeCVI_VA         : IType<13>;
-def TypeCVI_VA_DV      : IType<14>;
-def TypeCVI_VX         : IType<15>;
-def TypeCVI_VX_DV      : IType<16>;
-def TypeCVI_VP         : IType<17>;
-def TypeCVI_VP_VS      : IType<18>;
-def TypeCVI_VS         : IType<19>;
-def TypeCVI_VINLANESAT : IType<20>;
-def TypeCVI_VM_LD      : IType<21>;
-def TypeCVI_VM_TMP_LD  : IType<22>;
-def TypeCVI_VM_CUR_LD  : IType<23>;
-def TypeCVI_VM_VP_LDU  : IType<24>;
-def TypeCVI_VM_ST      : IType<25>;
-def TypeCVI_VM_NEW_ST  : IType<26>;
-def TypeCVI_VM_STU     : IType<27>;
-def TypeCVI_HIST       : IType<28>;
-//----------------------------------------------------------------------------//
 //                         Instruction Classes Definitions +
 //----------------------------------------------------------------------------//
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 0a7dc6b49d00..b265a883da5c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -152,10 +152,11 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
 /// On Hexagon, we have two instructions used to set-up the hardware loop
 /// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
 /// to indicate the end of a loop.
-static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
+      MachineBasicBlock *TargetBB,
       SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
-  int LOOPi;
-  int LOOPr;
+  unsigned LOOPi;
+  unsigned LOOPr;
   if (EndLoopOp == Hexagon::ENDLOOP0) {
     LOOPi = Hexagon::J2_loop0i;
     LOOPr = Hexagon::J2_loop0r;
@@ -165,26 +166,24 @@ static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
   }
 
   // The loop set-up instruction will be in a predecessor block
-  for (MachineBasicBlock::pred_iterator PB = BB->pred_begin(),
-         PE = BB->pred_end(); PB != PE; ++PB) {
+  for (MachineBasicBlock *PB : BB->predecessors()) {
     // If this has been visited, already skip it.
-    if (!Visited.insert(*PB).second)
+    if (!Visited.insert(PB).second)
       continue;
-    if (*PB == BB)
+    if (PB == BB)
       continue;
-    for (MachineBasicBlock::reverse_instr_iterator I = (*PB)->instr_rbegin(),
-           E = (*PB)->instr_rend(); I != E; ++I) {
-      int Opc = I->getOpcode();
+    for (auto I = PB->instr_rbegin(), E = PB->instr_rend(); I != E; ++I) {
+      unsigned Opc = I->getOpcode();
       if (Opc == LOOPi || Opc == LOOPr)
         return &*I;
-      // We've reached a different loop, which means the loop0 has been removed.
-      if (Opc == EndLoopOp)
+      // We've reached a different loop, which means the loop01 has been
+      // removed.
+      if (Opc == EndLoopOp && I->getOperand(0).getMBB() != TargetBB)
         return nullptr;
     }
     // Check the predecessors for the LOOP instruction.
-    MachineInstr *loop = findLoopInstr(*PB, EndLoopOp, Visited);
-    if (loop)
-      return loop;
+    if (MachineInstr *Loop = findLoopInstr(PB, EndLoopOp, TargetBB, Visited))
+      return Loop;
   }
   return nullptr;
 }
@@ -597,7 +596,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
       // Since we're adding an ENDLOOP, there better be a LOOP instruction.
       // Check for it, and change the BB target if needed.
       SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, Cond[1].getMBB(),
+                                         VisitedBBs);
       assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
       Loop->getOperand(0).setMBB(TBB);
       // Add the ENDLOOP after the finding the LOOP0.
@@ -637,7 +637,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
     // Since we're adding an ENDLOOP, there better be a LOOP instruction.
     // Check for it, and change the BB target if needed.
     SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, Cond[1].getMBB(),
+                                       VisitedBBs);
     assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
     Loop->getOperand(0).setMBB(TBB);
     // Add the ENDLOOP after the finding the LOOP0.
@@ -687,7 +688,8 @@ unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
   MachineFunction *MF = MBB.getParent();
   DebugLoc DL = Cmp.getDebugLoc();
   SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(), VisitedBBs);
+  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(),
+                                     Cmp.getOperand(0).getMBB(), VisitedBBs);
   if (!Loop)
     return 0;
   // If the loop trip count is a compile-time value, then just change the
@@ -1074,13 +1076,13 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
       MachineInstr *MI1New =
           BuildMI(MBB, MI, DL, get(NewOpc))
-              .addOperand(MI.getOperand(0))
+              .add(MI.getOperand(0))
               .addImm(MI.getOperand(1).getImm())
               .addReg(SrcSubLo)
               .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MI1New->getOperand(0).setIsKill(false);
       BuildMI(MBB, MI, DL, get(NewOpc))
-          .addOperand(MI.getOperand(0))
+          .add(MI.getOperand(0))
           // The Vectors are indexed in multiples of vector size.
           .addImm(MI.getOperand(1).getImm() + Offset)
           .addReg(SrcSubHi)
@@ -1106,15 +1108,13 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
       unsigned DstReg = MI.getOperand(0).getReg();
       unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
-      MachineInstr *MI1New =
-          BuildMI(MBB, MI, DL, get(NewOpc),
-                  HRI.getSubReg(DstReg, Hexagon::vsub_lo))
-              .addOperand(MI.getOperand(1))
-              .addImm(MI.getOperand(2).getImm());
+      MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpc),
+                                     HRI.getSubReg(DstReg, Hexagon::vsub_lo))
+                                 .add(MI.getOperand(1))
+                                 .addImm(MI.getOperand(2).getImm());
       MI1New->getOperand(1).setIsKill(false);
-      BuildMI(MBB, MI, DL, get(NewOpc),
-              HRI.getSubReg(DstReg, Hexagon::vsub_hi))
-          .addOperand(MI.getOperand(1))
+      BuildMI(MBB, MI, DL, get(NewOpc), HRI.getSubReg(DstReg, Hexagon::vsub_hi))
+          .add(MI.getOperand(1))
           // The Vectors are indexed in multiples of vector size.
           .addImm(MI.getOperand(2).getImm() + Offset)
           .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -1227,18 +1227,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
       if (Op0.getReg() != Op2.getReg()) {
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addOperand(Op2);
+                     .add(Op0)
+                     .add(Op1)
+                     .add(Op2);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
         IsDestLive = true;
       }
       if (Op0.getReg() != Op3.getReg()) {
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addOperand(Op3);
+                     .add(Op0)
+                     .add(Op1)
+                     .add(Op3);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
       }
@@ -1259,10 +1259,10 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
         unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addReg(SrcHi)
-                    .addReg(SrcLo);
+                     .add(Op0)
+                     .add(Op1)
+                     .addReg(SrcHi)
+                     .addReg(SrcLo);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
         IsDestLive = true;
@@ -1271,10 +1271,10 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
         unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addReg(SrcHi)
-                    .addReg(SrcLo);
+                     .add(Op0)
+                     .add(Op1)
+                     .addReg(SrcHi)
+                     .addReg(SrcLo);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
       }
@@ -1376,7 +1376,7 @@ bool HexagonInstrInfo::PredicateInstruction(
     MachineOperand &Op = MI.getOperand(NOp);
     if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
       break;
-    T.addOperand(Op);
+    T.add(Op);
     NOp++;
   }
 
@@ -1386,7 +1386,7 @@ bool HexagonInstrInfo::PredicateInstruction(
   assert(GotPredReg);
   T.addReg(PredReg, PredRegFlags);
   while (NOp < NumOps)
-    T.addOperand(MI.getOperand(NOp++));
+    T.add(MI.getOperand(NOp++));
 
   MI.setDesc(get(PredOpc));
   while (unsigned n = MI.getNumOperands())
@@ -1413,18 +1413,28 @@ bool HexagonInstrInfo::DefinesPredicate(
   auto &HRI = getRegisterInfo();
   for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
     MachineOperand MO = MI.getOperand(oper);
-    if (MO.isReg() && MO.isDef()) {
+    if (MO.isReg()) {
+      if (!MO.isDef())
+        continue;
       const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
       if (RC == &Hexagon::PredRegsRegClass) {
         Pred.push_back(MO);
         return true;
       }
+      continue;
+    } else if (MO.isRegMask()) {
+      for (unsigned PR : Hexagon::PredRegsRegClass) {
+        if (!MI.modifiesRegister(PR, &HRI))
+          continue;
+        Pred.push_back(MO);
+        return true;
+      }
     }
   }
   return false;
 }
 
-bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
+bool HexagonInstrInfo::isPredicable(const MachineInstr &MI) const {
   return MI.getDesc().isPredicable();
 }
 
@@ -1715,7 +1725,7 @@ bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
 
 // Return true if the instruction is a compund branch instruction.
 bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
-  return (getType(MI) == HexagonII::TypeCOMPOUND && MI.isBranch());
+  return getType(MI) == HexagonII::TypeCJ && MI.isBranch();
 }
 
 bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
@@ -3009,10 +3019,12 @@ bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
 
 bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
       unsigned PredReg) const {
-  for (unsigned opNum = 0; opNum < MI.getNumOperands(); opNum++) {
-    const MachineOperand &MO = MI.getOperand(opNum);
+  for (const MachineOperand &MO : MI.operands()) {
+    // Predicate register must be explicitly defined.
+    if (MO.isRegMask() && MO.clobbersPhysReg(PredReg))
+      return false;
     if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
-      return false; // Predicate register must be explicitly defined.
+      return false;
   }
 
   // Hexagon Programmer's Reference says that decbin, memw_locked, and
@@ -3415,7 +3427,9 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
     return NVOpcode;
 
   switch (MI.getOpcode()) {
-  default: llvm_unreachable("Unknown .new type");
+  default:
+    llvm::report_fatal_error(std::string("Unknown .new type: ") +
+      std::to_string(MI.getOpcode()).c_str());
   case Hexagon::S4_storerb_ur:
     return Hexagon::S4_storerbnew_ur;
 
@@ -3456,20 +3470,75 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
 int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
   // We assume that block can have at most two successors.
-  bool taken = false;
   const MachineBasicBlock *Src = MI.getParent();
   const MachineOperand &BrTarget = MI.getOperand(1);
-  const MachineBasicBlock *Dst = BrTarget.getMBB();
+  bool Taken = false;
+  const BranchProbability OneHalf(1, 2);
 
-  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
-  if (Prediction >= BranchProbability(1,2))
-    taken = true;
+  if (BrTarget.isMBB()) {
+    const MachineBasicBlock *Dst = BrTarget.getMBB();
+    Taken = MBPI->getEdgeProbability(Src, Dst) >= OneHalf;
+  } else {
+    // The branch target is not a basic block (most likely a function).
+    // Since BPI only gives probabilities for targets that are basic blocks,
+    // try to identify another target of this branch (potentially a fall-
+    // -through) and check the probability of that target.
+    //
+    // The only handled branch combinations are:
+    // - one conditional branch,
+    // - one conditional branch followed by one unconditional branch.
+    // Otherwise, assume not-taken.
+    assert(MI.isConditionalBranch());
+    const MachineBasicBlock &B = *MI.getParent();
+    bool SawCond = false, Bad = false;
+    for (const MachineInstr &I : B) {
+      if (!I.isBranch())
+        continue;
+      if (I.isConditionalBranch()) {
+        SawCond = true;
+        if (&I != &MI) {
+          Bad = true;
+          break;
+        }
+      }
+      if (I.isUnconditionalBranch() && !SawCond) {
+        Bad = true;
+        break;
+      }
+    }
+    if (!Bad) {
+      MachineBasicBlock::const_instr_iterator It(MI);
+      MachineBasicBlock::const_instr_iterator NextIt = std::next(It);
+      if (NextIt == B.instr_end()) {
+        // If this branch is the last, look for the fall-through block.
+        for (const MachineBasicBlock *SB : B.successors()) {
+          if (!B.isLayoutSuccessor(SB))
+            continue;
+          Taken = MBPI->getEdgeProbability(Src, SB) < OneHalf;
+          break;
+        }
+      } else {
+        assert(NextIt->isUnconditionalBranch());
+        // Find the first MBB operand and assume it's the target.
+        const MachineBasicBlock *BT = nullptr;
+        for (const MachineOperand &Op : NextIt->operands()) {
+          if (!Op.isMBB())
+            continue;
+          BT = Op.getMBB();
+          break;
+        }
+        Taken = BT && MBPI->getEdgeProbability(Src, BT) < OneHalf;
+      }
+    } // if (!Bad)
+  }
+
+  // The Taken flag should be set to something reasonable by this point.
 
   switch (MI.getOpcode()) {
   case Hexagon::J2_jumpt:
-    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+    return Taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
   case Hexagon::J2_jumpf:
-    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+    return Taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
 
   default:
     llvm_unreachable("Unexpected jump instruction.");
@@ -3479,26 +3548,46 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
 // Return .new predicate version for an instruction.
 int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
-  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
-  if (NewOpcode >= 0) // Valid predicate new instruction
-    return NewOpcode;
-
   switch (MI.getOpcode()) {
   // Condtional Jumps
   case Hexagon::J2_jumpt:
   case Hexagon::J2_jumpf:
     return getDotNewPredJumpOp(MI, MBPI);
-
-  default:
-    assert(0 && "Unknown .new type");
   }
-  return 0;
+
+  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+  if (NewOpcode >= 0)
+    return NewOpcode;
+
+  dbgs() << "Cannot convert to .new: " << getName(MI.getOpcode()) << '\n';
+  llvm_unreachable(nullptr);
 }
 
-int HexagonInstrInfo::getDotOldOp(const int opc) const {
-  int NewOp = opc;
+int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const {
+  int NewOp = MI.getOpcode();
   if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
     NewOp = Hexagon::getPredOldOpcode(NewOp);
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+    // All Hexagon architectures have prediction bits on dot-new branches,
+    // but only Hexagon V60+ has prediction bits on dot-old ones. Make sure
+    // to pick the right opcode when converting back to dot-old.
+    if (!HST.getFeatureBits()[Hexagon::ArchV60]) {
+      switch (NewOp) {
+      case Hexagon::J2_jumptpt:
+        NewOp = Hexagon::J2_jumpt;
+        break;
+      case Hexagon::J2_jumpfpt:
+        NewOp = Hexagon::J2_jumpf;
+        break;
+      case Hexagon::J2_jumprtpt:
+        NewOp = Hexagon::J2_jumprt;
+        break;
+      case Hexagon::J2_jumprfpt:
+        NewOp = Hexagon::J2_jumprf;
+        break;
+      }
+    }
     assert(NewOp >= 0 &&
            "Couldn't change predicate new instruction to its old form.");
   }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 2358d4b7e4c0..b268c7a28171 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -235,7 +235,7 @@ public:
   /// Return true if the specified instruction can be predicated.
   /// By default, this returns true for every instruction with a
   /// PredicateOperand.
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   /// Test if the given instruction should be considered a scheduling boundary.
   /// This primarily includes labels and terminators.
@@ -404,7 +404,7 @@ public:
                           const MachineBranchProbabilityInfo *MBPI) const;
   int getDotNewPredOp(const MachineInstr &MI,
                       const MachineBranchProbabilityInfo *MBPI) const;
-  int getDotOldOp(const int opc) const;
+  int getDotOldOp(const MachineInstr &MI) const;
   HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr &MI)
                                                          const;
   short getEquivalentHWInstr(const MachineInstr &MI) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
deleted file mode 100644
index c5719ad5b6d8..000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ /dev/null
@@ -1,4799 +0,0 @@
-//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrFormats.td"
-include "HexagonOperands.td"
-include "HexagonInstrEnc.td"
-
-//===----------------------------------------------------------------------===//
-// Compare
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isCompare = 1, InputType = "imm", isExtendable = 1,
-    opExtendable = 2 in
-class T_CMP <string mnemonic, bits<2> MajOp, bit isNot, Operand ImmOp>
-  : ALU32Inst <(outs PredRegs:$dst),
-               (ins IntRegs:$src1, ImmOp:$src2),
-  "$dst = "#!if(isNot, "!","")#mnemonic#"($src1, #$src2)",
-  [], "",ALU32_2op_tc_2early_SLOT0123 >, ImmRegRel {
-    bits<2> dst;
-    bits<5> src1;
-    bits<10> src2;
-    let CextOpcode = mnemonic;
-    let opExtentBits  = !if(!eq(mnemonic, "cmp.gtu"), 9, 10);
-    let isExtentSigned = !if(!eq(mnemonic, "cmp.gtu"), 0, 1);
-
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = !if(!eq(mnemonic, "cmp.gtu"), 0, src2{9});
-    let Inst{20-16} = src1;
-    let Inst{13-5}  = src2{8-0};
-    let Inst{4}     = isNot;
-    let Inst{3-2}   = 0b00;
-    let Inst{1-0}   = dst;
-  }
-
-def C2_cmpeqi   : T_CMP <"cmp.eq",  0b00, 0, s10_0Ext>;
-def C2_cmpgti   : T_CMP <"cmp.gt",  0b01, 0, s10_0Ext>;
-def C2_cmpgtui  : T_CMP <"cmp.gtu", 0b10, 0, u9_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// ALU32/ALU +
-//===----------------------------------------------------------------------===//
-// Add.
-
-let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
-class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
-                  bit IsComm>
-  : ALU32_rr<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-             "$Rd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredRel {
-  let isCommutable = IsComm;
-  let BaseOpcode = mnemonic#_rr;
-  let CextOpcode = mnemonic;
-
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1111;
-  let Inst{27} = 0b0;
-  let Inst{26-24} = MajOp;
-  let Inst{23-21} = MinOp;
-  let Inst{20-16} = !if(OpsRev,Rt,Rs);
-  let Inst{12-8} = !if(OpsRev,Rs,Rt);
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                       bit OpsRev, bit PredNot, bit PredNew>
-  : ALU32_rr<(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
-             "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") "#
-             "$Rd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
-  let isPredicated = 1;
-  let isPredicatedFalse = PredNot;
-  let isPredicatedNew = PredNew;
-  let BaseOpcode = mnemonic#_rr;
-  let CextOpcode = mnemonic;
-
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1111;
-  let Inst{27} = 0b1;
-  let Inst{26-24} = MajOp;
-  let Inst{23-21} = MinOp;
-  let Inst{20-16} = !if(OpsRev,Rt,Rs);
-  let Inst{13} = PredNew;
-  let Inst{12-8} = !if(OpsRev,Rs,Rt);
-  let Inst{7} = PredNot;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU32_combineh<string Op1, string Op2, bits<3> MajOp, bits<3> MinOp,
-                      bit OpsRev>
-  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, 0> {
-  let AsmString = "$Rd = combine($Rs"#Op1#", $Rt"#Op2#")";
-}
-
-def A2_combine_hh : T_ALU32_combineh<".h", ".h", 0b011, 0b100, 1>;
-def A2_combine_hl : T_ALU32_combineh<".h", ".l", 0b011, 0b101, 1>;
-def A2_combine_lh : T_ALU32_combineh<".l", ".h", 0b011, 0b110, 1>;
-def A2_combine_ll : T_ALU32_combineh<".l", ".l", 0b011, 0b111, 1>;
-
-class T_ALU32_3op_sfx<string mnemonic, string suffix, bits<3> MajOp,
-                      bits<3> MinOp, bit OpsRev, bit IsComm>
-  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, IsComm> {
-  let AsmString = "$Rd = "#mnemonic#"($Rs, $Rt)"#suffix;
-}
-
-def A2_svaddh   : T_ALU32_3op<"vaddh",   0b110, 0b000, 0, 1>;
-def A2_svsubh   : T_ALU32_3op<"vsubh",   0b110, 0b100, 1, 0>;
-
-let Defs = [USR_OVF], Itinerary = ALU32_3op_tc_2_SLOT0123 in {
-  def A2_svaddhs  : T_ALU32_3op_sfx<"vaddh",  ":sat", 0b110, 0b001, 0, 1>;
-  def A2_addsat   : T_ALU32_3op_sfx<"add",    ":sat", 0b110, 0b010, 0, 1>;
-  def A2_svadduhs : T_ALU32_3op_sfx<"vadduh", ":sat", 0b110, 0b011, 0, 1>;
-  def A2_svsubhs  : T_ALU32_3op_sfx<"vsubh",  ":sat", 0b110, 0b101, 1, 0>;
-  def A2_subsat   : T_ALU32_3op_sfx<"sub",    ":sat", 0b110, 0b110, 1, 0>;
-  def A2_svsubuhs : T_ALU32_3op_sfx<"vsubuh", ":sat", 0b110, 0b111, 1, 0>;
-}
-
-let Itinerary = ALU32_3op_tc_2_SLOT0123 in
-def A2_svavghs  : T_ALU32_3op_sfx<"vavgh",  ":rnd", 0b111, 0b001, 0, 1>;
-
-def A2_svavgh   : T_ALU32_3op<"vavgh",   0b111, 0b000, 0, 1>;
-def A2_svnavgh  : T_ALU32_3op<"vnavgh",  0b111, 0b011, 1, 0>;
-
-multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                         bit OpsRev> {
-  def t    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
-  def f    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 0>;
-  def tnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 1>;
-  def fnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 1>;
-}
-
-multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                          bit OpsRev, bit IsComm> {
-  let isPredicable = 1 in
-  def  A2_#NAME  : T_ALU32_3op  <mnemonic, MajOp, MinOp, OpsRev, IsComm>;
-  defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
-}
-
-defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
-defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
-defm or  : T_ALU32_3op_A2<"or",  0b001, 0b001, 0, 1>;
-defm sub : T_ALU32_3op_A2<"sub", 0b011, 0b001, 1, 0>;
-defm xor : T_ALU32_3op_A2<"xor", 0b001, 0b011, 0, 1>;
-
-// A few special cases producing register pairs:
-let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in {
-  def S2_packhl    : T_ALU32_3op  <"packhl",  0b101, 0b100, 0, 0>;
-
-  let isPredicable = 1 in
-    def A2_combinew  : T_ALU32_3op  <"combine", 0b101, 0b000, 0, 0>;
-
-  // Conditional combinew uses "newt/f" instead of "t/fnew".
-  def C2_ccombinewt    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 0>;
-  def C2_ccombinewf    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 0>;
-  def C2_ccombinewnewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 1>;
-  def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>;
-}
-
-let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg"  in
-class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm>
-  : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-             "$Pd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
-  let CextOpcode = mnemonic;
-  let isCommutable = IsComm;
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<2> Pd;
-
-  let IClass = 0b1111;
-  let Inst{27-24} = 0b0010;
-  let Inst{22-21} = MinOp;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4} = IsNeg;
-  let Inst{3-2} = 0b00;
-  let Inst{1-0} = Pd;
-}
-
-let Itinerary = ALU32_3op_tc_2early_SLOT0123 in {
-  def C2_cmpeq   : T_ALU32_3op_cmp< "cmp.eq",  0b00, 0, 1>;
-  def C2_cmpgt   : T_ALU32_3op_cmp< "cmp.gt",  0b10, 0, 0>;
-  def C2_cmpgtu  : T_ALU32_3op_cmp< "cmp.gtu", 0b11, 0, 0>;
-}
-
-let CextOpcode = "MUX", InputType = "reg", hasNewValue = 1 in
-def C2_mux: ALU32_rr<(outs IntRegs:$Rd),
-                     (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = mux($Pu, $Rs, $Rt)", [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let CextOpcode = "mux";
-  let InputType = "reg";
-  let hasSideEffects = 0;
-  let IClass = 0b1111;
-
-  let Inst{27-24} = 0b0100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-// Combines the two immediates into a double register.
-// Increase complexity to make it greater than any complexity of a combine
-// that involves a register.
-
-let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 1,
-    AddedComplexity = 75 in
-def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8_0Ext:$s8, s8_0Imm:$S8),
-  "$Rdd = combine(#$s8, #$S8)",
-  []> {
-    bits<5> Rdd;
-    bits<8> s8;
-    bits<8> S8;
-
-    let IClass = 0b0111;
-    let Inst{27-23} = 0b11000;
-    let Inst{22-16} = S8{7-1};
-    let Inst{13}    = S8{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated ADD of a reg and an Immediate value.
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_Addri_Pred <bit PredNot, bit PredNew>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
-  !if(PredNot, "if (!$Pu", "if ($Pu")#!if(PredNew,".new) $Rd = ",
-  ") $Rd = ")#"add($Rs, #$s8)"> {
-    bits<5> Rd;
-    bits<2> Pu;
-    bits<5> Rs;
-    bits<8> s8;
-
-    let isPredicatedNew = PredNew;
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23}    = PredNot;
-    let Inst{22-21} = Pu;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = PredNew;
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// A2_addi: Add a signed immediate to a register.
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_Addri <Operand immOp>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins IntRegs:$Rs, immOp:$s16),
-  "$Rd = add($Rs, #$s16)", [], "", ALU32_ADDI_tc_1_SLOT0123> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<16> s16;
-
-    let IClass = 0b1011;
-
-    let Inst{27-21} = s16{15-9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s16{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for ADD of a register and an immediate value.
-//===----------------------------------------------------------------------===//
-multiclass Addri_Pred<string mnemonic, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def NAME     : T_Addri_Pred<PredNot, 0>;
-    // Predicate new
-    def NAME#new : T_Addri_Pred<PredNot, 1>;
-  }
-}
-
-let isExtendable = 1, isExtentSigned = 1, InputType = "imm" in
-multiclass Addri_base<string mnemonic, SDNode OpNode> {
-  let CextOpcode = mnemonic, BaseOpcode = mnemonic#_ri in {
-    let opExtendable = 2, opExtentBits = 16, isPredicable = 1, isAdd = 1 in
-    def A2_#NAME : T_Addri<s16_0Ext>;
-
-    let opExtendable = 3, opExtentBits = 8, isPredicated = 1 in {
-      defm A2_p#NAME#t : Addri_Pred<mnemonic, 0>;
-      defm A2_p#NAME#f : Addri_Pred<mnemonic, 1>;
-    }
-  }
-}
-
-defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
-
-let hasNewValue = 1, hasSideEffects = 0, isPseudo = 1 in
-def A2_iconst
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins s23_2Imm:$s23_2),
-  "$Rd = iconst(#$s23_2)"> {}
-
-//===----------------------------------------------------------------------===//
-// Template class used for the following ALU32 instructions.
-// Rd=and(Rs,#s10)
-// Rd=or(Rs,#s10)
-//===----------------------------------------------------------------------===//
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
-InputType = "imm", hasNewValue = 1 in
-class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins IntRegs:$Rs, s10_0Ext:$s10),
-  "$Rd = "#mnemonic#"($Rs, #$s10)" ,
-  []> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<10> s10;
-    let CextOpcode = mnemonic;
-
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0110;
-    let Inst{23-22} = MinOp;
-    let Inst{21}    = s10{9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-def A2_orir  : T_ALU32ri_logical<"or", or, 0b10>, ImmRegRel;
-def A2_andir : T_ALU32ri_logical<"and", and, 0b00>, ImmRegRel;
-
-// Subtract register from immediate
-// Rd32=sub(#s10,Rs32)
-let isExtendable = 1, CextOpcode = "sub", opExtendable = 1, isExtentSigned = 1,
-    opExtentBits = 10, InputType = "imm", hasNewValue = 1, hasSideEffects = 0 in
-def A2_subri: ALU32_ri <(outs IntRegs:$Rd), (ins s10_0Ext:$s10, IntRegs:$Rs),
-  "$Rd = sub(#$s10, $Rs)", []>, ImmRegRel {
-    bits<5> Rd;
-    bits<10> s10;
-    bits<5> Rs;
-
-    let IClass = 0b0111;
-
-    let Inst{27-22} = 0b011001;
-    let Inst{21}    = s10{9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-// Nop.
-let hasSideEffects = 0 in
-def A2_nop: ALU32Inst <(outs), (ins), "nop" > {
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1111;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_tfr16<bit isHi>
-  : ALU32Inst <(outs IntRegs:$Rx), (ins IntRegs:$src1, u16_0Imm:$u16),
-  "$Rx"#!if(isHi, ".h", ".l")#" = #$u16",
-  [], "$src1 = $Rx" > {
-    bits<5> Rx;
-    bits<16> u16;
-
-    let IClass = 0b0111;
-    let Inst{27-26} = 0b00;
-    let Inst{25-24} = !if(isHi, 0b10, 0b01);
-    let Inst{23-22} = u16{15-14};
-    let Inst{21}    = 0b1;
-    let Inst{20-16} = Rx;
-    let Inst{13-0}  = u16{13-0};
-  }
-
-def A2_tfril: T_tfr16<0>;
-def A2_tfrih: T_tfr16<1>;
-
-// Conditional transfer is an alias to conditional "Rd = add(Rs, #0)".
-let isPredicated = 1, hasNewValue = 1, opNewValue = 0 in
-class T_tfr_pred<bit isPredNot, bit isPredNew>
-  : ALU32Inst<(outs IntRegs:$dst),
-              (ins PredRegs:$src1, IntRegs:$src2),
-              "if ("#!if(isPredNot, "!", "")#
-              "$src1"#!if(isPredNew, ".new", "")#
-              ") $dst = $src2"> {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-
-    let isPredicatedFalse = isPredNot;
-    let isPredicatedNew = isPredNew;
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23} = isPredNot;
-    let Inst{13} = isPredNew;
-    let Inst{12-5} = 0;
-    let Inst{4-0} = dst;
-    let Inst{22-21} = src1;
-    let Inst{20-16} = src2;
-  }
-
-let isPredicable = 1 in
-class T_tfr : ALU32Inst<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = $src"> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0111;
-
-    let Inst{27-21} = 0b0000011;
-    let Inst{20-16} = src;
-    let Inst{13}    = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-let InputType = "reg", hasNewValue = 1, hasSideEffects = 0 in
-multiclass tfr_base<string CextOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    def NAME : T_tfr;
-
-    // Predicate
-    def t : T_tfr_pred<0, 0>;
-    def f : T_tfr_pred<1, 0>;
-    // Predicate new
-    def tnew : T_tfr_pred<0, 1>;
-    def fnew : T_tfr_pred<1, 1>;
-  }
-}
-
-// Assembler mapped to C2_ccombinew[t|f|newt|newf].
-// Please don't add bits to this instruction as it'll be converted into
-// 'combine' before object code emission.
-let isPredicated = 1 in
-class T_tfrp_pred<bit PredNot, bit PredNew>
-  : ALU32_rr <(outs DoubleRegs:$dst),
-              (ins PredRegs:$src1, DoubleRegs:$src2),
-  "if ("#!if(PredNot, "!", "")#"$src1"
-        #!if(PredNew, ".new", "")#") $dst = $src2" > {
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = PredNew;
-  }
-
-// Assembler mapped to A2_combinew.
-// Please don't add bits to this instruction as it'll be converted into
-// 'combine' before object code emission.
-class T_tfrp : ALU32Inst <(outs DoubleRegs:$dst),
-               (ins DoubleRegs:$src),
-    "$dst = $src">;
-
-let hasSideEffects = 0 in
-multiclass TFR64_base<string BaseName> {
-  let BaseOpcode = BaseName in {
-    let isPredicable = 1 in
-    def NAME : T_tfrp;
-    // Predicate
-    def t : T_tfrp_pred <0, 0>;
-    def f : T_tfrp_pred <1, 0>;
-    // Predicate new
-    def tnew : T_tfrp_pred <0, 1>;
-    def fnew : T_tfrp_pred <1, 1>;
-  }
-}
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1, opExtentBits = 12,
-    isMoveImm = 1, opExtendable = 2, BaseOpcode = "TFRI", CextOpcode = "TFR",
-    hasSideEffects = 0, isPredicated = 1, hasNewValue = 1 in
-class T_TFRI_Pred<bit PredNot, bit PredNew>
-  : ALU32_ri<(outs IntRegs:$Rd), (ins PredRegs:$Pu, s12_0Ext:$s12),
-    "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") $Rd = #$s12",
-    [], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
-  let isPredicatedFalse = PredNot;
-  let isPredicatedNew = PredNew;
-
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<12> s12;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1110;
-  let Inst{23} = PredNot;
-  let Inst{22-21} = Pu;
-  let Inst{20} = 0b0;
-  let Inst{19-16,12-5} = s12;
-  let Inst{13} = PredNew;
-  let Inst{4-0} = Rd;
-}
-
-def C2_cmoveit    : T_TFRI_Pred<0, 0>;
-def C2_cmoveif    : T_TFRI_Pred<1, 0>;
-def C2_cmovenewit : T_TFRI_Pred<0, 1>;
-def C2_cmovenewif : T_TFRI_Pred<1, 1>;
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1,
-    CextOpcode = "TFR", BaseOpcode = "TFRI", hasNewValue = 1, opNewValue = 0,
-    isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1,
-    isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in
-def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16_0Ext:$s16), "$Rd = #$s16",
-    [], "", ALU32_2op_tc_1_SLOT0123>,
-    ImmRegRel, PredRel {
-  bits<5> Rd;
-  bits<16> s16;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1000;
-  let Inst{23-22,20-16,13-5} = s16;
-  let Inst{4-0} = Rd;
-}
-
-defm A2_tfr  : tfr_base<"TFR">, ImmRegRel, PredNewRel;
-let isAsmParserOnly = 1 in
-defm A2_tfrp : TFR64_base<"TFR64">, PredNewRel;
-
-// Assembler mapped
-let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
-    isAsmParserOnly = 1 in
-def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8_0Imm64:$src1),
-                      "$dst = #$src1",
-                      []>;
-
-// TODO: see if this instruction can be deleted..
-let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
-    isAsmParserOnly = 1 in {
-def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
-                         "$dst = #$src1">;
-def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
-                             (ins s8_0Ext:$src1, s8_0Imm:$src2),
-                             "$dst = combine(##$src1, #$src2)">;
-}
-
-//===----------------------------------------------------------------------===//
-// ALU32/ALU -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM +
-//===----------------------------------------------------------------------===//
-// Scalar mux register immediate.
-let hasSideEffects = 0, isExtentSigned = 1, CextOpcode = "MUX",
-    InputType = "imm", hasNewValue = 1, isExtendable = 1, opExtentBits = 8 in
-class T_MUX1 <bit MajOp, dag ins, string AsmStr>
-      : ALU32Inst <(outs IntRegs:$Rd), ins, AsmStr>, ImmRegRel {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<8> s8;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b0011;
-  let Inst{23} = MajOp;
-  let Inst{22-21} = Pu;
-  let Inst{20-16} = Rs;
-  let Inst{13}    = 0b0;
-  let Inst{12-5}  = s8;
-  let Inst{4-0}   = Rd;
-}
-
-let opExtendable = 2 in
-def C2_muxri : T_MUX1<0b1, (ins PredRegs:$Pu, s8_0Ext:$s8, IntRegs:$Rs),
-                           "$Rd = mux($Pu, #$s8, $Rs)">;
-
-let opExtendable = 3 in
-def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
-                           "$Rd = mux($Pu, $Rs, #$s8)">;
-
-// C2_muxii: Scalar mux immediates.
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 8, opExtendable = 2 in
-def C2_muxii: ALU32Inst <(outs IntRegs:$Rd),
-                         (ins PredRegs:$Pu, s8_0Ext:$s8, s8_0Imm:$S8),
-  "$Rd = mux($Pu, #$s8, #$S8)" ,
-  []> {
-    bits<5> Rd;
-    bits<2> Pu;
-    bits<8> s8;
-    bits<8> S8;
-
-    let IClass = 0b0111;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-23} = Pu;
-    let Inst{22-16} = S8{7-1};
-    let Inst{13}    = S8{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rd;
-  }
-
-let isCodeGenOnly = 1, isPseudo = 1 in
-def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
-      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-      ".error \"should not emit\" ", []>;
-
-
-//===----------------------------------------------------------------------===//
-// template class for non-predicated alu32_2op instructions
-// - aslh, asrh, sxtb, sxth, zxth
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, opNewValue = 0 in
-class T_ALU32_2op <string mnemonic, bits<3> minOp> :
-  ALU32Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-             "$Rd = "#mnemonic#"($Rs)", [] > {
-  bits<5> Rd;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-
-  let Inst{27-24} = 0b0000;
-  let Inst{23-21} = minOp;
-  let Inst{13} = 0b0;
-  let Inst{4-0} = Rd;
-  let Inst{20-16} = Rs;
-}
-
-//===----------------------------------------------------------------------===//
-// template class for predicated alu32_2op instructions
-// - aslh, asrh, sxtb, sxth, zxtb, zxth
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_ALU32_2op_Pred <string mnemonic, bits<3> minOp, bit isPredNot,
-                        bit isPredNew > :
-  ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs),
-             !if(isPredNot, "if (!$Pu", "if ($Pu")
-             #!if(isPredNew, ".new) ",") ")#"$Rd = "#mnemonic#"($Rs)"> {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-
-  let Inst{27-24} = 0b0000;
-  let Inst{23-21} = minOp;
-  let Inst{13} = 0b1;
-  let Inst{11} = isPredNot;
-  let Inst{10} = isPredNew;
-  let Inst{4-0} = Rd;
-  let Inst{9-8} = Pu;
-  let Inst{20-16} = Rs;
-}
-
-multiclass ALU32_2op_Pred<string mnemonic, bits<3> minOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def NAME : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 0>;
-
-    // Predicate new
-    let isPredicatedNew = 1 in
-    def NAME#new : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 1>;
-  }
-}
-
-multiclass ALU32_2op_base<string mnemonic, bits<3> minOp> {
-  let BaseOpcode = mnemonic in {
-    let isPredicable = 1, hasSideEffects = 0 in
-    def A2_#NAME : T_ALU32_2op<mnemonic, minOp>;
-
-    let isPredicated = 1, hasSideEffects = 0 in {
-      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
-      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
-    }
-  }
-}
-
-defm aslh : ALU32_2op_base<"aslh", 0b000>, PredNewRel;
-defm asrh : ALU32_2op_base<"asrh", 0b001>, PredNewRel;
-defm sxtb : ALU32_2op_base<"sxtb", 0b101>, PredNewRel;
-defm sxth : ALU32_2op_base<"sxth", 0b111>, PredNewRel;
-defm zxth : ALU32_2op_base<"zxth", 0b110>, PredNewRel;
-
-// Rd=zxtb(Rs): assembler mapped to Rd=and(Rs,#255).
-// Compiler would want to generate 'zxtb' instead of 'and' because 'zxtb' has
-// predicated forms while 'and' doesn't. Since integrated assembler can't
-// handle 'mapped' instructions, we need to encode 'zxtb' same as 'and' where
-// immediate operand is set to '255'.
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_ZXTB: ALU32Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rs),
-  "$Rd = zxtb($Rs)", [] > { // Rd = and(Rs,255)
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<10> s10 = 255;
-
-    let IClass = 0b0111;
-
-    let Inst{27-22} = 0b011000;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{21} = s10{9};
-    let Inst{13-5} = s10{8-0};
-}
-
-//Rd=zxtb(Rs): assembler mapped to "Rd=and(Rs,#255)
-multiclass ZXTB_base <string mnemonic, bits<3> minOp> {
-  let BaseOpcode = mnemonic in {
-    let isPredicable = 1, hasSideEffects = 0 in
-    def A2_#NAME : T_ZXTB;
-
-    let isPredicated = 1, hasSideEffects = 0 in {
-      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
-      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
-    }
-  }
-}
-
-defm zxtb : ZXTB_base<"zxtb",0b100>, PredNewRel;
-
-//===----------------------------------------------------------------------===//
-// Template class for vector add and avg
-//===----------------------------------------------------------------------===//
-
-class T_VectALU_64 <string opc, bits<3> majOp, bits<3> minOp,
-                   bit isSat, bit isRnd, bit isCrnd, bit SwapOps >
-  : ALU64_rr < (outs DoubleRegs:$Rdd),
-                (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(isRnd, ":rnd", "")
-                             #!if(isCrnd,":crnd","")
-                             #!if(isSat, ":sat", ""),
-  [], "", ALU64_tc_2_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b0011;
-    let Inst{23-21} = majOp;
-    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
-    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
-    let Inst{7-5} = minOp;
-    let Inst{4-0} = Rdd;
-  }
-
-// ALU64 - Vector add
-// Rdd=vadd[u][bhw](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vaddub  : T_VectALU_64 < "vaddub", 0b000, 0b000, 0, 0, 0, 0>;
-  def A2_vaddh   : T_VectALU_64 < "vaddh",  0b000, 0b010, 0, 0, 0, 0>;
-  def A2_vaddw   : T_VectALU_64 < "vaddw",  0b000, 0b101, 0, 0, 0, 0>;
-}
-
-// Rdd=vadd[u][bhw](Rss,Rtt):sat
-let Defs = [USR_OVF] in {
-  def A2_vaddubs : T_VectALU_64 < "vaddub", 0b000, 0b001, 1, 0, 0, 0>;
-  def A2_vaddhs  : T_VectALU_64 < "vaddh",  0b000, 0b011, 1, 0, 0, 0>;
-  def A2_vadduhs : T_VectALU_64 < "vadduh", 0b000, 0b100, 1, 0, 0, 0>;
-  def A2_vaddws  : T_VectALU_64 < "vaddw",  0b000, 0b110, 1, 0, 0, 0>;
-}
-
-// ALU64 - Vector average
-// Rdd=vavg[u][bhw](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vavgub : T_VectALU_64 < "vavgub", 0b010, 0b000, 0, 0, 0, 0>;
-  def A2_vavgh  : T_VectALU_64 < "vavgh",  0b010, 0b010, 0, 0, 0, 0>;
-  def A2_vavguh : T_VectALU_64 < "vavguh", 0b010, 0b101, 0, 0, 0, 0>;
-  def A2_vavgw  : T_VectALU_64 < "vavgw",  0b011, 0b000, 0, 0, 0, 0>;
-  def A2_vavguw : T_VectALU_64 < "vavguw", 0b011, 0b011, 0, 0, 0, 0>;
-}
-
-// Rdd=vavg[u][bhw](Rss,Rtt)[:rnd|:crnd]
-def A2_vavgubr : T_VectALU_64 < "vavgub", 0b010, 0b001, 0, 1, 0, 0>;
-def A2_vavghr  : T_VectALU_64 < "vavgh",  0b010, 0b011, 0, 1, 0, 0>;
-def A2_vavghcr : T_VectALU_64 < "vavgh",  0b010, 0b100, 0, 0, 1, 0>;
-def A2_vavguhr : T_VectALU_64 < "vavguh", 0b010, 0b110, 0, 1, 0, 0>;
-
-def A2_vavgwr  : T_VectALU_64 < "vavgw",  0b011, 0b001, 0, 1, 0, 0>;
-def A2_vavgwcr : T_VectALU_64 < "vavgw",  0b011, 0b010, 0, 0, 1, 0>;
-def A2_vavguwr : T_VectALU_64 < "vavguw", 0b011, 0b100, 0, 1, 0, 0>;
-
-// Rdd=vnavg[bh](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vnavgh   : T_VectALU_64 < "vnavgh", 0b100, 0b000, 0, 0, 0, 1>;
-  def A2_vnavgw   : T_VectALU_64 < "vnavgw", 0b100, 0b011, 0, 0, 0, 1>;
-}
-
-// Rdd=vnavg[bh](Rss,Rtt)[:rnd|:crnd]:sat
-let Defs = [USR_OVF] in {
-  def A2_vnavghr  : T_VectALU_64 < "vnavgh", 0b100, 0b001, 1, 1, 0, 1>;
-  def A2_vnavghcr : T_VectALU_64 < "vnavgh", 0b100, 0b010, 1, 0, 1, 1>;
-  def A2_vnavgwr  : T_VectALU_64 < "vnavgw", 0b100, 0b100, 1, 1, 0, 1>;
-  def A2_vnavgwcr : T_VectALU_64 < "vnavgw", 0b100, 0b110, 1, 0, 1, 1>;
-}
-
-// Rdd=vsub[u][bh](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vsubub  : T_VectALU_64 < "vsubub", 0b001, 0b000, 0, 0, 0, 1>;
-  def A2_vsubh   : T_VectALU_64 < "vsubh",  0b001, 0b010, 0, 0, 0, 1>;
-  def A2_vsubw   : T_VectALU_64 < "vsubw",  0b001, 0b101, 0, 0, 0, 1>;
-}
-
-// Rdd=vsub[u][bh](Rss,Rtt):sat
-let Defs = [USR_OVF] in {
-  def A2_vsububs : T_VectALU_64 < "vsubub", 0b001, 0b001, 1, 0, 0, 1>;
-  def A2_vsubhs  : T_VectALU_64 < "vsubh",  0b001, 0b011, 1, 0, 0, 1>;
-  def A2_vsubuhs : T_VectALU_64 < "vsubuh", 0b001, 0b100, 1, 0, 0, 1>;
-  def A2_vsubws  : T_VectALU_64 < "vsubw",  0b001, 0b110, 1, 0, 0, 1>;
-}
-
-// Rdd=vmax[u][bhw](Rss,Rtt)
-def A2_vmaxb  : T_VectALU_64 < "vmaxb",  0b110, 0b110, 0, 0, 0, 1>;
-def A2_vmaxub : T_VectALU_64 < "vmaxub", 0b110, 0b000, 0, 0, 0, 1>;
-def A2_vmaxh  : T_VectALU_64 < "vmaxh",  0b110, 0b001, 0, 0, 0, 1>;
-def A2_vmaxuh : T_VectALU_64 < "vmaxuh", 0b110, 0b010, 0, 0, 0, 1>;
-def A2_vmaxw  : T_VectALU_64 < "vmaxw",  0b110, 0b011, 0, 0, 0, 1>;
-def A2_vmaxuw : T_VectALU_64 < "vmaxuw", 0b101, 0b101, 0, 0, 0, 1>;
-
-// Rdd=vmin[u][bhw](Rss,Rtt)
-def A2_vminb  : T_VectALU_64 < "vminb",  0b110, 0b111, 0, 0, 0, 1>;
-def A2_vminub : T_VectALU_64 < "vminub", 0b101, 0b000, 0, 0, 0, 1>;
-def A2_vminh  : T_VectALU_64 < "vminh",  0b101, 0b001, 0, 0, 0, 1>;
-def A2_vminuh : T_VectALU_64 < "vminuh", 0b101, 0b010, 0, 0, 0, 1>;
-def A2_vminw  : T_VectALU_64 < "vminw",  0b101, 0b011, 0, 0, 0, 1>;
-def A2_vminuw : T_VectALU_64 < "vminuw", 0b101, 0b100, 0, 0, 0, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for vector compare
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_vcmp <string Str, bits<4> minOp>
-  : ALU64_rr <(outs PredRegs:$Pd),
-              (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = "#Str#"($Rss, $Rtt)", [],
-  "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = minOp{3};
-    let Inst{7-5} = minOp{2-0};
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector compare bytes
-def A2_vcmpbeq  : T_vcmp <"vcmpb.eq",  0b0110>;
-def A2_vcmpbgtu : T_vcmp <"vcmpb.gtu", 0b0111>;
-
-// Vector compare halfwords
-def A2_vcmpheq  : T_vcmp <"vcmph.eq",  0b0011>;
-def A2_vcmphgt  : T_vcmp <"vcmph.gt",  0b0100>;
-def A2_vcmphgtu : T_vcmp <"vcmph.gtu", 0b0101>;
-
-// Vector compare words
-def A2_vcmpweq  : T_vcmp <"vcmpw.eq",  0b0000>;
-def A2_vcmpwgt  : T_vcmp <"vcmpw.gt",  0b0001>;
-def A2_vcmpwgtu : T_vcmp <"vcmpw.gtu", 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PRED +
-//===----------------------------------------------------------------------===//
-// No bits needed.  If cmp.ge is found the assembler parser will
-// transform it to cmp.gt subtracting 1 from the immediate.
-let isPseudo = 1 in {
-def C2_cmpgei: ALU32Inst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, s8_0Ext:$s8),
-  "$Pd = cmp.ge($Rs, #$s8)">;
-def C2_cmpgeui: ALU32Inst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, u8_0Ext:$s8),
-  "$Pd = cmp.geu($Rs, #$s8)">;
-}
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PRED -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU +
-//===----------------------------------------------------------------------===//
-// Add.
-//===----------------------------------------------------------------------===//
-// Template Class
-// Add/Subtract halfword
-// Rd=add(Rt.L,Rs.[HL])[:sat]
-// Rd=sub(Rt.L,Rs.[HL])[:sat]
-// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
-// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
-//===----------------------------------------------------------------------===//
-
-let  hasNewValue = 1, opNewValue = 0 in
-class T_XTYPE_ADD_SUB <bits<2> LHbits, bit isSat, bit hasShift, bit isSub>
-  : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
-  "$Rd = "#!if(isSub,"sub","add")#"($Rt."
-          #!if(hasShift, !if(LHbits{1},"h","l"),"l") #", $Rs."
-          #!if(hasShift, !if(LHbits{0},"h)","l)"), !if(LHbits{1},"h)","l)"))
-          #!if(isSat,":sat","")
-          #!if(hasShift,":<<16",""), [], "", ALU64_tc_1_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01010;
-    let Inst{22} = hasShift;
-    let Inst{21} = isSub;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rd;
-    let Inst{12-8} = Rt;
-    let Inst{20-16} = Rs;
-  }
-
-//Rd=sub(Rt.L,Rs.[LH])
-def A2_subh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 1>;
-def A2_subh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 1>;
-
-//Rd=add(Rt.L,Rs.[LH])
-def A2_addh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 0>;
-def A2_addh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 0>;
-
-let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
-  //Rd=sub(Rt.L,Rs.[LH]):sat
-  def A2_subh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 1>;
-  def A2_subh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 1>;
-
-  //Rd=add(Rt.L,Rs.[LH]):sat
-  def A2_addh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 0>;
-  def A2_addh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 0>;
-}
-
-//Rd=sub(Rt.[LH],Rs.[LH]):<<16
-def A2_subh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 1>;
-def A2_subh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 1>;
-def A2_subh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 1>;
-def A2_subh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 1>;
-
-//Rd=add(Rt.[LH],Rs.[LH]):<<16
-def A2_addh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 0>;
-def A2_addh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 0>;
-def A2_addh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 0>;
-def A2_addh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 0>;
-
-let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
-  //Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
-  def A2_subh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 1>;
-  def A2_subh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 1>;
-  def A2_subh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 1>;
-  def A2_subh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 1>;
-
-  //Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
-  def A2_addh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 0>;
-  def A2_addh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 0>;
-  def A2_addh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 0>;
-  def A2_addh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 0>;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S2_parityp: ALU64Inst<(outs IntRegs:$Rd),
-      (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0000;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, opNewValue = 0, hasSideEffects = 0 in
-class T_XTYPE_MIN_MAX < bit isMax, bit isUnsigned >
-  : ALU64Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
-  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
-          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01011;
-    let Inst{22-21} = !if(isMax, 0b10, 0b01);
-    let Inst{7} = isUnsigned;
-    let Inst{4-0} = Rd;
-    let Inst{12-8} = !if(isMax, Rs, Rt);
-    let Inst{20-16} = !if(isMax, Rt, Rs);
-  }
-
-def A2_min  : T_XTYPE_MIN_MAX < 0, 0 >;
-def A2_minu : T_XTYPE_MIN_MAX < 0, 1 >;
-def A2_max  : T_XTYPE_MIN_MAX < 1, 0 >;
-def A2_maxu : T_XTYPE_MIN_MAX < 1, 1 >;
-
-class T_cmp64_rr<string mnemonic, bits<3> MinOp, bit IsComm>
-  : ALU64_rr<(outs PredRegs:$Pd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-             "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", ALU64_tc_2early_SLOT23> {
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0010100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = MinOp;
-  let Inst{1-0} = Pd;
-}
-
-def C2_cmpeqp  : T_cmp64_rr<"cmp.eq",  0b000, 1>;
-def C2_cmpgtp  : T_cmp64_rr<"cmp.gt",  0b010, 0>;
-def C2_cmpgtup : T_cmp64_rr<"cmp.gtu", 0b100, 0>;
-
-def C2_vmux : ALU64_rr<(outs DoubleRegs:$Rd),
-      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-      "$Rd = vmux($Pu, $Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
-  let hasSideEffects = 0;
-
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0001;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU64_rr<string mnemonic, string suffix, bits<4> RegType,
-                 bits<3> MajOp, bits<3> MinOp, bit OpsRev, bit IsComm,
-                 string Op2Pfx>
-  : ALU64_rr<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-             "$Rd = " #mnemonic# "($Rs, " #Op2Pfx# "$Rt)" #suffix, [],
-             "", ALU64_tc_1_SLOT23> {
-  let hasSideEffects = 0;
-  let isCommutable = IsComm;
-
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = RegType;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = !if (OpsRev,Rt,Rs);
-  let Inst{12-8} = !if (OpsRev,Rs,Rt);
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU64_arith<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit IsSat,
-                    bit OpsRev, bit IsComm>
-  : T_ALU64_rr<mnemonic, !if(IsSat,":sat",""), 0b0011, MajOp, MinOp, OpsRev,
-               IsComm, "">;
-
-let isAdd = 1 in
-def A2_addp : T_ALU64_arith<"add", 0b000, 0b111, 0, 0, 1>;
-def A2_subp : T_ALU64_arith<"sub", 0b001, 0b111, 0, 1, 0>;
-
-class T_ALU64_logical<string mnemonic, bits<3> MinOp, bit OpsRev, bit IsComm,
-                      bit IsNeg>
-  : T_ALU64_rr<mnemonic, "", 0b0011, 0b111, MinOp, OpsRev, IsComm,
-               !if(IsNeg,"~","")>;
-
-def A2_andp : T_ALU64_logical<"and", 0b000, 0, 1, 0>;
-def A2_orp  : T_ALU64_logical<"or",  0b010, 0, 1, 0>;
-def A2_xorp : T_ALU64_logical<"xor", 0b100, 0, 1, 0>;
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/BIT +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/PERM +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// CR +
-//===----------------------------------------------------------------------===//
-// Logical reductions on predicates.
-
-// Looping instructions.
-
-// Pipelined looping instructions.
-
-// Logical operations on predicates.
-let hasSideEffects = 0 in
-class T_LOGICAL_1OP<string MnOp, bits<2> OpBits>
-    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps),
-             "$Pd = " # MnOp # "($Ps)", [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-
-  let IClass = 0b0110;
-  let Inst{27-23} = 0b10111;
-  let Inst{22-21} = OpBits;
-  let Inst{20} = 0b0;
-  let Inst{17-16} = Ps;
-  let Inst{13} = 0b0;
-  let Inst{1-0} = Pd;
-}
-
-def C2_any8 : T_LOGICAL_1OP<"any8", 0b00>;
-def C2_all8 : T_LOGICAL_1OP<"all8", 0b01>;
-def C2_not  : T_LOGICAL_1OP<"not",  0b10>;
-
-let hasSideEffects = 0 in
-class T_LOGICAL_2OP<string MnOp, bits<3> OpBits, bit IsNeg, bit Rev>
-    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps, PredRegs:$Pt),
-             "$Pd = " # MnOp # "($Ps, " # !if (IsNeg,"!","") # "$Pt)",
-             [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-  bits<2> Pt;
-
-  let IClass = 0b0110;
-  let Inst{27-24} = 0b1011;
-  let Inst{23-21} = OpBits;
-  let Inst{20} = 0b0;
-  let Inst{17-16} = !if(Rev,Pt,Ps);  // Rs and Rt are reversed for some
-  let Inst{13} = 0b0;                // instructions.
-  let Inst{9-8} = !if(Rev,Ps,Pt);
-  let Inst{1-0} = Pd;
-}
-
-def C2_and  : T_LOGICAL_2OP<"and", 0b000, 0, 1>;
-def C2_or   : T_LOGICAL_2OP<"or",  0b001, 0, 1>;
-def C2_xor  : T_LOGICAL_2OP<"xor", 0b010, 0, 0>;
-def C2_andn : T_LOGICAL_2OP<"and", 0b011, 1, 1>;
-def C2_orn  : T_LOGICAL_2OP<"or",  0b111, 1, 1>;
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def C2_vitpack : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps, PredRegs:$Pt),
-      "$Rd = vitpack($Ps, $Pt)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Ps;
-  bits<2> Pt;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1001;
-  let Inst{22-21} = 0b00;
-  let Inst{17-16} = Ps;
-  let Inst{9-8} = Pt;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def C2_mask : SInst<(outs DoubleRegs:$Rd), (ins PredRegs:$Pt),
-      "$Rd = mask($Pt)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Pt;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0110;
-  let Inst{9-8} = Pt;
-  let Inst{4-0} = Rd;
-}
-
-// User control register transfer.
-//===----------------------------------------------------------------------===//
-// CR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// JR +
-//===----------------------------------------------------------------------===//
-
-class CondStr<string CReg, bit True, bit New> {
-  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
-}
-class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
-  string S = Mnemonic # !if(Taken, ":t", ":nt");
-}
-
-let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
-    isPredicable = 1,
-    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
-    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
-class T_JMP<string ExtStr>
-  : JInst_CJUMP_UCJUMP<(outs), (ins brtarget:$dst),
-      "jump " # ExtStr # "$dst",
-      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
-    bits<24> dst;
-    let IClass = 0b0101;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-16} = dst{23-15};
-    let Inst{13-1} = dst{14-2};
-}
-
-let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
-    isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
-    opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
-class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
-  : JInst_CJUMP_UCJUMP<(outs), (ins PredRegs:$src, brtarget:$dst),
-      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-        JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
-        ExtStr # "$dst",
-      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>, ImmRegRel {
-    let isTaken = isTak;
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = isPredNew;
-    bits<2> src;
-    bits<17> dst;
-
-    let IClass = 0b0101;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{21} = PredNot;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{9-8} = src;
-    let Inst{23-22} = dst{16-15};
-    let Inst{20-16} = dst{14-10};
-    let Inst{13} = dst{9};
-    let Inst{7-1} = dst{8-2};
-  }
-
-multiclass JMP_Pred<bit PredNot, string ExtStr> {
-  def NAME       : T_JMP_c<PredNot, 0, 0, ExtStr>; // not taken
-  // Predicate new
-  def NAME#newpt : T_JMP_c<PredNot, 1, 1, ExtStr>; // taken
-  def NAME#new   : T_JMP_c<PredNot, 1, 0, ExtStr>; // not taken
-}
-
-multiclass JMP_base<string BaseOp, string ExtStr> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMP<ExtStr>;
-    defm t : JMP_Pred<0, ExtStr>;
-    defm f : JMP_Pred<1, ExtStr>;
-  }
-}
-
-// Jumps to address stored in a register, JUMPR_MISC
-// if ([[!]P[.new]]) jumpr[:t/nt] Rs
-let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
-    isPredicable = 1, hasSideEffects = 0, InputType = "reg" in
-class T_JMPr
-  : JRInst<(outs), (ins IntRegs:$dst),
-      "jumpr $dst", [], "", J_tc_2early_SLOT2> {
-    bits<5> dst;
-
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0010100;
-    let Inst{20-16} = dst;
-}
-
-let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
-    hasSideEffects = 0, InputType = "reg" in
-class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
-  : JRInst <(outs), (ins PredRegs:$src, IntRegs:$dst),
-      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-        JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst", [],
-      "", J_tc_2early_SLOT2> {
-
-    let isTaken = isTak;
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = isPredNew;
-    bits<2> src;
-    bits<5> dst;
-
-    let IClass = 0b0101;
-
-    let Inst{27-22} = 0b001101;
-    let Inst{21} = PredNot;
-    let Inst{20-16} = dst;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{9-8} = src;
-}
-
-multiclass JMPR_Pred<bit PredNot> {
-  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
-  // Predicate new
-  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
-  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
-}
-
-multiclass JMPR_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMPr;
-    defm t : JMPR_Pred<0>;
-    defm f : JMPR_Pred<1>;
-  }
-}
-
-let isCall = 1, hasSideEffects = 1 in
-class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
-               dag InputDag = (ins IntRegs:$Rs)>
-  : JRInst<(outs), InputDag,
-      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
-                                 "if ($Pu) callr $Rs"),
-                                 "callr $Rs"),
-      [], "", J_tc_2early_SLOT2> {
-    bits<5> Rs;
-    bits<2> Pu;
-    let isPredicated = isPred;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b0101;
-    let Inst{27-25} = 0b000;
-    let Inst{24-23} = !if (isPred, 0b10, 0b01);
-    let Inst{22} = 0;
-    let Inst{21} = isPredNot;
-    let Inst{9-8} = !if (isPred, Pu, 0b00);
-    let Inst{20-16} = Rs;
-
-  }
-
-let Defs = VolatileV3.Regs in {
-  def J2_callrt : JUMPR_MISC_CALLR<1, 0, (ins PredRegs:$Pu, IntRegs:$Rs)>;
-  def J2_callrf : JUMPR_MISC_CALLR<1, 1, (ins PredRegs:$Pu, IntRegs:$Rs)>;
-}
-
-let isTerminator = 1, hasSideEffects = 0 in {
-  defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
-
-  defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
-
-  let isReturn = 1, isPseudo = 1, isCodeGenOnly = 1 in
-  defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
-}
-
-let validSubTargets  = HasV60SubT in
-multiclass JMPpt_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def tpt : T_JMP_c <0, 0, 1, "">; // Predicate true - taken
-    def fpt : T_JMP_c <1, 0, 1, "">; // Predicate false - taken
-  }
-}
-
-let validSubTargets  = HasV60SubT in
-multiclass JMPRpt_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def tpt : T_JMPr_c<0, 0, 1>; // predicate true - taken
-    def fpt : T_JMPr_c<1, 0, 1>; // predicate false - taken
-  }
-}
-
-defm J2_jumpr : JMPRpt_base<"JMPr">;
-defm J2_jump  : JMPpt_base<"JMP">;
-
-// A return through builtin_eh_return.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
-    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
-def EH_RETURN_JMPR : T_JMPr;
-
-//===----------------------------------------------------------------------===//
-// JR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LD +
-//===----------------------------------------------------------------------===//
-
-// Load - Base with Immediate offset addressing mode
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, AddedComplexity = 20 in
-class T_load_io <string mnemonic, RegisterClass RC, bits<4> MajOp,
-                 Operand ImmOp>
-  : LDInst<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src1 + #$offset)", []>, AddrModeRel {
-    bits<4> name;
-    bits<5> dst;
-    bits<5> src1;
-    bits<14> offset;
-    bits<11> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), offset{13-3},
-                     !if (!eq(ImmOpStr, "s11_2Ext"), offset{12-2},
-                     !if (!eq(ImmOpStr, "s11_1Ext"), offset{11-1},
-                                      /* s11_0Ext */ offset{10-0})));
-    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
-                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
-                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
-                                        /* s11_0Ext */ 11)));
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27}    = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let opExtendable = 3, isExtentSigned = 0, isPredicated = 1 in
-class T_pload_io <string mnemonic, RegisterClass RC, bits<4>MajOp,
-                  Operand ImmOp, bit isNot, bit isPredNew>
-  : LDInst<(outs RC:$dst),
-           (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "if ("#!if(isNot, "!$src1", "$src1")
-       #!if(isPredNew, ".new", "")
-       #") $dst = "#mnemonic#"($src2 + #$offset)",
-  [],"", V2LDST_tc_ld_SLOT01> , AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> offset;
-    bits<6> offsetBits;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), offset{8-3},
-                     !if (!eq(ImmOpStr, "u6_2Ext"), offset{7-2},
-                     !if (!eq(ImmOpStr, "u6_1Ext"), offset{6-1},
-                                      /* u6_0Ext */ offset{5-0})));
-    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
-                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
-                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
-                                        /* u6_0Ext */ 6)));
-    let hasNewValue = !if (!eq(ImmOpStr, "u6_3Ext"), 0, 1);
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isNot;
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b0;
-    let Inst{27}    = 0b0;
-    let Inst{26}    = isNot;
-    let Inst{25}    = isPredNew;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13}    = 0b0;
-    let Inst{12-11} = src1;
-    let Inst{10-5}  = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-let isExtendable = 1, hasSideEffects = 0, addrMode = BaseImmOffset in
-multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<4>MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    let isPredicable = 1 in
-    def L2_#NAME#_io : T_load_io <mnemonic, RC, MajOp, ImmOp>;
-
-    // Predicated
-    def L2_p#NAME#t_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 0>;
-    def L2_p#NAME#f_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 0>;
-
-    // Predicated new
-    def L2_p#NAME#tnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 1>;
-    def L2_p#NAME#fnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 1>;
-  }
-}
-
-let accessSize = ByteAccess in {
-  defm loadrb:  LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext, 0b1000>;
-  defm loadrub: LD_Idxd <"memub", "LDriub", IntRegs, s11_0Ext, u6_0Ext, 0b1001>;
-}
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  defm loadrh:  LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext, 0b1010>;
-  defm loadruh: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext, 0b1011>;
-}
-
-let accessSize = WordAccess, opExtentAlign = 2 in
-defm loadri: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext, 0b1100>;
-
-let accessSize = DoubleWordAccess, opExtentAlign = 3 in
-defm loadrd: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext, 0b1110>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  def L2_loadbsw2_io:   T_load_io<"membh",  IntRegs, 0b0001, s11_1Ext>;
-  def L2_loadbzw2_io:   T_load_io<"memubh", IntRegs, 0b0011, s11_1Ext>;
-}
-
-let accessSize = WordAccess, opExtentAlign = 2 in {
-  def L2_loadbzw4_io: T_load_io<"memubh", DoubleRegs, 0b0101, s11_2Ext>;
-  def L2_loadbsw4_io: T_load_io<"membh",  DoubleRegs, 0b0111, s11_2Ext>;
-}
-
-let addrMode = BaseImmOffset, isExtendable = 1, hasSideEffects = 0,
-    opExtendable = 3, isExtentSigned = 1  in
-class T_loadalign_io <string str, bits<4> MajOp, Operand ImmOp>
-  : LDInst<(outs DoubleRegs:$dst),
-           (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "$dst = "#str#"($src2 + #$offset)", [],
-  "$src1 = $dst">, AddrModeRel {
-    bits<4> name;
-    bits<5> dst;
-    bits<5> src2;
-    bits<12> offset;
-    bits<11> offsetBits;
-
-    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s11_1Ext"), offset{11-1},
-                                                  /* s11_0Ext */ offset{10-0});
-    let IClass = 0b1001;
-
-    let Inst{27}    = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let accessSize = HalfWordAccess, opExtentBits = 12, opExtentAlign = 1 in
-def L2_loadalignh_io: T_loadalign_io <"memh_fifo", 0b0010, s11_1Ext>;
-
-let accessSize = ByteAccess, opExtentBits = 11 in
-def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// Post increment load
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                     bits<4> MajOp >
-  : LDInstPI <(outs RC:$dst, IntRegs:$dst2),
-  (ins IntRegs:$src1, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src1++#$offset)" ,
-  [],
-  "$src1 = $dst2" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<5> src1;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13-12} = 0b00;
-    let Inst{8-5} = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_pload_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                          bits<4> MajOp, bit isPredNot, bit isPredNew >
-  : LDInst <(outs RC:$dst, IntRegs:$dst2),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
-  [] ,
-  "$src2 = $dst2" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12} = isPredNew;
-    let Inst{11} = isPredNot;
-    let Inst{10-9} = src1;
-    let Inst{8-5}  = offsetBits;
-    let Inst{4-0}  = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-
-multiclass LD_PostInc <string mnemonic, string BaseOp, RegisterClass RC,
-                       Operand ImmOp, bits<4> MajOp> {
-  let BaseOpcode = "POST_"#BaseOp in {
-    let isPredicable = 1 in
-    def L2_#NAME#_pi : T_load_pi < mnemonic, RC, ImmOp, MajOp>;
-
-    // Predicated
-    def L2_p#NAME#t_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 0>;
-    def L2_p#NAME#f_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 0>;
-
-    // Predicated new
-    def L2_p#NAME#tnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 1>;
-    def L2_p#NAME#fnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 1>;
-  }
-}
-
-// post increment byte loads with immediate offset
-let accessSize = ByteAccess in {
-  defm loadrb  : LD_PostInc <"memb",  "LDrib", IntRegs, s4_0Imm, 0b1000>;
-  defm loadrub : LD_PostInc <"memub", "LDriub", IntRegs, s4_0Imm, 0b1001>;
-}
-
-// post increment halfword loads with immediate offset
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  defm loadrh  : LD_PostInc <"memh",  "LDrih", IntRegs, s4_1Imm, 0b1010>;
-  defm loadruh : LD_PostInc <"memuh", "LDriuh", IntRegs, s4_1Imm, 0b1011>;
-}
-
-// post increment word loads with immediate offset
-let accessSize = WordAccess, opExtentAlign = 2 in
-defm loadri : LD_PostInc <"memw", "LDriw", IntRegs, s4_2Imm, 0b1100>;
-
-// post increment doubleword loads with immediate offset
-let accessSize = DoubleWordAccess, opExtentAlign = 3 in
-defm loadrd : LD_PostInc <"memd", "LDrid", DoubleRegs, s4_3Imm, 0b1110>;
-
-// Rd=memb[u]h(Rx++#s4:1)
-// Rdd=memb[u]h(Rx++#s4:2)
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  def L2_loadbsw2_pi   : T_load_pi <"membh", IntRegs, s4_1Imm, 0b0001>;
-  def L2_loadbzw2_pi   : T_load_pi <"memubh", IntRegs, s4_1Imm, 0b0011>;
-}
-let accessSize = WordAccess, opExtentAlign = 2, hasNewValue = 0 in {
-  def L2_loadbsw4_pi   : T_load_pi <"membh", DoubleRegs, s4_2Imm, 0b0111>;
-  def L2_loadbzw4_pi   : T_load_pi <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment fifo loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pi <string mnemonic, Operand ImmOp, bits<4> MajOp >
-  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$dst2),
-  (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src2++#$offset)" ,
-  [], "$src2 = $dst2, $src1 = $dst" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> offset;
-    bits<4> offsetBits;
-
-    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s4_1Imm"), offset{4-1},
-                                                  /* s4_0Imm */ offset{3-0});
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13-12} = 0b00;
-    let Inst{8-5} = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-// Ryy=memh_fifo(Rx++#s4:1)
-// Ryy=memb_fifo(Rx++#s4:0)
-let accessSize = ByteAccess in
-def L2_loadalignb_pi : T_loadalign_pi <"memb_fifo", s4_0Imm, 0b0100>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in
-def L2_loadalignh_pi : T_loadalign_pi <"memh_fifo", s4_1Imm, 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment loads with register offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pr <string mnemonic, RegisterClass RC, bits<4> MajOp,
-                       MemAccessSize AccessSz>
-  : LDInstPI <(outs RC:$dst, IntRegs:$_dst_),
-              (ins IntRegs:$src1, ModRegs:$src2),
-  "$dst = "#mnemonic#"($src1++$src2)" ,
-  [], "$src1 = $_dst_" > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<1> src2;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2;
-    let Inst{12}    = 0b0;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-let hasNewValue = 1 in {
-  def L2_loadrb_pr  : T_load_pr <"memb",  IntRegs, 0b1000, ByteAccess>;
-  def L2_loadrub_pr : T_load_pr <"memub", IntRegs, 0b1001, ByteAccess>;
-  def L2_loadrh_pr  : T_load_pr <"memh",  IntRegs, 0b1010, HalfWordAccess>;
-  def L2_loadruh_pr : T_load_pr <"memuh", IntRegs, 0b1011, HalfWordAccess>;
-  def L2_loadri_pr  : T_load_pr <"memw",  IntRegs, 0b1100, WordAccess>;
-
-  def L2_loadbzw2_pr : T_load_pr <"memubh", IntRegs, 0b0011, HalfWordAccess>;
-}
-
-def L2_loadrd_pr   : T_load_pr <"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
-def L2_loadbzw4_pr : T_load_pr <"memubh", DoubleRegs, 0b0101, WordAccess>;
-
-// Load predicate.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_pred : LDInst<(outs PredRegs:$dst),
-                        (ins IntRegs:$addr, s11_2Ext:$off),
-                        ".error \"should not emit\"", []>;
-// Load modifier.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_mod : LDInst<(outs ModRegs:$dst),
-                        (ins IntRegs:$addr, s11_2Ext:$off),
-                        ".error \"should not emit\"", []>;
-
-let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
-  def L2_deallocframe : LDInst<(outs), (ins),
-                     "deallocframe",
-                     []> {
-    let IClass = 0b1001;
-
-    let Inst{27-16} = 0b000000011110;
-    let Inst{13} = 0b0;
-    let Inst{4-0} = 0b11110;
-}
-
-// Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
-class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
-  : LDInst <(outs RC:$dst, IntRegs:$_dst_),
-            (ins IntRegs:$Rz, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
-  "$Rz = $_dst_" > {
-    bits<5> dst;
-    bits<5> Rz;
-    bit Mu;
-
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12} = 0b0;
-    let Inst{9} = 0b1;
-    let Inst{7} = 0b0;
-    let Inst{4-0} = dst;
- }
-
-let accessSize = ByteAccess in {
-  def L2_loadrb_pcr  : T_load_pcr <"memb",  IntRegs, 0b1000>;
-  def L2_loadrub_pcr : T_load_pcr <"memub", IntRegs, 0b1001>;
-}
-
-let accessSize = HalfWordAccess in {
-  def L2_loadrh_pcr   : T_load_pcr <"memh",   IntRegs, 0b1010>;
-  def L2_loadruh_pcr  : T_load_pcr <"memuh",  IntRegs, 0b1011>;
-  def L2_loadbsw2_pcr : T_load_pcr <"membh",  IntRegs, 0b0001>;
-  def L2_loadbzw2_pcr : T_load_pcr <"memubh", IntRegs, 0b0011>;
-}
-
-let accessSize = WordAccess in {
-  def  L2_loadri_pcr  : T_load_pcr <"memw", IntRegs, 0b1100>;
-  let hasNewValue = 0 in {
-    def L2_loadbzw4_pcr : T_load_pcr <"memubh", DoubleRegs, 0b0101>;
-    def L2_loadbsw4_pcr : T_load_pcr <"membh",  DoubleRegs, 0b0111>;
-  }
-}
-
-let accessSize = DoubleWordAccess in
-def L2_loadrd_pcr  : T_load_pcr <"memd", DoubleRegs, 0b1110>;
-
-// Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
-  : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
-            (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
-  "$Rz = $_dst_, $dst = $_src_" > {
-    bits<5> dst;
-    bits<5> Rz;
-    bit Mu;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13}    = Mu;
-    let Inst{12}    = 0b0;
-    let Inst{9}     = 0b1;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
- }
-
-def L2_loadalignb_pcr : T_loadalign_pcr <"memb_fifo", 0b0100, ByteAccess>;
-def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Circular loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let Uses = [CS], mayLoad = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_load_pci <string mnemonic, RegisterClass RC,
-                  Operand ImmOp, bits<4> MajOp>
-  : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
-             (ins IntRegs:$Rz, ImmOp:$offset, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ #$offset:circ($Mu))", [],
-  "$Rz = $_dst_"> {
-    bits<5> dst;
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let IClass      = 0b1001;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13}    = Mu;
-    let Inst{12}    = 0b0;
-    let Inst{9}     = 0b0;
-    let Inst{8-5}   = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-// Byte variants of circ load
-let accessSize = ByteAccess in {
-  def L2_loadrb_pci  : T_load_pci <"memb",  IntRegs, s4_0Imm, 0b1000>;
-  def L2_loadrub_pci : T_load_pci <"memub", IntRegs, s4_0Imm, 0b1001>;
-}
-
-// Half word variants of circ load
-let accessSize = HalfWordAccess in {
-  def L2_loadrh_pci   : T_load_pci <"memh",   IntRegs, s4_1Imm, 0b1010>;
-  def L2_loadruh_pci  : T_load_pci <"memuh",  IntRegs, s4_1Imm, 0b1011>;
-  def L2_loadbzw2_pci : T_load_pci <"memubh", IntRegs, s4_1Imm, 0b0011>;
-  def L2_loadbsw2_pci : T_load_pci <"membh",  IntRegs, s4_1Imm, 0b0001>;
-}
-
-// Word variants of circ load
-let accessSize = WordAccess in
-def L2_loadri_pci   : T_load_pci <"memw",   IntRegs,    s4_2Imm, 0b1100>;
-
-let accessSize = WordAccess, hasNewValue = 0 in {
-  def L2_loadbzw4_pci : T_load_pci <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
-  def L2_loadbsw4_pci : T_load_pci <"membh",  DoubleRegs, s4_2Imm, 0b0111>;
-}
-
-let accessSize = DoubleWordAccess, hasNewValue = 0 in
-def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
-
-
-// TODO: memb_fifo and memh_fifo must take destination register as input.
-// One-off circ loads - not enough in common to break into a class.
-let accessSize = ByteAccess in
-def L2_loadalignb_pci : T_load_pci <"memb_fifo", DoubleRegs, s4_0Imm, 0b0100>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in
-def L2_loadalignh_pci : T_load_pci <"memh_fifo", DoubleRegs, s4_1Imm, 0b0010>;
-
-// L[24]_load[wd]_locked: Load word/double with lock.
-let isSoloAX = 1 in
-class T_load_locked <string mnemonic, RegisterClass RC>
-  : LD0Inst <(outs RC:$dst),
-             (ins IntRegs:$src),
-    "$dst = "#mnemonic#"($src)"> {
-    bits<5> dst;
-    bits<5> src;
-    let IClass = 0b1001;
-    let Inst{27-21} = 0b0010000;
-    let Inst{20-16} = src;
-    let Inst{13-12} = !if (!eq(mnemonic, "memd_locked"), 0b01, 0b00);
-    let Inst{5}   = 0;
-    let Inst{4-0} = dst;
-}
-let hasNewValue = 1, accessSize = WordAccess, opNewValue = 0 in
-  def L2_loadw_locked : T_load_locked <"memw_locked", IntRegs>;
-let accessSize = DoubleWordAccess in
-  def L4_loadd_locked : T_load_locked <"memd_locked", DoubleRegs>;
-
-// S[24]_store[wd]_locked: Store word/double conditionally.
-let isSoloAX = 1, isPredicateLate = 1 in
-class T_store_locked <string mnemonic, RegisterClass RC>
-  : ST0Inst <(outs PredRegs:$Pd), (ins IntRegs:$Rs, RC:$Rt),
-    mnemonic#"($Rs, $Pd) = $Rt"> {
-    bits<2> Pd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1010;
-    let Inst{27-23} = 0b00001;
-    let Inst{22} = !if (!eq(mnemonic, "memw_locked"), 0b0, 0b1);
-    let Inst{21} = 0b1;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{1-0} = Pd;
-}
-
-let accessSize = WordAccess in
-def S2_storew_locked : T_store_locked <"memw_locked", IntRegs>;
-
-let accessSize = DoubleWordAccess in
-def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed loads with auto-increment register
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pbr<string mnemonic, RegisterClass RC,
-                            MemAccessSize addrSize, bits<4> majOp>
-  : LDInst
-    <(outs RC:$dst, IntRegs:$_dst_),
-     (ins IntRegs:$Rz, ModRegs:$Mu),
-     "$dst = "#mnemonic#"($Rz ++ $Mu:brev)" ,
-      [] , "$Rz = $_dst_" > {
-
-      let accessSize = addrSize;
-
-      bits<5> dst;
-      bits<5> Rz;
-      bits<1> Mu;
-
-      let IClass = 0b1001;
-
-      let Inst{27-25} = 0b111;
-      let Inst{24-21} = majOp;
-      let Inst{20-16} = Rz;
-      let Inst{13} = Mu;
-      let Inst{12} = 0b0;
-      let Inst{7} = 0b0;
-      let Inst{4-0} = dst;
-  }
-
-let hasNewValue =1, opNewValue = 0 in {
-  def L2_loadrb_pbr   : T_load_pbr <"memb",  IntRegs, ByteAccess, 0b1000>;
-  def L2_loadrub_pbr  : T_load_pbr <"memub", IntRegs, ByteAccess, 0b1001>;
-  def L2_loadrh_pbr   : T_load_pbr <"memh",  IntRegs, HalfWordAccess, 0b1010>;
-  def L2_loadruh_pbr  : T_load_pbr <"memuh", IntRegs, HalfWordAccess, 0b1011>;
-  def L2_loadbsw2_pbr : T_load_pbr <"membh", IntRegs, HalfWordAccess, 0b0001>;
-  def L2_loadbzw2_pbr : T_load_pbr <"memubh", IntRegs, HalfWordAccess, 0b0011>;
-  def L2_loadri_pbr : T_load_pbr <"memw", IntRegs, WordAccess, 0b1100>;
-}
-
-def L2_loadbzw4_pbr : T_load_pbr <"memubh", DoubleRegs, WordAccess, 0b0101>;
-def L2_loadbsw4_pbr : T_load_pbr <"membh",  DoubleRegs, WordAccess, 0b0111>;
-def L2_loadrd_pbr : T_load_pbr <"memd", DoubleRegs, DoubleWordAccess, 0b1110>;
-
-def L2_loadalignb_pbr :T_load_pbr <"memb_fifo", DoubleRegs, ByteAccess, 0b0100>;
-def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
-                                   HalfWordAccess, 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// LD -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/ALU +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYH +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords
-//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_M2_mpy < bits<2> LHbits, bit isSat, bit isRnd,
-                 bit hasShift, bit isUnsigned>
-  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
-                                       #", $Rt."#!if(LHbits{0},"h)","l)")
-                                       #!if(hasShift,":<<1","")
-                                       #!if(isRnd,":rnd","")
-                                       #!if(isSat,":sat",""),
-  [], "", M_tc_3x_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isRnd;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 0>;
-def M2_mpy_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 0>;
-def M2_mpy_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 0>;
-def M2_mpy_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 0>;
-def M2_mpy_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 0>;
-def M2_mpy_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 0>;
-def M2_mpy_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 0>;
-def M2_mpy_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 0>;
-
-//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 1>;
-def M2_mpyu_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 1>;
-def M2_mpyu_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 1>;
-def M2_mpyu_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 1>;
-def M2_mpyu_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 1>;
-def M2_mpyu_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 1>;
-def M2_mpyu_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 1>;
-def M2_mpyu_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 1>;
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]:rnd
-def M2_mpy_rnd_ll_s1: T_M2_mpy <0b00, 0, 1, 1, 0>;
-def M2_mpy_rnd_ll_s0: T_M2_mpy <0b00, 0, 1, 0, 0>;
-def M2_mpy_rnd_lh_s1: T_M2_mpy <0b01, 0, 1, 1, 0>;
-def M2_mpy_rnd_lh_s0: T_M2_mpy <0b01, 0, 1, 0, 0>;
-def M2_mpy_rnd_hl_s1: T_M2_mpy <0b10, 0, 1, 1, 0>;
-def M2_mpy_rnd_hl_s0: T_M2_mpy <0b10, 0, 1, 0, 0>;
-def M2_mpy_rnd_hh_s1: T_M2_mpy <0b11, 0, 1, 1, 0>;
-def M2_mpy_rnd_hh_s0: T_M2_mpy <0b11, 0, 1, 0, 0>;
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-let Defs = [USR_OVF] in {
-  def M2_mpy_sat_ll_s1: T_M2_mpy <0b00, 1, 0, 1, 0>;
-  def M2_mpy_sat_ll_s0: T_M2_mpy <0b00, 1, 0, 0, 0>;
-  def M2_mpy_sat_lh_s1: T_M2_mpy <0b01, 1, 0, 1, 0>;
-  def M2_mpy_sat_lh_s0: T_M2_mpy <0b01, 1, 0, 0, 0>;
-  def M2_mpy_sat_hl_s1: T_M2_mpy <0b10, 1, 0, 1, 0>;
-  def M2_mpy_sat_hl_s0: T_M2_mpy <0b10, 1, 0, 0, 0>;
-  def M2_mpy_sat_hh_s1: T_M2_mpy <0b11, 1, 0, 1, 0>;
-  def M2_mpy_sat_hh_s0: T_M2_mpy <0b11, 1, 0, 0, 0>;
-
-  def M2_mpy_sat_rnd_ll_s1: T_M2_mpy <0b00, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_ll_s0: T_M2_mpy <0b00, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_lh_s1: T_M2_mpy <0b01, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_lh_s0: T_M2_mpy <0b01, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_hl_s1: T_M2_mpy <0b10, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_hl_s0: T_M2_mpy <0b10, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_hh_s1: T_M2_mpy <0b11, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_hh_s0: T_M2_mpy <0b11, 1, 1, 0, 0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the accumulator.
-//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_M2_mpy_acc < bits<2> LHbits, bit isSat, bit isNac,
-                 bit hasShift, bit isUnsigned >
-  : MInst_acc<(outs IntRegs:$Rx), (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
-                              #"($Rs."#!if(LHbits{1},"h","l")
-                              #", $Rt."#!if(LHbits{0},"h)","l)")
-                              #!if(hasShift,":<<1","")
-                              #!if(isSat,":sat",""),
-  [], "$dst2 = $Rx", M_tc_3x_SLOT23 > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-    let Inst{27-24} = 0b1110;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isNac;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 0>;
-def M2_mpy_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 0>;
-def M2_mpy_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 0>;
-def M2_mpy_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 0>;
-def M2_mpy_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 0>;
-def M2_mpy_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 0>;
-def M2_mpy_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 0>;
-def M2_mpy_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 0>;
-
-//Rx += mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 1>;
-def M2_mpyu_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 1>;
-def M2_mpyu_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 1>;
-def M2_mpyu_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 1>;
-def M2_mpyu_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 1>;
-def M2_mpyu_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 1>;
-def M2_mpyu_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 1>;
-def M2_mpyu_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 1>;
-
-//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 0>;
-def M2_mpy_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 0>;
-def M2_mpy_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 0>;
-def M2_mpy_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 0>;
-def M2_mpy_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 0>;
-def M2_mpy_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 0>;
-def M2_mpy_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 0>;
-def M2_mpy_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 0>;
-
-//Rx -= mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 1>;
-def M2_mpyu_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 1>;
-def M2_mpyu_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 1>;
-def M2_mpyu_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 1>;
-def M2_mpyu_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 1>;
-def M2_mpyu_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 1>;
-def M2_mpyu_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 1>;
-def M2_mpyu_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 1>;
-
-//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
-def M2_mpy_acc_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 0, 0, 0>;
-
-//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
-def M2_mpy_nac_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 1, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the 64-bit destination register.
-//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-class T_M2_mpyd_acc < bits<2> LHbits, bit isNac, bit hasShift, bit isUnsigned>
-  : MInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rxx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
-                                #"($Rs."#!if(LHbits{1},"h","l")
-                                #", $Rt."#!if(LHbits{0},"h)","l)")
-                                #!if(hasShift,":<<1",""),
-  [], "$dst2 = $Rxx", M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0110;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isNac;
-    let Inst{7} = 0;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-def M2_mpyd_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 0>;
-def M2_mpyd_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 0>;
-def M2_mpyd_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 0>;
-def M2_mpyd_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 0>;
-
-def M2_mpyd_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 0>;
-def M2_mpyd_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 0>;
-def M2_mpyd_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 0>;
-def M2_mpyd_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 0>;
-
-def M2_mpyd_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 0>;
-def M2_mpyd_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 0>;
-def M2_mpyd_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 0>;
-def M2_mpyd_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 0>;
-
-def M2_mpyd_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 0>;
-def M2_mpyd_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 0>;
-def M2_mpyd_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 0>;
-def M2_mpyd_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 0>;
-
-def M2_mpyud_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 1>;
-def M2_mpyud_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 1>;
-def M2_mpyud_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 1>;
-def M2_mpyud_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 1>;
-
-def M2_mpyud_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 1>;
-def M2_mpyud_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 1>;
-def M2_mpyud_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 1>;
-def M2_mpyud_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 1>;
-
-def M2_mpyud_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 1>;
-def M2_mpyud_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 1>;
-def M2_mpyud_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 1>;
-def M2_mpyud_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 1>;
-
-def M2_mpyud_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 1>;
-def M2_mpyud_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 1>;
-def M2_mpyud_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 1>;
-def M2_mpyud_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Vector Multipy
-// Used for complex multiply real or imaginary, dual multiply and even halfwords
-//===----------------------------------------------------------------------===//
-class T_M2_vmpy < string opc, bits<3> MajOp, bits<3> MinOp, bit hasShift,
-                  bit isRnd, bit isSat >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                              #!if(isRnd,":rnd","")
-                              #!if(isSat,":sat",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
-let Defs = [USR_OVF] in {
-def M2_vcmpy_s1_sat_i: T_M2_vmpy <"vcmpyi", 0b110, 0b110, 1, 0, 1>;
-def M2_vcmpy_s0_sat_i: T_M2_vmpy <"vcmpyi", 0b010, 0b110, 0, 0, 1>;
-
-// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
-def M2_vcmpy_s1_sat_r: T_M2_vmpy <"vcmpyr", 0b101, 0b110, 1, 0, 1>;
-def M2_vcmpy_s0_sat_r: T_M2_vmpy <"vcmpyr", 0b001, 0b110, 0, 0, 1>;
-
-// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
-def M2_vdmpys_s1: T_M2_vmpy <"vdmpy", 0b100, 0b100, 1, 0, 1>;
-def M2_vdmpys_s0: T_M2_vmpy <"vdmpy", 0b000, 0b100, 0, 0, 1>;
-
-// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
-def M2_vmpy2es_s1: T_M2_vmpy <"vmpyeh", 0b100, 0b110, 1, 0, 1>;
-def M2_vmpy2es_s0: T_M2_vmpy <"vmpyeh", 0b000, 0b110, 0, 0, 1>;
-
-//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyh_s0:  T_M2_vmpy <"vmpywoh", 0b000, 0b111, 0, 0, 1>;
-def M2_mmpyh_s1:  T_M2_vmpy <"vmpywoh", 0b100, 0b111, 1, 0, 1>;
-def M2_mmpyh_rs0: T_M2_vmpy <"vmpywoh", 0b001, 0b111, 0, 1, 1>;
-def M2_mmpyh_rs1: T_M2_vmpy <"vmpywoh", 0b101, 0b111, 1, 1, 1>;
-
-//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyl_s0:  T_M2_vmpy <"vmpyweh", 0b000, 0b101, 0, 0, 1>;
-def M2_mmpyl_s1:  T_M2_vmpy <"vmpyweh", 0b100, 0b101, 1, 0, 1>;
-def M2_mmpyl_rs0: T_M2_vmpy <"vmpyweh", 0b001, 0b101, 0, 1, 1>;
-def M2_mmpyl_rs1: T_M2_vmpy <"vmpyweh", 0b101, 0b101, 1, 1, 1>;
-
-//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyuh_s0:  T_M2_vmpy <"vmpywouh", 0b010, 0b111, 0, 0, 1>;
-def M2_mmpyuh_s1:  T_M2_vmpy <"vmpywouh", 0b110, 0b111, 1, 0, 1>;
-def M2_mmpyuh_rs0: T_M2_vmpy <"vmpywouh", 0b011, 0b111, 0, 1, 1>;
-def M2_mmpyuh_rs1: T_M2_vmpy <"vmpywouh", 0b111, 0b111, 1, 1, 1>;
-
-//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyul_s0:  T_M2_vmpy <"vmpyweuh", 0b010, 0b101, 0, 0, 1>;
-def M2_mmpyul_s1:  T_M2_vmpy <"vmpyweuh", 0b110, 0b101, 1, 0, 1>;
-def M2_mmpyul_rs0: T_M2_vmpy <"vmpyweuh", 0b011, 0b101, 0, 1, 1>;
-def M2_mmpyul_rs1: T_M2_vmpy <"vmpyweuh", 0b111, 0b101, 1, 1, 1>;
-}
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_MType_mpy <string mnemonic, bits<4> RegTyBits, RegisterClass RC,
-                   bits<3> MajOp, bits<3> MinOp, bit isSat = 0, bit isRnd = 0,
-                   string op2Suffix = "", bit isRaw = 0, bit isHi = 0 >
-  : MInst <(outs IntRegs:$dst), (ins RC:$src1, RC:$src2),
-  "$dst = "#mnemonic
-           #"($src1, $src2"#op2Suffix#")"
-           #!if(MajOp{2}, ":<<1", "")
-           #!if(isRnd, ":rnd", "")
-           #!if(isSat, ":sat", "")
-           #!if(isRaw, !if(isHi, ":raw:hi", ":raw:lo"), ""), [] > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src2;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = dst;
-  }
-
-class T_MType_vrcmpy <string mnemonic, bits<3> MajOp, bits<3> MinOp, bit isHi>
-  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, 1, 1, "", 1, isHi>;
-
-class T_MType_dd  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0 >
-  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, isSat, isRnd>;
-
-class T_MType_rr1  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                    bit isSat = 0, bit isRnd = 0 >
-  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd>;
-
-class T_MType_rr2 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0, string op2str = "" >
-  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd, op2str>;
-
-def M2_vradduh    : T_MType_dd <"vradduh", 0b000, 0b001, 0, 0>;
-def M2_vdmpyrs_s0 : T_MType_dd <"vdmpy",   0b000, 0b000, 1, 1>;
-def M2_vdmpyrs_s1 : T_MType_dd <"vdmpy",   0b100, 0b000, 1, 1>;
-
-let CextOpcode = "mpyi", InputType = "reg" in
-def M2_mpyi    : T_MType_rr1 <"mpyi", 0b000, 0b000>, ImmRegRel;
-
-def M2_mpy_up  : T_MType_rr1 <"mpy",  0b000, 0b001>;
-def M2_mpyu_up : T_MType_rr1 <"mpyu", 0b010, 0b001>;
-
-def M2_dpmpyss_rnd_s0 : T_MType_rr1 <"mpy", 0b001, 0b001, 0, 1>;
-
-def M2_vmpy2s_s0pack : T_MType_rr1 <"vmpyh", 0b001, 0b111, 1, 1>;
-def M2_vmpy2s_s1pack : T_MType_rr1 <"vmpyh", 0b101, 0b111, 1, 1>;
-
-def M2_hmmpyh_rs1 : T_MType_rr2 <"mpy", 0b101, 0b100, 1, 1, ".h">;
-def M2_hmmpyl_rs1 : T_MType_rr2 <"mpy", 0b111, 0b100, 1, 1, ".l">;
-
-def M2_cmpyrs_s0  : T_MType_rr2 <"cmpy", 0b001, 0b110, 1, 1>;
-def M2_cmpyrs_s1  : T_MType_rr2 <"cmpy", 0b101, 0b110, 1, 1>;
-def M2_cmpyrsc_s0 : T_MType_rr2 <"cmpy", 0b011, 0b110, 1, 1, "*">;
-def M2_cmpyrsc_s1 : T_MType_rr2 <"cmpy", 0b111, 0b110, 1, 1, "*">;
-
-// V4 Instructions
-def M2_vraddh : T_MType_dd <"vraddh", 0b001, 0b111, 0>;
-def M2_mpysu_up : T_MType_rr1 <"mpysu", 0b011, 0b001, 0>;
-def M2_mpy_up_s1 : T_MType_rr1 <"mpy", 0b101, 0b010, 0>;
-def M2_mpy_up_s1_sat : T_MType_rr1 <"mpy", 0b111, 0b000, 1>;
-
-def M2_hmmpyh_s1 : T_MType_rr2 <"mpy", 0b101, 0b000, 1, 0, ".h">;
-def M2_hmmpyl_s1 : T_MType_rr2 <"mpy", 0b101, 0b001, 1, 0, ".l">;
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern>
-  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, ImmOp:$u8),
-  "$Rd ="#!if(isNeg, "- ", "+ ")#"mpyi($Rs, #$u8)" ,
-   pattern, "", M_tc_3x_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<8> u8;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0000;
-    let Inst{23} = isNeg;
-    let Inst{13} = 0b0;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{12-5} = u8;
-  }
-
-let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in
-def M2_mpysip : T_MType_mpy_ri <0, u8_0Ext, []>;
-
-def M2_mpysin :  T_MType_mpy_ri <1, u8_0Imm, []>;
-
-// Assember mapped to M2_mpyi
-let isAsmParserOnly = 1 in
-def M2_mpyui : MInst<(outs IntRegs:$dst),
-                     (ins IntRegs:$src1, IntRegs:$src2),
-  "$dst = mpyui($src1, $src2)">;
-
-// Rd=mpyi(Rs,#m9)
-// s9 is NOT the same as m9 - but it works.. so far.
-// Assembler maps to either Rd=+mpyi(Rs,#u8) or Rd=-mpyi(Rs,#u8)
-// depending on the value of m9. See Arch Spec.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
-    CextOpcode = "mpyi", InputType = "imm", hasNewValue = 1,
-    isAsmParserOnly = 1 in
-def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9_0Ext:$src2),
-    "$dst = mpyi($src1, #$src2)", []>, ImmRegRel;
-
-let hasNewValue = 1, isExtendable = 1,  opExtentBits = 8, opExtendable = 3,
-    InputType = "imm" in
-class T_MType_acc_ri <string mnemonic, bits<3> MajOp, Operand ImmOp,
-                      list<dag> pattern = []>
- : MInst < (outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, ImmOp:$src3),
-  "$dst "#mnemonic#"($src2, #$src3)",
-  pattern, "$src1 = $dst", M_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src2;
-    bits<8> src3;
-
-    let IClass = 0b1110;
-
-    let Inst{27-26} = 0b00;
-    let Inst{25-23} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b0;
-    let Inst{12-5} = src3;
-    let Inst{4-0} = dst;
-  }
-
-let InputType = "reg", hasNewValue = 1 in
-class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                      bit isSwap = 0, list<dag> pattern = [], bit hasNot = 0,
-                      bit isSat = 0, bit isShift = 0>
-  : MInst < (outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-  "$dst "#mnemonic#"($src2, "#!if(hasNot, "~$src3)","$src3)")
-                          #!if(isShift, ":<<1", "")
-                          #!if(isSat, ":sat", ""),
-  pattern, "$src1 = $dst", M_tc_2_SLOT23 > {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> src3;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = !if(isSwap, src3, src2);
-    let Inst{13} = 0b0;
-    let Inst{12-8} = !if(isSwap, src2, src3);
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in {
-  def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8_0Ext, []>, ImmRegRel;
-
-  def M2_maci   : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0, []>, ImmRegRel;
-}
-
-let CextOpcode = "ADD_acc" in {
-  let isExtentSigned = 1 in
-  def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8_0Ext, []>, ImmRegRel;
-
-  def M2_acci  : T_MType_acc_rr <"+= add",  0b000, 0b001, 0, []>, ImmRegRel;
-}
-
-let CextOpcode = "SUB_acc" in {
-  let isExtentSigned = 1 in
-  def M2_naccii : T_MType_acc_ri <"-= add", 0b101, s8_0Ext>, ImmRegRel;
-
-  def M2_nacci  : T_MType_acc_rr <"-= add",  0b100, 0b001, 0>, ImmRegRel;
-}
-
-let Itinerary = M_tc_3x_SLOT23 in
-def M2_macsin : T_MType_acc_ri <"-= mpyi", 0b011, u8_0Ext>;
-
-def M2_xor_xacc : T_MType_acc_rr < "^= xor", 0b100, 0b011, 0>;
-def M2_subacc : T_MType_acc_rr <"+= sub",  0b000, 0b011, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- XType Vector Instructions
-//===----------------------------------------------------------------------===//
-class T_XTYPE_Vect < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_XTYPE_Vect_acc < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
-  : MInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd += "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
-  [], "$dst2 = $Rdd",M_tc_3x_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_XTYPE_Vect_diff < bits<3> MajOp, string opc >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rtt, DoubleRegs:$Rss),
-  "$Rdd = "#opc#"($Rtt, $Rss)",
-  [], "",M_tc_2_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = 0b000;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector reduce add unsigned bytes: Rdd32=vrmpybu(Rss32,Rtt32)
-def A2_vraddub: T_XTYPE_Vect <"vraddub", 0b010, 0b001, 0>;
-def A2_vraddub_acc: T_XTYPE_Vect_acc <"vraddub", 0b010, 0b001, 0>;
-
-// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
-def A2_vrsadub: T_XTYPE_Vect <"vrsadub", 0b010, 0b010, 0>;
-def A2_vrsadub_acc: T_XTYPE_Vect_acc <"vrsadub", 0b010, 0b010, 0>;
-
-// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
-def M2_vabsdiffh: T_XTYPE_Vect_diff<0b011, "vabsdiffh">;
-
-// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
-def M2_vabsdiffw: T_XTYPE_Vect_diff<0b001, "vabsdiffw">;
-
-// Vector reduce complex multiply real or imaginary:
-// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
-def M2_vrcmpyi_s0:  T_XTYPE_Vect <"vrcmpyi", 0b000, 0b000, 0>;
-def M2_vrcmpyi_s0c: T_XTYPE_Vect <"vrcmpyi", 0b010, 0b000, 1>;
-def M2_vrcmaci_s0:  T_XTYPE_Vect_acc <"vrcmpyi", 0b000, 0b000, 0>;
-def M2_vrcmaci_s0c: T_XTYPE_Vect_acc <"vrcmpyi", 0b010, 0b000, 1>;
-
-def M2_vrcmpyr_s0:  T_XTYPE_Vect <"vrcmpyr", 0b000, 0b001, 0>;
-def M2_vrcmpyr_s0c: T_XTYPE_Vect <"vrcmpyr", 0b011, 0b001, 1>;
-def M2_vrcmacr_s0:  T_XTYPE_Vect_acc <"vrcmpyr", 0b000, 0b001, 0>;
-def M2_vrcmacr_s0c: T_XTYPE_Vect_acc <"vrcmpyr", 0b011, 0b001, 1>;
-
-// Vector reduce halfwords:
-// Rdd[+]=vrmpyh(Rss,Rtt)
-def M2_vrmpy_s0: T_XTYPE_Vect <"vrmpyh", 0b000, 0b010, 0>;
-def M2_vrmac_s0: T_XTYPE_Vect_acc <"vrmpyh", 0b000, 0b010, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Vector Multipy with accumulation.
-// Used for complex multiply real or imaginary, dual multiply and even halfwords
-//===----------------------------------------------------------------------===//
-let Defs = [USR_OVF] in
-class T_M2_vmpy_acc_sat < string opc, bits<3> MajOp, bits<3> MinOp,
-                          bit hasShift, bit isRnd >
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                               #!if(isRnd,":rnd","")#":sat",
-  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_M2_vmpy_acc < string opc, bits<3> MajOp, bits<3> MinOp,
-                      bit hasShift, bit isRnd >
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                               #!if(isRnd,":rnd",""),
-  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector multiply word by signed half with accumulation
-// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmacls_s1:  T_M2_vmpy_acc_sat <"vmpyweh", 0b100, 0b101, 1, 0>;
-def M2_mmacls_s0:  T_M2_vmpy_acc_sat <"vmpyweh", 0b000, 0b101, 0, 0>;
-def M2_mmacls_rs1: T_M2_vmpy_acc_sat <"vmpyweh", 0b101, 0b101, 1, 1>;
-def M2_mmacls_rs0: T_M2_vmpy_acc_sat <"vmpyweh", 0b001, 0b101, 0, 1>;
-
-def M2_mmachs_s1:  T_M2_vmpy_acc_sat <"vmpywoh", 0b100, 0b111, 1, 0>;
-def M2_mmachs_s0:  T_M2_vmpy_acc_sat <"vmpywoh", 0b000, 0b111, 0, 0>;
-def M2_mmachs_rs1: T_M2_vmpy_acc_sat <"vmpywoh", 0b101, 0b111, 1, 1>;
-def M2_mmachs_rs0: T_M2_vmpy_acc_sat <"vmpywoh", 0b001, 0b111, 0, 1>;
-
-// Vector multiply word by unsigned half with accumulation
-// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmaculs_s1:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b110, 0b101, 1, 0>;
-def M2_mmaculs_s0:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b010, 0b101, 0, 0>;
-def M2_mmaculs_rs1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b111, 0b101, 1, 1>;
-def M2_mmaculs_rs0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b011, 0b101, 0, 1>;
-
-def M2_mmacuhs_s1:  T_M2_vmpy_acc_sat <"vmpywouh", 0b110, 0b111, 1, 0>;
-def M2_mmacuhs_s0:  T_M2_vmpy_acc_sat <"vmpywouh", 0b010, 0b111, 0, 0>;
-def M2_mmacuhs_rs1: T_M2_vmpy_acc_sat <"vmpywouh", 0b111, 0b111, 1, 1>;
-def M2_mmacuhs_rs0: T_M2_vmpy_acc_sat <"vmpywouh", 0b011, 0b111, 0, 1>;
-
-// Vector multiply even halfwords with accumulation
-// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
-def M2_vmac2es:    T_M2_vmpy_acc     <"vmpyeh", 0b001, 0b010, 0, 0>;
-def M2_vmac2es_s1: T_M2_vmpy_acc_sat <"vmpyeh", 0b100, 0b110, 1, 0>;
-def M2_vmac2es_s0: T_M2_vmpy_acc_sat <"vmpyeh", 0b000, 0b110, 0, 0>;
-
-// Vector dual multiply with accumulation
-// Rxx+=vdmpy(Rss,Rtt)[:sat]
-def M2_vdmacs_s1: T_M2_vmpy_acc_sat <"vdmpy", 0b100, 0b100, 1, 0>;
-def M2_vdmacs_s0: T_M2_vmpy_acc_sat <"vdmpy", 0b000, 0b100, 0, 0>;
-
-// Vector complex multiply real or imaginary with accumulation
-// Rxx+=vcmpy[ir](Rss,Rtt):sat
-def M2_vcmac_s0_sat_r: T_M2_vmpy_acc_sat <"vcmpyr", 0b001, 0b100, 0, 0>;
-def M2_vcmac_s0_sat_i: T_M2_vmpy_acc_sat <"vcmpyi", 0b010, 0b100, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Multiply signed/unsigned halfwords with and without
-// saturation and rounding
-//===----------------------------------------------------------------------===//
-class T_M2_mpyd < bits<2> LHbits, bit isRnd, bit hasShift, bit isUnsigned >
-  : MInst < (outs DoubleRegs:$Rdd), (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rdd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
-                                       #", $Rt."#!if(LHbits{0},"h)","l)")
-                                       #!if(hasShift,":<<1","")
-                                       #!if(isRnd,":rnd",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isRnd;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-}
-
-def M2_mpyd_hh_s0: T_M2_mpyd<0b11, 0, 0, 0>;
-def M2_mpyd_hl_s0: T_M2_mpyd<0b10, 0, 0, 0>;
-def M2_mpyd_lh_s0: T_M2_mpyd<0b01, 0, 0, 0>;
-def M2_mpyd_ll_s0: T_M2_mpyd<0b00, 0, 0, 0>;
-
-def M2_mpyd_hh_s1: T_M2_mpyd<0b11, 0, 1, 0>;
-def M2_mpyd_hl_s1: T_M2_mpyd<0b10, 0, 1, 0>;
-def M2_mpyd_lh_s1: T_M2_mpyd<0b01, 0, 1, 0>;
-def M2_mpyd_ll_s1: T_M2_mpyd<0b00, 0, 1, 0>;
-
-def M2_mpyd_rnd_hh_s0: T_M2_mpyd<0b11, 1, 0, 0>;
-def M2_mpyd_rnd_hl_s0: T_M2_mpyd<0b10, 1, 0, 0>;
-def M2_mpyd_rnd_lh_s0: T_M2_mpyd<0b01, 1, 0, 0>;
-def M2_mpyd_rnd_ll_s0: T_M2_mpyd<0b00, 1, 0, 0>;
-
-def M2_mpyd_rnd_hh_s1: T_M2_mpyd<0b11, 1, 1, 0>;
-def M2_mpyd_rnd_hl_s1: T_M2_mpyd<0b10, 1, 1, 0>;
-def M2_mpyd_rnd_lh_s1: T_M2_mpyd<0b01, 1, 1, 0>;
-def M2_mpyd_rnd_ll_s1: T_M2_mpyd<0b00, 1, 1, 0>;
-
-//Rdd=mpyu(Rs.[HL],Rt.[HL])[:<<1]
-def M2_mpyud_hh_s0: T_M2_mpyd<0b11, 0, 0, 1>;
-def M2_mpyud_hl_s0: T_M2_mpyd<0b10, 0, 0, 1>;
-def M2_mpyud_lh_s0: T_M2_mpyd<0b01, 0, 0, 1>;
-def M2_mpyud_ll_s0: T_M2_mpyd<0b00, 0, 0, 1>;
-
-def M2_mpyud_hh_s1: T_M2_mpyd<0b11, 0, 1, 1>;
-def M2_mpyud_hl_s1: T_M2_mpyd<0b10, 0, 1, 1>;
-def M2_mpyud_lh_s1: T_M2_mpyd<0b01, 0, 1, 1>;
-def M2_mpyud_ll_s1: T_M2_mpyd<0b00, 0, 1, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class for xtype mpy:
-// Vector multiply
-// Complex multiply
-// multiply 32X32 and use full result
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_XTYPE_mpy64 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                     bit isSat, bit hasShift, bit isConj>
-   : MInst <(outs DoubleRegs:$Rdd),
-            (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rdd = "#mnemonic#"($Rs, $Rt"#!if(isConj,"*)",")")
-                                #!if(hasShift,":<<1","")
-                                #!if(isSat,":sat",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template Class for xtype mpy with accumulation into 64-bit:
-// Vector multiply
-// Complex multiply
-// multiply 32X32 and use full result
-//===----------------------------------------------------------------------===//
-class T_XTYPE_mpy64_acc <string op1, string op2, bits<3> MajOp, bits<3> MinOp,
-                         bit isSat, bit hasShift, bit isConj>
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rxx "#op2#"= "#op1#"($Rs, $Rt"#!if(isConj,"*)",")")
-                                   #!if(hasShift,":<<1","")
-                                   #!if(isSat,":sat",""),
-
-  [] , "$dst2 = $Rxx" > {
-    bits<5> Rxx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-  }
-
-// MPY - Multiply and use full result
-// Rdd = mpy[u](Rs,Rt)
-def M2_dpmpyss_s0 : T_XTYPE_mpy64 < "mpy", 0b000, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_s0 : T_XTYPE_mpy64 < "mpyu", 0b010, 0b000, 0, 0, 0>;
-
-// Rxx[+-]= mpy[u](Rs,Rt)
-def M2_dpmpyss_acc_s0 : T_XTYPE_mpy64_acc < "mpy",  "+", 0b000, 0b000, 0, 0, 0>;
-def M2_dpmpyss_nac_s0 : T_XTYPE_mpy64_acc < "mpy",  "-", 0b001, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_acc_s0 : T_XTYPE_mpy64_acc < "mpyu", "+", 0b010, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_nac_s0 : T_XTYPE_mpy64_acc < "mpyu", "-", 0b011, 0b000, 0, 0, 0>;
-
-// Complex multiply real or imaginary
-// Rxx=cmpy[ir](Rs,Rt)
-def M2_cmpyi_s0 : T_XTYPE_mpy64 < "cmpyi", 0b000, 0b001, 0, 0, 0>;
-def M2_cmpyr_s0 : T_XTYPE_mpy64 < "cmpyr", 0b000, 0b010, 0, 0, 0>;
-
-// Rxx+=cmpy[ir](Rs,Rt)
-def M2_cmaci_s0 : T_XTYPE_mpy64_acc < "cmpyi", "+", 0b000, 0b001, 0, 0, 0>;
-def M2_cmacr_s0 : T_XTYPE_mpy64_acc < "cmpyr", "+", 0b000, 0b010, 0, 0, 0>;
-
-// Complex multiply
-// Rdd=cmpy(Rs,Rt)[:<<]:sat
-def M2_cmpys_s0 : T_XTYPE_mpy64 < "cmpy", 0b000, 0b110, 1, 0, 0>;
-def M2_cmpys_s1 : T_XTYPE_mpy64 < "cmpy", 0b100, 0b110, 1, 1, 0>;
-
-// Rdd=cmpy(Rs,Rt*)[:<<]:sat
-def M2_cmpysc_s0 : T_XTYPE_mpy64 < "cmpy", 0b010, 0b110, 1, 0, 1>;
-def M2_cmpysc_s1 : T_XTYPE_mpy64 < "cmpy", 0b110, 0b110, 1, 1, 1>;
-
-// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
-def M2_cmacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b000, 0b110, 1, 0, 0>;
-def M2_cnacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b000, 0b111, 1, 0, 0>;
-def M2_cmacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b100, 0b110, 1, 1, 0>;
-def M2_cnacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b100, 0b111, 1, 1, 0>;
-
-// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
-def M2_cmacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b010, 0b110, 1, 0, 1>;
-def M2_cnacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b010, 0b111, 1, 0, 1>;
-def M2_cmacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b110, 0b110, 1, 1, 1>;
-def M2_cnacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b110, 0b111, 1, 1, 1>;
-
-// Vector multiply halfwords
-// Rdd=vmpyh(Rs,Rt)[:<<]:sat
-//let Defs = [USR_OVF] in {
-  def M2_vmpy2s_s1 : T_XTYPE_mpy64 < "vmpyh", 0b100, 0b101, 1, 1, 0>;
-  def M2_vmpy2s_s0 : T_XTYPE_mpy64 < "vmpyh", 0b000, 0b101, 1, 0, 0>;
-//}
-
-// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
-def M2_vmac2     : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b001, 0b001, 0, 0, 0>;
-def M2_vmac2s_s1 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b100, 0b101, 1, 1, 0>;
-def M2_vmac2s_s0 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b000, 0b101, 1, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYS +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYS -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/VB +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/VB -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/VH  +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/VH  -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ST +
-//===----------------------------------------------------------------------===//
-///
-// Store doubleword.
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<4> MajOp, bit isHalf >
-  : STInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-  mnemonic#"($src1++#$offset) = $src2"#!if(isHalf, ".h", ""),
-  [], "$src1 = $_dst_" >,
-  AddrModeRel {
-    bits<5> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src2;
-    let Inst{7}     = 0b0;
-    let Inst{6-3}   = offsetBits;
-    let Inst{1}     = 0b0;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
-  : STInst <(outs IntRegs:$_dst_),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2++#$offset) = $src3"#!if(isHalf, ".h", ""),
-  [], "$src2 = $_dst_" >,
-  AddrModeRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<5> src3;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12-8} = src3;
-    let Inst{7} = isPredNew;
-    let Inst{6-3} = offsetBits;
-    let Inst{2} = isPredNot;
-    let Inst{1-0} = src1;
-  }
-
-multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
-                      Operand ImmOp, bits<4> MajOp, bit isHalf = 0 > {
-
-  let BaseOpcode = "POST_"#BaseOp in {
-    def S2_#NAME#_pi : T_store_pi <mnemonic, RC, ImmOp, MajOp, isHalf>;
-
-    // Predicated
-    def S2_p#NAME#t_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 0, 0>;
-    def S2_p#NAME#f_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 1, 0>;
-
-    // Predicated new
-    def S2_p#NAME#tnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
-                                          isHalf, 0, 1>;
-    def S2_p#NAME#fnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
-                                          isHalf, 1, 1>;
-  }
-}
-
-let accessSize = ByteAccess in
-defm storerb: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm, 0b1000>;
-
-let accessSize = HalfWordAccess in
-defm storerh: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm, 0b1010>;
-
-let accessSize = WordAccess in
-defm storeri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm, 0b1100>;
-
-let accessSize = DoubleWordAccess in
-defm storerd: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm, 0b1110>;
-
-let accessSize = HalfWordAccess, isNVStorable = 0 in
-defm storerf: ST_PostInc <"memh", "STrih_H", IntRegs, s4_1Imm, 0b1011, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment stores with register offset.
-//===----------------------------------------------------------------------===//
-class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                     MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$src1, ModRegs:$src2, RC:$src3),
-  mnemonic#"($src1++$src2) = $src3"#!if(isHalf, ".h", ""),
-  [], "$src1 = $_dst_" > {
-    bits<5> src1;
-    bits<1> src2;
-    bits<5> src3;
-    let accessSize = AccessSz;
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13} = src2;
-    let Inst{12-8} = src3;
-    let Inst{7} = 0b0;
-  }
-
-def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
-def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
-def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
-def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
-def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
-
-let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
-class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                  bits<3> MajOp, bit isH = 0>
-  : STInst <(outs),
-            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-  mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
-  AddrModeRel, ImmRegRel {
-    bits<5> src1;
-    bits<14> src2; // Actual address offset
-    bits<5> src3;
-    bits<11> offsetBits; // Represents offset encoding
-
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
-                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
-                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
-                                        /* s11_0Ext */ 11)));
-    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), src2{13-3},
-                     !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
-                     !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
-                                      /* s11_0Ext */ src2{10-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-    let IClass = 0b1010;
-
-    let Inst{27} = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24} = 0b1;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13} = offsetBits{8};
-    let Inst{12-8} = src3;
-    let Inst{7-0} = offsetBits{7-0};
-  }
-
-let opExtendable = 2, isPredicated = 1 in
-class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3>MajOp, bit PredNot, bit isPredNew, bit isH = 0>
-  : STInst <(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2+#$src3) = $src4"#!if(isH,".h",""),
-  [],"",V2LDST_tc_st_SLOT01 >,
-   AddrModeRel, ImmRegRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> src3; // Actual address offset
-    bits<5> src4;
-    bits<6> offsetBits; // Represents offset encoding
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = PredNot;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
-                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
-                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
-                                        /* u6_0Ext */ 6)));
-    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), src3{8-3},
-                     !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
-                     !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
-                                      /* u6_0Ext */ src3{5-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0100;
-
-    let Inst{27} = 0b0;
-    let Inst{26} = PredNot;
-    let Inst{25} = isPredNew;
-    let Inst{24} = 0b0;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = offsetBits{5};
-    let Inst{12-8} = src4;
-    let Inst{7-3} = offsetBits{4-0};
-    let Inst{1-0} = src1;
-  }
-
-let isExtendable = 1, hasSideEffects = 0 in
-multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                 Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    def S2_#NAME#_io : T_store_io <mnemonic, RC, ImmOp, MajOp, isH>;
-
-    // Predicated
-    def S2_p#NAME#t_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 0, 0, isH>;
-    def S2_p#NAME#f_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 1, 0, isH>;
-
-    // Predicated new
-    def S4_p#NAME#tnew_io : T_pstore_io <mnemonic, RC, predImmOp,
-                                         MajOp, 0, 1, isH>;
-    def S4_p#NAME#fnew_io : T_pstore_io <mnemonic, RC, predImmOp,
-                                         MajOp, 1, 1, isH>;
-  }
-}
-
-let addrMode = BaseImmOffset, InputType = "imm" in {
-  let accessSize = ByteAccess in
-    defm storerb: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext, u6_0Ext, 0b000>;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-    defm storerh: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext, u6_1Ext, 0b010>;
-
-  let accessSize = WordAccess, opExtentAlign = 2 in
-    defm storeri: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext, u6_2Ext, 0b100>;
-
-  let accessSize = DoubleWordAccess, isNVStorable = 0, opExtentAlign = 3 in
-    defm storerd: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
-                            u6_3Ext, 0b110>;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-    defm storerf: ST_Idxd < "memh", "STrif", IntRegs, s11_1Ext,
-                            u6_1Ext, 0b011, 1>;
-}
-
-// Store predicate.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_pred : STInst<(outs),
-      (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
-      ".error \"should not emit\"", []>;
-// Store modifier.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_mod : STInst<(outs),
-      (ins IntRegs:$addr, s11_2Ext:$off, ModRegs:$src1),
-      ".error \"should not emit\"", []>;
-
-// S2_allocframe: Allocate stack frame.
-let Defs = [R29, R30], Uses = [R29, R31, R30],
-    hasSideEffects = 0, accessSize = DoubleWordAccess in
-def S2_allocframe: ST0Inst <
-  (outs), (ins u11_3Imm:$u11_3),
-  "allocframe(#$u11_3)" > {
-    bits<14> u11_3;
-
-    let IClass = 0b1010;
-    let Inst{27-16} = 0b000010011101;
-    let Inst{13-11} = 0b000;
-    let Inst{10-0} = u11_3{13-3};
-  }
-
-// S2_storer[bhwdf]_pci: Store byte/half/word/double.
-// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
-let Uses = [CS], addrMode = PostInc in
-class T_store_pci <string mnemonic, RegisterClass RC,
-                         Operand Imm, bits<4>MajOp,
-                         MemAccessSize AlignSize, string RegSrc = "Rt">
-  : STInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, RC:$Rt),
-  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $"#RegSrc#"",
-  [] ,
-  "$Rz = $_dst_" > {
-    bits<5> Rz;
-    bits<7> offset;
-    bits<1> Mu;
-    bits<5> Rt;
-    let accessSize = AlignSize;
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
-                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
-
-    let IClass = 0b1010;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-8} = Rt;
-    let Inst{7} = 0b0;
-    let Inst{6-3} =
-      !if (!eq(!cast<string>(AlignSize), "DoubleWordAccess"), offset{6-3},
-      !if (!eq(!cast<string>(AlignSize), "WordAccess"),       offset{5-2},
-      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"),   offset{4-1},
-                                       /* ByteAccess */       offset{3-0})));
-    let Inst{1} = 0b0;
-  }
-
-def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
-                                 ByteAccess>;
-def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
-                                 HalfWordAccess>;
-def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
-                                 HalfWordAccess, "Rt.h">;
-def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
-                                 WordAccess>;
-def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
-                                 DoubleWordAccess>;
-
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4,
-    addrMode = PostInc in
-class T_storenew_pci <string mnemonic, Operand Imm,
-                             bits<2>MajOp, MemAccessSize AlignSize>
-  : NVInst < (outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, IntRegs:$Nt),
-  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $Nt.new",
-  [],
-  "$Rz = $_dst_"> {
-    bits<5> Rz;
-    bits<6> offset;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let accessSize = AlignSize;
-
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b1001101;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = Nt;
-    let Inst{7} = 0b0;
-    let Inst{6-3} =
-      !if (!eq(!cast<string>(AlignSize), "WordAccess"),     offset{5-2},
-      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
-                                       /* ByteAccess */     offset{3-0}));
-    let Inst{1} = 0b0;
-  }
-
-def S2_storerbnew_pci : T_storenew_pci <"memb", s4_0Imm, 0b00, ByteAccess>;
-def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
-def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Circular stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let Uses = [CS], addrMode = PostInc in
-class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
-                               MemAccessSize AlignSize, string RegSrc = "Rt">
-  : STInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, ModRegs:$Mu, RC:$Rt),
-  #mnemonic#"($Rz ++ I:circ($Mu)) = $"#RegSrc#"",
-  [],
-  "$Rz = $_dst_" > {
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<5> Rt;
-
-    let accessSize = AlignSize;
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
-                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
-
-    let IClass = 0b1010;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-8} = Rt;
-    let Inst{7} = 0b0;
-    let Inst{1} = 0b1;
-  }
-
-def S2_storerb_pcr : T_store_pcr<"memb", IntRegs, 0b1000, ByteAccess>;
-def S2_storerh_pcr : T_store_pcr<"memh", IntRegs, 0b1010, HalfWordAccess>;
-def S2_storeri_pcr : T_store_pcr<"memw", IntRegs, 0b1100, WordAccess>;
-def S2_storerd_pcr : T_store_pcr<"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
-def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
-                                 HalfWordAccess, "Rt.h">;
-
-//===----------------------------------------------------------------------===//
-// Circular .new stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    addrMode = PostInc in
-class T_storenew_pcr <string mnemonic, bits<2>MajOp,
-                                   MemAccessSize AlignSize>
-  : NVInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
-  #mnemonic#"($Rz ++ I:circ($Mu)) = $Nt.new" ,
-  [] ,
-  "$Rz = $_dst_"> {
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let accessSize = AlignSize;
-
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b1001101;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = Nt;
-    let Inst{7} = 0b0;
-    let Inst{1} = 0b1;
-  }
-
-def S2_storerbnew_pcr : T_storenew_pcr <"memb", 0b00, ByteAccess>;
-def S2_storerhnew_pcr : T_storenew_pcr <"memh", 0b01, HalfWordAccess>;
-def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_store_pbr<string mnemonic, RegisterClass RC,
-                            MemAccessSize addrSize, bits<3> majOp,
-                            bit isHalf = 0>
-  : STInst
-    <(outs IntRegs:$_dst_),
-     (ins IntRegs:$Rz, ModRegs:$Mu, RC:$src),
-     #mnemonic#"($Rz ++ $Mu:brev) = $src"#!if (!eq(isHalf, 1), ".h", ""),
-     [], "$Rz = $_dst_" > {
-
-      let accessSize = addrSize;
-
-      bits<5> Rz;
-      bits<1> Mu;
-      bits<5> src;
-
-      let IClass = 0b1010;
-
-      let Inst{27-24} = 0b1111;
-      let Inst{23-21} = majOp;
-      let Inst{7} = 0b0;
-      let Inst{20-16} = Rz;
-      let Inst{13} = Mu;
-      let Inst{12-8} = src;
-    }
-
-let isNVStorable = 1 in {
-  let BaseOpcode = "S2_storerb_pbr" in
-  def S2_storerb_pbr : T_store_pbr<"memb", IntRegs, ByteAccess,
-                                             0b000>, NewValueRel;
-  let BaseOpcode = "S2_storerh_pbr" in
-  def S2_storerh_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess,
-                                             0b010>, NewValueRel;
-  let BaseOpcode = "S2_storeri_pbr" in
-  def S2_storeri_pbr : T_store_pbr<"memw", IntRegs, WordAccess,
-                                             0b100>, NewValueRel;
-}
-
-def S2_storerf_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess, 0b011, 1>;
-def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed .new stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    hasSideEffects = 0, addrMode = PostInc in
-class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
-  : NVInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
-     #mnemonic#"($Rz ++ $Mu:brev) = $Nt.new", [],
-     "$Rz = $_dst_">, NewValueRel {
-    let accessSize = addrSize;
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1111101;
-    let Inst{12-11} = majOp;
-    let Inst{7} = 0b0;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{10-8} = Nt;
-  }
-
-let BaseOpcode = "S2_storerb_pbr" in
-def S2_storerbnew_pbr : T_storenew_pbr<"memb", ByteAccess, 0b00>;
-
-let BaseOpcode = "S2_storerh_pbr" in
-def S2_storerhnew_pbr : T_storenew_pbr<"memh", HalfWordAccess, 0b01>;
-
-let BaseOpcode = "S2_storeri_pbr" in
-def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
-
-//===----------------------------------------------------------------------===//
-// ST -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template class for S_2op instructions.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S2op_1 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
-                RegisterClass RCIn, bits<2> MajOp, bits<3> MinOp, bit isSat>
-  : SInst <(outs RCOut:$dst), (ins RCIn:$src),
-  "$dst = "#mnemonic#"($src)"#!if(isSat, ":sat", ""),
-  [], "", S_2op_tc_1_SLOT23 > {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-22} = MajOp;
-    let Inst{21} = 0b0;
-    let Inst{20-16} = src;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-class T_S2op_1_di <string mnemonic, bits<2> MajOp, bits<3> MinOp>
-  : T_S2op_1 <mnemonic, 0b0100, DoubleRegs, IntRegs, MajOp, MinOp, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_1_id <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
-  : T_S2op_1 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, isSat>;
-
-let hasNewValue = 1 in
-class T_S2op_1_ii <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
-  : T_S2op_1 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp, isSat>;
-
-// Vector sign/zero extend
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def S2_vsxtbh : T_S2op_1_di <"vsxtbh", 0b00, 0b000>;
-  def S2_vsxthw : T_S2op_1_di <"vsxthw", 0b00, 0b100>;
-  def S2_vzxtbh : T_S2op_1_di <"vzxtbh", 0b00, 0b010>;
-  def S2_vzxthw : T_S2op_1_di <"vzxthw", 0b00, 0b110>;
-}
-
-// Vector splat bytes/halfwords
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def S2_vsplatrb : T_S2op_1_ii <"vsplatb", 0b01, 0b111>;
-  def S2_vsplatrh : T_S2op_1_di <"vsplath", 0b01, 0b010>;
-}
-
-// Sign extend word to doubleword
-def A2_sxtw   : T_S2op_1_di <"sxtw", 0b01, 0b000>;
-
-// Vector saturate and pack
-let Defs = [USR_OVF] in {
-  def S2_svsathb  : T_S2op_1_ii <"vsathb", 0b10, 0b000>;
-  def S2_svsathub : T_S2op_1_ii <"vsathub", 0b10, 0b010>;
-  def S2_vsathb   : T_S2op_1_id <"vsathb", 0b00, 0b110>;
-  def S2_vsathub  : T_S2op_1_id <"vsathub", 0b00, 0b000>;
-  def S2_vsatwh   : T_S2op_1_id <"vsatwh", 0b00, 0b010>;
-  def S2_vsatwuh  : T_S2op_1_id <"vsatwuh", 0b00, 0b100>;
-}
-
-// Vector truncate
-def S2_vtrunohb : T_S2op_1_id <"vtrunohb", 0b10, 0b000>;
-def S2_vtrunehb : T_S2op_1_id <"vtrunehb", 0b10, 0b010>;
-
-// Swizzle the bytes of a word
-def A2_swiz : T_S2op_1_ii <"swiz", 0b10, 0b111>;
-
-// Saturate
-let Defs = [USR_OVF] in {
-  def A2_sat   : T_S2op_1_id <"sat", 0b11, 0b000>;
-  def A2_satb  : T_S2op_1_ii <"satb", 0b11, 0b111>;
-  def A2_satub : T_S2op_1_ii <"satub", 0b11, 0b110>;
-  def A2_sath  : T_S2op_1_ii <"sath", 0b11, 0b100>;
-  def A2_satuh : T_S2op_1_ii <"satuh", 0b11, 0b101>;
-  def A2_roundsat : T_S2op_1_id <"round", 0b11, 0b001, 0b1>;
-}
-
-let Itinerary = S_2op_tc_2_SLOT23 in {
-  // Vector round and pack
-  def S2_vrndpackwh   : T_S2op_1_id <"vrndwh", 0b10, 0b100>;
-
-  let Defs = [USR_OVF] in
-  def S2_vrndpackwhs  : T_S2op_1_id <"vrndwh", 0b10, 0b110, 1>;
-
-  // Bit reverse
-  def S2_brev : T_S2op_1_ii <"brev", 0b01, 0b110>;
-
-  // Absolute value word
-  def A2_abs    : T_S2op_1_ii <"abs", 0b10, 0b100>;
-
-  let Defs = [USR_OVF] in
-  def A2_abssat : T_S2op_1_ii <"abs", 0b10, 0b101, 1>;
-
-  // Negate with saturation
-  let Defs = [USR_OVF] in
-  def A2_negsat : T_S2op_1_ii <"neg", 0b10, 0b110, 1>;
-}
-
-class T_S2op_2 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
-                RegisterClass RCIn, bits<3> MajOp, bits<3> MinOp,
-                bit isSat, bit isRnd, list<dag> pattern = []>
-  : SInst <(outs RCOut:$dst),
-  (ins RCIn:$src, u5_0Imm:$u5),
-  "$dst = "#mnemonic#"($src, #$u5)"#!if(isSat, ":sat", "")
-                                   #!if(isRnd, ":rnd", ""),
-  pattern, "", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src;
-    bits<5> u5;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src;
-    let Inst{13} = 0b0;
-    let Inst{12-8} = u5;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-class T_S2op_2_di <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : T_S2op_2 <mnemonic, 0b1000, DoubleRegs, IntRegs, MajOp, MinOp, 0, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_2_id <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : T_S2op_2 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, 0, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_2_ii <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0, list<dag> pattern = []>
-  : T_S2op_2 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp,
-              isSat, isRnd, pattern>;
-
-class T_S2op_shift <string mnemonic, bits<3> MajOp, bits<3> MinOp, SDNode OpNd>
-  : T_S2op_2_ii <mnemonic, MajOp, MinOp, 0, 0, []>;
-
-// Vector arithmetic shift right by immediate with truncate and pack
-def S2_asr_i_svw_trun : T_S2op_2_id <"vasrw", 0b110, 0b010>;
-
-// Arithmetic/logical shift right/left by immediate
-let Itinerary = S_2op_tc_1_SLOT23 in {
-  def S2_asr_i_r : T_S2op_shift <"asr", 0b000, 0b000, sra>;
-  def S2_lsr_i_r : T_S2op_shift <"lsr", 0b000, 0b001, srl>;
-  def S2_asl_i_r : T_S2op_shift <"asl", 0b000, 0b010, shl>;
-}
-
-// Shift left by immediate with saturation
-let Defs = [USR_OVF] in
-def S2_asl_i_r_sat : T_S2op_2_ii <"asl", 0b010, 0b010, 1>;
-
-// Shift right with round
-def S2_asr_i_r_rnd : T_S2op_2_ii <"asr", 0b010, 0b000, 0, 1>;
-
-let isAsmParserOnly = 1 in
-def S2_asr_i_r_rnd_goodsyntax
-  : SInst <(outs IntRegs:$dst), (ins  IntRegs:$src, u5_0Imm:$u5),
-  "$dst = asrrnd($src, #$u5)",
-  [], "", S_2op_tc_1_SLOT23>;
-
-let isAsmParserOnly = 1 in
-def A2_not: ALU32_rr<(outs IntRegs:$dst),(ins IntRegs:$src),
-  "$dst = not($src)">;
-
-class T_S2op_3<string opc, bits<2>MajOp, bits<3>minOp, bits<1> sat = 0>
-  : SInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
-           "$Rdd = "#opc#"($Rss)"#!if(!eq(sat, 1),":sat","")> {
-  bits<5> Rss;
-  bits<5> Rdd;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0;
-  let Inst{23-22} = MajOp;
-  let Inst{20-16} = Rss;
-  let Inst{7-5} = minOp;
-  let Inst{4-0} = Rdd;
-}
-
-def A2_absp : T_S2op_3 <"abs", 0b10, 0b110>;
-def A2_negp : T_S2op_3 <"neg", 0b10, 0b101>;
-def A2_notp : T_S2op_3 <"not", 0b10, 0b100>;
-
-// Innterleave/deinterleave
-def S2_interleave   : T_S2op_3 <"interleave",   0b11, 0b101>;
-def S2_deinterleave : T_S2op_3 <"deinterleave", 0b11, 0b100>;
-
-// Vector Complex conjugate
-def A2_vconj : T_S2op_3 <"vconj", 0b10, 0b111, 1>;
-
-// Vector saturate without pack
-def S2_vsathb_nopack  : T_S2op_3 <"vsathb",  0b00, 0b111>;
-def S2_vsathub_nopack : T_S2op_3 <"vsathub", 0b00, 0b100>;
-def S2_vsatwh_nopack  : T_S2op_3 <"vsatwh",  0b00, 0b110>;
-def S2_vsatwuh_nopack : T_S2op_3 <"vsatwuh", 0b00, 0b101>;
-
-// Vector absolute value halfwords with and without saturation
-// Rdd64=vabsh(Rss64)[:sat]
-def A2_vabsh    : T_S2op_3 <"vabsh", 0b01, 0b100>;
-def A2_vabshsat : T_S2op_3 <"vabsh", 0b01, 0b101, 1>;
-
-// Vector absolute value words with and without saturation
-def A2_vabsw    : T_S2op_3 <"vabsw", 0b01, 0b110>;
-def A2_vabswsat : T_S2op_3 <"vabsw", 0b01, 0b111, 1>;
-
-//===----------------------------------------------------------------------===//
-// STYPE/BIT +
-//===----------------------------------------------------------------------===//
-// Bit count
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_COUNT_LEADING<string MnOp, bits<3> MajOp, bits<3> MinOp, bit Is32,
-                dag Out, dag Inp>
-    : SInst<Out, Inp, "$Rd = "#MnOp#"($Rs)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  let IClass = 0b1000;
-  let Inst{27} = 0b1;
-  let Inst{26} = Is32;
-  let Inst{25-24} = 0b00;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-class T_COUNT_LEADING_32<string MnOp, bits<3> MajOp, bits<3> MinOp>
-    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b1,
-                      (outs IntRegs:$Rd), (ins IntRegs:$Rs)>;
-
-class T_COUNT_LEADING_64<string MnOp, bits<3> MajOp, bits<3> MinOp>
-    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b0,
-                      (outs IntRegs:$Rd), (ins DoubleRegs:$Rs)>;
-
-def S2_cl0     : T_COUNT_LEADING_32<"cl0",     0b000, 0b101>;
-def S2_cl1     : T_COUNT_LEADING_32<"cl1",     0b000, 0b110>;
-def S2_ct0     : T_COUNT_LEADING_32<"ct0",     0b010, 0b100>;
-def S2_ct1     : T_COUNT_LEADING_32<"ct1",     0b010, 0b101>;
-def S2_cl0p    : T_COUNT_LEADING_64<"cl0",     0b010, 0b010>;
-def S2_cl1p    : T_COUNT_LEADING_64<"cl1",     0b010, 0b100>;
-def S2_clb     : T_COUNT_LEADING_32<"clb",     0b000, 0b100>;
-def S2_clbp    : T_COUNT_LEADING_64<"clb",     0b010, 0b000>;
-def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
-
-// The 64-bit counts leading/trailing are defined in HexagonInstrInfoV4.td.
-
-// Bit set/clear/toggle
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_SCT_BIT_IMM<string MnOp, bits<3> MinOp>
-    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, u5_0Imm:$u5),
-            "$Rd = "#MnOp#"($Rs, #$u5)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> u5;
-  let IClass = 0b1000;
-  let Inst{27-21} = 0b1100110;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b0;
-  let Inst{12-8} = u5;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_SCT_BIT_REG<string MnOp, bits<2> MinOp>
-    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Rd = "#MnOp#"($Rs, $Rt)", [], "", S_3op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-22} = 0b011010;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-6} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-def S2_clrbit_i    : T_SCT_BIT_IMM<"clrbit",    0b001>;
-def S2_setbit_i    : T_SCT_BIT_IMM<"setbit",    0b000>;
-def S2_togglebit_i : T_SCT_BIT_IMM<"togglebit", 0b010>;
-def S2_clrbit_r    : T_SCT_BIT_REG<"clrbit",    0b01>;
-def S2_setbit_r    : T_SCT_BIT_REG<"setbit",    0b00>;
-def S2_togglebit_r : T_SCT_BIT_REG<"togglebit", 0b10>;
-
-// Bit test
-
-let hasSideEffects = 0 in
-class T_TEST_BIT_IMM<string MnOp, bits<3> MajOp>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u5_0Imm:$u5),
-            "$Pd = "#MnOp#"($Rs, #$u5)",
-            [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> u5;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0101;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0;
-  let Inst{12-8} = u5;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0 in
-class T_TEST_BIT_REG<string MnOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Pd = "#MnOp#"($Rs, $Rt)",
-            [], "", S_3op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-22} = 0b011100;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{1-0} = Pd;
-}
-
-def S2_tstbit_i : T_TEST_BIT_IMM<"tstbit", 0b000>;
-def S2_tstbit_r : T_TEST_BIT_REG<"tstbit", 0>;
-
-let hasSideEffects = 0 in
-class T_TEST_BITS_IMM<string MnOp, bits<2> MajOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u6_0Imm:$u6),
-            "$Pd = "#MnOp#"($Rs, #$u6)",
-            [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<6> u6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0101;
-  let Inst{23-22} = MajOp;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = u6;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0 in
-class T_TEST_BITS_REG<string MnOp, bits<2> MajOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Pd = "#MnOp#"($Rs, $Rt)",
-            [], "", S_3op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-24} = 0b0111;
-  let Inst{23-22} = MajOp;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{1-0} = Pd;
-}
-
-def C2_bitsclri : T_TEST_BITS_IMM<"bitsclr", 0b10, 0>;
-def C2_bitsclr  : T_TEST_BITS_REG<"bitsclr", 0b10, 0>;
-def C2_bitsset  : T_TEST_BITS_REG<"bitsset", 0b01, 0>;
-
-//===----------------------------------------------------------------------===//
-// STYPE/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PERM +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/PRED +
-//===----------------------------------------------------------------------===//
-
-// Predicate transfer.
-let hasSideEffects = 0, hasNewValue = 1 in
-def C2_tfrpr : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps),
-      "$Rd = $Ps", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Ps;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1001;
-  let Inst{22} = 0b1;
-  let Inst{17-16} = Ps;
-  let Inst{4-0} = Rd;
-}
-
-// Transfer general register to predicate.
-let hasSideEffects = 0 in
-def C2_tfrrp: SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs),
-      "$Pd = $Rs", [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-
-  let IClass = 0b1000;
-  let Inst{27-21} = 0b0101010;
-  let Inst{20-16} = Rs;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0, isCodeGenOnly = 1 in
-def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src),
-     "$dst = $src">;
-
-//===----------------------------------------------------------------------===//
-// STYPE/PRED -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/SHIFT +
-//===----------------------------------------------------------------------===//
-class S_2OpInstImm<string Mnemonic, bits<3>MajOp, bits<3>MinOp,
-                   Operand Imm, list<dag> pattern = [], bit isRnd = 0>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, Imm:$src2),
-           "$dst = "#Mnemonic#"($src1, #$src2)"#!if(isRnd, ":rnd", ""),
-           pattern> {
-  bits<5> src1;
-  bits<5> dst;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = src1;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = dst;
-}
-
-class S_2OpInstImmI6<string Mnemonic, SDNode OpNode, bits<3>MinOp>
-  : S_2OpInstImm<Mnemonic, 0b000, MinOp, u6_0Imm, []> {
-  bits<6> src2;
-  let Inst{13-8} = src2;
-}
-
-// Shift by immediate.
-def S2_asr_i_p : S_2OpInstImmI6<"asr", sra, 0b000>;
-def S2_asl_i_p : S_2OpInstImmI6<"asl", shl, 0b010>;
-def S2_lsr_i_p : S_2OpInstImmI6<"lsr", srl, 0b001>;
-
-// Shift left by small amount and add.
-let AddedComplexity = 100, hasNewValue = 1, hasSideEffects = 0 in
-def S2_addasl_rrri: SInst <(outs IntRegs:$Rd),
-                           (ins IntRegs:$Rt, IntRegs:$Rs, u3_0Imm:$u3),
-  "$Rd = addasl($Rt, $Rs, #$u3)" , [],
-  "", S_3op_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-    bits<3> u3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b0100000;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = u3;
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// STYPE/SHIFT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/VH +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/VH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/VW +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/VW -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/SUPER +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/USER +
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 1, isSoloAX = 1 in
-def Y2_barrier : SYSInst<(outs), (ins), "barrier", [],"",ST_tc_st_SLOT0> {
-  let Inst{31-28} = 0b1010;
-  let Inst{27-21} = 0b1000000;
-}
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/SUPER -
-//===----------------------------------------------------------------------===//
-
-// Generate frameindex addresses. The main reason for the offset operand is
-// that every instruction that is allowed to have frame index as an operand
-// will then have that operand followed by an immediate operand (the offset).
-// This simplifies the frame-index elimination code.
-//
-let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
-    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def PS_fi  : ALU32_ri<(outs IntRegs:$Rd),
-                        (ins IntRegs:$fi, s32_0Imm:$off), "">;
-  def PS_fia : ALU32_ri<(outs IntRegs:$Rd),
-                        (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
-}
-
-//===----------------------------------------------------------------------===//
-// CRUSER - Type.
-//===----------------------------------------------------------------------===//
-// HW loop
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, hasSideEffects = 0 in
-class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
-         : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
-           #mnemonic#"($offset, #$src2)",
-           [], "" , CR_tc_3x_SLOT3> {
-    bits<9> offset;
-    bits<10> src2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-22} = 0b100100;
-    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
-    let Inst{20-16} = src2{9-5};
-    let Inst{12-8} = offset{8-4};
-    let Inst{7-5} = src2{4-2};
-    let Inst{4-3} = offset{3-2};
-    let Inst{1-0} = src2{1-0};
-}
-
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, hasSideEffects = 0 in
-class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
-         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
-           #mnemonic#"($offset, $src2)",
-           [], "" ,CR_tc_3x_SLOT3> {
-    bits<9> offset;
-    bits<5> src2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-22} = 0b000000;
-    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
-    let Inst{20-16} = src2;
-    let Inst{12-8} = offset{8-4};
-    let Inst{4-3} = offset{3-2};
-  }
-
-multiclass LOOP_ri<string mnemonic> {
-  def i : LOOP_iBase<mnemonic, brtarget>;
-  def r : LOOP_rBase<mnemonic, brtarget>;
-
-  let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
-    def iext: LOOP_iBase<mnemonic, brtargetExt, 1>;
-    def rext: LOOP_rBase<mnemonic, brtargetExt, 1>;
-  }
-}
-
-
-let Defs = [SA0, LC0, USR] in
-defm J2_loop0 : LOOP_ri<"loop0">;
-
-// Interestingly only loop0's appear to set usr.lpcfg
-let Defs = [SA1, LC1] in
-defm J2_loop1 : LOOP_ri<"loop1">;
-
-let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
-    Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop0",
-                       []>;
-}
-
-let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
-    Defs = [PC, LC1], Uses = [SA1, LC1] in {
-def ENDLOOP1 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop1",
-                       []>;
-}
-
-// Pipelined loop instructions, sp[123]loop0
-let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, isPredicateLate = 1 in
-class SPLOOP_iBase<string SP, bits<2> op>
-  : CRInst <(outs), (ins brtarget:$r7_2, u10_0Imm:$U10),
-  "p3 = sp"#SP#"loop0($r7_2, #$U10)" > {
-    bits<9> r7_2;
-    bits<10> U10;
-
-    let IClass = 0b0110;
-
-    let Inst{22-21} = op;
-    let Inst{27-23} = 0b10011;
-    let Inst{20-16} = U10{9-5};
-    let Inst{12-8} = r7_2{8-4};
-    let Inst{7-5} = U10{4-2};
-    let Inst{4-3} = r7_2{3-2};
-    let Inst{1-0} = U10{1-0};
-  }
-
-let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, isPredicateLate = 1 in
-class SPLOOP_rBase<string SP, bits<2> op>
-  : CRInst <(outs), (ins brtarget:$r7_2, IntRegs:$Rs),
-  "p3 = sp"#SP#"loop0($r7_2, $Rs)" > {
-    bits<9> r7_2;
-    bits<5> Rs;
-
-    let IClass = 0b0110;
-
-    let Inst{22-21} = op;
-    let Inst{27-23} = 0b00001;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = r7_2{8-4};
-    let Inst{4-3} = r7_2{3-2};
-  }
-
-multiclass SPLOOP_ri<string mnemonic, bits<2> op> {
-  def i : SPLOOP_iBase<mnemonic, op>;
-  def r : SPLOOP_rBase<mnemonic, op>;
-}
-
-defm J2_ploop1s : SPLOOP_ri<"1", 0b01>;
-defm J2_ploop2s : SPLOOP_ri<"2", 0b10>;
-defm J2_ploop3s : SPLOOP_ri<"3", 0b11>;
-
-// if (Rs[!>=<]=#0) jump:[t/nt]
-let Defs = [PC], isPredicated = 1, isBranch = 1, hasSideEffects = 0,
-    hasSideEffects = 0 in
-class J2_jump_0_Base<string compare, bit isTak, bits<2> op>
-  : CRInst <(outs), (ins IntRegs:$Rs, brtarget:$r13_2),
-  "if ($Rs"#compare#"#0) jump"#!if(isTak, ":t", ":nt")#" $r13_2" > {
-    bits<5> Rs;
-    bits<15> r13_2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-24} = 0b0001;
-    let Inst{23-22} = op;
-    let Inst{12} = isTak;
-    let Inst{21} = r13_2{14};
-    let Inst{20-16} = Rs;
-    let Inst{11-1} = r13_2{12-2};
-    let Inst{13} = r13_2{13};
-  }
-
-multiclass J2_jump_compare_0<string compare, bits<2> op> {
-  def NAME    : J2_jump_0_Base<compare, 0, op>;
-  def NAME#pt : J2_jump_0_Base<compare, 1, op>;
-}
-
-defm J2_jumprz    : J2_jump_compare_0<"!=", 0b00>;
-defm J2_jumprgtez : J2_jump_compare_0<">=", 0b01>;
-defm J2_jumprnz   : J2_jump_compare_0<"==", 0b10>;
-defm J2_jumprltez : J2_jump_compare_0<"<=", 0b11>;
-
-// Transfer to/from Control/GPR Guest/GPR
-let hasSideEffects = 0 in
-class TFR_CR_RS_base<RegisterClass CTRC, RegisterClass RC, bit isDouble>
-  : CRInst <(outs CTRC:$dst), (ins RC:$src),
-  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0110;
-
-    let Inst{27-25} = 0b001;
-    let Inst{24} = isDouble;
-    let Inst{23-21} = 0b001;
-    let Inst{20-16} = src;
-    let Inst{4-0} = dst;
-  }
-
-def A2_tfrrcr : TFR_CR_RS_base<CtrRegs, IntRegs, 0b0>;
-def A4_tfrpcp : TFR_CR_RS_base<CtrRegs64, DoubleRegs, 0b1>;
-def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
-def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
-
-let hasSideEffects = 0 in
-class TFR_RD_CR_base<RegisterClass RC, RegisterClass CTRC, bit isSingle>
-  : CRInst <(outs RC:$dst), (ins CTRC:$src),
-  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0110;
-
-    let Inst{27-26} = 0b10;
-    let Inst{25} = isSingle;
-    let Inst{24-21} = 0b0000;
-    let Inst{20-16} = src;
-    let Inst{4-0} = dst;
-  }
-
-let hasNewValue = 1, opNewValue = 0 in
-def A2_tfrcrr : TFR_RD_CR_base<IntRegs, CtrRegs, 1>;
-def A4_tfrcpp : TFR_RD_CR_base<DoubleRegs, CtrRegs64, 0>;
-def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
-def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
-
-// Y4_trace: Send value to etm trace.
-let isSoloAX = 1, hasSideEffects = 0 in
-def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs),
-  "trace($Rs)"> {
-    bits<5> Rs;
-
-    let IClass = 0b0110;
-    let Inst{27-21} = 0b0010010;
-    let Inst{20-16} = Rs;
-  }
-
-// HI/LO Instructions
-let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
-    hasNewValue = 1, opNewValue = 0 in
-class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
-  : ALU32_ri<(outs IntRegs:$dst),
-              (ins u16_0Imm:$imm_value),
-              "$dst"#RegHalf#" = $imm_value", []> {
-    bits<5> dst;
-    bits<32> imm_value;
-    let IClass = 0b0111;
-
-    let Inst{27} = Rs;
-    let Inst{26-24} = MajOp;
-    let Inst{21} = MinOp;
-    let Inst{20-16} = dst;
-    let Inst{23-22} = imm_value{15-14};
-    let Inst{13-0} = imm_value{13-0};
-}
-
-let isAsmParserOnly = 1 in {
-  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
-  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
-}
-
-let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
-  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
-                "$Rd = CONST32(#$v)", []>;
-  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
-                "$Rd = CONST64(#$v)", []>;
-}
-
-let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
-    isCodeGenOnly = 1 in
-def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
-
-let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
-    isCodeGenOnly = 1 in
-def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
-
-let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              ".error \"should not emit\" ", []>;
-
-let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                             ".error \"should not emit\" ", []>;
-
-// Call subroutine indirectly.
-let Defs = VolatileV3.Regs in
-def J2_callr : JUMPR_MISC_CALLR<0, 1>;
-
-// Indirect tail-call.
-let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
-    isTerminator = 1, isCodeGenOnly = 1 in
-def PS_tailcall_r : T_JMPr;
-
-// Direct tail-calls.
-let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
-    isTerminator = 1, isCodeGenOnly = 1 in
-def PS_tailcall_i : JInst<(outs), (ins calltarget:$dst), "", []>;
-
-// The reason for the custom inserter is to record all ALLOCA instructions
-// in MachineFunctionInfo.
-let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
-def PS_alloca: ALU32Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
-
-let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
-def PS_aligna : ALU32Inst<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
-
-// XTYPE/SHIFT
-//
-//===----------------------------------------------------------------------===//
-// Template Class
-// Shift by immediate/register and accumulate/logical
-//===----------------------------------------------------------------------===//
-
-// Rx[+-&|]=asr(Rs,#u5)
-// Rx[+-&|^]=lsr(Rs,#u5)
-// Rx[+-&|^]=asl(Rs,#u5)
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_shift_imm_acc_r <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs IntRegs:$Rx),
-              (ins IntRegs:$src1, IntRegs:$Rs, u5_0Imm:$u5),
-  "$Rx "#opc2#opc1#"($Rs, #$u5)", [],
-  "$src1 = $Rx", S_2op_tc_2_SLOT23> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> u5;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b1110;
-    let Inst{23-22} = majOp{2-1};
-    let Inst{13} = 0b0;
-    let Inst{7} = majOp{0};
-    let Inst{6-5} = minOp;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = u5;
-  }
-
-// Rx[+-&|]=asr(Rs,Rt)
-// Rx[+-&|^]=lsr(Rs,Rt)
-// Rx[+-&|^]=asl(Rs,Rt)
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_shift_reg_acc_r <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<2> majOp, bits<2> minOp>
-  : SInst_acc<(outs IntRegs:$Rx),
-              (ins IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#opc2#opc1#"($Rs, $Rt)", [],
-  "$src1 = $Rx", S_3op_tc_2_SLOT23 > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{23-22} = majOp;
-    let Inst{7-6} = minOp;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-// Rxx[+-&|]=asr(Rss,#u6)
-// Rxx[+-&|^]=lsr(Rss,#u6)
-// Rxx[+-&|^]=asl(Rss,#u6)
-
-class T_shift_imm_acc_p <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$src1, DoubleRegs:$Rss, u6_0Imm:$u6),
-  "$Rxx "#opc2#opc1#"($Rss, #$u6)", [],
-  "$src1 = $Rxx", S_2op_tc_2_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<6> u6;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-22} = majOp{2-1};
-    let Inst{7} = majOp{0};
-    let Inst{6-5} = minOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{13-8} = u6;
-  }
-
-
-// Rxx[+-&|]=asr(Rss,Rt)
-// Rxx[+-&|^]=lsr(Rss,Rt)
-// Rxx[+-&|^]=asl(Rss,Rt)
-// Rxx[+-&|^]=lsl(Rss,Rt)
-
-class T_shift_reg_acc_p <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$src1, DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rxx "#opc2#opc1#"($Rss, $Rt)", [],
-  "$src1 = $Rxx", S_3op_tc_2_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = majOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rt;
-    let Inst{7-6} = minOp;
-    let Inst{4-0} = Rxx;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multi-class for the shift instructions with logical/arithmetic operators.
-//===----------------------------------------------------------------------===//
-
-multiclass xtype_imm_base<string OpcStr1, string OpcStr2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp > {
-  def _i_r#NAME : T_shift_imm_acc_r< OpcStr1, OpcStr2, OpNode1,
-                                     OpNode2, majOp, minOp >;
-  def _i_p#NAME : T_shift_imm_acc_p< OpcStr1, OpcStr2, OpNode1,
-                                     OpNode2, majOp, minOp >;
-}
-
-multiclass xtype_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  defm _acc  : xtype_imm_base< opc1, "+= ", OpNode, add, 0b001, minOp>;
-
-  defm _nac  : xtype_imm_base< opc1, "-= ", OpNode, sub, 0b000, minOp>;
-  defm _and  : xtype_imm_base< opc1, "&= ", OpNode, and, 0b010, minOp>;
-  defm _or   : xtype_imm_base< opc1, "|= ", OpNode,  or, 0b011, minOp>;
-}
-
-multiclass xtype_xor_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
-let AddedComplexity = 100 in
-  defm _xacc  : xtype_imm_base< opc1, "^= ", OpNode, xor, 0b100, minOp>;
-}
-
-defm S2_asr : xtype_imm_acc<"asr", sra, 0b00>;
-
-defm S2_lsr : xtype_imm_acc<"lsr", srl, 0b01>,
-              xtype_xor_imm_acc<"lsr", srl, 0b01>;
-
-defm S2_asl : xtype_imm_acc<"asl", shl, 0b10>,
-              xtype_xor_imm_acc<"asl", shl, 0b10>;
-
-multiclass xtype_reg_acc_r<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  def _acc : T_shift_reg_acc_r <opc1, "+= ", OpNode, add, 0b11, minOp>;
-
-  def _nac : T_shift_reg_acc_r <opc1, "-= ", OpNode, sub, 0b10, minOp>;
-  def _and : T_shift_reg_acc_r <opc1, "&= ", OpNode, and, 0b01, minOp>;
-  def _or  : T_shift_reg_acc_r <opc1, "|= ", OpNode,  or, 0b00, minOp>;
-}
-
-multiclass xtype_reg_acc_p<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  def _acc : T_shift_reg_acc_p <opc1, "+= ", OpNode, add, 0b110, minOp>;
-
-  def _nac : T_shift_reg_acc_p <opc1, "-= ", OpNode, sub, 0b100, minOp>;
-  def _and : T_shift_reg_acc_p <opc1, "&= ", OpNode, and, 0b010, minOp>;
-  def _or  : T_shift_reg_acc_p <opc1, "|= ", OpNode,  or, 0b000, minOp>;
-  def _xor : T_shift_reg_acc_p <opc1, "^= ", OpNode, xor, 0b011, minOp>;
-}
-
-multiclass xtype_reg_acc<string OpcStr, SDNode OpNode, bits<2> minOp > {
-  defm _r_r : xtype_reg_acc_r <OpcStr, OpNode, minOp>;
-  defm _r_p : xtype_reg_acc_p <OpcStr, OpNode, minOp>;
-}
-
-defm S2_asl : xtype_reg_acc<"asl", shl, 0b10>;
-defm S2_asr : xtype_reg_acc<"asr", sra, 0b00>;
-defm S2_lsr : xtype_reg_acc<"lsr", srl, 0b01>;
-defm S2_lsl : xtype_reg_acc<"lsl", shl, 0b11>;
-
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S3op_1 <string mnemonic, RegisterClass RC, bits<2> MajOp, bits<3> MinOp,
-                bit SwapOps, bit isSat = 0, bit isRnd = 0, bit hasShift = 0>
-  : SInst <(outs RC:$dst),
-           (ins DoubleRegs:$src1, DoubleRegs:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)"#!if(isRnd, ":rnd", "")
-                                     #!if(hasShift,":>>1","")
-                                     #!if(isSat, ":sat", ""),
-  [], "", S_3op_tc_2_SLOT23 > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0001;
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = !if (SwapOps, src2, src1);
-    let Inst{12-8}  = !if (SwapOps, src1, src2);
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = dst;
-  }
-
-class T_S3op_64 <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit SwapOps,
-                 bit isSat = 0, bit isRnd = 0, bit hasShift = 0 >
-  : T_S3op_1 <mnemonic, DoubleRegs, MajOp, MinOp, SwapOps,
-              isSat, isRnd, hasShift>;
-
-let Itinerary = S_3op_tc_1_SLOT23 in {
-  def S2_shuffeb : T_S3op_64 < "shuffeb", 0b00, 0b010, 0>;
-  def S2_shuffeh : T_S3op_64 < "shuffeh", 0b00, 0b110, 0>;
-  def S2_shuffob : T_S3op_64 < "shuffob", 0b00, 0b100, 1>;
-  def S2_shuffoh : T_S3op_64 < "shuffoh", 0b10, 0b000, 1>;
-
-  def S2_vtrunewh : T_S3op_64 < "vtrunewh", 0b10, 0b010, 0>;
-  def S2_vtrunowh : T_S3op_64 < "vtrunowh", 0b10, 0b100, 0>;
-}
-
-def S2_lfsp : T_S3op_64 < "lfs", 0b10, 0b110, 0>;
-
-let hasSideEffects = 0 in
-class T_S3op_2 <string mnemonic, bits<3> MajOp, bit SwapOps>
-  : SInst < (outs DoubleRegs:$Rdd),
-            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu)",
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<2> Pu;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
-    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
-    let Inst{6-5} = Pu;
-    let Inst{4-0} = Rdd;
-  }
-
-def S2_valignrb  : T_S3op_2 < "valignb",  0b000, 1>;
-def S2_vsplicerb : T_S3op_2 < "vspliceb", 0b100, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template class used by vector shift, vector rotate, vector neg,
-// 32-bit shift, 64-bit shifts, etc.
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0 in
-class T_S3op_3 <string mnemonic, RegisterClass RC, bits<2> MajOp,
-                 bits<2> MinOp, bit isSat = 0, list<dag> pattern = [] >
-  : SInst <(outs RC:$dst),
-           (ins RC:$src1, IntRegs:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)"#!if(isSat, ":sat", ""),
-  pattern, "", S_3op_tc_1_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b0110, 0b0011);
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{12-8} = src2;
-    let Inst{7-6} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-let hasNewValue = 1 in
-class T_S3op_shift32 <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, IntRegs, 0b01, MinOp, 0, []>;
-
-let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in
-class T_S3op_shift32_Sat <string mnemonic, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, IntRegs, 0b00, MinOp, 1, []>;
-
-
-class T_S3op_shift64 <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, DoubleRegs, 0b10, MinOp, 0, []>;
-
-
-class T_S3op_shiftVect <string mnemonic, bits<2> MajOp, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, DoubleRegs, MajOp, MinOp, 0, []>;
-
-
-// Shift by register
-// Rdd=[asr|lsr|asl|lsl](Rss,Rt)
-
-def S2_asr_r_p : T_S3op_shift64 < "asr", sra, 0b00>;
-def S2_lsr_r_p : T_S3op_shift64 < "lsr", srl, 0b01>;
-def S2_asl_r_p : T_S3op_shift64 < "asl", shl, 0b10>;
-def S2_lsl_r_p : T_S3op_shift64 < "lsl", shl, 0b11>;
-
-// Rd=[asr|lsr|asl|lsl](Rs,Rt)
-
-def S2_asr_r_r : T_S3op_shift32<"asr", sra, 0b00>;
-def S2_lsr_r_r : T_S3op_shift32<"lsr", srl, 0b01>;
-def S2_asl_r_r : T_S3op_shift32<"asl", shl, 0b10>;
-def S2_lsl_r_r : T_S3op_shift32<"lsl", shl, 0b11>;
-
-// Shift by register with saturation
-// Rd=asr(Rs,Rt):sat
-// Rd=asl(Rs,Rt):sat
-
-let Defs = [USR_OVF] in {
-  def S2_asr_r_r_sat : T_S3op_shift32_Sat<"asr", 0b00>;
-  def S2_asl_r_r_sat : T_S3op_shift32_Sat<"asl", 0b10>;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_S3op_8 <string opc, bits<3> MinOp, bit isSat, bit isRnd, bit hasShift, bit hasSplat = 0>
-  : SInst < (outs IntRegs:$Rd),
-            (ins DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rd = "#opc#"($Rss, $Rt"#!if(hasSplat, "*", "")#")"
-                           #!if(hasShift, ":<<1", "")
-                           #!if(isRnd, ":rnd", "")
-                           #!if(isSat, ":sat", ""),
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = Rd;
-  }
-
-def S2_asr_r_svw_trun : T_S3op_8<"vasrw", 0b010, 0, 0, 0>;
-
-let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def S2_vcrotate : T_S3op_shiftVect < "vcrotate", 0b11, 0b00>;
-
-let hasSideEffects = 0 in
-class T_S3op_7 <string mnemonic, bit MajOp >
-  : SInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, u3_0Imm:$u3),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, #$u3)" ,
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<3> u3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0000;
-    let Inst{23}    = MajOp;
-    let Inst{20-16} = !if(MajOp, Rss, Rtt);
-    let Inst{12-8}  =  !if(MajOp, Rtt, Rss);
-    let Inst{7-5}   = u3;
-    let Inst{4-0}   = Rdd;
-  }
-
-def S2_valignib  : T_S3op_7 < "valignb", 0>;
-def S2_vspliceib : T_S3op_7 < "vspliceb", 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for 'insert bitfield' instructions
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S3op_insert <string mnemonic, RegisterClass RC>
-  : SInst <(outs RC:$dst),
-           (ins RC:$src1, RC:$src2, DoubleRegs:$src3),
-  "$dst = "#mnemonic#"($src2, $src3)" ,
-  [], "$src1 = $dst", S_3op_tc_1_SLOT23 > {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> src3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-26} = 0b10;
-    let Inst{25-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b00, 0b10);
-    let Inst{23}    = 0b0;
-    let Inst{20-16} = src2;
-    let Inst{12-8}  = src3;
-    let Inst{4-0}   = dst;
-  }
-
-let hasSideEffects = 0 in
-class T_S2op_insert <bits<4> RegTyBits, RegisterClass RC, Operand ImmOp>
-  : SInst <(outs RC:$dst), (ins RC:$dst2, RC:$src1, ImmOp:$src2, ImmOp:$src3),
-  "$dst = insert($src1, #$src2, #$src3)",
-  [], "$dst2 = $dst", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<6> src2;
-    bits<6> src3;
-    bit bit23;
-    bit bit13;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5}, 0);
-    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23}    = bit23;
-    let Inst{22-21} = src3{4-3};
-    let Inst{20-16} = src1;
-    let Inst{13}    = bit13;
-    let Inst{12-8}  = src2{4-0};
-    let Inst{7-5}   = src3{2-0};
-    let Inst{4-0}   = dst;
-  }
-
-// Rx=insert(Rs,Rtt)
-// Rx=insert(Rs,#u5,#U5)
-let hasNewValue = 1 in {
-  def S2_insert_rp : T_S3op_insert <"insert", IntRegs>;
-  def S2_insert    : T_S2op_insert <0b1111, IntRegs, u5_0Imm>;
-}
-
-// Rxx=insert(Rss,Rtt)
-// Rxx=insert(Rss,#u6,#U6)
-def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>;
-def S2_insertp    : T_S2op_insert <0b0011, DoubleRegs, u6_0Imm>;
-
-
-//===----------------------------------------------------------------------===//
-// Template class for 'extract bitfield' instructions
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_S3op_extract <string mnemonic, bits<2> MinOp>
-  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
-  "$Rd = "#mnemonic#"($Rs, $Rtt)",
-  [], "", S_3op_tc_2_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rtt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b100100;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-6}   = MinOp;
-    let Inst{4-0}   = Rd;
-  }
-
-let hasSideEffects = 0 in
-class T_S2op_extract <string mnemonic, bits<4> RegTyBits,
-                      RegisterClass RC, Operand ImmOp>
-  : SInst <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2, ImmOp:$src3),
-  "$dst = "#mnemonic#"($src1, #$src2, #$src3)",
-  [], "", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<6> src2;
-    bits<6> src3;
-    bit bit23;
-    bit bit13;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5},
-                !if (!eq(mnemonic, "extractu"), 0, 1));
-
-    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23}    = bit23;
-    let Inst{22-21} = src3{4-3};
-    let Inst{20-16} = src1;
-    let Inst{13}    = bit13;
-    let Inst{12-8}  = src2{4-0};
-    let Inst{7-5}   = src3{2-0};
-    let Inst{4-0}   = dst;
-  }
-
-// Extract bitfield
-
-// Rdd=extractu(Rss,Rtt)
-// Rdd=extractu(Rss,#u6,#U6)
-def S2_extractup_rp : T_S3op_64 < "extractu", 0b00, 0b000, 0>;
-def S2_extractup    : T_S2op_extract <"extractu", 0b0001, DoubleRegs, u6_0Imm>;
-
-// Rd=extractu(Rs,Rtt)
-// Rd=extractu(Rs,#u5,#U5)
-let hasNewValue = 1 in {
-  def S2_extractu_rp : T_S3op_extract<"extractu", 0b00>;
-  def S2_extractu    : T_S2op_extract <"extractu", 0b1101, IntRegs, u5_0Imm>;
-}
-
-//===----------------------------------------------------------------------===//
-// :raw for of tableindx[bdhw] insns
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class tableidxRaw<string OpStr, bits<2>MinOp>
-  : SInst <(outs IntRegs:$Rx),
-           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, s6_0Imm:$S6),
-           "$Rx = "#OpStr#"($Rs, #$u4, #$S6):raw",
-    [], "$Rx = $_dst_" > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<4> u4;
-    bits<6> S6;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b0111;
-    let Inst{23-22} = MinOp;
-    let Inst{21}    = u4{3};
-    let Inst{20-16} = Rs;
-    let Inst{13-8}  = S6;
-    let Inst{7-5}   = u4{2-0};
-    let Inst{4-0}   = Rx;
-  }
-
-def S2_tableidxb : tableidxRaw<"tableidxb", 0b00>;
-def S2_tableidxh : tableidxRaw<"tableidxh", 0b01>;
-def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>;
-def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>;
-
-//===----------------------------------------------------------------------===//
-// Template class for 'table index' instructions which are assembler mapped
-// to their :raw format.
-//===----------------------------------------------------------------------===//
-let isPseudo = 1 in
-class tableidx_goodsyntax <string mnemonic>
-  : SInst <(outs IntRegs:$Rx),
-           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, u5_0Imm:$u5),
-           "$Rx = "#mnemonic#"($Rs, #$u4, #$u5)",
-           [], "$Rx = $_dst_" >;
-
-def S2_tableidxb_goodsyntax : tableidx_goodsyntax<"tableidxb">;
-def S2_tableidxh_goodsyntax : tableidx_goodsyntax<"tableidxh">;
-def S2_tableidxw_goodsyntax : tableidx_goodsyntax<"tableidxw">;
-def S2_tableidxd_goodsyntax : tableidx_goodsyntax<"tableidxd">;
-
-//===----------------------------------------------------------------------===//
-// V3 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV3.td"
-
-//===----------------------------------------------------------------------===//
-// V3 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V4 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV4.td"
-
-//===----------------------------------------------------------------------===//
-// V4 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V5 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV5.td"
-
-//===----------------------------------------------------------------------===//
-// V5 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V60 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV60.td"
-
-//===----------------------------------------------------------------------===//
-// V60 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU32/64/Vector +
-//===----------------------------------------------------------------------===///
-
-include "HexagonInstrInfoVector.td"
-
-include "HexagonInstrAlias.td"
-include "HexagonSystemInst.td"
-
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
deleted file mode 100644
index 225f94405076..000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ /dev/null
@@ -1,215 +0,0 @@
-//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V3 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// J +
-//===----------------------------------------------------------------------===//
-// Call subroutine.
-let isCall = 1, hasSideEffects = 1, isPredicable = 1,
-    isExtended = 0, isExtendable = 1, opExtendable = 0,
-    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
-class T_Call<bit CSR, string ExtStr>
-  : JInst<(outs), (ins calltarget:$dst),
-      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
-  let BaseOpcode = "call";
-  bits<24> dst;
-
-  let Defs = !if (CSR, VolatileV3.Regs, []);
-  let IClass = 0b0101;
-  let Inst{27-25} = 0b101;
-  let Inst{24-16,13-1} = dst{23-2};
-  let Inst{0} = 0b0;
-}
-
-let isCall = 1, hasSideEffects = 1, isPredicated = 1,
-    isExtended = 0, isExtendable = 1, opExtendable = 1,
-    isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
-class T_CallPred<bit CSR, bit IfTrue, string ExtStr>
-  : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
-      CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
-      [], "", J_tc_2early_SLOT23> {
-  let BaseOpcode = "call";
-  let isPredicatedFalse = !if(IfTrue,0,1);
-  bits<2> Pu;
-  bits<17> dst;
-
-  let Defs = !if (CSR, VolatileV3.Regs, []);
-  let IClass = 0b0101;
-  let Inst{27-24} = 0b1101;
-  let Inst{23-22,20-16,13,7-1} = dst{16-2};
-  let Inst{21} = !if(IfTrue,0,1);
-  let Inst{11} = 0b0;
-  let Inst{9-8} = Pu;
-}
-
-multiclass T_Calls<bit CSR, string ExtStr> {
-  def NAME : T_Call<CSR, ExtStr>;
-  def t    : T_CallPred<CSR, 1, ExtStr>;
-  def f    : T_CallPred<CSR, 0, ExtStr>;
-}
-
-defm J2_call: T_Calls<1, "">, PredRel;
-
-let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
-    Defs = VolatileV3.Regs in
-def PS_call_nr : T_Call<1, "">, PredRel;
-
-let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
-    Defs = [PC, R31, R6, R7, P0] in
-def PS_call_stk :  T_Call<0, "">, PredRel;
-
-//===----------------------------------------------------------------------===//
-// J -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// JR +
-//===----------------------------------------------------------------------===//
-// Call subroutine from register.
-
-let isCodeGenOnly = 1, Defs = VolatileV3.Regs in {
-  def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
-}
-
-//===----------------------------------------------------------------------===//
-// JR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU +
-//===----------------------------------------------------------------------===//
-
-let Defs = [USR_OVF], Itinerary = ALU64_tc_2_SLOT23 in
-def A2_addpsat : T_ALU64_arith<"add", 0b011, 0b101, 1, 0, 1>;
-
-class T_ALU64_addsp_hl<string suffix, bits<3> MinOp>
-  : T_ALU64_rr<"add", suffix, 0b0011, 0b011, MinOp, 0, 0, "">;
-
-def A2_addspl : T_ALU64_addsp_hl<":raw:lo", 0b110>;
-def A2_addsph : T_ALU64_addsp_hl<":raw:hi", 0b111>;
-
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def A2_addsp : ALU64_rr<(outs DoubleRegs:$Rd),
-  (ins IntRegs:$Rs, DoubleRegs:$Rt), "$Rd = add($Rs, $Rt)", [],
-  "", ALU64_tc_1_SLOT23>;
-
-
-let hasSideEffects = 0 in
-class T_XTYPE_MIN_MAX_P<bit isMax, bit isUnsigned>
-  : ALU64Inst<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rt, DoubleRegs:$Rs),
-  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
-          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-
-  let Inst{27-23} = 0b00111;
-  let Inst{22-21} = !if(isMax, 0b10, 0b01);
-  let Inst{20-16} = !if(isMax, Rt, Rs);
-  let Inst{12-8} = !if(isMax, Rs, Rt);
-  let Inst{7} = 0b1;
-  let Inst{6} = !if(isMax, 0b0, 0b1);
-  let Inst{5} = isUnsigned;
-  let Inst{4-0} = Rd;
-}
-
-def A2_minp  : T_XTYPE_MIN_MAX_P<0, 0>;
-def A2_minup : T_XTYPE_MIN_MAX_P<0, 1>;
-def A2_maxp  : T_XTYPE_MIN_MAX_P<1, 0>;
-def A2_maxup : T_XTYPE_MIN_MAX_P<1, 1>;
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// :raw form of vrcmpys:hi/lo insns
-//===----------------------------------------------------------------------===//
-// Vector reduce complex multiply by scalar.
-let Defs = [USR_OVF], hasSideEffects = 0 in
-class T_vrcmpRaw<string HiLo, bits<3>MajOp>:
-  MInst<(outs DoubleRegs:$Rdd),
-         (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-         "$Rdd = vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, []> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b100;
-    let Inst{4-0}   = Rdd;
-}
-
-def M2_vrcmpys_s1_h: T_vrcmpRaw<"hi", 0b101>;
-def M2_vrcmpys_s1_l: T_vrcmpRaw<"lo", 0b111>;
-
-// Assembler mapped to M2_vrcmpys_s1_h or M2_vrcmpys_s1_l
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def M2_vrcmpys_s1
- : MInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
- "$Rdd=vrcmpys($Rss,$Rt):<<1:sat">;
-
-// Vector reduce complex multiply by scalar with accumulation.
-let Defs = [USR_OVF], hasSideEffects = 0 in
-class T_vrcmpys_acc<string HiLo, bits<3>MajOp>:
-  MInst <(outs DoubleRegs:$Rxx),
-         (ins DoubleRegs:$_src_, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, [],
-  "$Rxx = $_src_"> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b100;
-    let Inst{4-0}   = Rxx;
-  }
-
-def M2_vrcmpys_acc_s1_h: T_vrcmpys_acc<"hi", 0b101>;
-def M2_vrcmpys_acc_s1_l: T_vrcmpys_acc<"lo", 0b111>;
-
-// Assembler mapped to M2_vrcmpys_acc_s1_h or M2_vrcmpys_acc_s1_l
-
-let isAsmParserOnly = 1 in
-def M2_vrcmpys_acc_s1
-  : MInst <(outs DoubleRegs:$dst),
-           (ins DoubleRegs:$dst2, DoubleRegs:$src1, IntRegs:$src2),
-           "$dst += vrcmpys($src1, $src2):<<1:sat", [],
-           "$dst2 = $dst">;
-
-def M2_vrcmpys_s1rp_h : T_MType_vrcmpy <"vrcmpys", 0b101, 0b110, 1>;
-def M2_vrcmpys_s1rp_l : T_MType_vrcmpy <"vrcmpys", 0b101, 0b111, 0>;
-
-// Assembler mapped to M2_vrcmpys_s1rp_h or M2_vrcmpys_s1rp_l
-let isAsmParserOnly = 1 in
-def M2_vrcmpys_s1rp
-  : MInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rd=vrcmpys($Rss,$Rt):<<1:rnd:sat">;
-
-
-// S2_cabacdecbin: Cabac decode bin.
-let Defs = [P0], isPredicateLate = 1, Itinerary = S_3op_tc_1_SLOT23 in
-def S2_cabacdecbin : T_S3op_64 < "decbin", 0b11, 0b110, 0>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
deleted file mode 100644
index 18943a082d28..000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ /dev/null
@@ -1,3301 +0,0 @@
-//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V4 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-def DuplexIClass0:  InstDuplex < 0 >;
-def DuplexIClass1:  InstDuplex < 1 >;
-def DuplexIClass2:  InstDuplex < 2 >;
-let isExtendable = 1 in {
-  def DuplexIClass3:  InstDuplex < 3 >;
-  def DuplexIClass4:  InstDuplex < 4 >;
-  def DuplexIClass5:  InstDuplex < 5 >;
-  def DuplexIClass6:  InstDuplex < 6 >;
-  def DuplexIClass7:  InstDuplex < 7 >;
-}
-def DuplexIClass8:  InstDuplex < 8 >;
-def DuplexIClass9:  InstDuplex < 9 >;
-def DuplexIClassA:  InstDuplex < 0xA >;
-def DuplexIClassB:  InstDuplex < 0xB >;
-def DuplexIClassC:  InstDuplex < 0xC >;
-def DuplexIClassD:  InstDuplex < 0xD >;
-def DuplexIClassE:  InstDuplex < 0xE >;
-def DuplexIClassF:  InstDuplex < 0xF >;
-
-let hasSideEffects = 0 in
-class T_Immext<Operand ImmType>
-  : EXTENDERInst<(outs), (ins ImmType:$imm),
-                 "immext(#$imm)", []> {
-    bits<32> imm;
-    let IClass = 0b0000;
-
-    let Inst{27-16} = imm{31-20};
-    let Inst{13-0} = imm{19-6};
-  }
-
-def A4_ext : T_Immext<u26_6Imm>;
-let isCodeGenOnly = 1 in {
-  let isBranch = 1 in
-    def A4_ext_b : T_Immext<brtarget>;
-  let isCall = 1 in
-    def A4_ext_c : T_Immext<calltarget>;
-  def A4_ext_g : T_Immext<globaladdress>;
-}
-
-// Hexagon V4 Architecture spec defines 8 instruction classes:
-// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
-// compiler)
-
-// LD Instructions:
-// ========================================
-// Loads (8/16/32/64 bit)
-// Deallocframe
-
-// ST Instructions:
-// ========================================
-// Stores (8/16/32/64 bit)
-// Allocframe
-
-// ALU32 Instructions:
-// ========================================
-// Arithmetic / Logical (32 bit)
-// Vector Halfword
-
-// XTYPE Instructions (32/64 bit):
-// ========================================
-// Arithmetic, Logical, Bit Manipulation
-// Multiply (Integer, Fractional, Complex)
-// Permute / Vector Permute Operations
-// Predicate Operations
-// Shift / Shift with Add/Sub/Logical
-// Vector Byte ALU
-// Vector Halfword (ALU, Shift, Multiply)
-// Vector Word (ALU, Shift)
-
-// J Instructions:
-// ========================================
-// Jump/Call PC-relative
-
-// JR Instructions:
-// ========================================
-// Jump/Call Register
-
-// MEMOP Instructions:
-// ========================================
-// Operation on memory (8/16/32 bit)
-
-// NV Instructions:
-// ========================================
-// New-value Jumps
-// New-value Stores
-
-// CR Instructions:
-// ========================================
-// Control-Register Transfers
-// Hardware Loop Setup
-// Predicate Logicals & Reductions
-
-// SYSTEM Instructions (not implemented in the compiler):
-// ========================================
-// Prefetch
-// Cache Maintenance
-// Bus Operations
-
-
-//===----------------------------------------------------------------------===//
-// ALU32 +
-//===----------------------------------------------------------------------===//
-
-class T_ALU32_3op_not<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                      bit OpsRev>
-  : T_ALU32_3op<mnemonic, MajOp, MinOp, OpsRev, 0> {
-  let AsmString = "$Rd = "#mnemonic#"($Rs, ~$Rt)";
-}
-
-let BaseOpcode = "andn_rr", CextOpcode = "andn" in
-def A4_andn    : T_ALU32_3op_not<"and", 0b001, 0b100, 1>;
-let BaseOpcode = "orn_rr", CextOpcode = "orn" in
-def A4_orn     : T_ALU32_3op_not<"or",  0b001, 0b101, 1>;
-
-let CextOpcode = "rcmp.eq" in
-def A4_rcmpeq  : T_ALU32_3op<"cmp.eq",  0b011, 0b010, 0, 1>;
-let CextOpcode = "!rcmp.eq" in
-def A4_rcmpneq : T_ALU32_3op<"!cmp.eq", 0b011, 0b011, 0, 1>;
-
-def C4_cmpneq  : T_ALU32_3op_cmp<"!cmp.eq",  0b00, 1, 1>;
-def C4_cmplte  : T_ALU32_3op_cmp<"!cmp.gt",  0b10, 1, 0>;
-def C4_cmplteu : T_ALU32_3op_cmp<"!cmp.gtu", 0b11, 1, 0>;
-
-class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm>
-  : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-    "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", S_3op_tc_2early_SLOT23>,
-    ImmRegRel {
-  let InputType = "reg";
-  let CextOpcode = mnemonic;
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1100;
-  let Inst{27-21} = 0b0111110;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = MinOp;
-  let Inst{1-0} = Pd;
-}
-
-def A4_cmpbeq  : T_CMP_rrbh<"cmpb.eq",  0b110, 1>;
-def A4_cmpbgt  : T_CMP_rrbh<"cmpb.gt",  0b010, 0>;
-def A4_cmpbgtu : T_CMP_rrbh<"cmpb.gtu", 0b111, 0>;
-def A4_cmpheq  : T_CMP_rrbh<"cmph.eq",  0b011, 1>;
-def A4_cmphgt  : T_CMP_rrbh<"cmph.gt",  0b100, 0>;
-def A4_cmphgtu : T_CMP_rrbh<"cmph.gtu", 0b101, 0>;
-
-class T_CMP_ribh<string mnemonic, bits<2> MajOp, bit IsHalf, bit IsComm,
-                 Operand ImmType, bit IsImmExt, bit IsImmSigned, int ImmBits>
-  : ALU64Inst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, ImmType:$Imm),
-    "$Pd = "#mnemonic#"($Rs, #$Imm)", [], "", ALU64_tc_2early_SLOT23>,
-    ImmRegRel {
-  let InputType = "imm";
-  let CextOpcode = mnemonic;
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-  let isExtendable = IsImmExt;
-  let opExtendable = !if (IsImmExt, 2, 0);
-  let isExtentSigned = IsImmSigned;
-  let opExtentBits = ImmBits;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<8> Imm;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b1101;
-  let Inst{22-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{12-5} = Imm;
-  let Inst{4} = 0b0;
-  let Inst{3} = IsHalf;
-  let Inst{1-0} = Pd;
-}
-
-def A4_cmpbeqi  : T_CMP_ribh<"cmpb.eq",  0b00, 0, 1, u8_0Imm, 0, 0, 8>;
-def A4_cmpbgti  : T_CMP_ribh<"cmpb.gt",  0b01, 0, 0, s8_0Imm, 0, 1, 8>;
-def A4_cmpbgtui : T_CMP_ribh<"cmpb.gtu", 0b10, 0, 0, u7_0Ext, 1, 0, 7>;
-def A4_cmpheqi  : T_CMP_ribh<"cmph.eq",  0b00, 1, 1, s8_0Ext, 1, 1, 8>;
-def A4_cmphgti  : T_CMP_ribh<"cmph.gt",  0b01, 1, 0, s8_0Ext, 1, 1, 8>;
-def A4_cmphgtui : T_CMP_ribh<"cmph.gtu", 0b10, 1, 0, u7_0Ext, 1, 0, 7>;
-
-class T_RCMP_EQ_ri<string mnemonic, bit IsNeg>
-  : ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s8_0Ext:$s8),
-    "$Rd = "#mnemonic#"($Rs, #$s8)", [], "", ALU32_2op_tc_1_SLOT0123>,
-    ImmRegRel {
-  let InputType = "imm";
-  let CextOpcode = !if (IsNeg, "!rcmp.eq", "rcmp.eq");
-  let isExtendable = 1;
-  let opExtendable = 2;
-  let isExtentSigned = 1;
-  let opExtentBits = 8;
-  let hasNewValue = 1;
-
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<8> s8;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b0011;
-  let Inst{22} = 0b1;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b1;
-  let Inst{12-5} = s8;
-  let Inst{4-0} = Rd;
-}
-
-def A4_rcmpeqi  : T_RCMP_EQ_ri<"cmp.eq",  0>;
-def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>;
-
-//===----------------------------------------------------------------------===//
-// ALU32 -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM +
-//===----------------------------------------------------------------------===//
-
-// Combine a word and an immediate into a register pair.
-let hasSideEffects = 0, isExtentSigned = 1, isExtendable = 1,
-    opExtentBits = 8 in
-class T_Combine1 <bits<2> MajOp, dag ins, string AsmStr>
-  : ALU32Inst <(outs DoubleRegs:$Rdd), ins, AsmStr> {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<8> s8;
-
-    let IClass      = 0b0111;
-    let Inst{27-24} = 0b0011;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b1;
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-let opExtendable = 2 in
-def A4_combineri : T_Combine1<0b00, (ins IntRegs:$Rs, s8_0Ext:$s8),
-                                    "$Rdd = combine($Rs, #$s8)">;
-
-let opExtendable = 1 in
-def A4_combineir : T_Combine1<0b01, (ins s8_0Ext:$s8, IntRegs:$Rs),
-                                    "$Rdd = combine(#$s8, $Rs)">;
-
-// A4_combineii: Set two small immediates.
-let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in
-def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8_0Imm:$s8, u6_0Ext:$U6),
-  "$Rdd = combine(#$s8, #$U6)"> {
-    bits<5> Rdd;
-    bits<8> s8;
-    bits<6> U6;
-
-    let IClass = 0b0111;
-    let Inst{27-23} = 0b11001;
-    let Inst{20-16} = U6{5-1};
-    let Inst{13}    = U6{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LD +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template class for load instructions with Absolute set addressing mode.
-//===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, opExtentBits = 6, addrMode = AbsoluteSet,
-    hasSideEffects = 0 in
-class T_LD_abs_set<string mnemonic, RegisterClass RC, bits<4>MajOp>:
-            LDInst<(outs RC:$dst1, IntRegs:$dst2),
-            (ins u6_0Ext:$addr),
-            "$dst1 = "#mnemonic#"($dst2 = #$addr)",
-            []> {
-  bits<7> name;
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<6> addr;
-
-  let IClass = 0b1001;
-  let Inst{27-25} = 0b101;
-  let Inst{24-21} = MajOp;
-  let Inst{13-12} = 0b01;
-  let Inst{4-0}   = dst1;
-  let Inst{20-16} = dst2;
-  let Inst{11-8}  = addr{5-2};
-  let Inst{6-5}   = addr{1-0};
-}
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  def L4_loadrb_ap   : T_LD_abs_set <"memb",   IntRegs, 0b1000>;
-  def L4_loadrub_ap  : T_LD_abs_set <"memub",  IntRegs, 0b1001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  def L4_loadrh_ap  : T_LD_abs_set <"memh",  IntRegs, 0b1010>;
-  def L4_loadruh_ap : T_LD_abs_set <"memuh", IntRegs, 0b1011>;
-  def L4_loadbsw2_ap : T_LD_abs_set <"membh",  IntRegs, 0b0001>;
-  def L4_loadbzw2_ap : T_LD_abs_set <"memubh", IntRegs, 0b0011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-  def L4_loadri_ap : T_LD_abs_set <"memw", IntRegs, 0b1100>;
-
-let accessSize = WordAccess in {
-  def L4_loadbzw4_ap : T_LD_abs_set <"memubh", DoubleRegs, 0b0101>;
-  def L4_loadbsw4_ap : T_LD_abs_set <"membh",  DoubleRegs, 0b0111>;
-}
-
-let accessSize = DoubleWordAccess in
-def L4_loadrd_ap : T_LD_abs_set <"memd", DoubleRegs, 0b1110>;
-
-let accessSize = ByteAccess in
-  def L4_loadalignb_ap : T_LD_abs_set <"memb_fifo", DoubleRegs, 0b0100>;
-
-let accessSize = HalfWordAccess in
-def L4_loadalignh_ap : T_LD_abs_set <"memh_fifo", DoubleRegs, 0b0010>;
-
-// Load - Indirect with long offset
-let InputType = "imm", addrMode = BaseLongOffset, isExtended = 1,
-opExtentBits = 6, opExtendable = 3 in
-class T_LoadAbsReg <string mnemonic, string CextOp, RegisterClass RC,
-                    bits<4> MajOp>
-  : LDInst <(outs RC:$dst), (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3),
-  "$dst = "#mnemonic#"($src1<<#$src2 + #$src3)",
-  [] >, ImmRegShl {
-    bits<5> dst;
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    let CextOpcode = CextOp;
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12}    = 0b1;
-    let Inst{11-8}  = src3{5-2};
-    let Inst{7}     = src2{0};
-    let Inst{6-5}   = src3{1-0};
-    let Inst{4-0}   = dst;
-  }
-
-let accessSize = ByteAccess in {
-  def L4_loadrb_ur  : T_LoadAbsReg<"memb",  "LDrib", IntRegs, 0b1000>;
-  def L4_loadrub_ur : T_LoadAbsReg<"memub", "LDriub", IntRegs, 0b1001>;
-  def L4_loadalignb_ur : T_LoadAbsReg<"memb_fifo", "LDrib_fifo",
-                                      DoubleRegs, 0b0100>;
-}
-
-let accessSize = HalfWordAccess in {
-  def L4_loadrh_ur   : T_LoadAbsReg<"memh",   "LDrih",    IntRegs, 0b1010>;
-  def L4_loadruh_ur  : T_LoadAbsReg<"memuh",  "LDriuh",   IntRegs, 0b1011>;
-  def L4_loadbsw2_ur : T_LoadAbsReg<"membh",  "LDribh2",  IntRegs, 0b0001>;
-  def L4_loadbzw2_ur : T_LoadAbsReg<"memubh", "LDriubh2", IntRegs, 0b0011>;
-  def L4_loadalignh_ur : T_LoadAbsReg<"memh_fifo", "LDrih_fifo",
-                                      DoubleRegs, 0b0010>;
-}
-
-let accessSize = WordAccess in {
-  def L4_loadri_ur   : T_LoadAbsReg<"memw", "LDriw", IntRegs, 0b1100>;
-  def L4_loadbsw4_ur : T_LoadAbsReg<"membh", "LDribh4", DoubleRegs, 0b0111>;
-  def L4_loadbzw4_ur : T_LoadAbsReg<"memubh", "LDriubh4", DoubleRegs, 0b0101>;
-}
-
-let accessSize = DoubleWordAccess in
-def L4_loadrd_ur  : T_LoadAbsReg<"memd", "LDrid", DoubleRegs, 0b1110>;
-
-
-//===----------------------------------------------------------------------===//
-// Template classes for the non-predicated load instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-class T_load_rr <string mnemonic, RegisterClass RC, bits<3> MajOp>:
-   LDInst<(outs RC:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2_0Imm:$u2),
-  "$dst = "#mnemonic#"($src1 + $src2<<#$u2)",
-  [], "", V4LDST_tc_ld_SLOT01>, ImmRegShl, AddrModeRel {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-    bits<2> u2;
-
-    let IClass = 0b0011;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{12-8}  = src2;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated load instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated =  1 in
-class T_pload_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                  bit isNot, bit isPredNew>:
-   LDInst <(outs RC:$dst),
-           (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2_0Imm:$u2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$u2)",
-  [], "", V4LDST_tc_ld_SLOT01>, AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<5> src3;
-    bits<2> u2;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-
-    let IClass = 0b0011;
-
-    let Inst{27-26} = 0b00;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{12-8}  = src3;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = src1;
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// multiclass for load instructions with base + register offset
-// addressing mode
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = BaseRegOffset in
-multiclass ld_idxd_shl <string mnemonic, string CextOp, RegisterClass RC,
-                        bits<3> MajOp > {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl,
-      InputType = "reg" in {
-    let isPredicable = 1 in
-    def L4_#NAME#_rr : T_load_rr <mnemonic, RC, MajOp>;
-
-    // Predicated
-    def L4_p#NAME#t_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 0>;
-    def L4_p#NAME#f_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 0>;
-
-    // Predicated new
-    def L4_p#NAME#tnew_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 1>;
-    def L4_p#NAME#fnew_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 1>;
-  }
-}
-
-let hasNewValue = 1, accessSize = ByteAccess in {
-  defm loadrb  : ld_idxd_shl<"memb", "LDrib", IntRegs, 0b000>;
-  defm loadrub : ld_idxd_shl<"memub", "LDriub", IntRegs, 0b001>;
-}
-
-let hasNewValue = 1, accessSize = HalfWordAccess in {
-  defm loadrh  : ld_idxd_shl<"memh", "LDrih", IntRegs, 0b010>;
-  defm loadruh : ld_idxd_shl<"memuh", "LDriuh", IntRegs, 0b011>;
-}
-
-let hasNewValue = 1, accessSize = WordAccess in
-defm loadri : ld_idxd_shl<"memw", "LDriw", IntRegs, 0b100>;
-
-let accessSize = DoubleWordAccess in
-defm loadrd  : ld_idxd_shl<"memd", "LDrid", DoubleRegs, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// LD -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ST +
-//===----------------------------------------------------------------------===//
-///
-//===----------------------------------------------------------------------===//
-// Template class for store instructions with Absolute set addressing mode.
-//===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 1, opExtentBits = 6,
-    addrMode = AbsoluteSet in
-class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
-                   bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst<(outs IntRegs:$dst),
-           (ins u6_0Ext:$addr, RC:$src),
-    mnemonic#"($dst = #$addr) = $src"#!if(isHalf, ".h","")>, NewValueRel {
-    bits<5> dst;
-    bits<6> addr;
-    bits<5> src;
-    let accessSize = AccessSz;
-    let BaseOpcode = BaseOp#"_AbsSet";
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = dst;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src;
-    let Inst{7}     = 0b1;
-    let Inst{5-0}   = addr;
-  }
-
-def S4_storerb_ap : T_ST_absset <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
-def S4_storerh_ap : T_ST_absset <"memh", "STrih", IntRegs, 0b010,
-                                 HalfWordAccess>;
-def S4_storeri_ap : T_ST_absset <"memw", "STriw", IntRegs, 0b100, WordAccess>;
-
-let isNVStorable = 0 in {
-  def S4_storerf_ap : T_ST_absset <"memh", "STrif", IntRegs,
-                                   0b011, HalfWordAccess, 1>;
-  def S4_storerd_ap : T_ST_absset <"memd", "STrid", DoubleRegs,
-                                   0b110, DoubleWordAccess>;
-}
-
-let opExtendable = 1, isNewValue = 1, isNVStore = 1, opNewValue = 2,
-isExtended = 1, opExtentBits= 6 in
-class T_ST_absset_nv <string mnemonic, string BaseOp, bits<2> MajOp,
-                      MemAccessSize AccessSz >
-  : NVInst <(outs IntRegs:$dst),
-            (ins u6_0Ext:$addr, IntRegs:$src),
-    mnemonic#"($dst = #$addr) = $src.new">, NewValueRel {
-    bits<5> dst;
-    bits<6> addr;
-    bits<3> src;
-    let accessSize = AccessSz;
-    let BaseOpcode = BaseOp#"_AbsSet";
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = dst;
-    let Inst{13-11} = 0b000;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src;
-    let Inst{7}     = 0b1;
-    let Inst{5-0}   = addr;
-  }
-
-let mayStore = 1, addrMode = AbsoluteSet in {
-  def S4_storerbnew_ap : T_ST_absset_nv <"memb", "STrib", 0b00, ByteAccess>;
-  def S4_storerhnew_ap : T_ST_absset_nv <"memh", "STrih", 0b01, HalfWordAccess>;
-  def S4_storerinew_ap : T_ST_absset_nv <"memw", "STriw", 0b10, WordAccess>;
-}
-
-let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
-    addrMode = BaseLongOffset, AddedComplexity = 40 in
-class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
-                     bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst<(outs),
-           (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, RC:$src4),
-   mnemonic#"($src1<<#$src2 + #$src3) = $src4"#!if(isHalf, ".h",""),
-   []>, ImmRegShl, NewValueRel {
-
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    bits<5> src4;
-
-    let accessSize = AccessSz;
-    let CextOpcode = CextOp;
-    let BaseOpcode = CextOp#"_shl";
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} =0b1101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12-8}  = src4;
-    let Inst{7}     = 0b1;
-    let Inst{6}     = src2{0};
-    let Inst{5-0}   = src3;
-}
-
-def S4_storerb_ur : T_StoreAbsReg <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
-def S4_storerh_ur : T_StoreAbsReg <"memh", "STrih", IntRegs, 0b010,
-                                   HalfWordAccess>;
-def S4_storerf_ur : T_StoreAbsReg <"memh", "STrif", IntRegs, 0b011,
-                                   HalfWordAccess, 1>;
-def S4_storeri_ur : T_StoreAbsReg <"memw", "STriw", IntRegs, 0b100, WordAccess>;
-def S4_storerd_ur : T_StoreAbsReg <"memd", "STrid", DoubleRegs, 0b110,
-                                   DoubleWordAccess>;
-
-let mayStore = 1, isNVStore = 1, isExtended = 1, addrMode = BaseLongOffset,
-    opExtentBits = 6, isNewValue = 1, opNewValue = 3, opExtendable = 2 in
-class T_StoreAbsRegNV <string mnemonic, string CextOp, bits<2> MajOp,
-                       MemAccessSize AccessSz>
-  : NVInst <(outs ),
-            (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, IntRegs:$src4),
-  mnemonic#"($src1<<#$src2 + #$src3) = $src4.new">, NewValueRel {
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    bits<3> src4;
-
-    let CextOpcode  = CextOp;
-    let BaseOpcode  = CextOp#"_shl";
-    let IClass      = 0b1010;
-
-    let Inst{27-21} = 0b1101101;
-    let Inst{12-11} = 0b00;
-    let Inst{7}     = 0b1;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src4;
-    let Inst{6}     = src2{0};
-    let Inst{5-0}   = src3;
-  }
-
-def S4_storerbnew_ur : T_StoreAbsRegNV <"memb", "STrib", 0b00, ByteAccess>;
-def S4_storerhnew_ur : T_StoreAbsRegNV <"memh", "STrih", 0b01, HalfWordAccess>;
-def S4_storerinew_ur : T_StoreAbsRegNV <"memw", "STriw", 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Template classes for the non-predicated store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicable = 1 in
-class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
-  : STInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
-  mnemonic#"($Rs + $Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
-  [],"",V4LDST_tc_st_SLOT01>, ImmRegShl, AddrModeRel {
-
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<5> Rt;
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0011;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-0}   = Rt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated = 1 in
-class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                   bit isNot, bit isPredNew, bit isH>
-  : STInst <(outs),
-            (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
-
-  !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
-  [], "", V4LDST_tc_st_SLOT01> , AddrModeRel{
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<5> Rt;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0011;
-
-    let Inst{27-26} = 0b01;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = Pv;
-    let Inst{4-0}   = Rt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the new-value store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, isNewValue = 1, opNewValue = 3 in
-class T_store_new_rr <string mnemonic, bits<2> MajOp> :
-  NVInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
-  mnemonic#"($Rs + $Ru<<#$u2) = $Nt.new",
-  [],"",V4LDST_tc_st_SLOT0>, ImmRegShl, AddrModeRel {
-
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<3> Nt;
-
-    let IClass = 0b0011;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-3}   = MajOp;
-    let Inst{2-0}   = Nt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated new-value store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, isNewValue = 1, opNewValue = 4 in
-class T_pstore_new_rr <string mnemonic, bits<2> MajOp, bit isNot, bit isPredNew>
-  : NVInst<(outs),
-           (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
-   !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-   ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Nt.new",
-   [], "", V4LDST_tc_st_SLOT0>, AddrModeRel {
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<3> Nt;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-
-    let IClass = 0b0011;
-    let Inst{27-26} = 0b01;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = 0b101;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = Pv;
-    let Inst{4-3}   = MajOp;
-    let Inst{2-0}   = Nt;
-  }
-
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + register offset addressing
-// mode
-//===----------------------------------------------------------------------===//
-let isNVStorable = 1 in
-multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC,
-                       bits<3> MajOp, bit isH = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    def S4_#NAME#_rr : T_store_rr <mnemonic, RC, MajOp, isH>;
-
-    // Predicated
-    def S4_p#NAME#t_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 0, isH>;
-    def S4_p#NAME#f_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 0, isH>;
-
-    // Predicated new
-    def S4_p#NAME#tnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 1, isH>;
-    def S4_p#NAME#fnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 1, isH>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass for new-value store instructions with base + register offset
-// addressing mode.
-//===----------------------------------------------------------------------===//
-let mayStore = 1, isNVStore = 1 in
-multiclass ST_Idxd_shl_nv <string mnemonic, string CextOp, RegisterClass RC,
-                           bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    def S4_#NAME#new_rr : T_store_new_rr<mnemonic, MajOp>;
-
-    // Predicated
-    def S4_p#NAME#newt_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 0>;
-    def S4_p#NAME#newf_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 0>;
-
-    // Predicated new
-    def S4_p#NAME#newtnew_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 1>;
-  }
-}
-
-let addrMode = BaseRegOffset, InputType = "reg", hasSideEffects = 0 in {
-  let accessSize = ByteAccess in
-  defm storerb: ST_Idxd_shl<"memb", "STrib", IntRegs, 0b000>,
-                ST_Idxd_shl_nv<"memb", "STrib", IntRegs, 0b00>;
-
-  let accessSize = HalfWordAccess in
-  defm storerh: ST_Idxd_shl<"memh", "STrih", IntRegs, 0b010>,
-                ST_Idxd_shl_nv<"memh", "STrih", IntRegs, 0b01>;
-
-  let accessSize = WordAccess in
-  defm storeri: ST_Idxd_shl<"memw", "STriw", IntRegs, 0b100>,
-                ST_Idxd_shl_nv<"memw", "STriw", IntRegs, 0b10>;
-
-  let isNVStorable = 0, accessSize = DoubleWordAccess in
-  defm storerd: ST_Idxd_shl<"memd", "STrid", DoubleRegs, 0b110>;
-
-  let isNVStorable = 0, accessSize = HalfWordAccess in
-  defm storerf: ST_Idxd_shl<"memh", "STrif", IntRegs, 0b011, 1>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
-    opExtendable = 2 in
-class T_StoreImm <string mnemonic, Operand OffsetOp, bits<2> MajOp >
-  : STInst <(outs ), (ins IntRegs:$Rs, OffsetOp:$offset, s8_0Ext:$S8),
-  mnemonic#"($Rs+#$offset)=#$S8",
-  [], "", V4LDST_tc_st_SLOT01>,
-  ImmRegRel, PredNewRel {
-    bits<5> Rs;
-    bits<8> S8;
-    bits<8> offset;
-    bits<6> offsetBits;
-
-    string OffsetOpStr = !cast<string>(OffsetOp);
-    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
-                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
-                                         /* u6_0Imm */ offset{5-0}));
-
-    let IClass = 0b0011;
-
-    let Inst{27-25} = 0b110;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-7}  = offsetBits;
-    let Inst{13}    = S8{7};
-    let Inst{6-0}   = S8{6-0};
-  }
-
-let isPredicated = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 6,
-    opExtendable = 3 in
-class T_StoreImm_pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
-                       bit isPredNot, bit isPredNew >
-  : STInst <(outs ),
-            (ins PredRegs:$Pv, IntRegs:$Rs, OffsetOp:$offset, s6_0Ext:$S6),
-  !if(isPredNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($Rs+#$offset)=#$S6",
-  [], "", V4LDST_tc_st_SLOT01>,
-  ImmRegRel, PredNewRel {
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<6> S6;
-    bits<8> offset;
-    bits<6> offsetBits;
-
-    string OffsetOpStr = !cast<string>(OffsetOp);
-    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
-                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
-                                         /* u6_0Imm */ offset{5-0}));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b0011;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24}    = isPredNew;
-    let Inst{23}    = isPredNot;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = S6{5};
-    let Inst{12-7}  = offsetBits;
-    let Inst{6-5}   = Pv;
-    let Inst{4-0}   = S6{4-0};
-  }
-
-
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + immediate offset
-// addressing mode and immediate stored value.
-// mem[bhw](Rx++#s4:3)=#s8
-// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
-//===----------------------------------------------------------------------===//
-
-multiclass ST_Imm_Pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
-                        bit PredNot> {
-  def _io    : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 0>;
-  // Predicate new
-  def new_io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 1>;
-}
-
-multiclass ST_Imm <string mnemonic, string CextOp, Operand OffsetOp,
-                   bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
-    def _io : T_StoreImm <mnemonic, OffsetOp, MajOp>;
-
-    defm t : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 0>;
-    defm f : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 1>;
-  }
-}
-
-let hasSideEffects = 0, addrMode = BaseImmOffset,
-    InputType = "imm" in {
-  let accessSize = ByteAccess in
-  defm S4_storeirb : ST_Imm<"memb", "STrib", u6_0Imm, 0b00>;
-
-  let accessSize = HalfWordAccess in
-  defm S4_storeirh : ST_Imm<"memh", "STrih", u6_1Imm, 0b01>;
-
-  let accessSize = WordAccess in
-  defm S4_storeiri : ST_Imm<"memw", "STriw", u6_2Imm, 0b10>;
-}
-
-//===----------------------------------------------------------------------===
-// ST -
-//===----------------------------------------------------------------------===
-
-
-//===----------------------------------------------------------------------===//
-// NV/ST +
-//===----------------------------------------------------------------------===//
-
-let opNewValue = 2, opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
-class T_store_io_nv <string mnemonic, RegisterClass RC,
-                    Operand ImmOp, bits<2>MajOp>
-  : NVInst_V4 <(outs),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-  mnemonic#"($src1+#$src2) = $src3.new",
-  [],"",ST_tc_st_SLOT0> {
-    bits<5> src1;
-    bits<13> src2; // Actual address offset
-    bits<3> src3;
-    bits<11> offsetBits; // Represents offset encoding
-
-    let opExtentBits = !if (!eq(mnemonic, "memb"), 11,
-                       !if (!eq(mnemonic, "memh"), 12,
-                       !if (!eq(mnemonic, "memw"), 13, 0)));
-
-    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
-                        !if (!eq(mnemonic, "memh"), 1,
-                        !if (!eq(mnemonic, "memw"), 2, 0)));
-
-    let offsetBits = !if (!eq(mnemonic, "memb"),  src2{10-0},
-                     !if (!eq(mnemonic, "memh"),  src2{11-1},
-                     !if (!eq(mnemonic, "memw"),  src2{12-2}, 0)));
-
-    let IClass = 0b1010;
-
-    let Inst{27} = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = 0b1101;
-    let Inst{20-16} = src1;
-    let Inst{13} = offsetBits{8};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src3;
-    let Inst{7-0} = offsetBits{7-0};
-  }
-
-let opExtendable = 2, opNewValue = 3, isPredicated = 1 in
-class T_pstore_io_nv <string mnemonic, RegisterClass RC, Operand predImmOp,
-                         bits<2>MajOp, bit PredNot, bit isPredNew>
-  : NVInst_V4 <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC:$src4),
-  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2+#$src3) = $src4.new",
-  [],"",V2LDST_tc_st_SLOT0> {
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> src3;
-    bits<3> src4;
-    bits<6> offsetBits; // Represents offset encoding
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = PredNot;
-    let opExtentBits = !if (!eq(mnemonic, "memb"), 6,
-                       !if (!eq(mnemonic, "memh"), 7,
-                       !if (!eq(mnemonic, "memw"), 8, 0)));
-
-    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
-                        !if (!eq(mnemonic, "memh"), 1,
-                        !if (!eq(mnemonic, "memw"), 2, 0)));
-
-    let offsetBits = !if (!eq(mnemonic, "memb"), src3{5-0},
-                     !if (!eq(mnemonic, "memh"), src3{6-1},
-                     !if (!eq(mnemonic, "memw"), src3{7-2}, 0)));
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b0;
-    let Inst{26}    = PredNot;
-    let Inst{25}    = isPredNew;
-    let Inst{24-21} = 0b0101;
-    let Inst{20-16} = src2;
-    let Inst{13}    = offsetBits{5};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src4;
-    let Inst{7-3}   = offsetBits{4-0};
-    let Inst{2}     = 0b0;
-    let Inst{1-0}   = src1;
-  }
-
-// multiclass for new-value store instructions with base + immediate offset.
-//
-let mayStore = 1, isNVStore = 1, isNewValue = 1, hasSideEffects = 0,
-    isExtendable = 1 in
-multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<2> MajOp> {
-
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    def S2_#NAME#new_io : T_store_io_nv <mnemonic, RC, ImmOp, MajOp>;
-    // Predicated
-    def S2_p#NAME#newt_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 0, 0>;
-    def S2_p#NAME#newf_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 1, 0>;
-    // Predicated new
-    def S4_p#NAME#newtnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
-                                              MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
-                                              MajOp, 1, 1>;
-  }
-}
-
-let addrMode = BaseImmOffset, InputType = "imm" in {
-  let accessSize = ByteAccess in
-  defm storerb: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
-                           u6_0Ext, 0b00>, AddrModeRel;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-  defm storerh: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
-                           u6_1Ext, 0b01>, AddrModeRel;
-
-  let accessSize = WordAccess, opExtentAlign = 2 in
-  defm storeri: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
-                           u6_2Ext, 0b10>, AddrModeRel;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment loads with register offset.
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1 in
-def L2_loadbsw2_pr : T_load_pr <"membh", IntRegs, 0b0001, HalfWordAccess>;
-
-def L2_loadbsw4_pr : T_load_pr <"membh", DoubleRegs, 0b0111, WordAccess>;
-
-let hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pr <string mnemonic, bits<4> MajOp, MemAccessSize AccessSz>
-  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$_dst_),
-              (ins DoubleRegs:$src1, IntRegs:$src2, ModRegs:$src3),
-  "$dst = "#mnemonic#"($src2++$src3)", [],
-  "$src1 = $dst, $src2 = $_dst_"> {
-    bits<5> dst;
-    bits<5> src2;
-    bits<1> src3;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13}    = src3;
-    let Inst{12}    = 0b0;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-def L2_loadalignb_pr : T_loadalign_pr <"memb_fifo", 0b0100, ByteAccess>;
-def L2_loadalignh_pr : T_loadalign_pr <"memh_fifo", 0b0010, HalfWordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment .new stores
-// mem[bhwd](Rx++#s4:[0123])=Nt.new
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
-    isNewValue = 1, opNewValue = 3 in
-class T_StorePI_nv <string mnemonic, Operand ImmOp, bits<2> MajOp >
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins IntRegs:$src1, ImmOp:$offset, IntRegs:$src2),
-  mnemonic#"($src1++#$offset) = $src2.new",
-  [], "$src1 = $_dst_">,
-  AddrModeRel {
-    bits<5> src1;
-    bits<3> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0}));
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = src1;
-    let Inst{13} = 0b0;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src2;
-    let Inst{7} = 0b0;
-    let Inst{6-3} = offsetBits;
-    let Inst{1} = 0b0;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment .new stores
-// if([!]Pv[.new]) mem[bhwd](Rx++#s4:[0123])=Nt.new
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
-    isNewValue = 1, opNewValue = 4 in
-class T_StorePI_nv_pred <string mnemonic, Operand ImmOp,
-                         bits<2> MajOp, bit isPredNot, bit isPredNew >
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins PredRegs:$src1, IntRegs:$src2,
-                      ImmOp:$offset, IntRegs:$src3),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2++#$offset) = $src3.new",
-  [], "$src2 = $_dst_">,
-  AddrModeRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<3> src3;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0}));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src3;
-    let Inst{7} = isPredNew;
-    let Inst{6-3} = offsetBits;
-    let Inst{2} = isPredNot;
-    let Inst{1-0} = src1;
-  }
-
-multiclass ST_PostInc_Pred_nv<string mnemonic, Operand ImmOp,
-                              bits<2> MajOp, bit PredNot> {
-  def _pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 0>;
-
-  // Predicate new
-  def new_pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 1>;
-}
-
-multiclass ST_PostInc_nv<string mnemonic, string BaseOp, Operand ImmOp,
-                         bits<2> MajOp> {
-  let BaseOpcode = "POST_"#BaseOp in {
-    def S2_#NAME#_pi : T_StorePI_nv <mnemonic, ImmOp, MajOp>;
-
-    // Predicated
-    defm S2_p#NAME#t : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 0>;
-    defm S2_p#NAME#f : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 1>;
-  }
-}
-
-let accessSize = ByteAccess in
-defm storerbnew: ST_PostInc_nv <"memb", "STrib", s4_0Imm, 0b00>;
-
-let accessSize = HalfWordAccess in
-defm storerhnew: ST_PostInc_nv <"memh", "STrih", s4_1Imm, 0b01>;
-
-let accessSize = WordAccess in
-defm storerinew: ST_PostInc_nv <"memw", "STriw", s4_2Imm, 0b10>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment .new stores with register offset
-//===----------------------------------------------------------------------===//
-let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
-class T_StorePI_RegNV <string mnemonic, bits<2> MajOp, MemAccessSize AccessSz>
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins IntRegs:$src1, ModRegs:$src2, IntRegs:$src3),
-  #mnemonic#"($src1++$src2) = $src3.new",
-  [], "$src1 = $_dst_"> {
-    bits<5> src1;
-    bits<1> src2;
-    bits<3> src3;
-    let accessSize = AccessSz;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1101101;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src3;
-    let Inst{7}     = 0b0;
-  }
-
-def S2_storerbnew_pr : T_StorePI_RegNV<"memb", 0b00, ByteAccess>;
-def S2_storerhnew_pr : T_StorePI_RegNV<"memh", 0b01, HalfWordAccess>;
-def S2_storerinew_pr : T_StorePI_RegNV<"memw", 0b10, WordAccess>;
-
-// memb(Rx++#s4:0:circ(Mu))=Nt.new
-// memb(Rx++I:circ(Mu))=Nt.new
-// memb(Rx++Mu:brev)=Nt.new
-// memh(Rx++#s4:1:circ(Mu))=Nt.new
-// memh(Rx++I:circ(Mu))=Nt.new
-// memh(Rx++Mu)=Nt.new
-// memh(Rx++Mu:brev)=Nt.new
-
-// memw(Rx++#s4:2:circ(Mu))=Nt.new
-// memw(Rx++I:circ(Mu))=Nt.new
-// memw(Rx++Mu)=Nt.new
-// memw(Rx++Mu:brev)=Nt.new
-
-//===----------------------------------------------------------------------===//
-// NV/ST -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NV/J +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps with the register
-// operands.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
-                      bit isNegCond, bit isTak>
-  : NVInst_V4<(outs),
-    (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-    "if ("#!if(isNegCond, "!","")#mnemonic#
-    "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
-    "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      bits<5> src1;
-      bits<5> src2;
-      bits<3> Ns;    // New-Value Operand
-      bits<5> RegOp; // Non-New-Value Operand
-      bits<11> offset;
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let opNewValue{0} = NvOpNum;
-
-      let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
-      let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
-
-      let IClass = 0b0010;
-      let Inst{27-26} = 0b00;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = Ns;
-      let Inst{13} = isTak;
-      let Inst{12-8} = RegOp;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-
-multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
-                       bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
-}
-
-// NvOpNum = 0 -> First Operand is a new-value Register
-// NvOpNum = 1 -> Second Operand is a new-value Register
-
-multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
-                       bit NvOpNum> {
-  let BaseOpcode = BaseOp#_NVJ in {
-    defm _t_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
-    defm _f_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
-  }
-}
-
-// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_cmpeq  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
-  defm J4_cmpgt  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
-  defm J4_cmpgtu : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
-  defm J4_cmplt  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
-  defm J4_cmpltu : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps instruction
-// with a register and an unsigned immediate (U5) operand.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
-                         bit isTak>
-  : NVInst_V4<(outs),
-    (ins IntRegs:$src1, u5_0Imm:$src2, brtarget:$offset),
-    "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let isTaken = isTak;
-
-      bits<3> src1;
-      bits<5> src2;
-      bits<11> offset;
-
-      let IClass = 0b0010;
-      let Inst{26} = 0b1;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = src1;
-      let Inst{13} = isTak;
-      let Inst{12-8} = src2;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJri_template<mnemonic, majOp, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJri_template<mnemonic, majOp, isNegCond, 1>;
-}
-
-multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
-  let BaseOpcode = BaseOp#_NVJri in {
-    defm _t_jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
-    defm _f_jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
-  }
-}
-
-// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_cmpeqi  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
-  defm J4_cmpgti  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
-  defm J4_cmpgtui : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps instruction
-// with a register and an hardcoded 0/-1 immediate value.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
-                            bit isNegCond, bit isTak>
-  : NVInst_V4<(outs),
-    !if(!eq(ImmVal, "{-1}"),
-        (ins IntRegs:$src1, n1Const:$n1, brtarget:$offset),
-        (ins IntRegs:$src1, brtarget:$offset)),
-    "if ("#!if(isNegCond, "!","")#mnemonic
-    #"($src1.new, #" # !if(!eq(ImmVal, "{-1}"), "$n1", ImmVal) # ")) jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let isTaken = isTak;
-      let opExtendable = !if(!eq(ImmVal, "{-1}"), 2, 1);
-
-      bits<3> src1;
-      bits<11> offset;
-      let IClass = 0b0010;
-      let Inst{26} = 0b1;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = src1;
-      let Inst{13} = isTak;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
-                             bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
-}
-
-multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
-                             string ImmVal> {
-  let BaseOpcode = BaseOp#_NVJ_ConstImm in {
-    defm _t_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True
-    defm _f_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False
-  }
-}
-
-// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
-// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_tstbit0 : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
-  defm J4_cmpeqn1 : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "{-1}">, PredRel;
-  defm J4_cmpgtn1 : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "{-1}">, PredRel;
-}
-
-// J4_hintjumpr: Hint indirect conditional jump.
-let isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def J4_hintjumpr: JRInst <
-  (outs),
-  (ins IntRegs:$Rs),
-  "hintjr($Rs)"> {
-    bits<5> Rs;
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0010101;
-    let Inst{20-16} = Rs;
-  }
-
-//===----------------------------------------------------------------------===//
-// NV/J -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// CR +
-//===----------------------------------------------------------------------===//
-
-// PC-relative add
-let hasNewValue = 1, isExtendable = 1, opExtendable = 1,
-    isExtentSigned = 0, opExtentBits = 6, hasSideEffects = 0, Uses = [PC] in
-def C4_addipc : CRInst <(outs IntRegs:$Rd), (ins u6_0Ext:$u6),
-  "$Rd = add(pc, #$u6)", [], "", CR_tc_2_SLOT3 > {
-    bits<5> Rd;
-    bits<6> u6;
-
-    let IClass = 0b0110;
-    let Inst{27-16} = 0b101001001001;
-    let Inst{12-7} = u6;
-    let Inst{4-0} = Rd;
-  }
-
-
-
-let hasSideEffects = 0 in
-class T_LOGICAL_3OP<string MnOp1, string MnOp2, bits<2> OpBits, bit IsNeg>
-    : CRInst<(outs PredRegs:$Pd),
-             (ins PredRegs:$Ps, PredRegs:$Pt, PredRegs:$Pu),
-             "$Pd = " # MnOp1 # "($Ps, " # MnOp2 # "($Pt, " #
-                   !if (IsNeg,"!","") # "$Pu))",
-             [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-  bits<2> Pt;
-  bits<2> Pu;
-
-  let IClass = 0b0110;
-  let Inst{27-24} = 0b1011;
-  let Inst{23} = IsNeg;
-  let Inst{22-21} = OpBits;
-  let Inst{20} = 0b1;
-  let Inst{17-16} = Ps;
-  let Inst{13} = 0b0;
-  let Inst{9-8} = Pt;
-  let Inst{7-6} = Pu;
-  let Inst{1-0} = Pd;
-}
-
-def C4_and_and  : T_LOGICAL_3OP<"and", "and", 0b00, 0>;
-def C4_and_or   : T_LOGICAL_3OP<"and", "or",  0b01, 0>;
-def C4_or_and   : T_LOGICAL_3OP<"or",  "and", 0b10, 0>;
-def C4_or_or    : T_LOGICAL_3OP<"or",  "or",  0b11, 0>;
-def C4_and_andn : T_LOGICAL_3OP<"and", "and", 0b00, 1>;
-def C4_and_orn  : T_LOGICAL_3OP<"and", "or",  0b01, 1>;
-def C4_or_andn  : T_LOGICAL_3OP<"or",  "and", 0b10, 1>;
-def C4_or_orn   : T_LOGICAL_3OP<"or",  "or",  0b11, 1>;
-
-//===----------------------------------------------------------------------===//
-// CR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/ALU +
-//===----------------------------------------------------------------------===//
-
-// Logical with-not instructions.
-def A4_andnp : T_ALU64_logical<"and", 0b001, 1, 0, 1>;
-def A4_ornp  : T_ALU64_logical<"or",  0b011, 1, 0, 1>;
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def S4_parity: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101111;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-//  Add and accumulate.
-//  Rd=add(Rs,add(Ru,#s6))
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 6,
-    opExtendable = 3 in
-def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
-                            (ins IntRegs:$Rs, IntRegs:$Ru, s6_0Ext:$s6),
-  "$Rd = add($Rs, add($Ru, #$s6))" , [],
-  "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<6> s6;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b10110;
-    let Inst{22-21} = s6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = s6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = s6{2-0};
-    let Inst{4-0}   = Ru;
-  }
-
-let isExtentSigned = 1, hasSideEffects = 0, hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 6, opExtendable = 2 in
-def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd),
-                           (ins IntRegs:$Rs, s6_0Ext:$s6, IntRegs:$Ru),
-  "$Rd = add($Rs, sub(#$s6, $Ru))",
-  [], "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<6> s6;
-    bits<5> Ru;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b10111;
-    let Inst{22-21} = s6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = s6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = s6{2-0};
-    let Inst{4-0}   = Ru;
-  }
-
-def S4_extractp_rp : T_S3op_64 < "extract",  0b11, 0b100, 0>;
-def S4_extractp    : T_S2op_extract <"extract",  0b1010, DoubleRegs, u6_0Imm>;
-
-let hasNewValue = 1 in {
-  def S4_extract_rp : T_S3op_extract<"extract",  0b01>;
-  def S4_extract    : T_S2op_extract <"extract",  0b1101, IntRegs, u5_0Imm>;
-}
-
-// Complex add/sub halfwords/words
-let Defs = [USR_OVF] in {
-  def S4_vxaddsubh : T_S3op_64 < "vxaddsubh", 0b01, 0b100, 0, 1>;
-  def S4_vxaddsubw : T_S3op_64 < "vxaddsubw", 0b01, 0b000, 0, 1>;
-  def S4_vxsubaddh : T_S3op_64 < "vxsubaddh", 0b01, 0b110, 0, 1>;
-  def S4_vxsubaddw : T_S3op_64 < "vxsubaddw", 0b01, 0b010, 0, 1>;
-}
-
-let Defs = [USR_OVF] in {
-  def S4_vxaddsubhr : T_S3op_64 < "vxaddsubh", 0b11, 0b000, 0, 1, 1, 1>;
-  def S4_vxsubaddhr : T_S3op_64 < "vxsubaddh", 0b11, 0b010, 0, 1, 1, 1>;
-}
-
-let Itinerary = M_tc_3x_SLOT23, Defs = [USR_OVF] in {
-  def M4_mac_up_s1_sat: T_MType_acc_rr<"+= mpy", 0b011, 0b000, 0, [], 0, 1, 1>;
-  def M4_nac_up_s1_sat: T_MType_acc_rr<"-= mpy", 0b011, 0b001, 0, [], 0, 1, 1>;
-}
-
-// Logical xor with xor accumulation.
-// Rxx^=xor(Rss,Rtt)
-let hasSideEffects = 0 in
-def M4_xor_xacc
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx ^= xor($Rss, $Rtt)", [],
-  "$dst2 = $Rxx", S_3op_tc_1_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b101010;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rxx;
-  }
-
-// Rotate and reduce bytes
-// Rdd=vrcrotate(Rss,Rt,#u2)
-let hasSideEffects = 0 in
-def S4_vrcrotate
-  : SInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
-  "$Rdd = vrcrotate($Rss, $Rt, #$u2)",
-  [], "", S_3op_tc_3x_SLOT23> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rt;
-    bits<2> u2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b001111;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = u2{1};
-    let Inst{12-8}  = Rt;
-    let Inst{7-6}   = 0b11;
-    let Inst{5}     = u2{0};
-    let Inst{4-0}   = Rdd;
-  }
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-let hasSideEffects = 0 in
-def S4_vrcrotate_acc
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
-  "$Rxx += vrcrotate($Rss, $Rt, #$u2)", [],
-  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-    bits<2> u2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = u2{1};
-    let Inst{12-8}  = Rt;
-    let Inst{5}     = u2{0};
-    let Inst{4-0}   = Rxx;
-  }
-
-// Vector reduce conditional negate halfwords
-let hasSideEffects = 0 in
-def S2_vrcnegh
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rxx += vrcnegh($Rss, $Rt)", [],
-  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011001;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = 0b1;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = 0b111;
-    let Inst{4-0}   = Rxx;
-  }
-
-// Split bitfield
-def A4_bitspliti : T_S2op_2_di <"bitsplit", 0b110, 0b100>;
-
-// Arithmetic/Convergent round
-def A4_cround_ri : T_S2op_2_ii <"cround", 0b111, 0b000>;
-
-def A4_round_ri  : T_S2op_2_ii <"round", 0b111, 0b100>;
-
-let Defs = [USR_OVF] in
-def A4_round_ri_sat : T_S2op_2_ii <"round", 0b111, 0b110, 1>;
-
-// Logical-logical words.
-// Compound or-and -- Rx=or(Ru,and(Rx,#s10))
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 10,
-    opExtendable = 3 in
-def S4_or_andix:
-  ALU64Inst<(outs IntRegs:$Rx),
-            (ins IntRegs:$Ru, IntRegs:$_src_, s10_0Ext:$s10),
-  "$Rx = or($Ru, and($_src_, #$s10))" , [] ,
-  "$_src_ = $Rx", ALU64_tc_2_SLOT23> {
-    bits<5> Rx;
-    bits<5> Ru;
-    bits<10> s10;
-
-    let IClass = 0b1101;
-
-    let Inst{27-22} = 0b101001;
-    let Inst{20-16} = Rx;
-    let Inst{21}    = s10{9};
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Ru;
-  }
-
-// Miscellaneous ALU64 instructions.
-//
-let hasNewValue = 1, hasSideEffects = 0 in
-def A4_modwrapu: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = modwrap($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0011111;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = 0b111;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def A4_bitsplit: ALU64Inst<(outs DoubleRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = bitsplit($Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0100;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def dep_S2_packhl: ALU64Inst<(outs DoubleRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = packhl($Rs, $Rt):deprecated", [], "", ALU64_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0100;
-  let Inst{21} = 0b0;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def dep_A2_addsat: ALU64Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = add($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7} = 0b0;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def dep_A2_subsat: ALU64Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = sub($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101100;
-  let Inst{20-16} = Rt;
-  let Inst{12-8} = Rs;
-  let Inst{7} = 0b1;
-  let Inst{4-0} = Rd;
-}
-
-// Rx[&|]=xor(Rs,Rt)
-def M4_or_xor   : T_MType_acc_rr < "|= xor", 0b110, 0b001, 0>;
-def M4_and_xor  : T_MType_acc_rr < "&= xor", 0b010, 0b010, 0>;
-
-// Rx[&|^]=or(Rs,Rt)
-def M4_xor_or   : T_MType_acc_rr < "^= or",  0b110, 0b011, 0>;
-
-let CextOpcode = "ORr_ORr" in
-def M4_or_or    : T_MType_acc_rr < "|= or",  0b110, 0b000, 0>;
-def M4_and_or   : T_MType_acc_rr < "&= or",  0b010, 0b001, 0>;
-
-// Rx[&|^]=and(Rs,Rt)
-def M4_xor_and  : T_MType_acc_rr < "^= and", 0b110, 0b010, 0>;
-
-let CextOpcode = "ORr_ANDr" in
-def M4_or_and   : T_MType_acc_rr < "|= and", 0b010, 0b011, 0>;
-def M4_and_and  : T_MType_acc_rr < "&= and", 0b010, 0b000, 0>;
-
-// Rx[&|^]=and(Rs,~Rt)
-def M4_xor_andn : T_MType_acc_rr < "^= and", 0b001, 0b010, 0, [], 1>;
-def M4_or_andn  : T_MType_acc_rr < "|= and", 0b001, 0b000, 0, [], 1>;
-def M4_and_andn : T_MType_acc_rr < "&= and", 0b001, 0b001, 0, [], 1>;
-
-// Compound or-or and or-and
-let isExtentSigned = 1, InputType = "imm", hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 10, opExtendable = 3 in
-class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode>
-  : MInst_acc <(outs IntRegs:$Rx),
-               (ins IntRegs:$src1, IntRegs:$Rs, s10_0Ext:$s10),
-  "$Rx |= "#mnemonic#"($Rs, #$s10)", [],
-  "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<10> s10;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{21}    = s10{9};
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rx;
-  }
-
-let CextOpcode = "ORr_ANDr" in
-def S4_or_andi : T_CompOR <"and", 0b00, and>;
-
-let CextOpcode = "ORr_ORr" in
-def S4_or_ori : T_CompOR <"or", 0b10, or>;
-
-//    Modulo wrap
-//        Rd=modwrap(Rs,Rt)
-//    Round
-//        Rd=cround(Rs,#u5)
-//        Rd=cround(Rs,Rt)
-//        Rd=round(Rs,#u5)[:sat]
-//        Rd=round(Rs,Rt)[:sat]
-//    Vector reduce add unsigned halfwords
-//        Rd=vraddh(Rss,Rtt)
-//    Vector add bytes
-//        Rdd=vaddb(Rss,Rtt)
-//    Vector conditional negate
-//        Rdd=vcnegh(Rss,Rt)
-//        Rxx+=vrcnegh(Rss,Rt)
-//    Vector maximum bytes
-//        Rdd=vmaxb(Rtt,Rss)
-//    Vector reduce maximum halfwords
-//        Rxx=vrmaxh(Rss,Ru)
-//        Rxx=vrmaxuh(Rss,Ru)
-//    Vector reduce maximum words
-//        Rxx=vrmaxuw(Rss,Ru)
-//        Rxx=vrmaxw(Rss,Ru)
-//    Vector minimum bytes
-//        Rdd=vminb(Rtt,Rss)
-//    Vector reduce minimum halfwords
-//        Rxx=vrminh(Rss,Ru)
-//        Rxx=vrminuh(Rss,Ru)
-//    Vector reduce minimum words
-//        Rxx=vrminuw(Rss,Ru)
-//        Rxx=vrminw(Rss,Ru)
-//    Vector subtract bytes
-//        Rdd=vsubb(Rss,Rtt)
-
-//===----------------------------------------------------------------------===//
-// XTYPE/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/BIT +
-//===----------------------------------------------------------------------===//
-
-// Bit reverse
-def S2_brevp : T_S2op_3 <"brev", 0b11, 0b110>;
-
-// Bit count
-def S2_ct0p : T_COUNT_LEADING_64<"ct0", 0b111, 0b010>;
-def S2_ct1p : T_COUNT_LEADING_64<"ct1", 0b111, 0b100>;
-def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S4_clbaddi : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s6_0Imm:$s6),
-    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  bits<6> s6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1100;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = s6;
-  let Inst{7-5} = 0b000;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S4_clbpaddi : SInst<(outs IntRegs:$Rd), (ins DoubleRegs:$Rs, s6_0Imm:$s6),
-    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  bits<6> s6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1000;
-  let Inst{23-21} = 0b011;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = s6;
-  let Inst{7-5} = 0b010;
-  let Inst{4-0} = Rd;
-}
-
-
-// Bit test/set/clear
-def S4_ntstbit_i : T_TEST_BIT_IMM<"!tstbit", 0b001>;
-def S4_ntstbit_r : T_TEST_BIT_REG<"!tstbit", 1>;
-
-def C4_nbitsset  : T_TEST_BITS_REG<"!bitsset", 0b01, 1>;
-def C4_nbitsclr  : T_TEST_BITS_REG<"!bitsclr", 0b10, 1>;
-def C4_nbitsclri : T_TEST_BITS_IMM<"!bitsclr", 0b10, 1>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY +
-//===----------------------------------------------------------------------===//
-
-// Rd=add(#u6,mpyi(Rs,#U6)) -- Multiply by immed and add immed.
-
-let hasNewValue = 1, isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
-def M4_mpyri_addi : MInst<(outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6, IntRegs:$Rs, u6_0Imm:$U6),
-  "$Rd = add(#$u6, mpyi($Rs, #$U6))" , [],"",ALU64_tc_3x_SLOT23> {
-    bits<5> Rd;
-    bits<6> u6;
-    bits<5> Rs;
-    bits<6> U6;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23}    = U6{5};
-    let Inst{22-21} = u6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = u6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = u6{2-0};
-    let Inst{4-0}   = U6{4-0};
-  }
-
-// Rd=add(#u6,mpyi(Rs,Rt))
-let CextOpcode = "ADD_MPY", InputType = "imm", hasNewValue = 1,
-    isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
-def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = add(#$u6, mpyi($Rs, $Rt))" , [], "", ALU64_tc_3x_SLOT23>, ImmRegRel {
-    bits<5> Rd;
-    bits<6> u6;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01110;
-    let Inst{22-21} = u6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = u6{3};
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = u6{2-0};
-    let Inst{4-0}   = Rd;
-  }
-
-let hasNewValue = 1 in
-class T_AddMpy <bit MajOp, PatLeaf ImmPred, dag ins>
-  : ALU64Inst <(outs IntRegs:$dst), ins,
-  "$dst = add($src1, mpyi("#!if(MajOp,"$src3, #$src2))",
-                                      "#$src2, $src3))"), [],
-  "", ALU64_tc_3x_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<8> src2;
-    bits<5> src3;
-
-    let IClass = 0b1101;
-
-    bits<6> ImmValue = !if(MajOp, src2{5-0}, src2{7-2});
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23}    = MajOp;
-    let Inst{22-21} = ImmValue{5-4};
-    let Inst{20-16} = src3;
-    let Inst{13}    = ImmValue{3};
-    let Inst{12-8}  = dst;
-    let Inst{7-5}   = ImmValue{2-0};
-    let Inst{4-0}   = src1;
-  }
-
-def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred,
-                       (ins IntRegs:$src1, u6_2Imm:$src2, IntRegs:$src3)>;
-
-let isExtendable = 1, opExtentBits = 6, opExtendable = 3,
-    CextOpcode = "ADD_MPY", InputType = "imm" in
-def M4_mpyri_addr : T_AddMpy<0b1, u32_0ImmPred,
-                    (ins IntRegs:$src1, IntRegs:$src3, u6_0Ext:$src2)>, ImmRegRel;
-
-// Rx=add(Ru,mpyi(Rx,Rs))
-let CextOpcode = "ADD_MPY", InputType = "reg", hasNewValue = 1 in
-def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx),
-                              (ins IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs),
-  "$Rx = add($Ru, mpyi($_src_, $Rs))", [],
-  "$_src_ = $Rx", M_tc_3x_SLOT23>, ImmRegRel {
-    bits<5> Rx;
-    bits<5> Ru;
-    bits<5> Rs;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b0011000;
-    let Inst{12-8} = Rx;
-    let Inst{4-0} = Ru;
-    let Inst{20-16} = Rs;
-  }
-
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def M4_vrmpyeh_s0 : T_M2_vmpy<"vrmpyweh", 0b010, 0b100, 0, 0, 0>;
-def M4_vrmpyeh_s1 : T_M2_vmpy<"vrmpyweh", 0b110, 0b100, 1, 0, 0>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def M4_vrmpyoh_s0 : T_M2_vmpy<"vrmpywoh", 0b001, 0b010, 0, 0, 0>;
-def M4_vrmpyoh_s1 : T_M2_vmpy<"vrmpywoh", 0b101, 0b010, 1, 0, 0>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def M4_vrmpyeh_acc_s0: T_M2_vmpy_acc<"vrmpyweh", 0b001, 0b110, 0, 0>;
-def M4_vrmpyeh_acc_s1: T_M2_vmpy_acc<"vrmpyweh", 0b101, 0b110, 1, 0>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def M4_vrmpyoh_acc_s0: T_M2_vmpy_acc<"vrmpywoh", 0b011, 0b110, 0, 0>;
-def M4_vrmpyoh_acc_s1: T_M2_vmpy_acc<"vrmpywoh", 0b111, 0b110, 1, 0>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<]:sat
-def M2_vmpy2su_s0 : T_XTYPE_mpy64 < "vmpyhsu", 0b000, 0b111, 1, 0, 0>;
-def M2_vmpy2su_s1 : T_XTYPE_mpy64 < "vmpyhsu", 0b100, 0b111, 1, 1, 0>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def M2_vmac2su_s0 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b011, 0b101, 1, 0, 0>;
-def M2_vmac2su_s1 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b111, 0b101, 1, 1, 0>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def M4_vpmpyh : T_XTYPE_mpy64 < "vpmpyh", 0b110, 0b111, 0, 0, 0>;
-
-// Rxx^=vpmpyh(Rs,Rt)
-def M4_vpmpyh_acc : T_XTYPE_mpy64_acc < "vpmpyh", "^", 0b101, 0b111, 0, 0, 0>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def M4_pmpyw : T_XTYPE_mpy64 < "pmpyw", 0b010, 0b111, 0, 0, 0>;
-
-// Rxx^=pmpyw(Rs,Rt)
-def M4_pmpyw_acc  : T_XTYPE_mpy64_acc < "pmpyw", "^", 0b001, 0b111, 0, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/Vector compare
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Template class for vector compare
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0 in
-class T_vcmpImm <string Str, bits<2> cmpOp, bits<2> minOp, Operand ImmOprnd>
-  : ALU64_rr <(outs PredRegs:$Pd),
-              (ins DoubleRegs:$Rss, ImmOprnd:$Imm),
-  "$Pd = "#Str#"($Rss, #$Imm)",
-  [], "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<32> Imm;
-    bits<8> ImmBits;
-    let ImmBits{6-0} = Imm{6-0};
-    let ImmBits{7} = !if (!eq(cmpOp,0b10), 0b0, Imm{7}); // 0 for vcmp[bhw].gtu
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{22-21} = cmpOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-5} = ImmBits;
-    let Inst{4-3} = minOp;
-    let Inst{1-0} = Pd;
-  }
-
-// Vector compare bytes
-def A4_vcmpbgt   : T_vcmp <"vcmpb.gt", 0b1010>;
-
-let AsmString = "$Pd = any8(vcmpb.eq($Rss, $Rtt))" in
-def A4_vcmpbeq_any : T_vcmp <"any8(vcmpb.gt", 0b1000>;
-
-def A4_vcmpbeqi  : T_vcmpImm <"vcmpb.eq",  0b00, 0b00, u8_0Imm>;
-def A4_vcmpbgti  : T_vcmpImm <"vcmpb.gt",  0b01, 0b00, s8_0Imm>;
-def A4_vcmpbgtui : T_vcmpImm <"vcmpb.gtu", 0b10, 0b00, u7_0Imm>;
-
-// Vector compare halfwords
-def A4_vcmpheqi  : T_vcmpImm <"vcmph.eq",  0b00, 0b01, s8_0Imm>;
-def A4_vcmphgti  : T_vcmpImm <"vcmph.gt",  0b01, 0b01, s8_0Imm>;
-def A4_vcmphgtui : T_vcmpImm <"vcmph.gtu", 0b10, 0b01, u7_0Imm>;
-
-// Vector compare words
-def A4_vcmpweqi  : T_vcmpImm <"vcmpw.eq",  0b00, 0b10, s8_0Imm>;
-def A4_vcmpwgti  : T_vcmpImm <"vcmpw.gt",  0b01, 0b10, s8_0Imm>;
-def A4_vcmpwgtui : T_vcmpImm <"vcmpw.gtu", 0b10, 0b10, u7_0Imm>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/SHIFT +
-//===----------------------------------------------------------------------===//
-// Shift by immediate and accumulate/logical.
-// Rx=add(#u8,asl(Rx,#U5))  Rx=add(#u8,lsr(Rx,#U5))
-// Rx=sub(#u8,asl(Rx,#U5))  Rx=sub(#u8,lsr(Rx,#U5))
-// Rx=and(#u8,asl(Rx,#U5))  Rx=and(#u8,lsr(Rx,#U5))
-// Rx=or(#u8,asl(Rx,#U5))   Rx=or(#u8,lsr(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-    hasNewValue = 1, opNewValue = 0 in
-class T_S4_ShiftOperate<string MnOp, string MnSh, bit asl_lsr,
-                        bits<2> MajOp, InstrItinClass Itin>
-  : MInst_acc<(outs IntRegs:$Rd), (ins u8_0Ext:$u8, IntRegs:$Rx, u5_0Imm:$U5),
-      "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))",
-      [], "$Rd = $Rx", Itin> {
-
-  bits<5> Rd;
-  bits<8> u8;
-  bits<5> Rx;
-  bits<5> U5;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b1110;
-  let Inst{23-21} = u8{7-5};
-  let Inst{20-16} = Rd;
-  let Inst{13} = u8{4};
-  let Inst{12-8} = U5;
-  let Inst{7-5} = u8{3-1};
-  let Inst{4} = asl_lsr;
-  let Inst{3} = u8{0};
-  let Inst{2-1} = MajOp;
-}
-
-multiclass T_ShiftOperate<string mnemonic, bits<2> MajOp, InstrItinClass Itin> {
-  def _asl_ri : T_S4_ShiftOperate<mnemonic, "asl", 0, MajOp, Itin>;
-  def _lsr_ri : T_S4_ShiftOperate<mnemonic, "lsr", 1, MajOp, Itin>;
-}
-
-defm S4_addi : T_ShiftOperate<"add", 0b10, ALU64_tc_2_SLOT23>;
-defm S4_andi : T_ShiftOperate<"and", 0b00, ALU64_tc_2_SLOT23>;
-defm S4_ori  : T_ShiftOperate<"or",  0b01, ALU64_tc_1_SLOT23>;
-defm S4_subi : T_ShiftOperate<"sub", 0b11, ALU64_tc_1_SLOT23>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def S2_vcnegh   : T_S3op_shiftVect < "vcnegh",   0b11, 0b01>;
-
-// Rd=[cround|round](Rs,Rt)
-let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in {
-  def A4_cround_rr    : T_S3op_3 < "cround", IntRegs, 0b11, 0b00>;
-  def A4_round_rr     : T_S3op_3 < "round", IntRegs, 0b11, 0b10>;
-}
-
-// Rd=round(Rs,Rt):sat
-let hasNewValue = 1, Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def A4_round_rr_sat : T_S3op_3 < "round", IntRegs, 0b11, 0b11, 1>;
-
-// Rd=[cmpyiwh|cmpyrwh](Rss,Rt):<<1:rnd:sat
-let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
-  def M4_cmpyi_wh     : T_S3op_8<"cmpyiwh", 0b100, 1, 1, 1>;
-  def M4_cmpyr_wh     : T_S3op_8<"cmpyrwh", 0b110, 1, 1, 1>;
-}
-
-// Rdd=[add|sub](Rss,Rtt,Px):carry
-let isPredicateLate = 1, hasSideEffects = 0 in
-class T_S3op_carry <string mnemonic, bits<3> MajOp>
-  : SInst < (outs DoubleRegs:$Rdd, PredRegs:$Px),
-            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu):carry",
-  [], "$Px = $Pu", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<2> Pu;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{6-5}   = Pu;
-    let Inst{4-0}   = Rdd;
-  }
-
-def A4_addp_c : T_S3op_carry < "add", 0b110 >;
-def A4_subp_c : T_S3op_carry < "sub", 0b111 >;
-
-let Itinerary = S_3op_tc_3_SLOT23, hasSideEffects = 0 in
-class T_S3op_6 <string mnemonic, bits<3> MinOp, bit isUnsigned>
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Ru),
-  "$Rxx = "#mnemonic#"($Rss, $Ru)" ,
-  [] , "$dst2 = $Rxx"> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Ru;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011001;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = isUnsigned;
-    let Inst{12-8}  = Rxx;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = Ru;
-  }
-
-// Vector reduce maximum halfwords
-// Rxx=vrmax[u]h(Rss,Ru)
-def A4_vrmaxh  : T_S3op_6 < "vrmaxh",  0b001, 0>;
-def A4_vrmaxuh : T_S3op_6 < "vrmaxuh", 0b001, 1>;
-
-// Vector reduce maximum words
-// Rxx=vrmax[u]w(Rss,Ru)
-def A4_vrmaxw  : T_S3op_6 < "vrmaxw",  0b010, 0>;
-def A4_vrmaxuw : T_S3op_6 < "vrmaxuw", 0b010, 1>;
-
-// Vector reduce minimum halfwords
-// Rxx=vrmin[u]h(Rss,Ru)
-def A4_vrminh  : T_S3op_6 < "vrminh",  0b101, 0>;
-def A4_vrminuh : T_S3op_6 < "vrminuh", 0b101, 1>;
-
-// Vector reduce minimum words
-// Rxx=vrmin[u]w(Rss,Ru)
-def A4_vrminw  : T_S3op_6 < "vrminw",  0b110, 0>;
-def A4_vrminuw : T_S3op_6 < "vrminuw", 0b110, 1>;
-
-// Shift an immediate left by register amount.
-let hasNewValue = 1, hasSideEffects = 0 in
-def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6_0Imm:$s6, IntRegs:$Rt),
-  "$Rd = lsl(#$s6, $Rt)" , [], "", S_3op_tc_1_SLOT23> {
-    bits<5> Rd;
-    bits<6> s6;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b011010;
-    let Inst{20-16} = s6{5-1};
-    let Inst{12-8}  = Rt;
-    let Inst{7-6}   = 0b11;
-    let Inst{4-0}   = Rd;
-    let Inst{5}     = s6{0};
-  }
-
-//===----------------------------------------------------------------------===//
-// XTYPE/SHIFT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MEMOP
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// Template class for MemOp instructions with the register value.
-//===----------------------------------------------------------------------===//
-class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
-                     string memOp, bits<2> memOpBits> :
-      MEMInst_V4<(outs),
-                 (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
-                 opc#"($base+#$offset)"#memOp#"$delta",
-                 []>,
-                 Requires<[UseMEMOP]> {
-
-    bits<5> base;
-    bits<5> delta;
-    bits<32> offset;
-    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
-
-    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
-                     !if (!eq(opcBits, 0b01), offset{6-1},
-                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
-
-    let opExtentAlign = opcBits;
-    let IClass = 0b0011;
-    let Inst{27-24} = 0b1110;
-    let Inst{22-21} = opcBits;
-    let Inst{20-16} = base;
-    let Inst{13} = 0b0;
-    let Inst{12-7} = offsetBits;
-    let Inst{6-5} = memOpBits;
-    let Inst{4-0} = delta;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for MemOp instructions with the immediate value.
-//===----------------------------------------------------------------------===//
-class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
-                     string memOp, bits<2> memOpBits> :
-      MEMInst_V4 <(outs),
-                  (ins IntRegs:$base, ImmOp:$offset, u5_0Imm:$delta),
-                  opc#"($base+#$offset)"#memOp#"#$delta"
-                  #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
-                  []>,
-                  Requires<[UseMEMOP]> {
-
-    bits<5> base;
-    bits<5> delta;
-    bits<32> offset;
-    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
-
-    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
-                     !if (!eq(opcBits, 0b01), offset{6-1},
-                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
-
-    let opExtentAlign = opcBits;
-    let IClass = 0b0011;
-    let Inst{27-24} = 0b1111;
-    let Inst{22-21} = opcBits;
-    let Inst{20-16} = base;
-    let Inst{13} = 0b0;
-    let Inst{12-7} = offsetBits;
-    let Inst{6-5} = memOpBits;
-    let Inst{4-0} = delta;
-}
-
-// multiclass to define MemOp instructions with register operand.
-multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
-  def L4_add#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
-  def L4_sub#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
-  def L4_and#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
-  def L4_or#NAME  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
-}
-
-// multiclass to define MemOp instructions with immediate Operand.
-multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
-  def L4_iadd#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
-  def L4_isub#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
-  def L4_iand#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = clrbit(", 0b10>;
-  def L4_ior#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = setbit(", 0b11>;
-}
-
-multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
-  defm _#NAME : MemOp_rr <opc, opcBits, ImmOp>;
-  defm _#NAME : MemOp_ri <opc, opcBits, ImmOp>;
-}
-
-// Define MemOp instructions.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
-  let opExtentBits = 6, accessSize = ByteAccess in
-  defm memopb_io : MemOp_base <"memb", 0b00, u6_0Ext>;
-
-  let opExtentBits = 7, accessSize = HalfWordAccess in
-  defm memoph_io : MemOp_base <"memh", 0b01, u6_1Ext>;
-
-  let opExtentBits = 8, accessSize = WordAccess in
-  defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PRED +
-//===----------------------------------------------------------------------===//
-
-// Hexagon V4 only supports these flavors of byte/half compare instructions:
-// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
-// hardware. However, compiler can still implement these patterns through
-// appropriate patterns combinations based on current implemented patterns.
-// The implemented patterns are: EQ/GT/GTU.
-// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
-
-// Following instruction is not being extended as it results into the
-// incorrect code for negative numbers.
-// Pd=cmpb.eq(Rs,#u8)
-
-// p=!cmp.eq(r1,#s10)
-def C4_cmpneqi  : T_CMP <"cmp.eq",  0b00, 1, s10_0Ext>;
-def C4_cmpltei  : T_CMP <"cmp.gt",  0b01, 1, s10_0Ext>;
-def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PRED -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Multiclass for DeallocReturn
-//===----------------------------------------------------------------------===//
-class L4_RETURN<string mnemonic, bit isNot, bit isPredNew, bit isTak>
-  : LD0Inst<(outs), (ins PredRegs:$src),
-  !if(isNot, "if (!$src", "if ($src")#
-  !if(isPredNew, ".new) ", ") ")#mnemonic#
-  !if(isPredNew, #!if(isTak,":t", ":nt"),""),
-  [], "", LD_tc_3or4stall_SLOT0> {
-
-    bits<2> src;
-    let BaseOpcode = "L4_RETURN";
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-    let isTaken = isTak;
-    let IClass = 0b1001;
-
-    let Inst{27-16} = 0b011000011110;
-
-    let Inst{13} = isNot;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{10} = 0b0;
-    let Inst{9-8} = src;
-    let Inst{4-0} = 0b11110;
-  }
-
-// Produce all predicated forms, p, !p, p.new, !p.new, :t, :nt
-multiclass L4_RETURN_PRED<string mnemonic, bit PredNot> {
-  let isPredicated = 1 in {
-    def _#NAME# : L4_RETURN <mnemonic, PredNot, 0, 1>;
-    def _#NAME#new_pnt : L4_RETURN <mnemonic, PredNot, 1, 0>;
-    def _#NAME#new_pt : L4_RETURN <mnemonic, PredNot, 1, 1>;
-  }
-}
-
-multiclass LD_MISC_L4_RETURN<string mnemonic> {
-  let isBarrier = 1, isPredicable = 1 in
-    def NAME : LD0Inst <(outs), (ins), mnemonic, [], "",
-                        LD_tc_3or4stall_SLOT0> {
-      let BaseOpcode = "L4_RETURN";
-      let IClass = 0b1001;
-      let Inst{27-16} = 0b011000011110;
-      let Inst{13-10} = 0b0000;
-      let Inst{4-0} = 0b11110;
-    }
-  defm t : L4_RETURN_PRED<mnemonic, 0 >;
-  defm f : L4_RETURN_PRED<mnemonic, 1 >;
-}
-
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], hasSideEffects = 0 in
-defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
-
-// Restore registers and dealloc return function call.
-let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
-
-  let isExtended = 1, opExtendable = 0 in
-  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
-
-  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
-    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
-
-    let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
-  }
-}
-
-// Restore registers and dealloc frame before a tail call.
-let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<0, "">, PredRel;
-
-  let isExtended = 1, opExtendable = 0 in
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<0, "">, PredRel;
-
-    let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
-  }
-}
-
-// Save registers function call.
-let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
-  def SAVE_REGISTERS_CALL_V4 : T_Call<0, "">, PredRel;
-
-  let isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [P0] in
-  def SAVE_REGISTERS_CALL_V4STK : T_Call<0, "">, PredRel;
-
-  let Defs = [P0], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28] in
-  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, P0] in
-  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<0, "">, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1 in
-class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                    bits<2>MajOp, bit isAbs, bit isHalf>
-  : STInst<(outs), (ins ImmOp:$addr, RC:$src),
-  mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
-  [], "", V2LDST_tc_st_SLOT01> {
-    bits<19> addr;
-    bits<5> src;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-    let Uses = !if (isAbs, [], [GP]);
-
-    let IClass = 0b0100;
-    let Inst{27} = 1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24}    = 0b0;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = isHalf;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13}    = offsetBits{8};
-    let Inst{12-8}  = src;
-    let Inst{7-0}   = offsetBits{7-0};
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
-class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
-                       bit isHalf, bit isNot, bit isNew>
-  : STInst<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, RC: $src2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
-  ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
-  [], "", ST_tc_st_SLOT01>, AddrModeRel {
-    bits<2> src1;
-    bits<6> absaddr;
-    bits<5> src2;
-
-    let isPredicatedNew = isNew;
-    let isPredicatedFalse = isNot;
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = isHalf;
-    let Inst{17-16} = absaddr{5-4};
-    let Inst{13}    = isNew;
-    let Inst{12-8}  = src2;
-    let Inst{7}     = 0b1;
-    let Inst{6-3}   = absaddr{3-0};
-    let Inst{2}     = isNot;
-    let Inst{1-0}   = src1;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<2> MajOp, bit isHalf>
-  : T_StoreAbsGP <mnemonic, RC, u32_0MustExt, MajOp, 1, isHalf>,
-                  AddrModeRel {
-  string ImmOpStr = !cast<string>(ImmOp);
-  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                      /* u16_0Imm */ 16)));
-
-  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                       /* u16_0Imm */ 0)));
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass for store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-let addrMode = Absolute, isExtended = 1 in
-multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
-                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def PS_#NAME#abs : T_StoreAbs <mnemonic, RC, ImmOp, MajOp, isHalf>;
-
-    // Predicated
-    def S4_p#NAME#t_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 0>;
-    def S4_p#NAME#f_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 0>;
-
-    // .new Predicated
-    def S4_p#NAME#tnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 1>;
-    def S4_p#NAME#fnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 1>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated new-value store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
-    isNewValue = 1, opNewValue = 1 in
-class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp>
-  : NVInst_V4<(outs), (ins ImmOp:$addr, IntRegs:$src),
-  mnemonic #"(#$addr) = $src.new",
-  [], "", V2LDST_tc_st_SLOT0> {
-    bits<19> addr;
-    bits<3> src;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-    let IClass = 0b0100;
-
-    let Inst{27} = 1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24-21} = 0b0101;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13}    = offsetBits{8};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src;
-    let Inst{7-0}   = offsetBits{7-0};
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated new-value store instructions with
-// absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
-    isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
-class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
-  : NVInst_V4<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, IntRegs:$src2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
-  ") ")#mnemonic#"(#$absaddr) = $src2.new",
-  [], "", ST_tc_st_SLOT0>, AddrModeRel {
-    bits<2> src1;
-    bits<6> absaddr;
-    bits<3> src2;
-
-    let isPredicatedNew = isNew;
-    let isPredicatedFalse = isNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = 0b101;
-    let Inst{17-16} = absaddr{5-4};
-    let Inst{13}    = isNew;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src2;
-    let Inst{7}     = 0b1;
-    let Inst{6-3}   = absaddr{3-0};
-    let Inst{2}     = isNot;
-    let Inst{1-0}   = src1;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated new-value store instructions with
-// absolute addressing.
-//===----------------------------------------------------------------------===//
-class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
-  : T_StoreAbsGP_NV <mnemonic, u32_0MustExt, MajOp>, AddrModeRel {
-
-  string ImmOpStr = !cast<string>(ImmOp);
-  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                      /* u16_0Imm */ 16)));
-
-  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                       /* u16_0Imm */ 0)));
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass for new-value store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-let addrMode = Absolute, isExtended = 1  in
-multiclass ST_Abs_NV <string mnemonic, string CextOp, Operand ImmOp,
-                   bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def PS_#NAME#newabs : T_StoreAbs_NV <mnemonic, ImmOp, MajOp>;
-
-    // Predicated
-    def S4_p#NAME#newt_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 0>;
-    def S4_p#NAME#newf_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 0>;
-
-    // .new Predicated
-    def S4_p#NAME#newtnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 1>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Stores with absolute addressing
-//===----------------------------------------------------------------------===//
-let accessSize = ByteAccess in
-defm storerb : ST_Abs    <"memb", "STrib", IntRegs, u16_0Imm, 0b00>,
-               ST_Abs_NV <"memb", "STrib", u16_0Imm, 0b00>;
-
-let accessSize = HalfWordAccess in
-defm storerh : ST_Abs    <"memh", "STrih", IntRegs, u16_1Imm, 0b01>,
-               ST_Abs_NV <"memh", "STrih", u16_1Imm, 0b01>;
-
-let accessSize = WordAccess in
-defm storeri : ST_Abs    <"memw", "STriw", IntRegs, u16_2Imm, 0b10>,
-               ST_Abs_NV <"memw", "STriw", u16_2Imm, 0b10>;
-
-let isNVStorable = 0, accessSize = DoubleWordAccess in
-defm storerd : ST_Abs <"memd", "STrid", DoubleRegs, u16_3Imm, 0b11>;
-
-let isNVStorable = 0, accessSize = HalfWordAccess in
-defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
-
-//===----------------------------------------------------------------------===//
-// GP-relative stores.
-// mem[bhwd](#global)=Rt
-// Once predicated, these instructions map to absolute addressing mode.
-// if ([!]Pv[.new]) mem[bhwd](##global)=Rt
-//===----------------------------------------------------------------------===//
-
-let Uses = [GP], isAsmParserOnly = 1 in
-class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
-                 Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
-  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
-    // Set BaseOpcode same as absolute addressing instructions so that
-    // non-predicated GP-Rel instructions can have relate with predicated
-    // Absolute instruction.
-    let BaseOpcode = BaseOp#_abs;
-  }
-
-let Uses = [GP], isAsmParserOnly = 1 in
-multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
-                  bits<2> MajOp, bit isHalf = 0> {
-  // Set BaseOpcode same as absolute addressing instructions so that
-  // non-predicated GP-Rel instructions can have relate with predicated
-  // Absolute instruction.
-  let BaseOpcode = BaseOp#_abs in {
-    def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
-                                0, isHalf>;
-    // New-value store
-    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp> ;
-  }
-}
-
-let accessSize = ByteAccess in
-defm S2_storerb : ST_GP<"memb", "STrib", u16_0Imm, 0b00>, NewValueRel;
-
-let accessSize = HalfWordAccess in
-defm S2_storerh : ST_GP<"memh", "STrih", u16_1Imm, 0b01>, NewValueRel;
-
-let accessSize = WordAccess in
-defm S2_storeri : ST_GP<"memw", "STriw", u16_2Imm, 0b10>, NewValueRel;
-
-let isNVStorable = 0, accessSize = DoubleWordAccess in
-def S2_storerdgp : T_StoreGP <"memd", "STrid", DoubleRegs,
-                              u16_3Imm, 0b11>, PredNewRel;
-
-let isNVStorable = 0, accessSize = HalfWordAccess in
-def S2_storerfgp : T_StoreGP <"memh", "STrif", IntRegs,
-                              u16_1Imm, 0b01, 1>, PredNewRel;
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated load instructions with
-// absolute addressing mode.
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0 in
-class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3> MajOp>
-  : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
-  "$dst = "#mnemonic# "(#$addr)",
-  [], "", V2LDST_tc_ld_SLOT01> {
-    bits<5> dst;
-    bits<19> addr;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24}    = 0b1;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, u32_0MustExt, MajOp>, AddrModeRel {
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                       !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                       !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                        /* u16_0Imm */ 16)));
-
-    let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                        !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                        !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                        /* u16_0Imm */ 0)));
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated load instructions with
-// absolute addressing mode.
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
-    opExtendable = 2 in
-class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                      bit isPredNot, bit isPredNew>
-  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32_0MustExt:$absaddr),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<6> absaddr;
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = absaddr{5-1};
-    let Inst{13} = 0b1;
-    let Inst{12} = isPredNew;
-    let Inst{11} = isPredNot;
-    let Inst{10-9} = src1;
-    let Inst{8} = absaddr{0};
-    let Inst{7} = 0b1;
-    let Inst{4-0} = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for the load instructions with absolute addressing mode.
-//===----------------------------------------------------------------------===//
-multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bits<3> MajOp,
-                       bit PredNot> {
-  def _abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 0>;
-  // Predicate new
-  def new_abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 1>;
-}
-
-let addrMode = Absolute, isExtended = 1 in
-multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC,
-                  Operand ImmOp, bits<3> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 1, isPredicable = 1 in
-    def PS_#NAME#abs: T_LoadAbs <mnemonic, RC, ImmOp, MajOp>;
-
-    // Predicated
-    defm L4_p#NAME#t : LD_Abs_Pred<mnemonic, RC, MajOp, 0>;
-    defm L4_p#NAME#f : LD_Abs_Pred<mnemonic, RC, MajOp, 1>;
-  }
-}
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  defm loadrb  : LD_Abs<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
-  defm loadrub : LD_Abs<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  defm loadrh  : LD_Abs<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
-  defm loadruh : LD_Abs<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-defm loadri  : LD_Abs<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
-
-let accessSize = DoubleWordAccess in
-defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// multiclass for load instructions with GP-relative addressing mode.
-// Rx=mem[bhwd](##global)
-// Once predicated, these instructions map to absolute addressing mode.
-// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
-//===----------------------------------------------------------------------===//
-
-let isAsmParserOnly = 1, Uses = [GP] in
-class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
-                bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
-    let BaseOpcode = BaseOp#_abs;
-  }
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  def L2_loadrbgp  : T_LoadGP<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
-  def L2_loadrubgp : T_LoadGP<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  def L2_loadrhgp  : T_LoadGP<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
-  def L2_loadruhgp : T_LoadGP<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-def L2_loadrigp  : T_LoadGP<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
-
-let accessSize = DoubleWordAccess in
-def L2_loadrdgp  : T_LoadGP<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// :raw for of boundscheck:hi:lo insns
-//===----------------------------------------------------------------------===//
-
-// A4_boundscheck_lo: Detect if a register is within bounds.
-let hasSideEffects = 0 in
-def A4_boundscheck_lo: ALU64Inst <
-  (outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = boundscheck($Rss, $Rtt):raw:lo"> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = 0b1;
-    let Inst{7-5} = 0b100;
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// A4_boundscheck_hi: Detect if a register is within bounds.
-let hasSideEffects = 0 in
-def A4_boundscheck_hi: ALU64Inst <
-  (outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = boundscheck($Rss, $Rtt):raw:hi"> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = 0b1;
-    let Inst{7-5} = 0b101;
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def A4_boundscheck : MInst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
-  "$Pd=boundscheck($Rs,$Rtt)">;
-
-// A4_tlbmatch: Detect if a VA/ASID matches a TLB entry.
-let isPredicateLate = 1, hasSideEffects = 0 in
-def A4_tlbmatch : ALU64Inst<(outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rs, IntRegs:$Rt),
-  "$Pd = tlbmatch($Rs, $Rt)",
-  [], "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1101;
-    let Inst{27-23} = 0b00100;
-    let Inst{20-16} = Rs;
-    let Inst{13} = 0b1;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = 0b011;
-    let Inst{1-0} = Pd;
-  }
-
-// Use LD0Inst for dcfetch, but set "mayLoad" to 0 because this doesn't
-// really do a load.
-let hasSideEffects = 1, mayLoad = 0 in
-def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
-      "dcfetch($Rs + #$u11_3)",
-      [], "", LD_tc_ld_SLOT0> {
-  bits<5> Rs;
-  bits<14> u11_3;
-
-  let IClass = 0b1001;
-  let Inst{27-21} = 0b0100000;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b0;
-  let Inst{10-0} = u11_3{13-3};
-}
-
-
-//===----------------------------------------------------------------------===//
-// Compound instructions
-//===----------------------------------------------------------------------===//
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1,
-    opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
-    isTerminator = 1 in
-class CJInst_tstbit_R0<string px, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
-  ""#px#" = tstbit($Rs, #0); if ("
-    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{24-23} = 0b11;
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{9-8} = 0b11;
-  let Inst{7-1} = r9_2{8-2};
-}
-
-let Defs = [PC, P0], Uses = [P0] in {
-  def J4_tstbit0_tp0_jump_nt : CJInst_tstbit_R0<"p0", 0, "nt">;
-  def J4_tstbit0_tp0_jump_t : CJInst_tstbit_R0<"p0", 0, "t">;
-  def J4_tstbit0_fp0_jump_nt : CJInst_tstbit_R0<"p0", 1, "nt">;
-  def J4_tstbit0_fp0_jump_t : CJInst_tstbit_R0<"p0", 1, "t">;
-}
-
-let Defs = [PC, P1], Uses = [P1] in {
-  def J4_tstbit0_tp1_jump_nt : CJInst_tstbit_R0<"p1", 0, "nt">;
-  def J4_tstbit0_tp1_jump_t : CJInst_tstbit_R0<"p1", 0, "t">;
-  def J4_tstbit0_fp1_jump_nt : CJInst_tstbit_R0<"p1", 1, "nt">;
-  def J4_tstbit0_fp1_jump_t : CJInst_tstbit_R0<"p1", 1, "t">;
-}
-
-
-let isBranch = 1, hasSideEffects = 0,
-    isExtentSigned = 1, isPredicated = 1, isPredicatedNew = 1,
-    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2,
-    opExtendable = 2, isTerminator = 1 in
-class CJInst_RR<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs, $Rt); if ("
-   #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<4> Rt;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-23} = !if (!eq(op, "eq"),  0b01000,
-                    !if (!eq(op, "gt"),  0b01001,
-                    !if (!eq(op, "gtu"), 0b01010, 0)));
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  // px: Predicate reg 0/1
-  let Inst{12} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{11-8} = Rt;
-  let Inst{7-1} = r9_2{8-2};
-}
-
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_RR<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_RR<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_RR<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_RR<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_RR<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_RR<string op>{
-  defm J4_cmp#NAME#_t : T_tnt_CJInst_RR<op, 0>;
-  defm J4_cmp#NAME#_f : T_tnt_CJInst_RR<op, 1>;
-}
-// TypeCJ Instructions compare RR and jump
-defm eq : T_pnp_CJInst_RR<"eq">;
-defm gt : T_pnp_CJInst_RR<"gt">;
-defm gtu : T_pnp_CJInst_RR<"gtu">;
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2, isTerminator = 1 in
-class CJInst_RU5<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, u5_0Imm:$U5, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs, #$U5); if ("
-    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<5> U5;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  // px: Predicate reg 0/1
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{24-23} = !if (!eq(op, "eq"),  0b00,
-                    !if (!eq(op, "gt"),  0b01,
-                    !if (!eq(op, "gtu"), 0b10, 0)));
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{12-8} = U5;
-  let Inst{7-1} = r9_2{8-2};
-}
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_RU5<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_RU5<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_RU5<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_RU5<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_RU5<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_RU5<string op>{
-  defm J4_cmp#NAME#i_t : T_tnt_CJInst_RU5<op, 0>;
-  defm J4_cmp#NAME#i_f : T_tnt_CJInst_RU5<op, 1>;
-}
-// TypeCJ Instructions compare RI and jump
-defm eq : T_pnp_CJInst_RU5<"eq">;
-defm gt : T_pnp_CJInst_RU5<"gt">;
-defm gtu : T_pnp_CJInst_RU5<"gtu">;
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1,
-    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2, opExtendable = 2,
-    isTerminator = 1 in
-class CJInst_Rn1<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, n1Const:$n1, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs,#$n1); if ("
-  #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-
-  let Inst{24-23} = 0b11;
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{9-8} = !if (!eq(op, "eq"),  0b00,
-                  !if (!eq(op, "gt"),  0b01, 0));
-  let Inst{7-1} = r9_2{8-2};
-}
-
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_Rn1<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_Rn1<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_Rn1<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_Rn1<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_Rn1<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_Rn1<string op>{
-  defm J4_cmp#NAME#n1_t : T_tnt_CJInst_Rn1<op, 0>;
-  defm J4_cmp#NAME#n1_f : T_tnt_CJInst_Rn1<op, 1>;
-}
-// TypeCJ Instructions compare -1 and jump
-defm eq : T_pnp_CJInst_Rn1<"eq">;
-defm gt : T_pnp_CJInst_Rn1<"gt">;
-
-// J4_jumpseti: Direct unconditional jump and set register to immediate.
-let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
-    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpseti: CJInst_JMPSET <
-  (outs IntRegs:$Rd),
-  (ins u6_0Imm:$U6, brtarget:$r9_2),
-  "$Rd = #$U6 ; jump $r9_2"> {
-    bits<4> Rd;
-    bits<6> U6;
-    bits<11> r9_2;
-
-    let IClass = 0b0001;
-    let Inst{27-24} = 0b0110;
-    let Inst{21-20} = r9_2{10-9};
-    let Inst{19-16} = Rd;
-    let Inst{13-8} = U6;
-    let Inst{7-1} = r9_2{8-2};
-  }
-
-// J4_jumpsetr: Direct unconditional jump and transfer register.
-let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
-    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpsetr: CJInst_JMPSET <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, brtarget:$r9_2),
-  "$Rd = $Rs ; jump $r9_2"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<11> r9_2;
-
-    let IClass = 0b0001;
-    let Inst{27-24} = 0b0111;
-    let Inst{21-20} = r9_2{10-9};
-    let Inst{11-8} = Rd;
-    let Inst{19-16} = Rs;
-    let Inst{7-1} = r9_2{8-2};
-  }
-
-// Duplex instructions
-//===----------------------------------------------------------------------===//
-include "HexagonIsetDx.td"
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
deleted file mode 100644
index cd19b6916f21..000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ /dev/null
@@ -1,497 +0,0 @@
-//=- HexagonInstrInfoV5.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V5 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY
-//===----------------------------------------------------------------------===//
-
-  //Rdd[+]=vrmpybsu(Rss,Rtt)
-let Predicates = [HasV5T] in {
-  def M5_vrmpybsu: T_XTYPE_Vect<"vrmpybsu", 0b110, 0b001, 0>;
-  def M5_vrmacbsu: T_XTYPE_Vect_acc<"vrmpybsu", 0b110, 0b001, 0>;
-
-  //Rdd[+]=vrmpybu(Rss,Rtt)
-  def M5_vrmpybuu: T_XTYPE_Vect<"vrmpybu", 0b100, 0b001, 0>;
-  def M5_vrmacbuu: T_XTYPE_Vect_acc<"vrmpybu", 0b100, 0b001, 0>;
-
-  def M5_vdmpybsu: T_M2_vmpy<"vdmpybsu", 0b101, 0b001, 0, 0, 1>;
-  def M5_vdmacbsu: T_M2_vmpy_acc_sat <"vdmpybsu", 0b001, 0b001, 0, 0>;
-}
-
-// Vector multiply bytes
-// Rdd=vmpyb[s]u(Rs,Rt)
-let Predicates = [HasV5T] in {
-  def M5_vmpybsu: T_XTYPE_mpy64 <"vmpybsu", 0b010, 0b001, 0, 0, 0>;
-  def M5_vmpybuu: T_XTYPE_mpy64 <"vmpybu",  0b100, 0b001, 0, 0, 0>;
-
-  // Rxx+=vmpyb[s]u(Rs,Rt)
-  def M5_vmacbsu: T_XTYPE_mpy64_acc <"vmpybsu", "+", 0b110, 0b001, 0, 0, 0>;
-  def M5_vmacbuu: T_XTYPE_mpy64_acc <"vmpybu", "+", 0b100, 0b001, 0, 0, 0>;
-
-  // Rd=vaddhub(Rss,Rtt):sat
-  let hasNewValue = 1, opNewValue = 0 in
-    def A5_vaddhubs: T_S3op_1 <"vaddhub", IntRegs, 0b01, 0b001, 0, 1>;
-}
-
-def S2_asr_i_p_rnd : S_2OpInstImm<"asr", 0b110, 0b111, u6_0Imm, [], 1>,
-      Requires<[HasV5T]> {
-  bits<6> src2;
-  let Inst{13-8} = src2;
-}
-
-let isAsmParserOnly = 1 in
-def S2_asr_i_p_rnd_goodsyntax
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6_0Imm:$src2),
-    "$dst = asrrnd($src1, #$src2)">;
-
-def C4_fastcorner9 : T_LOGICAL_2OP<"fastcorner9", 0b000, 0, 0>,
-  Requires<[HasV5T]> {
-  let Inst{13,7,4} = 0b111;
-}
-
-def C4_fastcorner9_not : T_LOGICAL_2OP<"!fastcorner9", 0b000, 0, 0>,
-  Requires<[HasV5T]> {
-  let Inst{20,13,7,4} = 0b1111;
-}
-
-let hasNewValue = 1, validSubTargets = HasV5SubT in
-def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
-  "$Rd = popcount($Rss)", [], "", S_2op_tc_2_SLOT23>,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rss;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1000011;
-    let Inst{7-5} = 0b011;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rss;
-  }
-
-let isFP = 1, hasNewValue = 1, opNewValue = 0 in
-class T_MInstFloat <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : MInst<(outs IntRegs:$Rd),
-          (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = "#mnemonic#"($Rs, $Rt)", [],
-  "" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13} = 0b0;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rd;
-  }
-
-let isCommutable = 1 in {
-  def F2_sfadd : T_MInstFloat < "sfadd", 0b000, 0b000>;
-  def F2_sfmpy : T_MInstFloat < "sfmpy", 0b010, 0b000>;
-}
-
-def F2_sfsub : T_MInstFloat < "sfsub", 0b000, 0b001>;
-
-let Itinerary = M_tc_3x_SLOT23 in {
-  def F2_sfmax : T_MInstFloat < "sfmax", 0b100, 0b000>;
-  def F2_sfmin : T_MInstFloat < "sfmin", 0b100, 0b001>;
-}
-
-let Itinerary = M_tc_3or4x_SLOT23 in {
-def F2_sffixupn : T_MInstFloat < "sffixupn", 0b110, 0b000>;
-def F2_sffixupd : T_MInstFloat < "sffixupd", 0b110, 0b001>;
-}
-
-// F2_sfrecipa: Reciprocal approximation for division.
-let Uses = [USR], isPredicateLate = 1, isFP = 1,
-    hasSideEffects = 0, hasNewValue = 1, Itinerary = M_tc_3or4x_SLOT23 in
-def F2_sfrecipa: MInst <
-  (outs IntRegs:$Rd, PredRegs:$Pe),
-  (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd, $Pe = sfrecipa($Rs, $Rt)">,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<2> Pe;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-    let Inst{27-21} = 0b1011111;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6-5}   = Pe;
-    let Inst{4-0}   = Rd;
-  }
-
-// F2_dfcmpeq: Floating point compare for equal.
-let Uses = [USR], isCompare = 1, isFP = 1 in
-class T_fcmp <string mnemonic, RegisterClass RC, bits<3> MinOp,
-              list<dag> pattern = [] >
-  : ALU64Inst <(outs PredRegs:$dst), (ins RC:$src1, RC:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)", pattern,
-  "" , ALU64_tc_2early_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<2> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1101;
-
-    let Inst{27-21} = 0b0010111;
-    let Inst{20-16} = src1;
-    let Inst{12-8}  = src2;
-    let Inst{7-5}   = MinOp;
-    let Inst{1-0}   = dst;
-  }
-
-class T_fcmp64 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
-  : T_fcmp <mnemonic, DoubleRegs, MinOp, []> {
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0010111;
-}
-
-class T_fcmp32 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
-  : T_fcmp <mnemonic, IntRegs, MinOp, []> {
-  let IClass = 0b1100;
-  let Inst{27-21} = 0b0111111;
-}
-
-def F2_dfcmpeq : T_fcmp64<"dfcmp.eq", setoeq, 0b000>;
-def F2_dfcmpgt : T_fcmp64<"dfcmp.gt", setogt, 0b001>;
-def F2_dfcmpge : T_fcmp64<"dfcmp.ge", setoge, 0b010>;
-def F2_dfcmpuo : T_fcmp64<"dfcmp.uo", setuo,  0b011>;
-
-def F2_sfcmpge : T_fcmp32<"sfcmp.ge", setoge, 0b000>;
-def F2_sfcmpuo : T_fcmp32<"sfcmp.uo", setuo,  0b001>;
-def F2_sfcmpeq : T_fcmp32<"sfcmp.eq", setoeq, 0b011>;
-def F2_sfcmpgt : T_fcmp32<"sfcmp.gt", setogt, 0b100>;
-
-// F2 convert template classes:
-let Uses = [USR], isFP = 1 in
-class F2_RDD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
-                         string chop ="">
-  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
-   "$Rdd = "#mnemonic#"($Rss)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rdd;
-     bits<5> Rss;
-
-     let IClass = 0b1000;
-
-     let Inst{27-21} = 0b0000111;
-     let Inst{20-16} = Rss;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rdd;
-  }
-
-let Uses = [USR], isFP = 1 in
-class F2_RDD_RS_CONVERT<string mnemonic, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs DoubleRegs:$Rdd), (ins IntRegs:$Rs),
-   "$Rdd = "#mnemonic#"($Rs)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rdd;
-     bits<5> Rs;
-
-     let IClass = 0b1000;
-
-     let Inst{27-21} = 0b0100100;
-     let Inst{20-16} = Rs;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rdd;
-  }
-
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class F2_RD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
-   "$Rd = "#mnemonic#"($Rss)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rd;
-     bits<5> Rss;
-
-     let IClass = 0b1000;
-
-     let Inst{27-24} = 0b1000;
-     let Inst{23-21} = MinOp;
-     let Inst{20-16} = Rss;
-     let Inst{7-5} = 0b001;
-     let Inst{4-0} = Rd;
-  }
-
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class F2_RD_RS_CONVERT<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-   "$Rd = "#mnemonic#"($Rs)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rd;
-     bits<5> Rs;
-
-     let IClass = 0b1000;
-
-     let Inst{27-24} = 0b1011;
-     let Inst{23-21} = MajOp;
-     let Inst{20-16} = Rs;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rd;
-  }
-
-// Convert single precision to double precision and vice-versa.
-def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000>;
-def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000>;
-
-// Convert Integer to Floating Point.
-def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010>;
-def F2_conv_ud2sf : F2_RD_RSS_CONVERT <"convert_ud2sf", 0b001>;
-def F2_conv_uw2sf : F2_RD_RS_CONVERT <"convert_uw2sf", 0b001, 0b000>;
-def F2_conv_w2sf : F2_RD_RS_CONVERT <"convert_w2sf", 0b010, 0b000>;
-def F2_conv_d2df : F2_RDD_RSS_CONVERT <"convert_d2df", 0b011>;
-def F2_conv_ud2df : F2_RDD_RSS_CONVERT <"convert_ud2df", 0b010>;
-def F2_conv_uw2df : F2_RDD_RS_CONVERT <"convert_uw2df", 0b001>;
-def F2_conv_w2df : F2_RDD_RS_CONVERT <"convert_w2df", 0b010>;
-
-// Convert Floating Point to Integer.
-def F2_conv_df2uw_chop : F2_RD_RSS_CONVERT <"convert_df2uw", 0b101, ":chop">;
-def F2_conv_df2w_chop : F2_RD_RSS_CONVERT <"convert_df2w", 0b111, ":chop">;
-def F2_conv_sf2uw_chop : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b001,
-                                           ":chop">;
-def F2_conv_sf2w_chop : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b001,
-                                          ":chop">;
-def F2_conv_df2d_chop : F2_RDD_RSS_CONVERT <"convert_df2d", 0b110, ":chop">;
-def F2_conv_df2ud_chop : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b111, ":chop">;
-def F2_conv_sf2d_chop : F2_RDD_RS_CONVERT <"convert_sf2d", 0b110, ":chop">;
-def F2_conv_sf2ud_chop : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b101, ":chop">;
-
-// Convert Floating Point to Integer: non-chopped.
-let AddedComplexity = 20, Predicates = [HasV5T] in {
-  def F2_conv_df2d : F2_RDD_RSS_CONVERT <"convert_df2d", 0b000>;
-  def F2_conv_df2ud : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b001>;
-  def F2_conv_sf2ud : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b011>;
-  def F2_conv_sf2d : F2_RDD_RS_CONVERT <"convert_sf2d", 0b100>;
-  def F2_conv_df2uw : F2_RD_RSS_CONVERT <"convert_df2uw", 0b011>;
-  def F2_conv_df2w : F2_RD_RSS_CONVERT <"convert_df2w", 0b100>;
-  def F2_conv_sf2uw : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b000>;
-  def F2_conv_sf2w : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b000>;
-}
-
-// Fix up radicand.
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-def F2_sffixupr: SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-  "$Rd = sffixupr($Rs)",
-  [], "" , S_2op_tc_3or4x_SLOT23>, Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rs;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rs;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rd;
-  }
-
-// F2_sffma: Floating-point fused multiply add.
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class T_sfmpy_acc <bit isSub, bit isLib>
-  : MInst<(outs IntRegs:$Rx),
-          (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#!if(isSub, "-=","+=")#" sfmpy($Rs, $Rt)"#!if(isLib, ":lib",""),
-  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b1111000;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6}     = isLib;
-    let Inst{5}     = isSub;
-    let Inst{4-0}   = Rx;
-  }
-
-def F2_sffma: T_sfmpy_acc <0, 0>;
-def F2_sffms: T_sfmpy_acc <1, 0>;
-def F2_sffma_lib: T_sfmpy_acc <0, 1>;
-def F2_sffms_lib: T_sfmpy_acc <1, 1>;
-
-// Floating-point fused multiply add w/ additional scaling (2**pu).
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-def F2_sffma_sc: MInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt, PredRegs:$Pu),
-  "$Rx += sfmpy($Rs, $Rt, $Pu):scale" ,
-  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<2> Pu;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b1111011;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6-5}   = Pu;
-    let Inst{4-0}   = Rx;
-  }
-
-//===----------------------------------------------------------------------===//
-// :natural forms of vasrh and vasrhub insns
-//===----------------------------------------------------------------------===//
-// S5_asrhub_rnd_sat: Vector arithmetic shift right by immediate with round,
-// saturate, and pack.
-let Defs = [USR_OVF], hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_ASRHUB<bit isSat>
-  : SInst <(outs IntRegs:$Rd),
-  (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rd = vasrhub($Rss, #$u4):"#!if(isSat, "sat", "raw"),
-  [], "", S_2op_tc_2_SLOT23>,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rss;
-    bits<4> u4;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1000011;
-    let Inst{20-16} = Rss;
-    let Inst{13-12} = 0b00;
-    let Inst{11-8} = u4;
-    let Inst{7-6} = 0b10;
-    let Inst{5} = isSat;
-    let Inst{4-0} = Rd;
-  }
-
-def S5_asrhub_rnd_sat : T_ASRHUB <0>;
-def S5_asrhub_sat : T_ASRHUB <1>;
-
-let isAsmParserOnly = 1 in
-def S5_asrhub_rnd_sat_goodsyntax
-  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rd = vasrhub($Rss, #$u4):rnd:sat">, Requires<[HasV5T]>;
-
-// S5_vasrhrnd: Vector arithmetic shift right by immediate with round.
-let hasSideEffects = 0 in
-def S5_vasrhrnd : SInst <(outs DoubleRegs:$Rdd),
-                         (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rdd = vasrh($Rss, #$u4):raw">,
-  Requires<[HasV5T]> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<4> u4;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b0000001;
-    let Inst{20-16} = Rss;
-    let Inst{13-12} = 0b00;
-    let Inst{11-8}  = u4;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rdd;
-  }
-
-let isAsmParserOnly = 1 in
-def S5_vasrhrnd_goodsyntax
-  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rdd = vasrh($Rss,#$u4):rnd">, Requires<[HasV5T]>;
-
-// Floating point reciprocal square root approximation
-let Uses = [USR], isPredicateLate = 1, isFP = 1,
-    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0,
-    validSubTargets = HasV5SubT in
-def F2_sfinvsqrta: SInst <
-  (outs IntRegs:$Rd, PredRegs:$Pe),
-  (ins IntRegs:$Rs),
-  "$Rd, $Pe = sfinvsqrta($Rs)" > ,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<2> Pe;
-    bits<5> Rs;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1011111;
-    let Inst{20-16} = Rs;
-    let Inst{7} = 0b0;
-    let Inst{6-5} = Pe;
-    let Inst{4-0} = Rd;
-  }
-
-// Complex multiply 32x16
-let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
-  def M4_cmpyi_whc : T_S3op_8<"cmpyiwh", 0b101, 1, 1, 1, 1>;
-  def M4_cmpyr_whc : T_S3op_8<"cmpyrwh", 0b111, 1, 1, 1, 1>;
-}
-
-// Classify floating-point value
-let Uses = [USR], isFP = 1 in
-def F2_sfclass : T_TEST_BIT_IMM<"sfclass", 0b111>, Requires<[HasV5T]>;
-
-let Uses = [USR], isFP = 1 in
-def F2_dfclass: ALU64Inst<(outs PredRegs:$Pd), (ins DoubleRegs:$Rss, u5_0Imm:$u5),
-  "$Pd = dfclass($Rss, #$u5)",
-  [], "" , ALU64_tc_2early_SLOT23 > , Requires<[HasV5T]> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> u5;
-
-    let IClass = 0b1101;
-    let Inst{27-21} = 0b1100100;
-    let Inst{20-16} = Rss;
-    let Inst{12-10} = 0b000;
-    let Inst{9-5}   = u5;
-    let Inst{4-3}   = 0b10;
-    let Inst{1-0}   = Pd;
-  }
-
-// Instructions to create floating point constant
-class T_fimm <string mnemonic, RegisterClass RC, bits<4> RegType, bit isNeg>
-  : ALU64Inst<(outs RC:$dst), (ins u10_0Imm:$src),
-  "$dst = "#mnemonic#"(#$src)"#!if(isNeg, ":neg", ":pos"),
-  [], "", ALU64_tc_2_SLOT23>, Requires<[HasV5T]> {
-    bits<5> dst;
-    bits<10> src;
-
-    let IClass = 0b1101;
-    let Inst{27-24} = RegType;
-    let Inst{23}    = 0b0;
-    let Inst{22}    = isNeg;
-    let Inst{21}    = src{9};
-    let Inst{13-5}  = src{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let hasNewValue = 1, opNewValue = 0 in {
-  def F2_sfimm_p : T_fimm <"sfmake", IntRegs, 0b0110, 0>;
-  def F2_sfimm_n : T_fimm <"sfmake", IntRegs, 0b0110, 1>;
-}
-
-def F2_dfimm_p : T_fimm <"dfmake", DoubleRegs, 0b1001, 0>;
-def F2_dfimm_n : T_fimm <"dfmake", DoubleRegs, 0b1001, 1>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV60.td b/lib/Target/Hexagon/HexagonInstrInfoV60.td
deleted file mode 100644
index c50141b18ead..000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV60.td
+++ /dev/null
@@ -1,2068 +0,0 @@
-//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-// Vector load
-let Predicates = [HasV60T, UseHVX] in
-let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
-  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
-                  IType type = TypeCVI_VM_LD>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-// Vector store
-let Predicates = [HasV60T, UseHVX] in
-let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
-class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = CVI_VM_ST,
-                IType type = TypeCVI_VM_ST>
-: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-//===----------------------------------------------------------------------===//
-// Vector loads with base + immediate offset
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access in
-class T_vload_ai<string asmStr>
-  : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
-                asmStr>;
-
-let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
-class T_vload_ai_128B<string asmStr>
-  : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
-                asmStr>;
-
-let isCVLoadable = 1, hasNewValue = 1 in {
-  def V6_vL32b_ai         : T_vload_ai <"$dst = vmem($src1+#$src2)">,
-                            V6_vL32b_ai_enc;
-  def V6_vL32b_nt_ai      : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
-                            V6_vL32b_nt_ai_enc;
-  // 128B
-  def V6_vL32b_ai_128B    : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
-                            V6_vL32b_ai_128B_enc;
-  def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
-                            V6_vL32b_nt_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
-  def V6_vL32Ub_ai      : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
-                          V6_vL32Ub_ai_enc;
-  def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
-                          V6_vL32Ub_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
-    hasNewValue = 1 in {
-  def V6_vL32b_cur_ai    : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
-                           V6_vL32b_cur_ai_enc;
-  def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
-                           V6_vL32b_nt_cur_ai_enc;
-  // 128B
-  def V6_vL32b_cur_ai_128B    : T_vload_ai_128B
-                                <"$dst.cur = vmem($src1+#$src2)">,
-                                V6_vL32b_cur_ai_128B_enc;
-  def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
-                                <"$dst.cur = vmem($src1+#$src2):nt">,
-                                V6_vL32b_nt_cur_ai_128B_enc;
-}
-
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
-  def V6_vL32b_tmp_ai    : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
-                           V6_vL32b_tmp_ai_enc;
-  def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
-                           V6_vL32b_nt_tmp_ai_enc;
-  // 128B
-  def V6_vL32b_tmp_ai_128B    : T_vload_ai_128B
-                                <"$dst.tmp = vmem($src1+#$src2)">,
-                                V6_vL32b_tmp_ai_128B_enc;
-  def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
-                                <"$dst.tmp = vmem($src1+#$src2)">,
-                                V6_vL32b_nt_tmp_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - unconditional
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access, isPredicable = 1 in
-class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
-                   RegisterClass RC, bit isNT>
-  : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_ai         : T_vstore_ai_64B <"vmem", "vS32b_ai">,
-                            V6_vS32b_ai_enc;
-  def V6_vS32b_ai_128B    : T_vstore_ai_128B <"vmem", "vS32b_ai">,
-                            V6_vS32b_ai_128B_enc;
-}
-
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_ai      : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
-                            V6_vS32b_nt_ai_enc;
-  def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
-                            V6_vS32b_nt_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vS32Ub_ai">,
-                          V6_vS32Ub_ai_enc;
-  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vS32Ub_ai">,
-                          V6_vS32Ub_ai_128B_enc;
-}
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - unconditional new
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
-    isPredicable = 1, Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
-class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
-  : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
-  : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
-  : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
-
-def V6_vS32b_new_ai      : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
-def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
-                           V6_vS32b_new_ai_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_ai      : T_vstore_new_ai_64B<"vS32b_ai", 1>,
-                                V6_vS32b_nt_new_ai_enc;
-  def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
-                                V6_vS32b_nt_new_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - conditional
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isPredicated = 1 in
-class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
-                        RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "
-     #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
-                            bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
-                             bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
-                      isPredNot, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pred_ai     : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
-                             V6_vS32b_pred_ai_enc;
-  def V6_vS32b_npred_ai    : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
-                             V6_vS32b_npred_ai_enc;
-  // 128B
-  def V6_vS32b_pred_ai_128B    : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
-                                 V6_vS32b_pred_ai_128B_enc;
-  def V6_vS32b_npred_ai_128B   : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
-                                 V6_vS32b_npred_ai_128B_enc;
-}
-
-
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_ai  : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
-                             V6_vS32b_nt_pred_ai_enc;
-  def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
-                             V6_vS32b_nt_npred_ai_enc;
-  // 128B
-  def V6_vS32b_nt_pred_ai_128B  : T_vstore_pred_ai_128B
-                                  <"vmem", "vS32b_ai", 0, 1>,
-                                  V6_vS32b_nt_pred_ai_128B_enc;
-  def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
-                                  <"vmem", "vS32b_ai", 1, 1>,
-                                  V6_vS32b_nt_npred_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_ai  : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
-                           V6_vS32Ub_pred_ai_enc;
-  def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
-                           V6_vS32Ub_npred_ai_enc;
-  // 128B
-  def V6_vS32Ub_pred_ai_128B  :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
-                               V6_vS32Ub_pred_ai_128B_enc;
-  def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
-                               V6_vS32Ub_npred_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset in
-class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
-                         bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs),
-               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
-
-def V6_vS32b_qpred_ai  : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
-def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
-                         V6_vS32b_nqpred_ai_enc;
-def V6_vS32b_nt_qpred_ai  : T_vstore_qpred_ai_64B <0, 1>,
-                            V6_vS32b_nt_qpred_ai_enc;
-def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
-                            V6_vS32b_nt_nqpred_ai_enc;
-// 128B
-def V6_vS32b_qpred_ai_128B  : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
-def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
-                              V6_vS32b_nqpred_ai_128B_enc;
-def V6_vS32b_nt_qpred_ai_128B  : T_vstore_qpred_ai_128B<0, 1>,
-                                 V6_vS32b_nt_qpred_ai_128B_enc;
-def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
-                                 V6_vS32b_nt_nqpred_ai_128B_enc;
-
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - conditional new
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
-    isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
-class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
-                            bit isPredNot, bit isNT>
-  : V6_STInst <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
-                          isPredNot, isNT>;
-
-
-def V6_vS32b_new_pred_ai     : T_vstore_new_pred_ai_64B <"vS32b_ai">,
-                               V6_vS32b_new_pred_ai_enc;
-def V6_vS32b_new_npred_ai    : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
-                               V6_vS32b_new_npred_ai_enc;
-// 128B
-def V6_vS32b_new_pred_ai_128B     : T_vstore_new_pred_ai_128B <"vS32b_ai">,
-                                    V6_vS32b_new_pred_ai_128B_enc;
-def V6_vS32b_new_npred_ai_128B    : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
-                                    V6_vS32b_new_npred_ai_128B_enc;
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pred_ai  : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
-                                 V6_vS32b_nt_new_pred_ai_enc;
-  def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
-                                 V6_vS32b_nt_new_npred_ai_enc;
-  // 128B
-  def V6_vS32b_nt_new_pred_ai_128B  : T_vstore_new_pred_ai_128B
-                                      <"vS32b_ai", 0, 1>,
-                                      V6_vS32b_nt_new_pred_ai_128B_enc;
-  def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
-                                      <"vS32b_ai", 1, 1>,
-                                      V6_vS32b_nt_new_npred_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, hasNewValue = 1 in
-class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
-  : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
-    "$src1 = $_dst_">;
-
-let accessSize = Vector64Access in
-class T_vload_pi_64B <string asmStr>
-  : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vload_pi_128B <string asmStr>
-  : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
-
-let isCVLoadable = 1 in {
-  def V6_vL32b_pi    : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
-                       V6_vL32b_pi_enc;
-  def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
-                       V6_vL32b_nt_pi_enc;
-  // 128B
-  def V6_vL32b_pi_128B    : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
-                            V6_vL32b_pi_128B_enc;
-  def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
-                            V6_vL32b_nt_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
-  def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
-                     V6_vL32Ub_pi_enc;
-  // 128B
-  def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
-                          V6_vL32Ub_pi_128B_enc;
-}
-
-let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
-  def V6_vL32b_cur_pi    : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
-                           V6_vL32b_cur_pi_enc;
-  def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
-                           V6_vL32b_nt_cur_pi_enc;
-  // 128B
-  def V6_vL32b_cur_pi_128B    : T_vload_pi_128B
-                                <"$dst.cur = vmem($src1++#$src2)">,
-                                V6_vL32b_cur_pi_128B_enc;
-  def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
-                                <"$dst.cur = vmem($src1++#$src2):nt">,
-                                V6_vL32b_nt_cur_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
-  def V6_vL32b_tmp_pi    : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
-                           V6_vL32b_tmp_pi_enc;
-  def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
-                           V6_vL32b_nt_tmp_pi_enc;
-  //128B
-  def V6_vL32b_tmp_pi_128B    : T_vload_pi_128B
-                                <"$dst.tmp = vmem($src1++#$src2)">,
-                                V6_vL32b_tmp_pi_128B_enc;
-  def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
-                                <"$dst.tmp = vmem($src1++#$src2):nt">,
-                                V6_vL32b_nt_tmp_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, isPredicable = 1 in
-class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
-                   RegisterClass RC, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
-  def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
-                         V6_vS32b_pi_128B_enc;
-}
-
-let isNVStorable = 1 , isNonTemporal = 1  in {
-  def V6_vS32b_nt_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
-                            V6_vS32b_nt_pi_enc;
-  def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
-                            V6_vS32b_nt_pi_128B_enc;
-}
-
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pi      : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
-                          V6_vS32Ub_pi_enc;
-  def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
-                          V6_vS32Ub_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment unconditional .new vector stores with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, isNVStore = 1 in
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
-class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
-    "$src1 = $_dst_">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
-  : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
-  : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
-
-
-def V6_vS32b_new_pi      : T_vstore_new_pi_64B <"vS32b_pi">,
-                           V6_vS32b_new_pi_enc;
-def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
-                           V6_vS32b_new_pi_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pi      : T_vstore_new_pi_64B <"vS32b_pi", 1>,
-                                V6_vS32b_nt_new_pi_enc;
-  def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
-                                V6_vS32b_nt_new_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional vector stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, addrMode = PostInc in
-class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
-                        RegisterClass RC, bit isPredNot, bit isNT>
-  : V6_STInst<(outs IntRegs:$_dst_),
-             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
-                            bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
-                             bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
-                      isPredNot, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pred_pi     : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
-                             V6_vS32b_pred_pi_enc;
-  def V6_vS32b_npred_pi    : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
-                             V6_vS32b_npred_pi_enc;
-  // 128B
-  def V6_vS32b_pred_pi_128B  : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
-                               V6_vS32b_pred_pi_128B_enc;
-  def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
-                               V6_vS32b_npred_pi_128B_enc;
-}
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_pi  : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
-                             V6_vS32b_nt_pred_pi_enc;
-  def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
-                             V6_vS32b_nt_npred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_pred_pi_128B  : T_vstore_pred_pi_128B
-                                  <"vmem", "vS32b_pi", 0, 1>,
-                                  V6_vS32b_nt_pred_pi_128B_enc;
-  def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
-                                  <"vmem", "vS32b_pi", 1, 1>,
-                                  V6_vS32b_nt_npred_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_pi  : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
-                           V6_vS32Ub_pred_pi_enc;
-  def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
-                           V6_vS32Ub_npred_pi_enc;
-  // 128B
-  def V6_vS32Ub_pred_pi_128B  : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
-                                V6_vS32Ub_pred_pi_128B_enc;
-  def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
-                                V6_vS32Ub_npred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with immediate offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc in
-class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
-                         bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">;
-
-let accessSize = Vector64Access in
-class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
-
-def V6_vS32b_qpred_pi  : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
-def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
-// 128B
-def V6_vS32b_qpred_pi_128B  : T_vstore_qpred_pi_128B,
-                              V6_vS32b_qpred_pi_128B_enc;
-def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
-                              V6_vS32b_nqpred_pi_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_qpred_pi  : T_vstore_qpred_pi_64B <0, 1>,
-                              V6_vS32b_nt_qpred_pi_enc;
-  def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
-                              V6_vS32b_nt_nqpred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_qpred_pi_128B  : T_vstore_qpred_pi_128B<0, 1>,
-                                   V6_vS32b_nt_qpred_pi_128B_enc;
-  def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
-                                   V6_vS32b_nt_nqpred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with immediate offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
-    isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
-class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
-                            bit isPredNot, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new", [],
-    "$src2 = $_dst_"> , NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
-                          isPredNot, isNT>;
-
-def V6_vS32b_new_pred_pi     : T_vstore_new_pred_pi_64B <"vS32b_pi">,
-                               V6_vS32b_new_pred_pi_enc;
-def V6_vS32b_new_npred_pi    : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
-                               V6_vS32b_new_npred_pi_enc;
-// 128B
-def V6_vS32b_new_pred_pi_128B    : T_vstore_new_pred_pi_128B <"vS32b_pi">,
-                                   V6_vS32b_new_pred_pi_128B_enc;
-def V6_vS32b_new_npred_pi_128B   : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
-                                   V6_vS32b_new_npred_pi_128B_enc;
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pred_pi  : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
-                                 V6_vS32b_nt_new_pred_pi_enc;
-  def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
-                                 V6_vS32b_nt_new_npred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
-                                     <"vS32b_pi", 0, 1>,
-                                     V6_vS32b_nt_new_pred_pi_128B_enc;
-  def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
-                                      <"vS32b_pi", 1, 1>,
-                                      V6_vS32b_nt_new_npred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector loads with register offset
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1 in
-class T_vload_ppu<string asmStr>
-  : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let isCVLoadable = 1 in {
-  def V6_vL32b_ppu    : T_vload_ppu <"$dst = vmem($src1++$src2)">,
-                        V6_vL32b_ppu_enc;
-  def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
-                        V6_vL32b_nt_ppu_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
-def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
-                     V6_vL32Ub_ppu_enc;
-
-let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
-  def V6_vL32b_cur_ppu    : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
-                             V6_vL32b_cur_ppu_enc;
-  def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
-                             V6_vL32b_nt_cur_ppu_enc;
-}
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
-  def V6_vL32b_tmp_ppu    : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
-                             V6_vL32b_tmp_ppu_enc;
-  def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
-                             V6_vL32b_nt_tmp_ppu_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with register offset
-//===----------------------------------------------------------------------===//
-let isPredicable = 1 in
-class T_vstore_ppu <string mnemonic, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
-    mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_ppu    : T_vstore_ppu <"vmem">,
-                        V6_vS32b_ppu_enc;
-  let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
-  def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
-                        V6_vS32b_nt_ppu_enc;
-}
-
-let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
-def V6_vS32Ub_ppu   : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
-class T_vstore_new_ppu <bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
-    "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let BaseOpcode = "vS32b_ppu" in
-def V6_vS32b_new_ppu    : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
-
-let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
-def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1 in
-class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst<(outs IntRegs:$_dst_),
-           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-}
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
-  def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
-}
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_ppu  : T_vstore_pred_ppu <"vmem", 0, 1>,
-                              V6_vS32b_nt_pred_ppu_enc;
-  def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
-                              V6_vS32b_nt_npred_ppu_enc;
-}
-
-let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
-    Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_ppu  : T_vstore_pred_ppu <"vmemu">,
-                            V6_vS32Ub_pred_ppu_enc;
-  def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
-                            V6_vS32Ub_npred_ppu_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with register offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-        (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel;
-
-def V6_vS32b_qpred_ppu  : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
-def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
-def V6_vS32b_nt_qpred_ppu  : T_vstore_qpred_ppu<0, 1>,
-                             V6_vS32b_nt_qpred_ppu_enc;
-def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
-                             V6_vS32b_nt_nqpred_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
-    isNewValue = 1, opNewValue = 4, isNVStore = 1 in
-class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-}
-
-let BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_new_pred_ppu  : T_vstore_new_pred_ppu,
-                               V6_vS32b_new_pred_ppu_enc;
-  def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
-                               V6_vS32b_new_npred_ppu_enc;
-}
-
-let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
-def V6_vS32b_nt_new_pred_ppu :  T_vstore_new_pred_ppu<0, 1>,
-                                V6_vS32b_nt_new_pred_ppu_enc;
-def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
-                                V6_vS32b_nt_new_npred_ppu_enc;
-}
-
-
-// Vector load/store pseudos
-
-let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
-class STrivv_template<RegisterClass RC>
-  : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
-
-def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-
-
-let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
-class LDrivv_template<RegisterClass RC>
-  : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
-
-def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-
-// Store vector predicate pseudo.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
-  def PS_vstorerq_ai : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vstorerq_ai_128B : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXDbl]>;
-}
-
-// Load vector predicate pseudo.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
-  def PS_vloadrq_ai : LDInst<(outs VecPredRegs:$dst),
-              (ins IntRegs:$base, s32_0Imm:$offset),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vloadrq_ai_128B : LDInst<(outs VecPredRegs128B:$dst),
-              (ins IntRegs:$base, s32_0Imm:$offset),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXDbl]>;
-}
-
-class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = CVI_VA_DV,
-              IType type = TypeCVI_VA_DV>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
-  def PS_vselect: VSELInst<(outs VectorRegs:$dst),
-        (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
-        Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
-        (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
-        "", []>, Requires<[HasV60T,UseHVXDbl]>;
-  def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
-        (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
-        Requires<[HasV60T,UseHVXSgl]>;
-  def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
-        (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
-        "", []>, Requires<[HasV60T,UseHVXDbl]>;
-}
-
-let hasNewValue = 1 in
-class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
-    asmString >;
-
-multiclass T_vmpy <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_vmpy <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                      !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_vmpy_VV <string asmString>:
-  T_vmpy <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_vmpy_WW <string asmString>:
-  T_vmpy <asmString, VecDblRegs, VecDblRegs>;
-
-multiclass T_vmpy_VW <string asmString>:
-  T_vmpy <asmString, VectorRegs, VecDblRegs>;
-
-multiclass T_vmpy_WV <string asmString>:
-  T_vmpy <asmString, VecDblRegs, VectorRegs>;
-
-defm V6_vtmpyb   :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
-defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
-defm V6_vdsaduh  :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
-defm V6_vmpybus  :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
-defm V6_vmpabus  :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
-defm V6_vmpahb   :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
-defm V6_vmpyh    :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
-defm V6_vmpyuh   :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
-defm V6_vmpyiwh  :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
-defm V6_vtmpyhb  :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
-defm V6_vmpyub   :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
-
-let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
-defm V6_vmpyihb  :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
-
-defm V6_vdmpybus_dv :
-     T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
-defm V6_vdmpyhsusat :
-     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
-defm V6_vdmpyhsuisat :
-     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
-defm V6_vdmpyhsat :
-     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
-defm V6_vdmpyhisat :
-     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
-defm V6_vdmpyhb_dv :
-     T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
-defm V6_vmpyhss :
-     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
-defm V6_vmpyhsrs :
-     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in
-defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
-
-let Itinerary = CVI_VX, Type = TypeCVI_VX in {
-defm V6_vdmpyhb  : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
-defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
-defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
-defm V6_vmpyiwb  : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
-defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vasrw  : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
-defm V6_vasrh  : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
-defm V6_vaslw  : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
-defm V6_vaslh  : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
-defm V6_vlsrw  : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
-defm V6_vlsrh  : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
-}
-
-let hasNewValue = 1 in
-class T_HVX_alu <string asmString, InstrItinClass itin,
-                 RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
-    asmString >{
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_alu <string asmString, RegisterClass RCout,
-           RegisterClass RCin, InstrItinClass itin> {
-  def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu <asmString, itin,
-                              !cast<RegisterClass>(RCout#"128B"),
-                              !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_alu_VV <string asmString>:
-  T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
-
-multiclass T_HVX_alu_WW <string asmString>:
-  T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
-
-multiclass T_HVX_alu_WV <string asmString>:
-  T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
-
-
-let Itinerary  =  CVI_VX, Type  =  TypeCVI_VX in {
-defm V6_vrmpyubv :
-     T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
-defm V6_vrmpybv :
-     T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
-defm V6_vrmpybusv :
-     T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
-defm V6_vabsdiffub :
-     T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
-defm V6_vabsdiffh :
-     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
-defm V6_vabsdiffuh :
-     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
-defm V6_vabsdiffw :
-     T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
-}
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vdmpyhvsat :
-     T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
-defm V6_vmpyhvsrs :
-     T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
-defm V6_vmpyih :
-     T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
-}
-
-defm V6_vand :
-     T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
-defm V6_vor :
-     T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
-defm V6_vxor :
-     T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
-defm V6_vaddw :
-     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
-defm V6_vaddubsat :
-     T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
-defm V6_vadduhsat :
-     T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
-defm V6_vaddhsat :
-     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
-defm V6_vaddwsat :
-     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
-defm V6_vsubb :
-     T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
-defm V6_vsubh :
-     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
-defm V6_vsubw :
-     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
-defm V6_vsububsat :
-     T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
-defm V6_vsubuhsat :
-     T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
-defm V6_vsubhsat :
-     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
-defm V6_vsubwsat :
-     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
-defm V6_vavgub :
-     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
-defm V6_vavguh :
-     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
-defm V6_vavgh :
-     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
-defm V6_vavgw :
-     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
-defm V6_vnavgub :
-     T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
-defm V6_vnavgh :
-     T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
-defm V6_vnavgw :
-     T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
-defm V6_vavgubrnd :
-     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
-defm V6_vavguhrnd :
-     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
-defm V6_vavghrnd :
-     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
-defm V6_vavgwrnd :
-     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
-
-defm V6_vmpybv :
-     T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
-defm V6_vmpyubv :
-     T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
-defm V6_vmpybusv :
-     T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
-defm V6_vmpyhv :
-     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
-defm V6_vmpyuhv :
-     T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
-defm V6_vmpyhus :
-     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
-defm V6_vaddubh :
-     T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
-defm V6_vadduhw :
-     T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
-defm V6_vaddhw :
-     T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
-defm V6_vsububh :
-     T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
-defm V6_vsubuhw :
-     T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
-defm V6_vsubhw :
-     T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
-
-defm V6_vaddb_dv :
-     T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
-defm V6_vaddh_dv :
-     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
-defm V6_vaddw_dv :
-     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
-defm V6_vaddubsat_dv :
-     T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
-defm V6_vadduhsat_dv :
-     T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
-defm V6_vaddhsat_dv :
-     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
-defm V6_vaddwsat_dv :
-     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
-defm V6_vsubb_dv :
-     T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
-defm V6_vsubh_dv :
-     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
-defm V6_vsubw_dv :
-     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
-defm V6_vsububsat_dv :
-     T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
-defm V6_vsubuhsat_dv :
-     T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
-defm V6_vsubhsat_dv :
-     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
-defm V6_vsubwsat_dv :
-     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
-
-let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
-defm V6_vmpabusv :
-     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
-defm V6_vmpabuuv :
-     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
-}
-
-let isAccumulator = 1, hasNewValue = 1 in
-class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
-                     RegisterClass RCin1, RegisterClass RCin2>
-  : CVI_VA_Resource1 <(outs RCout:$dst),
-                      (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
-           RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
-  def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
-                   !cast<RegisterClass>(RCout#"128B"),
-                   !cast<RegisterClass>(RCin1#"128B"),
-                   !cast<RegisterClass>(RCin2#
-                   !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
-}
-
-multiclass T_HVX_vmpyacc_VVR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
-
-multiclass T_HVX_vmpyacc_VWR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WVR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WWR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_VVV <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WVV <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
-
-
-defm V6_vtmpyb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
-     V6_vtmpyb_acc_enc;
-defm V6_vtmpybus_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
-     V6_vtmpybus_acc_enc;
-defm V6_vtmpyhb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
-     V6_vtmpyhb_acc_enc;
-defm V6_vdmpyhb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
-     V6_vdmpyhb_acc_enc;
-defm V6_vrmpyub_acc :
-     T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
-     V6_vrmpyub_acc_enc;
-defm V6_vrmpybus_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
-     V6_vrmpybus_acc_enc;
-defm V6_vdmpybus_acc :
-     T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
-     V6_vdmpybus_acc_enc;
-defm V6_vdmpybus_dv_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
-     V6_vdmpybus_dv_acc_enc;
-defm V6_vdmpyhsuisat_acc :
-     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
-     V6_vdmpyhsuisat_acc_enc;
-defm V6_vdmpyhisat_acc :
-     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhisat_acc_enc;
-defm V6_vdmpyhb_dv_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
-     V6_vdmpyhb_dv_acc_enc;
-defm V6_vmpybus_acc :
-     T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
-     V6_vmpybus_acc_enc;
-defm V6_vmpabus_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
-     V6_vmpabus_acc_enc;
-defm V6_vmpahb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
-     V6_vmpahb_acc_enc;
-defm V6_vmpyhsat_acc :
-     T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
-     V6_vmpyhsat_acc_enc;
-defm V6_vmpyuh_acc :
-     T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
-     V6_vmpyuh_acc_enc;
-defm V6_vmpyiwb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
-     V6_vmpyiwb_acc_enc;
-defm V6_vdsaduh_acc :
-     T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
-     V6_vdsaduh_acc_enc;
-defm V6_vmpyihb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
-     V6_vmpyihb_acc_enc;
-defm V6_vmpyub_acc :
-     T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
-     V6_vmpyub_acc_enc;
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vdmpyhsusat_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
-     V6_vdmpyhsusat_acc_enc;
-defm V6_vdmpyhsat_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhsat_acc_enc;
-defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
-     <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vaslw_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
-defm V6_vasrw_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
-}
-
-defm V6_vdmpyhvsat_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhvsat_acc_enc;
-defm V6_vmpybusv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
-     V6_vmpybusv_acc_enc;
-defm V6_vmpybv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
-defm V6_vmpyhus_acc :
-     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
-defm V6_vmpyhv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
-defm V6_vmpyiewh_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
-     V6_vmpyiewh_acc_enc;
-defm V6_vmpyiewuh_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
-     V6_vmpyiewuh_acc_enc;
-defm V6_vmpyih_acc :
-     T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
-defm V6_vmpyowh_rnd_sacc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
-     V6_vmpyowh_rnd_sacc_enc;
-defm V6_vmpyowh_sacc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
-     V6_vmpyowh_sacc_enc;
-defm V6_vmpyubv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
-     V6_vmpyubv_acc_enc;
-defm V6_vmpyuhv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
-     V6_vmpyuhv_acc_enc;
-defm V6_vrmpybusv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
-     V6_vrmpybusv_acc_enc;
-defm V6_vrmpybv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
-defm V6_vrmpyubv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
-     V6_vrmpyubv_acc_enc;
-
-
-class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst),
-                      (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = CVI_VA;
-  let Type = TypeCVI_VA;
-}
-
-multiclass T_HVX_vcmp <string asmString> {
-  def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_veqb_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
-defm V6_veqh_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
-defm V6_veqw_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
-defm V6_vgtb_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
-defm V6_vgth_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
-defm V6_vgtw_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
-defm V6_vgtub_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
-defm V6_vgtuh_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
-defm V6_vgtuw_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
-defm V6_veqb_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
-defm V6_veqh_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
-defm V6_veqw_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
-defm V6_vgtb_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
-defm V6_vgth_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
-defm V6_vgtw_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
-defm V6_vgtub_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
-defm V6_vgtuh_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
-defm V6_vgtuw_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
-defm V6_veqb_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
-defm V6_veqh_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
-defm V6_veqw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
-defm V6_vgtb_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
-defm V6_vgth_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
-defm V6_vgtw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
-defm V6_vgtub_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
-defm V6_vgtuh_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
-defm V6_vgtuw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
-
-defm V6_vminub :
-     T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
-defm V6_vminuh :
-     T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
-defm V6_vminh :
-     T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
-defm V6_vminw :
-     T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
-defm V6_vmaxub :
-     T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
-defm V6_vmaxuh :
-     T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
-defm V6_vmaxh :
-     T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
-defm V6_vmaxw :
-     T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
-defm V6_vshuffeb :
-     T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
-defm V6_vshuffob :
-     T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
-defm V6_vshufeh :
-     T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
-defm V6_vshufoh :
-     T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vmpyowh_rnd :
-     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
-     V6_vmpyowh_rnd_enc;
-defm V6_vmpyiewuh :
-     T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
-defm V6_vmpyewuh :
-     T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
-defm V6_vmpyowh :
-     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
-defm V6_vmpyiowh :
-     T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
-}
-let Itinerary = CVI_VX, Type = TypeCVI_VX in
-defm V6_vmpyieoh :
-     T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
-defm V6_vshufoeh :
-     T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
-defm V6_vshufoeb :
-     T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
-}
-
-let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
-defm V6_vcombine :
-     T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
-
-let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
-defm V6_vsathub :
-     T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
-defm V6_vsatwh :
-     T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vroundwh :
-     T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
-defm V6_vroundwuh :
-     T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
-defm V6_vroundhb :
-     T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
-defm V6_vroundhub :
-     T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
-defm V6_vasrwv :
-     T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
-defm V6_vlsrwv :
-     T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
-defm V6_vlsrhv :
-     T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
-defm V6_vasrhv :
-     T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
-defm V6_vaslwv :
-     T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
-defm V6_vaslhv :
-     T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
-}
-
-defm V6_vaddb :
-     T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
-defm V6_vaddh :
-     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in {
-defm V6_vdelta :
-     T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
-defm V6_vrdelta :
-     T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
-defm V6_vdealb4w :
-     T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
-defm V6_vpackeb :
-     T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
-defm V6_vpackeh :
-     T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
-defm V6_vpackhub_sat :
-     T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
-defm V6_vpackhb_sat :
-     T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
-defm V6_vpackwuh_sat :
-     T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
-defm V6_vpackwh_sat :
-     T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
-defm V6_vpackob :
-     T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
-defm V6_vpackoh :
-     T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
-  : CVI_VA_Resource1 <(outs RC2:$dst),
-                      (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = CVI_VA;
-  let Type = TypeCVI_VA;
-}
-
-multiclass T_HVX_condALU <string asmString> {
-  def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_vaddbq  : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
-                  V6_vaddbq_enc;
-defm V6_vaddhq  : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
-                  V6_vaddhq_enc;
-defm V6_vaddwq  : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
-                  V6_vaddwq_enc;
-defm V6_vsubbq  : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
-                  V6_vsubbq_enc;
-defm V6_vsubhq  : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
-                  V6_vsubhq_enc;
-defm V6_vsubwq  : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
-                  V6_vsubwq_enc;
-defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
-                  V6_vaddbnq_enc;
-defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
-                  V6_vaddhnq_enc;
-defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
-                  V6_vaddwnq_enc;
-defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
-                  V6_vsubbnq_enc;
-defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
-                  V6_vsubhnq_enc;
-defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
-                  V6_vsubwnq_enc;
-
-let hasNewValue = 1 in
-class T_HVX_alu_2op <string asmString, InstrItinClass itin,
-                 RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
-    asmString >{
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
-           RegisterClass RCin, InstrItinClass itin> {
-  def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu_2op <asmString, itin,
-                              !cast<RegisterClass>(RCout#"128B"),
-                              !cast<RegisterClass>(RCin#"128B")>;
-}
-
-let hasNewValue = 1 in
-multiclass T_HVX_alu_2op_VV <string asmString>:
-  T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
-
-multiclass T_HVX_alu_2op_WV <string asmString>:
-  T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
-
-
-defm V6_vabsh     : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
-                    V6_vabsh_enc;
-defm V6_vabsw     : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
-                    V6_vabsw_enc;
-defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
-                    V6_vabsh_sat_enc;
-defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
-                    V6_vabsw_sat_enc;
-defm V6_vnot      : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
-                    V6_vnot_enc;
-defm V6_vassign   : T_HVX_alu_2op_VV <"$dst = $src1">,
-                    V6_vassign_enc;
-
-defm V6_vzb       : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
-                    V6_vzb_enc;
-defm V6_vzh       : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
-                    V6_vzh_enc;
-defm V6_vsb       : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
-                    V6_vsb_enc;
-defm V6_vsh       : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
-                    V6_vsh_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in {
-defm V6_vdealh    : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
-                    V6_vdealh_enc;
-defm V6_vdealb    : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
-                    V6_vdealb_enc;
-defm V6_vshuffh   : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
-                    V6_vshuffh_enc;
-defm V6_vshuffb   : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
-                    V6_vshuffb_enc;
-}
-
-let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
-defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
-                    V6_vunpackub_enc;
-defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
-                    V6_vunpackuh_enc;
-defm V6_vunpackb  : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
-                    V6_vunpackb_enc;
-defm V6_vunpackh  : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
-                    V6_vunpackh_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vcl0w     : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
-                    V6_vcl0w_enc;
-defm V6_vcl0h     : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
-                    V6_vcl0h_enc;
-defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
-                    V6_vnormamtw_enc;
-defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
-                    V6_vnormamth_enc;
-defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
-                     V6_vpopcounth_enc;
-}
-
-let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
-    Type = TypeCVI_VX_DV in
-class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$dst),
-                      (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
-    asmString, [], "$dst = $_src_" > ;
-
-
-multiclass T_HVX_vmpyacc2 <string asmString> {
-  def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
-}
-
-defm V6_vrmpybusi_acc :
-     T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
-     V6_vrmpybusi_acc_enc;
-defm V6_vrsadubi_acc :
-     T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
-     V6_vrsadubi_acc_enc;
-defm V6_vrmpyubi_acc :
-     T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
-     V6_vrmpyubi_acc_enc;
-
-
-let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
-class T_HVX_vmpy2 <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
-    asmString>;
-
-
-multiclass T_HVX_vmpy2 <string asmString> {
-  def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
-}
-
-defm V6_vrmpybusi :
-     T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
-defm V6_vrsadubi :
-     T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
-defm V6_vrmpyubi :
-     T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
-
-
-let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
-    hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
-class T_HVX_perm <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
-                      (ins RC:$src1, RC:$src2, IntRegs:$src3),
-    asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
-
-multiclass T_HVX_perm <string asmString> {
-  def NAME : T_HVX_perm <asmString, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
-}
-
-let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
-  defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
-  defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
-}
-
-// Conditional vector move.
-let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_HVX_cmov <bit isPredNot, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
-    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-multiclass T_HVX_cmov <bit isPredNot = 0> {
-  def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
-}
-
-defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
-defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
-
-// Conditional vector combine.
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
-    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 < (outs RCout:$dst),
-    (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
-    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-multiclass T_HVX_ccombine <bit isPredNot = 0> {
-  def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
-}
-
-defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
-defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
-
-let hasNewValue = 1 in
-class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst),
-    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
-    asmString >;
-
-multiclass T_HVX_shift <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_HVX_shift <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                           !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_shift_VV <string asmString>:
-  T_HVX_shift <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_shift_WV <string asmString>:
-  T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
-defm V6_valignb :
-     T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
-defm V6_vlalignb :
-     T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vasrwh :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
-defm V6_vasrwhsat :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
-     V6_vasrwhsat_enc;
-defm V6_vasrwhrndsat :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
-     V6_vasrwhrndsat_enc;
-defm V6_vasrwuhsat :
-     T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
-     V6_vasrwuhsat_enc;
-defm V6_vasrhubsat :
-     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
-     V6_vasrhubsat_enc;
-defm V6_vasrhubrndsat :
-     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
-     V6_vasrhubrndsat_enc;
-defm V6_vasrhbrndsat :
-     T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
-     V6_vasrhbrndsat_enc;
-}
-
-// Assembler mapped -- alias?
-//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
-defm V6_vshuffvdd :
-     T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
-defm V6_vdealvdd :
-     T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
-}
-
-let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
-class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
-    asmString, [], "$dst = $_src_">;
-
-multiclass T_HVX_unpack <string asmString> {
-  def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
-}
-
-defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
-defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
-    hasSideEffects = 0 in
-class T_HVX_valign <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3_0Imm:$src3),
-    asmString>;
-
-multiclass T_HVX_valign <string asmString> {
-  def NAME : T_HVX_valign <asmString, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
-}
-
-defm V6_valignbi :
-     T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
-defm V6_vlalignbi :
-     T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
-class T_HVX_predAlu <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
-    asmString>;
-
-multiclass T_HVX_predAlu <string asmString> {
-  def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
-}
-
-defm V6_pred_and  : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
-defm V6_pred_or   : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
-defm V6_pred_xor  : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
-defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
-defm V6_pred_and_n :
-     T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA in
-class T_HVX_prednot <RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
-    "$dst = not($src1)">, V6_pred_not_enc;
-
-def V6_pred_not : T_HVX_prednot <VecPredRegs>;
-let isCodeGenOnly =  1 in
-def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA in
-class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
-    asmString >;
-
-multiclass T_HVX_vcmp2 <string asmString> {
-  def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_veqb : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
-defm V6_veqh : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
-defm V6_veqw : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
-defm V6_vgtb : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
-defm V6_vgth : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
-defm V6_vgtw : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
-defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
-defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
-defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
-
-let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
-class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
-    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
-
-def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
-
-let isAccumulator = 1 in
-class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
-    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
-
-def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
-
-let hasNewValue =  1, hasSideEffects = 0 in
-class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCin:$src1, IntRegs:$src2),
-    "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
-
-def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_V6_lvsplatw <RegisterClass RC>
-  : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
-    "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
-
-def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
-
-
-let hasNewValue = 1 in
-class T_V6_vinsertwr <RegisterClass RC>
-  : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
-    "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
-    V6_vinsertwr_enc;
-
-def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
-
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
-class T_V6_pred_scalar2 <RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
-    "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
-
-def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
-
-class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
-    "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
-
-def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
-
-let validSubTargets = HasV60SubT in
-class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
-  : SInst2 <(outs RC:$dst), (ins  RC:$src1, ImmOp:$src2), asmString>;
-
-class T_HVX_rol_R <string asmString>
-  : T_HVX_rol <asmString, IntRegs, u5_0Imm>;
-class T_HVX_rol_P <string asmString>
-  : T_HVX_rol <asmString, DoubleRegs, u6_0Imm>;
-
-def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
-let hasNewValue = 1, opNewValue = 0 in
-def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
-
-let validSubTargets = HasV60SubT in
-class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
-  : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
-    asmString, [], "$dst = $_src_" >;
-
-class T_HVX_rol_acc_P <string asmString>
-  : T_HVX_rol_acc <asmString, DoubleRegs, u6_0Imm>;
-
-class T_HVX_rol_acc_R <string asmString>
-  : T_HVX_rol_acc <asmString, IntRegs, u5_0Imm>;
-
-def S6_rol_i_p_nac :
-    T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
-def S6_rol_i_p_acc :
-    T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
-def S6_rol_i_p_and :
-    T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
-def S6_rol_i_p_or  :
-    T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
-def S6_rol_i_p_xacc :
-    T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
-
-let hasNewValue = 1, opNewValue = 0 in {
-def S6_rol_i_r_nac :
-    T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
-def S6_rol_i_r_acc :
-    T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
-def S6_rol_i_r_and :
-    T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
-def S6_rol_i_r_or :
-    T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
-def S6_rol_i_r_xacc :
-    T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
-}
-
-let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
-class T_V6_extractw <RegisterClass RC>
-  : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
-    "$dst = vextract($src1,$src2)">, V6_extractw_enc;
-
-def V6_extractw : T_V6_extractw <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
-
-let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT  in
-class T_sys0op <string asmString>
-  : ST1Inst <(outs), (ins), asmString>;
-
-let isSolo = 1, validSubTargets = HasV55SubT in {
-def Y5_l2gunlock   : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
-def Y5_l2gclean    : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
-def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
-}
-
-class T_sys1op <string asmString, RegisterClass RC>
-  : ST1Inst <(outs), (ins RC:$src1), asmString>;
-
-class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
-class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
-
-let isSoloAX = 1, validSubTargets = HasV55SubT in
-def Y5_l2unlocka     : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
-
-let isSolo = 1, validSubTargets = HasV60SubT in {
-def Y6_l2gcleanpa    : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
-def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
-}
-
-let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
-    validSubTargets = HasV55SubT in
-def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
-  "$dst = l2locka($src1)">, Y5_l2locka_enc;
-
-// not defined on etc side. why?
-// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
-
-let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
-    hasSideEffects = 0,
-validSubTargets = HasV55SubT in
-def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
-  (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
-  "$dst1,$dst2 = vacsh($src1,$src2)", [],
-  "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
-    hasSideEffects = 0 in
-class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
-                  RegisterClass RCin2>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
-
-multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
-  def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
-                               VecPredRegs128B, VectorRegs128B>;
-}
-
-multiclass T_HVX_alu2_V <string asmString> :
-  T_HVX_alu2 <asmString, VectorRegs>;
-
-multiclass T_HVX_alu2_W <string asmString> :
-  T_HVX_alu2 <asmString, VecDblRegs>;
-
-defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
-    hasSideEffects = 0 in
-defm V6_vmux  : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
-
-class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
-
-multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                           !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_vlutb_V <string asmString> :
-  T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_vlutb_W <string asmString> :
-  T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
-
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
-class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
-                       RegisterClass RCin>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
-    asmString, [], "$dst = $_src_">;
-
-multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
-                            RegisterClass RCin> {
-  def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vlutb_acc<asmString,
-                                   !cast<RegisterClass>(RCout#"128B"),
-                                   !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_vlutb_acc_V <string asmString> :
-  T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_vlutb_acc_W <string asmString> :
-  T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
-
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
-defm V6_vlutvvb:
-     T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
-
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
-defm V6_vlutvwh:
-     T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
-
-let hasNewValue = 1 in {
-  defm V6_vlutvvb_oracc:
-       T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
-       V6_vlutvvb_oracc_enc;
-  defm V6_vlutvwh_oracc:
-       T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
-       V6_vlutvwh_oracc_enc;
-}
-
-// It's a fake instruction and should not be defined?
-def S2_cabacencbin
-  : SInst2<(outs DoubleRegs:$dst),
-          (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-    "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
-
-// Vhist instructions
-def V6_vhistq
-  : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
-    "vhist($src1)">, V6_vhistq_enc;
-
-def V6_vhist
-  : CVI_HIST_Resource1 <(outs), (ins),
-    "vhist" >, V6_vhist_enc;
-
-
-let isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def V6_vd0: CVI_VA_Resource<(outs VectorRegs:$dst), (ins), "$dst = #0", []>;
-  def V6_vd0_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst), (ins),
-      "$dst = #0", []>;
-
-  def V6_vassignp: CVI_VA_Resource<(outs VecDblRegs:$dst),
-      (ins VecDblRegs:$src), "", []>;
-  def V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
-      (ins VecDblRegs128B:$src), "", []>;
-
-  def V6_lo: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
-      "", []>;
-  def V6_lo_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
-      (ins VecDblRegs128B:$src1), "", []>;
-
-  def V6_hi: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
-      "", []>;
-  def V6_hi_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
-      (ins VecDblRegs128B:$src1), "", []>;
-}
diff --git a/lib/Target/Hexagon/HexagonInstrInfoVector.td b/lib/Target/Hexagon/HexagonInstrInfoVector.td
deleted file mode 100644
index e3520bd6e515..000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ /dev/null
@@ -1,69 +0,0 @@
-//===- HexagonInstrInfoVector.td - Hexagon Vector Patterns -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon Vector instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-// Vector shift support. Vector shifting in Hexagon is rather different
-// from internal representation of LLVM.
-// LLVM assumes all shifts (in vector case) will have the form
-// <VT> = SHL/SRA/SRL <VT> by <VT>
-// while Hexagon has the following format:
-// <VT> = SHL/SRA/SRL <VT> by <IT/i32>
-// As a result, special care is needed to guarantee correctness and
-// performance.
-class vshift_v4i16<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
-  : S_2OpInstImm<Str, MajOp, MinOp, u4_0Imm, []> {
-  bits<4> src2;
-  let Inst{11-8} = src2;
-}
-
-class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
-  : S_2OpInstImm<Str, MajOp, MinOp, u5_0Imm, []> {
-  bits<5> src2;
-  let Inst{12-8} = src2;
-}
-
-def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
-def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
-def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
-
-def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
-def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
-def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;
-
-// Vector shift words by register
-def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
-def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
-def S2_asl_r_vw : T_S3op_shiftVect < "vaslw", 0b00, 0b10>;
-def S2_lsl_r_vw : T_S3op_shiftVect < "vlslw", 0b00, 0b11>;
-
-// Vector shift halfwords by register
-def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
-def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
-def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
-def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
-
-
-// Hexagon doesn't have a vector multiply with C semantics.
-// Instead, generate a pseudo instruction that gets expaneded into two
-// scalar MPYI instructions.
-// This is expanded by ExpandPostRAPseudos.
-let isPseudo = 1 in
-def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
-      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
-
-let isPseudo = 1 in
-def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
-      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
-      "$Rd = $Rx">;
-
-
-
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index d4f303bf6ff0..c611857ec26a 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1347,6 +1347,25 @@ def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
 def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred, I64>;
 def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
 
+multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
+  def : Pat<(IntID VecPredRegs:$src1, IntRegs:$src2, VectorRegs:$src3),
+            (MI VecPredRegs:$src1, IntRegs:$src2, #0, VectorRegs:$src3)>,
+        Requires<[UseHVXSgl]>;
+
+  def : Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                             IntRegs:$src2,
+                                             VectorRegs128B:$src3),
+            (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+                                            IntRegs:$src2, #0,
+                                            VectorRegs128B:$src3)>,
+        Requires<[UseHVXDbl]>;
+}
+
+defm : MaskedStore <V6_vS32b_qpred_ai, int_hexagon_V6_vmaskedstoreq>;
+defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>;
+defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>;
+defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index a45e1c9d7be4..f438b3e0368f 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -790,7 +790,7 @@ def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
 defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
 defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
 
-def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
 
 def: Pat<(v64i16 (trunc v64i32:$Vdd)),
          (v64i16 (V6_vpackwh_sat_128B
diff --git a/lib/Target/Hexagon/HexagonIsetDx.td b/lib/Target/Hexagon/HexagonIsetDx.td
deleted file mode 100644
index ebedf2cbaf17..000000000000
--- a/lib/Target/Hexagon/HexagonIsetDx.td
+++ /dev/null
@@ -1,728 +0,0 @@
-//=- HexagonIsetDx.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon duplex instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// SA1_combine1i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine1i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#1, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b01;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SL2_jumpr31_f: Indirect conditional jump if false.
-// SL2_jumpr31_f -> SL2_jumpr31_fnew
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_f: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0) jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b101;
-  }
-
-// SL2_deallocframe: Deallocate stack frame.
-let Defs = [R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
-def SL2_deallocframe: SUBInst <
-  (outs ),
-  (ins ),
-  "deallocframe"> {
-    let Inst{12-6} = 0b1111100;
-    let Inst{2} = 0b0;
-  }
-
-// SL2_return_f: Deallocate stack frame and return.
-// SL2_return_f -> SL2_return_fnew
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_f: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0) dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b101;
-  }
-
-// SA1_combine3i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine3i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#3, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b11;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SS2_storebi0: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS2_storebi0: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "memb($Rs + #$u4_0)=#0"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12-8} = 0b10010;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_0;
-  }
-
-// SA1_clrtnew: Clear if true.
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrtnew: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if ($Pu.new) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b100;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_loadruh_io: Load half.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadruh_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
-  "$Rd = memuh($Rs + #$u3_1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u3_1;
-
-    let Inst{12-11} = 0b01;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-  }
-
-// SL2_jumpr31_tnew: Indirect conditional jump if true.
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_tnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0.new) jumpr:nt r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b110;
-  }
-
-// SA1_addi: Add.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 1, opExtentBits = 7, opExtendable = 2 in
-def SA1_addi: SUBInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$_src_, s7_0Ext:$s7),
-  "$Rx = add($_src_, #$s7)" ,
-  [] ,
-  "$_src_ = $Rx"> {
-    bits<4> Rx;
-    bits<7> s7;
-
-    let Inst{12-11} = 0b00;
-    let Inst{3-0} = Rx;
-    let Inst{10-4} = s7;
-  }
-
-// SL1_loadrub_io: Load byte.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
-def SL1_loadrub_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "$Rd = memub($Rs + #$u4_0)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12} = 0b1;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_0;
-  }
-
-// SL1_loadri_io: Load word.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL1_loadri_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "$Rd = memw($Rs + #$u4_2)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12} = 0b0;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_2{5-2};
-  }
-
-// SA1_cmpeqi: Compareimmed.
-let Defs = [P0], isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_cmpeqi: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u2_0Imm:$u2),
-  "p0 = cmp.eq($Rs, #$u2)"> {
-    bits<4> Rs;
-    bits<2> u2;
-
-    let Inst{12-8} = 0b11001;
-    let Inst{7-4} = Rs;
-    let Inst{1-0} = u2;
-  }
-
-// SA1_combinerz: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combinerz: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins IntRegs:$Rs),
-  "$Rdd = combine($Rs, #0)"> {
-    bits<3> Rdd;
-    bits<4> Rs;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b1;
-    let Inst{3} = 0b1;
-    let Inst{2-0} = Rdd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_return_t: Deallocate stack frame and return.
-// SL2_return_t -> SL2_return_tnew
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_t: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0) dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b100;
-  }
-
-// SS2_allocframe: Allocate stack frame.
-let Defs = [R29, R30], Uses = [R30, R31, R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
-def SS2_allocframe: SUBInst <
-  (outs ),
-  (ins u5_3Imm:$u5_3),
-  "allocframe(#$u5_3)"> {
-    bits<8> u5_3;
-
-    let Inst{12-9} = 0b1110;
-    let Inst{8-4} = u5_3{7-3};
-  }
-
-// SS2_storeh_io: Store half.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = HalfWordAccess in
-def SS2_storeh_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1, IntRegs:$Rt),
-  "memh($Rs + #$u3_1) = $Rt"> {
-    bits<4> Rs;
-    bits<4> u3_1;
-    bits<4> Rt;
-
-    let Inst{12-11} = 0b00;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-    let Inst{3-0} = Rt;
-  }
-
-// SS2_storewi0: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storewi0: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "memw($Rs + #$u4_2)=#0"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12-8} = 0b10000;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_2{5-2};
-  }
-
-// SS2_storewi1: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storewi1: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "memw($Rs + #$u4_2)=#1"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12-8} = 0b10001;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_2{5-2};
-  }
-
-// SL2_jumpr31: Indirect conditional jump if true.
-let Defs = [PC], Uses = [R31], isCodeGenOnly = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31: SUBInst <
-  (outs ),
-  (ins ),
-  "jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2} = 0b0;
-  }
-
-// SA1_combinezr: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combinezr: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins IntRegs:$Rs),
-  "$Rdd = combine(#0, $Rs)"> {
-    bits<3> Rdd;
-    bits<4> Rs;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b1;
-    let Inst{3} = 0b0;
-    let Inst{2-0} = Rdd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_loadrh_io: Load half.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadrh_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
-  "$Rd = memh($Rs + #$u3_1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u3_1;
-
-    let Inst{12-11} = 0b00;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-  }
-
-// SA1_addrx: Add.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_addrx: SUBInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$_src_, IntRegs:$Rs),
-  "$Rx = add($_src_, $Rs)" ,
-  [] ,
-  "$_src_ = $Rx"> {
-    bits<4> Rx;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b11000;
-    let Inst{3-0} = Rx;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_setin1: Set to -1.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_setin1: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins ),
-  "$Rd = #{-1}"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6} = 0b0;
-    let Inst{3-0} = Rd;
-  }
-
-// SA1_sxth: Sxth.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_sxth: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = sxth($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10100;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_combine0i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine0i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#0, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b00;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SA1_combine2i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine2i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#2, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b10;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SA1_sxtb: Sxtb.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_sxtb: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = sxtb($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10101;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_clrf: Clear if false.
-// SA1_clrf -> SA1_clrfnew
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrf: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if (!$Pu) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b111;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_loadrb_io: Load byte.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadrb_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_0Imm:$u3_0),
-  "$Rd = memb($Rs + #$u3_0)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<3> u3_0;
-
-    let Inst{12-11} = 0b10;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_0;
-  }
-
-// SA1_tfr: Tfr.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_tfr: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = $Rs"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10000;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_loadrd_sp: Load dword.
-let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
-def SL2_loadrd_sp: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u5_3Imm:$u5_3),
-  "$Rdd = memd(r29 + #$u5_3)"> {
-    bits<3> Rdd;
-    bits<8> u5_3;
-
-    let Inst{12-8} = 0b11110;
-    let Inst{2-0} = Rdd;
-    let Inst{7-3} = u5_3{7-3};
-  }
-
-// SA1_and1: And #1.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_and1: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = and($Rs, #1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10010;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SS2_storebi1: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS2_storebi1: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "memb($Rs + #$u4_0)=#1"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12-8} = 0b10011;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_0;
-  }
-
-// SA1_inc: Inc.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_inc: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = add($Rs, #1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10001;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SS2_stored_sp: Store dword.
-let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
-def SS2_stored_sp: SUBInst <
-  (outs ),
-  (ins s6_3Imm:$s6_3, DoubleRegs:$Rtt),
-  "memd(r29 + #$s6_3) = $Rtt"> {
-    bits<9> s6_3;
-    bits<3> Rtt;
-
-    let Inst{12-9} = 0b0101;
-    let Inst{8-3} = s6_3{8-3};
-    let Inst{2-0} = Rtt;
-  }
-
-// SS2_storew_sp: Store word.
-let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storew_sp: SUBInst <
-  (outs ),
-  (ins u5_2Imm:$u5_2, IntRegs:$Rt),
-  "memw(r29 + #$u5_2) = $Rt"> {
-    bits<7> u5_2;
-    bits<4> Rt;
-
-    let Inst{12-9} = 0b0100;
-    let Inst{8-4} = u5_2{6-2};
-    let Inst{3-0} = Rt;
-  }
-
-// SL2_jumpr31_fnew: Indirect conditional jump if false.
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_fnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0.new) jumpr:nt r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b111;
-  }
-
-// SA1_clrt: Clear if true.
-// SA1_clrt -> SA1_clrtnew
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrt: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if ($Pu) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b110;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_return: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return: SUBInst <
-  (outs ),
-  (ins ),
-  "dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2} = 0b0;
-  }
-
-// SA1_dec: Dec.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_dec: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = add($Rs,#{-1})"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10011;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_seti: Set immed.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 0, opExtentBits = 6, opExtendable = 1 in
-def SA1_seti: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6),
-  "$Rd = #$u6"> {
-    bits<4> Rd;
-    bits<6> u6;
-
-    let Inst{12-10} = 0b010;
-    let Inst{3-0} = Rd;
-    let Inst{9-4} = u6;
-  }
-
-// SL2_jumpr31_t: Indirect conditional jump if true.
-// SL2_jumpr31_t -> SL2_jumpr31_tnew
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_t: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0) jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b100;
-  }
-
-// SA1_clrfnew: Clear if false.
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrfnew: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if (!$Pu.new) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b101;
-    let Inst{3-0} = Rd;
-  }
-
-// SS1_storew_io: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS1_storew_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2, IntRegs:$Rt),
-  "memw($Rs + #$u4_2) = $Rt"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-    bits<4> Rt;
-
-    let Inst{12} = 0b0;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_2{5-2};
-    let Inst{3-0} = Rt;
-  }
-
-// SA1_zxtb: Zxtb.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_zxtb: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = and($Rs, #255)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10111;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_addsp: Add.
-let Uses = [R29], isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_addsp: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u6_2Imm:$u6_2),
-  "$Rd = add(r29, #$u6_2)"> {
-    bits<4> Rd;
-    bits<8> u6_2;
-
-    let Inst{12-10} = 0b011;
-    let Inst{3-0} = Rd;
-    let Inst{9-4} = u6_2{7-2};
-  }
-
-// SL2_loadri_sp: Load word.
-let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadri_sp: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u5_2Imm:$u5_2),
-  "$Rd = memw(r29 + #$u5_2)"> {
-    bits<4> Rd;
-    bits<7> u5_2;
-
-    let Inst{12-9} = 0b1110;
-    let Inst{3-0} = Rd;
-    let Inst{8-4} = u5_2{6-2};
-  }
-
-// SS1_storeb_io: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS1_storeb_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0, IntRegs:$Rt),
-  "memb($Rs + #$u4_0) = $Rt"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-    bits<4> Rt;
-
-    let Inst{12} = 0b1;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_0;
-    let Inst{3-0} = Rt;
-  }
-
-// SL2_return_tnew: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_tnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0.new) dealloc_return:nt"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b110;
-  }
-
-// SL2_return_fnew: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_fnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0.new) dealloc_return:nt"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b111;
-  }
-
-// SA1_zxth: Zxth.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_zxth: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = zxth($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10110;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
new file mode 100644
index 000000000000..b5948475e1f7
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -0,0 +1,2338 @@
+//===--- HexagonLoopIdiomRecognition.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-lir"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <array>
+
+using namespace llvm;
+
+static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
+  cl::Hidden, cl::init(false),
+  cl::desc("Disable generation of memcpy in loop idiom recognition"));
+
+static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
+  cl::Hidden, cl::init(false),
+  cl::desc("Disable generation of memmove in loop idiom recognition"));
+
+static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
+  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
+  "check guarding the memmove."));
+
+static cl::opt<unsigned> CompileTimeMemSizeThreshold(
+  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
+  cl::desc("Threshold (in bytes) to perform the transformation, if the "
+    "runtime loop count (mem transfer size) is known at compile-time."));
+
+static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
+  cl::Hidden, cl::init(true),
+  cl::desc("Only enable generating memmove in non-nested loops"));
+
+cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
+  cl::Hidden, cl::init(false),
+  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
+
+static const char *HexagonVolatileMemcpyName
+  = "hexagon_memcpy_forward_vp4cp4n2";
+
+
+namespace llvm {
+  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
+  Pass *createHexagonLoopIdiomPass();
+}
+
+namespace {
+  class HexagonLoopIdiomRecognize : public LoopPass {
+  public:
+    static char ID;
+    explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
+      initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+    }
+    StringRef getPassName() const override {
+      return "Recognize Hexagon-specific loop idioms";
+    }
+
+   void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  private:
+    unsigned getStoreSizeInBytes(StoreInst *SI);
+    int getSCEVStride(const SCEVAddRecExpr *StoreEv);
+    bool isLegalStore(Loop *CurLoop, StoreInst *SI);
+    void collectStores(Loop *CurLoop, BasicBlock *BB,
+        SmallVectorImpl<StoreInst*> &Stores);
+    bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
+    bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
+    bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
+        SmallVectorImpl<BasicBlock*> &ExitBlocks);
+    bool runOnCountableLoop(Loop *L);
+
+    AliasAnalysis *AA;
+    const DataLayout *DL;
+    DominatorTree *DT;
+    LoopInfo *LF;
+    const TargetLibraryInfo *TLI;
+    ScalarEvolution *SE;
+    bool HasMemcpy, HasMemmove;
+  };
+}
+
+char HexagonLoopIdiomRecognize::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
+    "Recognize Hexagon-specific loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
+    "Recognize Hexagon-specific loop idioms", false, false)
+
+
+namespace {
+  struct Simplifier {
+    typedef std::function<Value* (Instruction*, LLVMContext&)> Rule;
+
+    void addRule(const Rule &R) { Rules.push_back(R); }
+
+  private:
+    struct WorkListType {
+      WorkListType() = default;
+
+      void push_back(Value* V) {
+        // Do not push back duplicates.
+        if (!S.count(V)) { Q.push_back(V); S.insert(V); }
+      }
+      Value *pop_front_val() {
+        Value *V = Q.front(); Q.pop_front(); S.erase(V);
+        return V;
+      }
+      bool empty() const { return Q.empty(); }
+
+    private:
+      std::deque<Value*> Q;
+      std::set<Value*> S;
+    };
+
+    typedef std::set<Value*> ValueSetType;
+    std::vector<Rule> Rules;
+
+  public:
+    struct Context {
+      typedef DenseMap<Value*,Value*> ValueMapType;
+
+      Value *Root;
+      ValueSetType Used;    // The set of all cloned values used by Root.
+      ValueSetType Clones;  // The set of all cloned values.
+      LLVMContext &Ctx;
+
+      Context(Instruction *Exp)
+        : Ctx(Exp->getParent()->getParent()->getContext()) {
+        initialize(Exp);
+      }
+      ~Context() { cleanup(); }
+      void print(raw_ostream &OS, const Value *V) const;
+
+      Value *materialize(BasicBlock *B, BasicBlock::iterator At);
+
+    private:
+      void initialize(Instruction *Exp);
+      void cleanup();
+
+      template <typename FuncT> void traverse(Value *V, FuncT F);
+      void record(Value *V);
+      void use(Value *V);
+      void unuse(Value *V);
+
+      bool equal(const Instruction *I, const Instruction *J) const;
+      Value *find(Value *Tree, Value *Sub) const;
+      Value *subst(Value *Tree, Value *OldV, Value *NewV);
+      void replace(Value *OldV, Value *NewV);
+      void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
+
+      friend struct Simplifier;
+    };
+
+    Value *simplify(Context &C);
+  };
+
+  struct PE {
+    PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
+    const Simplifier::Context &C;
+    const Value *V;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
+  raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
+    P.C.print(OS, P.V ? P.V : P.C.Root);
+    return OS;
+  }
+}
+
+
+template <typename FuncT>
+void Simplifier::Context::traverse(Value *V, FuncT F) {
+  WorkListType Q;
+  Q.push_back(V);
+
+  while (!Q.empty()) {
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    if (!U || U->getParent())
+      continue;
+    if (!F(U))
+      continue;
+    for (Value *Op : U->operands())
+      Q.push_back(Op);
+  }
+}
+
+
+void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
+  const auto *U = dyn_cast<const Instruction>(V);
+  if (!U) {
+    OS << V << '(' << *V << ')';
+    return;
+  }
+
+  if (U->getParent()) {
+    OS << U << '(';
+    U->printAsOperand(OS, true);
+    OS << ')';
+    return;
+  }
+
+  unsigned N = U->getNumOperands();
+  if (N != 0)
+    OS << U << '(';
+  OS << U->getOpcodeName();
+  for (const Value *Op : U->operands()) {
+    OS << ' ';
+    print(OS, Op);
+  }
+  if (N != 0)
+    OS << ')';
+}
+
+
+void Simplifier::Context::initialize(Instruction *Exp) {
+  // Perform a deep clone of the expression, set Root to the root
+  // of the clone, and build a map from the cloned values to the
+  // original ones.
+  ValueMapType M;
+  BasicBlock *Block = Exp->getParent();
+  WorkListType Q;
+  Q.push_back(Exp);
+
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    if (M.find(V) != M.end())
+      continue;
+    if (Instruction *U = dyn_cast<Instruction>(V)) {
+      if (isa<PHINode>(U) || U->getParent() != Block)
+        continue;
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+      M.insert({U, U->clone()});
+    }
+  }
+
+  for (std::pair<Value*,Value*> P : M) {
+    Instruction *U = cast<Instruction>(P.second);
+    for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+      auto F = M.find(U->getOperand(i));
+      if (F != M.end())
+        U->setOperand(i, F->second);
+    }
+  }
+
+  auto R = M.find(Exp);
+  assert(R != M.end());
+  Root = R->second;
+
+  record(Root);
+  use(Root);
+}
+
+
+void Simplifier::Context::record(Value *V) {
+  auto Record = [this](Instruction *U) -> bool {
+    Clones.insert(U);
+    return true;
+  };
+  traverse(V, Record);
+}
+
+
+void Simplifier::Context::use(Value *V) {
+  auto Use = [this](Instruction *U) -> bool {
+    Used.insert(U);
+    return true;
+  };
+  traverse(V, Use);
+}
+
+
+void Simplifier::Context::unuse(Value *V) {
+  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
+    return;
+
+  auto Unuse = [this](Instruction *U) -> bool {
+    if (!U->use_empty())
+      return false;
+    Used.erase(U);
+    return true;
+  };
+  traverse(V, Unuse);
+}
+
+
+Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
+  if (Tree == OldV)
+    return NewV;
+  if (OldV == NewV)
+    return Tree;
+
+  WorkListType Q;
+  Q.push_back(Tree);
+  while (!Q.empty()) {
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    // If U is not an instruction, or it's not a clone, skip it.
+    if (!U || U->getParent())
+      continue;
+    for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+      Value *Op = U->getOperand(i);
+      if (Op == OldV) {
+        U->setOperand(i, NewV);
+        unuse(OldV);
+      } else {
+        Q.push_back(Op);
+      }
+    }
+  }
+  return Tree;
+}
+
+
+void Simplifier::Context::replace(Value *OldV, Value *NewV) {
+  if (Root == OldV) {
+    Root = NewV;
+    use(Root);
+    return;
+  }
+
+  // NewV may be a complex tree that has just been created by one of the
+  // transformation rules. We need to make sure that it is commoned with
+  // the existing Root to the maximum extent possible.
+  // Identify all subtrees of NewV (including NewV itself) that have
+  // equivalent counterparts in Root, and replace those subtrees with
+  // these counterparts.
+  WorkListType Q;
+  Q.push_back(NewV);
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    Instruction *U = dyn_cast<Instruction>(V);
+    if (!U || U->getParent())
+      continue;
+    if (Value *DupV = find(Root, V)) {
+      if (DupV != V)
+        NewV = subst(NewV, V, DupV);
+    } else {
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+    }
+  }
+
+  // Now, simply replace OldV with NewV in Root.
+  Root = subst(Root, OldV, NewV);
+  use(Root);
+}
+
+
+void Simplifier::Context::cleanup() {
+  for (Value *V : Clones) {
+    Instruction *U = cast<Instruction>(V);
+    if (!U->getParent())
+      U->dropAllReferences();
+  }
+
+  for (Value *V : Clones) {
+    Instruction *U = cast<Instruction>(V);
+    if (!U->getParent())
+      delete U;
+  }
+}
+
+
+bool Simplifier::Context::equal(const Instruction *I,
+                                const Instruction *J) const {
+  if (I == J)
+    return true;
+  if (!I->isSameOperationAs(J))
+    return false;
+  if (isa<PHINode>(I))
+    return I->isIdenticalTo(J);
+
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
+    if (OpI == OpJ)
+      continue;
+    auto *InI = dyn_cast<const Instruction>(OpI);
+    auto *InJ = dyn_cast<const Instruction>(OpJ);
+    if (InI && InJ) {
+      if (!equal(InI, InJ))
+        return false;
+    } else if (InI != InJ || !InI)
+      return false;
+  }
+  return true;
+}
+
+
+Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
+  Instruction *SubI = dyn_cast<Instruction>(Sub);
+  WorkListType Q;
+  Q.push_back(Tree);
+
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    if (V == Sub)
+      return V;
+    Instruction *U = dyn_cast<Instruction>(V);
+    if (!U || U->getParent())
+      continue;
+    if (SubI && equal(SubI, U))
+      return U;
+    assert(!isa<PHINode>(U));
+    for (Value *Op : U->operands())
+      Q.push_back(Op);
+  }
+  return nullptr;
+}
+
+
+void Simplifier::Context::link(Instruction *I, BasicBlock *B,
+      BasicBlock::iterator At) {
+  if (I->getParent())
+    return;
+
+  for (Value *Op : I->operands()) {
+    if (Instruction *OpI = dyn_cast<Instruction>(Op))
+      link(OpI, B, At);
+  }
+
+  B->getInstList().insert(At, I);
+}
+
+
+Value *Simplifier::Context::materialize(BasicBlock *B,
+      BasicBlock::iterator At) {
+  if (Instruction *RootI = dyn_cast<Instruction>(Root))
+    link(RootI, B, At);
+  return Root;
+}
+
+
+Value *Simplifier::simplify(Context &C) {
+  WorkListType Q;
+  Q.push_back(C.Root);
+  unsigned Count = 0;
+  const unsigned Limit = 100000;
+
+  while (!Q.empty()) {
+    if (Count++ >= Limit)
+      break;
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    if (!U || U->getParent() || !C.Used.count(U))
+      continue;
+    bool Changed = false;
+    for (Rule &R : Rules) {
+      Value *W = R(U, C.Ctx);
+      if (!W)
+        continue;
+      Changed = true;
+      C.record(W);
+      C.replace(U, W);
+      Q.push_back(C.Root);
+      break;
+    }
+    if (!Changed) {
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+    }
+  }
+  assert(Count < Limit && "Infinite loop in HLIR/simplify?");
+  return C.Root;
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of PolynomialMultiplyRecognize
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class PolynomialMultiplyRecognize {
+  public:
+    explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
+        const DominatorTree &dt, const TargetLibraryInfo &tli,
+        ScalarEvolution &se)
+      : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
+
+    bool recognize();
+  private:
+    typedef SetVector<Value*> ValueSeq;
+
+    IntegerType *getPmpyType() const {
+      LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
+      return IntegerType::get(Ctx, 32);
+    }
+    bool isPromotableTo(Value *V, IntegerType *Ty);
+    void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
+    bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
+
+    Value *getCountIV(BasicBlock *BB);
+    bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
+    void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
+          ValueSeq &Late);
+    bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
+    bool commutesWithShift(Instruction *I);
+    bool highBitsAreZero(Value *V, unsigned IterCount);
+    bool keepsHighBitsZero(Value *V, unsigned IterCount);
+    bool isOperandShifted(Instruction *I, Value *Op);
+    bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
+          unsigned IterCount);
+    void cleanupLoopBody(BasicBlock *LoopB);
+
+    struct ParsedValues {
+      ParsedValues() : M(nullptr), P(nullptr), Q(nullptr), R(nullptr),
+          X(nullptr), Res(nullptr), IterCount(0), Left(false), Inv(false) {}
+      Value *M, *P, *Q, *R, *X;
+      Instruction *Res;
+      unsigned IterCount;
+      bool Left, Inv;
+    };
+
+    bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
+    bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
+    bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
+          Value *CIV, ParsedValues &PV, bool PreScan);
+    unsigned getInverseMxN(unsigned QP);
+    Value *generate(BasicBlock::iterator At, ParsedValues &PV);
+
+    void setupSimplifier();
+
+    Simplifier Simp;
+    Loop *CurLoop;
+    const DataLayout &DL;
+    const DominatorTree &DT;
+    const TargetLibraryInfo &TLI;
+    ScalarEvolution &SE;
+  };
+}
+
+
+Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  if (std::distance(PI, PE) != 2)
+    return nullptr;
+  BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
+
+  for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
+    auto *PN = cast<PHINode>(I);
+    Value *InitV = PN->getIncomingValueForBlock(PB);
+    if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
+      continue;
+    Value *IterV = PN->getIncomingValueForBlock(BB);
+    if (!isa<BinaryOperator>(IterV))
+      continue;
+    auto *BO = dyn_cast<BinaryOperator>(IterV);
+    if (BO->getOpcode() != Instruction::Add)
+      continue;
+    Value *IncV = nullptr;
+    if (BO->getOperand(0) == PN)
+      IncV = BO->getOperand(1);
+    else if (BO->getOperand(1) == PN)
+      IncV = BO->getOperand(0);
+    if (IncV == nullptr)
+      continue;
+
+    if (auto *T = dyn_cast<ConstantInt>(IncV))
+      if (T->getZExtValue() == 1)
+        return PN;
+  }
+  return nullptr;
+}
+
+
+static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB) {
+  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
+    Use &TheUse = UI.getUse();
+    ++UI;
+    if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
+      if (BB == II->getParent())
+        II->replaceUsesOfWith(I, J);
+  }
+}
+
+
+bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
+      Value *CIV, ParsedValues &PV) {
+  // Match the following:
+  //   select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
+  //   select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
+  // The condition may also check for equality with the masked value, i.e
+  //   select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
+  //   select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
+
+  Value *CondV = SelI->getCondition();
+  Value *TrueV = SelI->getTrueValue();
+  Value *FalseV = SelI->getFalseValue();
+
+  using namespace PatternMatch;
+
+  CmpInst::Predicate P;
+  Value *A = nullptr, *B = nullptr, *C = nullptr;
+
+  if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
+      !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
+    return false;
+  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
+    return false;
+  // Matched: select (A & B) == C ? ... : ...
+  //          select (A & B) != C ? ... : ...
+
+  Value *X = nullptr, *Sh1 = nullptr;
+  // Check (A & B) for (X & (1 << i)):
+  if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
+    Sh1 = A;
+    X = B;
+  } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
+    Sh1 = B;
+    X = A;
+  } else {
+    // TODO: Could also check for an induction variable containing single
+    // bit shifted left by 1 in each iteration.
+    return false;
+  }
+
+  bool TrueIfZero;
+
+  // Check C against the possible values for comparison: 0 and (1 << i):
+  if (match(C, m_Zero()))
+    TrueIfZero = (P == CmpInst::ICMP_EQ);
+  else if (C == Sh1)
+    TrueIfZero = (P == CmpInst::ICMP_NE);
+  else
+    return false;
+
+  // So far, matched:
+  //   select (X & (1 << i)) ? ... : ...
+  // including variations of the check against zero/non-zero value.
+
+  Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
+  if (TrueIfZero) {
+    ShouldSameV = TrueV;
+    ShouldXoredV = FalseV;
+  } else {
+    ShouldSameV = FalseV;
+    ShouldXoredV = TrueV;
+  }
+
+  Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
+  Value *T = nullptr;
+  if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
+    // Matched: select +++ ? ... : Y ^ Z
+    //          select +++ ? Y ^ Z : ...
+    // where +++ denotes previously checked matches.
+    if (ShouldSameV == Y)
+      T = Z;
+    else if (ShouldSameV == Z)
+      T = Y;
+    else
+      return false;
+    R = ShouldSameV;
+    // Matched: select +++ ? R : R ^ T
+    //          select +++ ? R ^ T : R
+    // depending on TrueIfZero.
+
+  } else if (match(ShouldSameV, m_Zero())) {
+    // Matched: select +++ ? 0 : ...
+    //          select +++ ? ... : 0
+    if (!SelI->hasOneUse())
+      return false;
+    T = ShouldXoredV;
+    // Matched: select +++ ? 0 : T
+    //          select +++ ? T : 0
+
+    Value *U = *SelI->user_begin();
+    if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
+        !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
+      return false;
+    // Matched: xor (select +++ ? 0 : T), R
+    //          xor (select +++ ? T : 0), R
+  } else
+    return false;
+
+  // The xor input value T is isolated into its own match so that it could
+  // be checked against an induction variable containing a shifted bit
+  // (todo).
+  // For now, check against (Q << i).
+  if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
+      !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
+    return false;
+  // Matched: select +++ ? R : R ^ (Q << i)
+  //          select +++ ? R ^ (Q << i) : R
+
+  PV.X = X;
+  PV.Q = Q;
+  PV.R = R;
+  PV.Left = true;
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
+      ParsedValues &PV) {
+  // Match the following:
+  //   select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
+  //   select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
+  // The condition may also check for equality with the masked value, i.e
+  //   select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
+  //   select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
+
+  Value *CondV = SelI->getCondition();
+  Value *TrueV = SelI->getTrueValue();
+  Value *FalseV = SelI->getFalseValue();
+
+  using namespace PatternMatch;
+
+  Value *C = nullptr;
+  CmpInst::Predicate P;
+  bool TrueIfZero;
+
+  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
+      match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
+    if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
+      return false;
+    // Matched: select C == 0 ? ... : ...
+    //          select C != 0 ? ... : ...
+    TrueIfZero = (P == CmpInst::ICMP_EQ);
+  } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
+             match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
+    if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
+      return false;
+    // Matched: select C == 1 ? ... : ...
+    //          select C != 1 ? ... : ...
+    TrueIfZero = (P == CmpInst::ICMP_NE);
+  } else
+    return false;
+
+  Value *X = nullptr;
+  if (!match(C, m_And(m_Value(X), m_One())) &&
+      !match(C, m_And(m_One(), m_Value(X))))
+    return false;
+  // Matched: select (X & 1) == +++ ? ... : ...
+  //          select (X & 1) != +++ ? ... : ...
+
+  Value *R = nullptr, *Q = nullptr;
+  if (TrueIfZero) {
+    // The select's condition is true if the tested bit is 0.
+    // TrueV must be the shift, FalseV must be the xor.
+    if (!match(TrueV, m_LShr(m_Value(R), m_One())))
+      return false;
+    // Matched: select +++ ? (R >> 1) : ...
+    if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
+        !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
+      return false;
+    // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
+    // with commuting ^.
+  } else {
+    // The select's condition is true if the tested bit is 1.
+    // TrueV must be the xor, FalseV must be the shift.
+    if (!match(FalseV, m_LShr(m_Value(R), m_One())))
+      return false;
+    // Matched: select +++ ? ... : (R >> 1)
+    if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
+        !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
+      return false;
+    // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
+    // with commuting ^.
+  }
+
+  PV.X = X;
+  PV.Q = Q;
+  PV.R = R;
+  PV.Left = false;
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
+      BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
+      bool PreScan) {
+  using namespace PatternMatch;
+  // The basic pattern for R = P.Q is:
+  // for i = 0..31
+  //   R = phi (0, R')
+  //   if (P & (1 << i))        ; test-bit(P, i)
+  //     R' = R ^ (Q << i)
+  //
+  // Similarly, the basic pattern for R = (P/Q).Q - P
+  // for i = 0..31
+  //   R = phi(P, R')
+  //   if (R & (1 << i))
+  //     R' = R ^ (Q << i)
+
+  // There exist idioms, where instead of Q being shifted left, P is shifted
+  // right. This produces a result that is shifted right by 32 bits (the
+  // non-shifted result is 64-bit).
+  //
+  // For R = P.Q, this would be:
+  // for i = 0..31
+  //   R = phi (0, R')
+  //   if ((P >> i) & 1)
+  //     R' = (R >> 1) ^ Q      ; R is cycled through the loop, so it must
+  //   else                     ; be shifted by 1, not i.
+  //     R' = R >> 1
+  //
+  // And for the inverse:
+  // for i = 0..31
+  //   R = phi (P, R')
+  //   if (R & 1)
+  //     R' = (R >> 1) ^ Q
+  //   else
+  //     R' = R >> 1
+
+  // The left-shifting idioms share the same pattern:
+  //   select (X & (1 << i)) ? R ^ (Q << i) : R
+  // Similarly for right-shifting idioms:
+  //   select (X & 1) ? (R >> 1) ^ Q
+
+  if (matchLeftShift(SelI, CIV, PV)) {
+    // If this is a pre-scan, getting this far is sufficient.
+    if (PreScan)
+      return true;
+
+    // Need to make sure that the SelI goes back into R.
+    auto *RPhi = dyn_cast<PHINode>(PV.R);
+    if (!RPhi)
+      return false;
+    if (SelI != RPhi->getIncomingValueForBlock(LoopB))
+      return false;
+    PV.Res = SelI;
+
+    // If X is loop invariant, it must be the input polynomial, and the
+    // idiom is the basic polynomial multiply.
+    if (CurLoop->isLoopInvariant(PV.X)) {
+      PV.P = PV.X;
+      PV.Inv = false;
+    } else {
+      // X is not loop invariant. If X == R, this is the inverse pmpy.
+      // Otherwise, check for an xor with an invariant value. If the
+      // variable argument to the xor is R, then this is still a valid
+      // inverse pmpy.
+      PV.Inv = true;
+      if (PV.X != PV.R) {
+        Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
+        if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
+          return false;
+        auto *I1 = dyn_cast<Instruction>(X1);
+        auto *I2 = dyn_cast<Instruction>(X2);
+        if (!I1 || I1->getParent() != LoopB) {
+          Var = X2;
+          Inv = X1;
+        } else if (!I2 || I2->getParent() != LoopB) {
+          Var = X1;
+          Inv = X2;
+        } else
+          return false;
+        if (Var != PV.R)
+          return false;
+        PV.M = Inv;
+      }
+      // The input polynomial P still needs to be determined. It will be
+      // the entry value of R.
+      Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
+      PV.P = EntryP;
+    }
+
+    return true;
+  }
+
+  if (matchRightShift(SelI, PV)) {
+    // If this is an inverse pattern, the Q polynomial must be known at
+    // compile time.
+    if (PV.Inv && !isa<ConstantInt>(PV.Q))
+      return false;
+    if (PreScan)
+      return true;
+    // There is no exact matching of right-shift pmpy.
+    return false;
+  }
+
+  return false;
+}
+
+
+bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
+      IntegerType *DestTy) {
+  IntegerType *T = dyn_cast<IntegerType>(Val->getType());
+  if (!T || T->getBitWidth() > DestTy->getBitWidth())
+    return false;
+  if (T->getBitWidth() == DestTy->getBitWidth())
+    return true;
+  // Non-instructions are promotable. The reason why an instruction may not
+  // be promotable is that it may produce a different result if its operands
+  // and the result are promoted, for example, it may produce more non-zero
+  // bits. While it would still be possible to represent the proper result
+  // in a wider type, it may require adding additional instructions (which
+  // we don't want to do).
+  Instruction *In = dyn_cast<Instruction>(Val);
+  if (!In)
+    return true;
+  // The bitwidth of the source type is smaller than the destination.
+  // Check if the individual operation can be promoted.
+  switch (In->getOpcode()) {
+    case Instruction::PHI:
+    case Instruction::ZExt:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::LShr: // Shift right is ok.
+    case Instruction::Select:
+      return true;
+    case Instruction::ICmp:
+      if (CmpInst *CI = cast<CmpInst>(In))
+        return CI->isEquality() || CI->isUnsigned();
+      llvm_unreachable("Cast failed unexpectedly");
+    case Instruction::Add:
+      return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
+  }
+  return false;
+}
+
+
+void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
+      IntegerType *DestTy, BasicBlock *LoopB) {
+  // Leave boolean values alone.
+  if (!In->getType()->isIntegerTy(1))
+    In->mutateType(DestTy);
+  unsigned DestBW = DestTy->getBitWidth();
+
+  // Handle PHIs.
+  if (PHINode *P = dyn_cast<PHINode>(In)) {
+    unsigned N = P->getNumIncomingValues();
+    for (unsigned i = 0; i != N; ++i) {
+      BasicBlock *InB = P->getIncomingBlock(i);
+      if (InB == LoopB)
+        continue;
+      Value *InV = P->getIncomingValue(i);
+      IntegerType *Ty = cast<IntegerType>(InV->getType());
+      // Do not promote values in PHI nodes of type i1.
+      if (Ty != P->getType()) {
+        // If the value type does not match the PHI type, the PHI type
+        // must have been promoted.
+        assert(Ty->getBitWidth() < DestBW);
+        InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
+        P->setIncomingValue(i, InV);
+      }
+    }
+  } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
+    Value *Op = Z->getOperand(0);
+    if (Op->getType() == Z->getType())
+      Z->replaceAllUsesWith(Op);
+    Z->eraseFromParent();
+    return;
+  }
+
+  // Promote immediates.
+  for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
+      if (CI->getType()->getBitWidth() < DestBW)
+        In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
+  }
+}
+
+
+bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
+      BasicBlock *ExitB) {
+  assert(LoopB);
+  // Skip loops where the exit block has more than one predecessor. The values
+  // coming from the loop block will be promoted to another type, and so the
+  // values coming into the exit block from other predecessors would also have
+  // to be promoted.
+  if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
+    return false;
+  IntegerType *DestTy = getPmpyType();
+  // Check if the exit values have types that are no wider than the type
+  // that we want to promote to.
+  unsigned DestBW = DestTy->getBitWidth();
+  for (Instruction &In : *ExitB) {
+    PHINode *P = dyn_cast<PHINode>(&In);
+    if (!P)
+      break;
+    if (P->getNumIncomingValues() != 1)
+      return false;
+    assert(P->getIncomingBlock(0) == LoopB);
+    IntegerType *T = dyn_cast<IntegerType>(P->getType());
+    if (!T || T->getBitWidth() > DestBW)
+      return false;
+  }
+
+  // Check all instructions in the loop.
+  for (Instruction &In : *LoopB)
+    if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
+      return false;
+
+  // Perform the promotion.
+  std::vector<Instruction*> LoopIns;
+  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
+                 [](Instruction &In) { return &In; });
+  for (Instruction *In : LoopIns)
+    promoteTo(In, DestTy, LoopB);
+
+  // Fix up the PHI nodes in the exit block.
+  Instruction *EndI = ExitB->getFirstNonPHI();
+  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
+  for (auto I = ExitB->begin(); I != End; ++I) {
+    PHINode *P = dyn_cast<PHINode>(I);
+    if (!P)
+      break;
+    Type *Ty0 = P->getIncomingValue(0)->getType();
+    Type *PTy = P->getType();
+    if (PTy != Ty0) {
+      assert(Ty0 == DestTy);
+      // In order to create the trunc, P must have the promoted type.
+      P->mutateType(Ty0);
+      Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
+      // In order for the RAUW to work, the types of P and T must match.
+      P->mutateType(PTy);
+      P->replaceAllUsesWith(T);
+      // Final update of the P's type.
+      P->mutateType(Ty0);
+      cast<Instruction>(T)->setOperand(0, P);
+    }
+  }
+
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
+      ValueSeq &Cycle) {
+  // Out = ..., In, ...
+  if (Out == In)
+    return true;
+
+  auto *BB = cast<Instruction>(Out)->getParent();
+  bool HadPhi = false;
+
+  for (auto U : Out->users()) {
+    auto *I = dyn_cast<Instruction>(&*U);
+    if (I == nullptr || I->getParent() != BB)
+      continue;
+    // Make sure that there are no multi-iteration cycles, e.g.
+    //   p1 = phi(p2)
+    //   p2 = phi(p1)
+    // The cycle p1->p2->p1 would span two loop iterations.
+    // Check that there is only one phi in the cycle.
+    bool IsPhi = isa<PHINode>(I);
+    if (IsPhi && HadPhi)
+      return false;
+    HadPhi |= IsPhi;
+    if (Cycle.count(I))
+      return false;
+    Cycle.insert(I);
+    if (findCycle(I, In, Cycle))
+      break;
+    Cycle.remove(I);
+  }
+  return !Cycle.empty();
+}
+
+
+void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
+      ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
+  // All the values in the cycle that are between the phi node and the
+  // divider instruction will be classified as "early", all other values
+  // will be "late".
+
+  bool IsE = true;
+  unsigned I, N = Cycle.size();
+  for (I = 0; I < N; ++I) {
+    Value *V = Cycle[I];
+    if (DivI == V)
+      IsE = false;
+    else if (!isa<PHINode>(V))
+      continue;
+    // Stop if found either.
+    break;
+  }
+  // "I" is the index of either DivI or the phi node, whichever was first.
+  // "E" is "false" or "true" respectively.
+  ValueSeq &First = !IsE ? Early : Late;
+  for (unsigned J = 0; J < I; ++J)
+    First.insert(Cycle[J]);
+
+  ValueSeq &Second = IsE ? Early : Late;
+  Second.insert(Cycle[I]);
+  for (++I; I < N; ++I) {
+    Value *V = Cycle[I];
+    if (DivI == V || isa<PHINode>(V))
+      break;
+    Second.insert(V);
+  }
+
+  for (; I < N; ++I)
+    First.insert(Cycle[I]);
+}
+
+
+bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
+      ValueSeq &Early, ValueSeq &Late) {
+  // Select is an exception, since the condition value does not have to be
+  // classified in the same way as the true/false values. The true/false
+  // values do have to be both early or both late.
+  if (UseI->getOpcode() == Instruction::Select) {
+    Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
+    if (Early.count(TV) || Early.count(FV)) {
+      if (Late.count(TV) || Late.count(FV))
+        return false;
+      Early.insert(UseI);
+    } else if (Late.count(TV) || Late.count(FV)) {
+      if (Early.count(TV) || Early.count(FV))
+        return false;
+      Late.insert(UseI);
+    }
+    return true;
+  }
+
+  // Not sure what would be the example of this, but the code below relies
+  // on having at least one operand.
+  if (UseI->getNumOperands() == 0)
+    return true;
+
+  bool AE = true, AL = true;
+  for (auto &I : UseI->operands()) {
+    if (Early.count(&*I))
+      AL = false;
+    else if (Late.count(&*I))
+      AE = false;
+  }
+  // If the operands appear "all early" and "all late" at the same time,
+  // then it means that none of them are actually classified as either.
+  // This is harmless.
+  if (AE && AL)
+    return true;
+  // Conversely, if they are neither "all early" nor "all late", then
+  // we have a mixture of early and late operands that is not a known
+  // exception.
+  if (!AE && !AL)
+    return false;
+
+  // Check that we have covered the two special cases.
+  assert(AE != AL);
+
+  if (AE)
+    Early.insert(UseI);
+  else
+    Late.insert(UseI);
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
+  switch (I->getOpcode()) {
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::LShr:
+    case Instruction::Shl:
+    case Instruction::Select:
+    case Instruction::ICmp:
+    case Instruction::PHI:
+      break;
+    default:
+      return false;
+  }
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
+      unsigned IterCount) {
+  auto *T = dyn_cast<IntegerType>(V->getType());
+  if (!T)
+    return false;
+
+  unsigned BW = T->getBitWidth();
+  APInt K0(BW, 0), K1(BW, 0);
+  computeKnownBits(V, K0, K1, DL);
+  return K0.countLeadingOnes() >= IterCount;
+}
+
+
+bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
+      unsigned IterCount) {
+  // Assume that all inputs to the value have the high bits zero.
+  // Check if the value itself preserves the zeros in the high bits.
+  if (auto *C = dyn_cast<ConstantInt>(V))
+    return C->getValue().countLeadingZeros() >= IterCount;
+
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    switch (I->getOpcode()) {
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      case Instruction::LShr:
+      case Instruction::Select:
+      case Instruction::ICmp:
+      case Instruction::PHI:
+      case Instruction::ZExt:
+        return true;
+    }
+  }
+
+  return false;
+}
+
+
+bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
+  unsigned Opc = I->getOpcode();
+  if (Opc == Instruction::Shl || Opc == Instruction::LShr)
+    return Op != I->getOperand(1);
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
+      BasicBlock *ExitB, unsigned IterCount) {
+  Value *CIV = getCountIV(LoopB);
+  if (CIV == nullptr)
+    return false;
+  auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
+  if (CIVTy == nullptr)
+    return false;
+
+  ValueSeq RShifts;
+  ValueSeq Early, Late, Cycled;
+
+  // Find all value cycles that contain logical right shifts by 1.
+  for (Instruction &I : *LoopB) {
+    using namespace PatternMatch;
+    Value *V = nullptr;
+    if (!match(&I, m_LShr(m_Value(V), m_One())))
+      continue;
+    ValueSeq C;
+    if (!findCycle(&I, V, C))
+      continue;
+
+    // Found a cycle.
+    C.insert(&I);
+    classifyCycle(&I, C, Early, Late);
+    Cycled.insert(C.begin(), C.end());
+    RShifts.insert(&I);
+  }
+
+  // Find the set of all values affected by the shift cycles, i.e. all
+  // cycled values, and (recursively) all their users.
+  ValueSeq Users(Cycled.begin(), Cycled.end());
+  for (unsigned i = 0; i < Users.size(); ++i) {
+    Value *V = Users[i];
+    if (!isa<IntegerType>(V->getType()))
+      return false;
+    auto *R = cast<Instruction>(V);
+    // If the instruction does not commute with shifts, the loop cannot
+    // be unshifted.
+    if (!commutesWithShift(R))
+      return false;
+    for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
+      auto *T = cast<Instruction>(*I);
+      // Skip users from outside of the loop. They will be handled later.
+      // Also, skip the right-shifts and phi nodes, since they mix early
+      // and late values.
+      if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
+        continue;
+
+      Users.insert(T);
+      if (!classifyInst(T, Early, Late))
+        return false;
+    }
+  }
+
+  if (Users.size() == 0)
+    return false;
+
+  // Verify that high bits remain zero.
+  ValueSeq Internal(Users.begin(), Users.end());
+  ValueSeq Inputs;
+  for (unsigned i = 0; i < Internal.size(); ++i) {
+    auto *R = dyn_cast<Instruction>(Internal[i]);
+    if (!R)
+      continue;
+    for (Value *Op : R->operands()) {
+      auto *T = dyn_cast<Instruction>(Op);
+      if (T && T->getParent() != LoopB)
+        Inputs.insert(Op);
+      else
+        Internal.insert(Op);
+    }
+  }
+  for (Value *V : Inputs)
+    if (!highBitsAreZero(V, IterCount))
+      return false;
+  for (Value *V : Internal)
+    if (!keepsHighBitsZero(V, IterCount))
+      return false;
+
+  // Finally, the work can be done. Unshift each user.
+  IRBuilder<> IRB(LoopB);
+  std::map<Value*,Value*> ShiftMap;
+  typedef std::map<std::pair<Value*,Type*>,Value*> CastMapType;
+  CastMapType CastMap;
+
+  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
+        IntegerType *Ty) -> Value* {
+    auto H = CM.find(std::make_pair(V, Ty));
+    if (H != CM.end())
+      return H->second;
+    Value *CV = IRB.CreateIntCast(V, Ty, false);
+    CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
+    return CV;
+  };
+
+  for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
+    if (isa<PHINode>(I) || !Users.count(&*I))
+      continue;
+    using namespace PatternMatch;
+    // Match lshr x, 1.
+    Value *V = nullptr;
+    if (match(&*I, m_LShr(m_Value(V), m_One()))) {
+      replaceAllUsesOfWithIn(&*I, V, LoopB);
+      continue;
+    }
+    // For each non-cycled operand, replace it with the corresponding
+    // value shifted left.
+    for (auto &J : I->operands()) {
+      Value *Op = J.get();
+      if (!isOperandShifted(&*I, Op))
+        continue;
+      if (Users.count(Op))
+        continue;
+      // Skip shifting zeros.
+      if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
+        continue;
+      // Check if we have already generated a shift for this value.
+      auto F = ShiftMap.find(Op);
+      Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
+      if (W == nullptr) {
+        IRB.SetInsertPoint(&*I);
+        // First, the shift amount will be CIV or CIV+1, depending on
+        // whether the value is early or late. Instead of creating CIV+1,
+        // do a single shift of the value.
+        Value *ShAmt = CIV, *ShVal = Op;
+        auto *VTy = cast<IntegerType>(ShVal->getType());
+        auto *ATy = cast<IntegerType>(ShAmt->getType());
+        if (Late.count(&*I))
+          ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
+        // Second, the types of the shifted value and the shift amount
+        // must match.
+        if (VTy != ATy) {
+          if (VTy->getBitWidth() < ATy->getBitWidth())
+            ShVal = upcast(CastMap, IRB, ShVal, ATy);
+          else
+            ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
+        }
+        // Ready to generate the shift and memoize it.
+        W = IRB.CreateShl(ShVal, ShAmt);
+        ShiftMap.insert(std::make_pair(Op, W));
+      }
+      I->replaceUsesOfWith(Op, W);
+    }
+  }
+
+  // Update the users outside of the loop to account for having left
+  // shifts. They would normally be shifted right in the loop, so shift
+  // them right after the loop exit.
+  // Take advantage of the loop-closed SSA form, which has all the post-
+  // loop values in phi nodes.
+  IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
+  for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
+    if (!isa<PHINode>(P))
+      break;
+    auto *PN = cast<PHINode>(P);
+    Value *U = PN->getIncomingValueForBlock(LoopB);
+    if (!Users.count(U))
+      continue;
+    Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
+    PN->replaceAllUsesWith(S);
+    // The above RAUW will create
+    //   S = lshr S, IterCount
+    // so we need to fix it back into
+    //   S = lshr PN, IterCount
+    cast<User>(S)->replaceUsesOfWith(S, PN);
+  }
+
+  return true;
+}
+
+
+void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
+  for (auto &I : *LoopB)
+    if (Value *SV = SimplifyInstruction(&I, DL, &TLI, &DT))
+      I.replaceAllUsesWith(SV);
+
+  for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
+    N = std::next(I);
+    RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
+  }
+}
+
+
+unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
+  // Arrays of coefficients of Q and the inverse, C.
+  // Q[i] = coefficient at x^i.
+  std::array<char,32> Q, C;
+
+  for (unsigned i = 0; i < 32; ++i) {
+    Q[i] = QP & 1;
+    QP >>= 1;
+  }
+  assert(Q[0] == 1);
+
+  // Find C, such that
+  // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
+  //
+  // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
+  // operations * and + are & and ^ respectively.
+  //
+  // Find C[i] recursively, by comparing i-th coefficient in the product
+  // with 0 (or 1 for i=0).
+  //
+  // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
+  C[0] = 1;
+  for (unsigned i = 1; i < 32; ++i) {
+    // Solve for C[i] in:
+    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
+    // This is equivalent to
+    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
+    // which is
+    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
+    unsigned T = 0;
+    for (unsigned j = 0; j < i; ++j)
+      T = T ^ (C[j] & Q[i-j]);
+    C[i] = T;
+  }
+
+  unsigned QV = 0;
+  for (unsigned i = 0; i < 32; ++i)
+    if (C[i])
+      QV |= (1 << i);
+
+  return QV;
+}
+
+
+Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
+      ParsedValues &PV) {
+  IRBuilder<> B(&*At);
+  Module *M = At->getParent()->getParent()->getParent();
+  Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
+
+  Value *P = PV.P, *Q = PV.Q, *P0 = P;
+  unsigned IC = PV.IterCount;
+
+  if (PV.M != nullptr)
+    P0 = P = B.CreateXor(P, PV.M);
+
+  // Create a bit mask to clear the high bits beyond IterCount.
+  auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
+
+  if (PV.IterCount != 32)
+    P = B.CreateAnd(P, BMI);
+
+  if (PV.Inv) {
+    auto *QI = dyn_cast<ConstantInt>(PV.Q);
+    assert(QI && QI->getBitWidth() <= 32);
+
+    // Again, clearing bits beyond IterCount.
+    unsigned M = (1 << PV.IterCount) - 1;
+    unsigned Tmp = (QI->getZExtValue() | 1) & M;
+    unsigned QV = getInverseMxN(Tmp) & M;
+    auto *QVI = ConstantInt::get(QI->getType(), QV);
+    P = B.CreateCall(PMF, {P, QVI});
+    P = B.CreateTrunc(P, QI->getType());
+    if (IC != 32)
+      P = B.CreateAnd(P, BMI);
+  }
+
+  Value *R = B.CreateCall(PMF, {P, Q});
+
+  if (PV.M != nullptr)
+    R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
+
+  return R;
+}
+
+
+void PolynomialMultiplyRecognize::setupSimplifier() {
+  Simp.addRule(
+    // Sink zext past bitwise operations.
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::ZExt)
+        return nullptr;
+      Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
+      if (!T)
+        return nullptr;
+      switch (T->getOpcode()) {
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor:
+          break;
+        default:
+          return nullptr;
+      }
+      IRBuilder<> B(Ctx);
+      return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
+                           B.CreateZExt(T->getOperand(0), I->getType()),
+                           B.CreateZExt(T->getOperand(1), I->getType()));
+    });
+  Simp.addRule(
+    // (xor (and x a) (and y a)) -> (and (xor x y) a)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::Xor)
+        return nullptr;
+      Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
+      Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
+      if (!And0 || !And1)
+        return nullptr;
+      if (And0->getOpcode() != Instruction::And ||
+          And1->getOpcode() != Instruction::And)
+        return nullptr;
+      if (And0->getOperand(1) != And1->getOperand(1))
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
+                         And0->getOperand(1));
+    });
+  Simp.addRule(
+    // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
+    // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
+      if (!BO)
+        return nullptr;
+      Instruction::BinaryOps Op = BO->getOpcode();
+      if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
+        IRBuilder<> B(Ctx);
+        Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
+        Value *Z = BO->getOperand(1);
+        return B.CreateSelect(Sel->getCondition(),
+                              B.CreateBinOp(Op, X, Z),
+                              B.CreateBinOp(Op, Y, Z));
+      }
+      if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
+        IRBuilder<> B(Ctx);
+        Value *X = BO->getOperand(0);
+        Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
+        return B.CreateSelect(Sel->getCondition(),
+                              B.CreateBinOp(Op, X, Y),
+                              B.CreateBinOp(Op, X, Z));
+      }
+      return nullptr;
+    });
+  Simp.addRule(
+    // (select c (select c x y) z) -> (select c x z)
+    // (select c x (select c y z)) -> (select c x z)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      SelectInst *Sel = dyn_cast<SelectInst>(I);
+      if (!Sel)
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      Value *C = Sel->getCondition();
+      if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
+        if (Sel0->getCondition() == C)
+          return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
+      }
+      if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
+        if (Sel1->getCondition() == C)
+          return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
+      }
+      return nullptr;
+    });
+  Simp.addRule(
+    // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::Or)
+        return nullptr;
+      Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
+      if (!LShr || LShr->getOpcode() != Instruction::LShr)
+        return nullptr;
+      ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
+      if (!One || One->getZExtValue() != 1)
+        return nullptr;
+      ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
+        return nullptr;
+      return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
+    });
+  Simp.addRule(
+    // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::LShr)
+        return nullptr;
+      BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
+      if (!BitOp)
+        return nullptr;
+      switch (BitOp->getOpcode()) {
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor:
+          break;
+        default:
+          return nullptr;
+      }
+      IRBuilder<> B(Ctx);
+      Value *S = I->getOperand(1);
+      return B.CreateBinOp(BitOp->getOpcode(),
+                B.CreateLShr(BitOp->getOperand(0), S),
+                B.CreateLShr(BitOp->getOperand(1), S));
+    });
+  Simp.addRule(
+    // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      auto IsBitOp = [](unsigned Op) -> bool {
+        switch (Op) {
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            return true;
+        }
+        return false;
+      };
+      BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
+      if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
+        return nullptr;
+      BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
+      if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
+        return nullptr;
+      ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
+      ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
+      if (!CA || !CB)
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      Value *X = BitOp2->getOperand(0);
+      return B.CreateBinOp(BitOp2->getOpcode(), X,
+                B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
+    });
+}
+
+
+bool PolynomialMultiplyRecognize::recognize() {
+  DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
+               << *CurLoop << '\n');
+  // Restrictions:
+  // - The loop must consist of a single block.
+  // - The iteration count must be known at compile-time.
+  // - The loop must have an induction variable starting from 0, and
+  //   incremented in each iteration of the loop.
+  BasicBlock *LoopB = CurLoop->getHeader();
+  DEBUG(dbgs() << "Loop header:\n" << *LoopB);
+
+  if (LoopB != CurLoop->getLoopLatch())
+    return false;
+  BasicBlock *ExitB = CurLoop->getExitBlock();
+  if (ExitB == nullptr)
+    return false;
+  BasicBlock *EntryB = CurLoop->getLoopPreheader();
+  if (EntryB == nullptr)
+    return false;
+
+  unsigned IterCount = 0;
+  const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
+  if (isa<SCEVCouldNotCompute>(CT))
+    return false;
+  if (auto *CV = dyn_cast<SCEVConstant>(CT))
+    IterCount = CV->getValue()->getZExtValue() + 1;
+
+  Value *CIV = getCountIV(LoopB);
+  ParsedValues PV;
+  PV.IterCount = IterCount;
+  DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
+
+  setupSimplifier();
+
+  // Perform a preliminary scan of select instructions to see if any of them
+  // looks like a generator of the polynomial multiply steps. Assume that a
+  // loop can only contain a single transformable operation, so stop the
+  // traversal after the first reasonable candidate was found.
+  // XXX: Currently this approach can modify the loop before being 100% sure
+  // that the transformation can be carried out.
+  bool FoundPreScan = false;
+  for (Instruction &In : *LoopB) {
+    SelectInst *SI = dyn_cast<SelectInst>(&In);
+    if (!SI)
+      continue;
+
+    Simplifier::Context C(SI);
+    Value *T = Simp.simplify(C);
+    SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
+    DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
+    if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
+      FoundPreScan = true;
+      if (SelI != SI) {
+        Value *NewSel = C.materialize(LoopB, SI->getIterator());
+        SI->replaceAllUsesWith(NewSel);
+        RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
+      }
+      break;
+    }
+  }
+
+  if (!FoundPreScan) {
+    DEBUG(dbgs() << "Have not found candidates for pmpy\n");
+    return false;
+  }
+
+  if (!PV.Left) {
+    // The right shift version actually only returns the higher bits of
+    // the result (each iteration discards the LSB). If we want to convert it
+    // to a left-shifting loop, the working data type must be at least as
+    // wide as the target's pmpy instruction.
+    if (!promoteTypes(LoopB, ExitB))
+      return false;
+    convertShiftsToLeft(LoopB, ExitB, IterCount);
+    cleanupLoopBody(LoopB);
+  }
+
+  // Scan the loop again, find the generating select instruction.
+  bool FoundScan = false;
+  for (Instruction &In : *LoopB) {
+    SelectInst *SelI = dyn_cast<SelectInst>(&In);
+    if (!SelI)
+      continue;
+    DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
+    FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
+    if (FoundScan)
+      break;
+  }
+  assert(FoundScan);
+
+  DEBUG({
+    StringRef PP = (PV.M ? "(P+M)" : "P");
+    if (!PV.Inv)
+      dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
+    else
+      dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
+             << PP << "\n";
+    dbgs() << "  Res:" << *PV.Res << "\n  P:" << *PV.P << "\n";
+    if (PV.M)
+      dbgs() << "  M:" << *PV.M << "\n";
+    dbgs() << "  Q:" << *PV.Q << "\n";
+    dbgs() << "  Iteration count:" << PV.IterCount << "\n";
+  });
+
+  BasicBlock::iterator At(EntryB->getTerminator());
+  Value *PM = generate(At, PV);
+  if (PM == nullptr)
+    return false;
+
+  if (PM->getType() != PV.Res->getType())
+    PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
+
+  PV.Res->replaceAllUsesWith(PM);
+  PV.Res->eraseFromParent();
+  return true;
+}
+
+
+unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
+  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
+         "Don't overflow unsigned.");
+  return (unsigned)SizeInBits >> 3;
+}
+
+
+int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
+    return SC->getAPInt().getSExtValue();
+  return 0;
+}
+
+
+bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
+  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
+  if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
+    return false;
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
+  // Reject stores that are so large that they overflow an unsigned.
+  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return false;
+
+  // Check to see if the stride matches the size of the store.  If so, then we
+  // know that every byte is touched in the loop.
+  int Stride = getSCEVStride(StoreEv);
+  if (Stride == 0)
+    return false;
+  unsigned StoreSize = getStoreSizeInBytes(SI);
+  if (StoreSize != unsigned(std::abs(Stride)))
+    return false;
+
+  // The store must be feeding a non-volatile load.
+  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+  if (!LI || !LI->isSimple())
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  Value *LoadPtr = LI->getPointerOperand();
+  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
+  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+    return false;
+
+  // The store and load must share the same stride.
+  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+    return false;
+
+  // Success.  This store can be converted into a memcpy.
+  return true;
+}
+
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access.  The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                      const SCEV *BECount, unsigned StoreSize,
+                      AliasAnalysis &AA,
+                      SmallPtrSetImpl<Instruction *> &Ignored) {
+  // Get the location that may be stored across the loop.  Since the access
+  // is strided positively through memory, we say that the modified location
+  // starts at the pointer and has infinite size.
+  uint64_t AccessSize = MemoryLocation::UnknownSize;
+
+  // If the loop iterates a fixed number of times, we can refine the access
+  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
+
+  // TODO: For this to be really effective, we have to dive into the pointer
+  // operand in the store.  Store to &A[i] of 100 will always return may alias
+  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+  // which will then no-alias a store to &A[100].
+  MemoryLocation StoreLoc(Ptr, AccessSize);
+
+  for (auto *B : L->blocks())
+    for (auto &I : *B)
+      if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
+        return true;
+
+  return false;
+}
+
+
+void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
+      SmallVectorImpl<StoreInst*> &Stores) {
+  Stores.clear();
+  for (Instruction &I : *BB)
+    if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+      if (isLegalStore(CurLoop, SI))
+        Stores.push_back(SI);
+}
+
+
+bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
+      StoreInst *SI, const SCEV *BECount) {
+  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
+         "Expected only non-volatile stores, or Hexagon-specific memcpy"
+         "to volatile destination.");
+
+  Value *StorePtr = SI->getPointerOperand();
+  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getSCEVStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI);
+  if (Stride != StoreSize)
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  Instruction *ExpPt = Preheader->getTerminator();
+  IRBuilder<> Builder(ExpPt);
+  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
+
+  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
+
+  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
+  // this into a memcpy/memmove in the loop preheader now if we want.  However,
+  // this would be unsafe to do if there is anything else in the loop that may
+  // read or write the memory region we're storing to.  For memcpy, this
+  // includes the load that feeds the stores.  Check for an alias by generating
+  // the base address and checking everything.
+  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
+      Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
+  Value *LoadBasePtr = nullptr;
+
+  bool Overlap = false;
+  bool DestVolatile = SI->isVolatile();
+  Type *BECountTy = BECount->getType();
+
+  if (DestVolatile) {
+    // The trip count must fit in i32, since it is the type of the "num_words"
+    // argument to hexagon_memcpy_forward_vp4cp4n2.
+    if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
+CleanupAndExit:
+      // If we generated new code for the base pointer, clean up.
+      Expander.clear();
+      if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
+        RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+        StoreBasePtr = nullptr;
+      }
+      if (LoadBasePtr) {
+        RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+        LoadBasePtr = nullptr;
+      }
+      return false;
+    }
+  }
+
+  SmallPtrSet<Instruction*, 2> Ignore1;
+  Ignore1.insert(SI);
+  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                            StoreSize, *AA, Ignore1)) {
+    // Check if the load is the offending instruction.
+    Ignore1.insert(LI);
+    if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                              StoreSize, *AA, Ignore1)) {
+      // Still bad. Nothing we can do.
+      goto CleanupAndExit;
+    }
+    // It worked with the load ignored.
+    Overlap = true;
+  }
+
+  if (!Overlap) {
+    if (DisableMemcpyIdiom || !HasMemcpy)
+      goto CleanupAndExit;
+  } else {
+    // Don't generate memmove if this function will be inlined. This is
+    // because the caller will undergo this transformation after inlining.
+    Function *Func = CurLoop->getHeader()->getParent();
+    if (Func->hasFnAttribute(Attribute::AlwaysInline))
+      goto CleanupAndExit;
+
+    // In case of a memmove, the call to memmove will be executed instead
+    // of the loop, so we need to make sure that there is nothing else in
+    // the loop than the load, store and instructions that these two depend
+    // on.
+    SmallVector<Instruction*,2> Insts;
+    Insts.push_back(SI);
+    Insts.push_back(LI);
+    if (!coverLoop(CurLoop, Insts))
+      goto CleanupAndExit;
+
+    if (DisableMemmoveIdiom || !HasMemmove)
+      goto CleanupAndExit;
+    bool IsNested = CurLoop->getParentLoop() != 0;
+    if (IsNested && OnlyNonNestedMemmove)
+      goto CleanupAndExit;
+  }
+
+  // For a memcpy, we have to make sure that the input array is not being
+  // mutated by the loop.
+  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
+      Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
+
+  SmallPtrSet<Instruction*, 2> Ignore2;
+  Ignore2.insert(SI);
+  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
+                            *AA, Ignore2))
+    goto CleanupAndExit;
+
+  // Check the stride.
+  bool StridePos = getSCEVStride(LoadEv) >= 0;
+
+  // Currently, the volatile memcpy only emulates traversing memory forward.
+  if (!StridePos && DestVolatile)
+    goto CleanupAndExit;
+
+  bool RuntimeCheck = (Overlap || DestVolatile);
+
+  BasicBlock *ExitB;
+  if (RuntimeCheck) {
+    // The runtime check needs a single exit block.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    if (ExitBlocks.size() != 1)
+      goto CleanupAndExit;
+    ExitB = ExitBlocks[0];
+  }
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  LLVMContext &Ctx = SI->getContext();
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
+  unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
+  DebugLoc DLoc = SI->getDebugLoc();
+
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+                               SCEV::FlagNUW);
+  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
+  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
+    if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT))
+      NumBytes = Simp;
+
+  CallInst *NewCall;
+
+  if (RuntimeCheck) {
+    unsigned Threshold = RuntimeMemSizeThreshold;
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
+      uint64_t C = CI->getZExtValue();
+      if (Threshold != 0 && C < Threshold)
+        goto CleanupAndExit;
+      if (C < CompileTimeMemSizeThreshold)
+        goto CleanupAndExit;
+    }
+
+    BasicBlock *Header = CurLoop->getHeader();
+    Function *Func = Header->getParent();
+    Loop *ParentL = LF->getLoopFor(Preheader);
+    StringRef HeaderName = Header->getName();
+
+    // Create a new (empty) preheader, and update the PHI nodes in the
+    // header to use the new preheader.
+    BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
+                                                  Func, Header);
+    if (ParentL)
+      ParentL->addBasicBlockToLoop(NewPreheader, *LF);
+    IRBuilder<>(NewPreheader).CreateBr(Header);
+    for (auto &In : *Header) {
+      PHINode *PN = dyn_cast<PHINode>(&In);
+      if (!PN)
+        break;
+      int bx = PN->getBasicBlockIndex(Preheader);
+      if (bx >= 0)
+        PN->setIncomingBlock(bx, NewPreheader);
+    }
+    DT->addNewBlock(NewPreheader, Preheader);
+    DT->changeImmediateDominator(Header, NewPreheader);
+
+    // Check for safe conditions to execute memmove.
+    // If stride is positive, copying things from higher to lower addresses
+    // is equivalent to memmove.  For negative stride, it's the other way
+    // around.  Copying forward in memory with positive stride may not be
+    // same as memmove since we may be copying values that we just stored
+    // in some previous iteration.
+    Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
+    Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
+    Value *LowA = StridePos ? SA : LA;
+    Value *HighA = StridePos ? LA : SA;
+    Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
+    Value *Cond = CmpA;
+
+    // Check for distance between pointers.
+    Value *Dist = Builder.CreateSub(HighA, LowA);
+    Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
+    Value *CmpEither = Builder.CreateOr(Cond, CmpD);
+    Cond = CmpEither;
+
+    if (Threshold != 0) {
+      Type *Ty = NumBytes->getType();
+      Value *Thr = ConstantInt::get(Ty, Threshold);
+      Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
+      Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
+      Cond = CmpBoth;
+    }
+    BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
+                                              Func, NewPreheader);
+    if (ParentL)
+      ParentL->addBasicBlockToLoop(MemmoveB, *LF);
+    Instruction *OldT = Preheader->getTerminator();
+    Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
+    OldT->eraseFromParent();
+    Preheader->setName(Preheader->getName()+".old");
+    DT->addNewBlock(MemmoveB, Preheader);
+    // Find the new immediate dominator of the exit block.
+    BasicBlock *ExitD = Preheader;
+    for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
+      BasicBlock *PB = *PI;
+      ExitD = DT->findNearestCommonDominator(ExitD, PB);
+      if (!ExitD)
+        break;
+    }
+    // If the prior immediate dominator of ExitB was dominated by the
+    // old preheader, then the old preheader becomes the new immediate
+    // dominator.  Otherwise don't change anything (because the newly
+    // added blocks are dominated by the old preheader).
+    if (ExitD && DT->dominates(Preheader, ExitD)) {
+      DomTreeNode *BN = DT->getNode(ExitB);
+      DomTreeNode *DN = DT->getNode(ExitD);
+      BN->setIDom(DN);
+    }
+
+    // Add a call to memmove to the conditional block.
+    IRBuilder<> CondBuilder(MemmoveB);
+    CondBuilder.CreateBr(ExitB);
+    CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
+
+    if (DestVolatile) {
+      Type *Int32Ty = Type::getInt32Ty(Ctx);
+      Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
+      Type *VoidTy = Type::getVoidTy(Ctx);
+      Module *M = Func->getParent();
+      Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
+                                            Int32PtrTy, Int32PtrTy, Int32Ty);
+      Function *Fn = cast<Function>(CF);
+      Fn->setLinkage(Function::ExternalLinkage);
+
+      const SCEV *OneS = SE->getConstant(Int32Ty, 1);
+      const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
+      const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
+      Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
+                                               MemmoveB->getTerminator());
+      if (Instruction *In = dyn_cast<Instruction>(NumWords))
+        if (Value *Simp = SimplifyInstruction(In, *DL, TLI, DT))
+          NumWords = Simp;
+
+      Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
+                      ? StoreBasePtr
+                      : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
+      Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
+                      ? LoadBasePtr
+                      : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
+      NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
+    } else {
+      NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
+                                          NumBytes, Alignment);
+    }
+  } else {
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
+                                   NumBytes, Alignment);
+    // Okay, the memcpy has been formed.  Zap the original store and
+    // anything that feeds into it.
+    RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
+  }
+
+  NewCall->setDebugLoc(DLoc);
+
+  DEBUG(dbgs() << "  Formed " << (Overlap ? "memmove: " : "memcpy: ")
+               << *NewCall << "\n"
+               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+  return true;
+}
+
+
+// \brief Check if the instructions in Insts, together with their dependencies
+// cover the loop in the sense that the loop could be safely eliminated once
+// the instructions in Insts are removed.
+bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
+      SmallVectorImpl<Instruction*> &Insts) const {
+  SmallSet<BasicBlock*,8> LoopBlocks;
+  for (auto *B : L->blocks())
+    LoopBlocks.insert(B);
+
+  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
+
+  // Collect all instructions from the loop that the instructions in Insts
+  // depend on (plus their dependencies, etc.).  These instructions will
+  // constitute the expression trees that feed those in Insts, but the trees
+  // will be limited only to instructions contained in the loop.
+  for (unsigned i = 0; i < Worklist.size(); ++i) {
+    Instruction *In = Worklist[i];
+    for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
+      Instruction *OpI = dyn_cast<Instruction>(I);
+      if (!OpI)
+        continue;
+      BasicBlock *PB = OpI->getParent();
+      if (!LoopBlocks.count(PB))
+        continue;
+      Worklist.insert(OpI);
+    }
+  }
+
+  // Scan all instructions in the loop, if any of them have a user outside
+  // of the loop, or outside of the expressions collected above, then either
+  // the loop has a side-effect visible outside of it, or there are
+  // instructions in it that are not involved in the original set Insts.
+  for (auto *B : L->blocks()) {
+    for (auto &In : *B) {
+      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
+        continue;
+      if (!Worklist.count(&In) && In.mayHaveSideEffects())
+        return false;
+      for (const auto &K : In.users()) {
+        Instruction *UseI = dyn_cast<Instruction>(K);
+        if (!UseI)
+          continue;
+        BasicBlock *UseB = UseI->getParent();
+        if (LF->getLoopFor(UseB) != L)
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count.  This block is known to be in the current
+/// loop and not in any subloops.
+bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
+      const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
+  // We can only promote stores in this block if they are unconditionally
+  // executed in the loop.  For a block to be unconditionally executed, it has
+  // to dominate all the exit blocks of the loop.  Verify this now.
+  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
+    return DT->dominates(BB, EB);
+  };
+  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
+    return false;
+
+  bool MadeChange = false;
+  // Look for store instructions, which may be optimized to memset/memcpy.
+  SmallVector<StoreInst*,8> Stores;
+  collectStores(CurLoop, BB, Stores);
+
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : Stores)
+    MadeChange |= processCopyingStore(CurLoop, SI, BECount);
+
+  return MadeChange;
+}
+
+
+bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
+  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
+  if (PMR.recognize())
+    return true;
+
+  if (!HasMemcpy && !HasMemmove)
+    return false;
+
+  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+  assert(!isa<SCEVCouldNotCompute>(BECount) &&
+         "runOnCountableLoop() called on a loop without a predictable"
+         "backedge-taken count");
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  bool Changed = false;
+
+  // Scan all the blocks in the loop that are not in subloops.
+  for (auto *BB : L->getBlocks()) {
+    // Ignore blocks in subloops.
+    if (LF->getLoopFor(BB) != L)
+      continue;
+    Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
+  }
+
+  return Changed;
+}
+
+
+bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  const Module &M = *L->getHeader()->getParent()->getParent();
+  if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
+    return false;
+
+  if (skipLoop(L))
+    return false;
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy" || Name == "memmove")
+    return false;
+
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DL = &L->getHeader()->getModule()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  HasMemcpy = TLI->has(LibFunc_memcpy);
+  HasMemmove = TLI->has(LibFunc_memmove);
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L))
+    return runOnCountableLoop(L);
+  return false;
+}
+
+
+Pass *llvm::createHexagonLoopIdiomPass() {
+  return new HexagonLoopIdiomRecognize();
+}
+
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index a5dc002642c8..7189b5a52c42 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -109,11 +109,14 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
 
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_RegisterMask:
+      continue;
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
+      if (MO.isImplicit())
+        continue;
       MCO = MCOperand::createReg(MO.getReg());
       break;
     case MachineOperand::MO_FPImmediate: {
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 9ff9d93ea0c3..20dc9b0da1db 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -74,7 +74,9 @@ bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII,
     return false;
 
   // TypeXTYPE are 64 bit operations.
-  if (HII.getType(*Inst2.getInstr()) == HexagonII::TypeXTYPE)
+  unsigned Type = HII.getType(*Inst2.getInstr());
+  if (Type == HexagonII::TypeS_2op || Type == HexagonII::TypeS_3op ||
+    Type == HexagonII::TypeALU64 || Type == HexagonII::TypeM)
     return true;
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
new file mode 100644
index 000000000000..0b4ac14c7a47
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
@@ -0,0 +1,204 @@
+//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+           (MI VectorRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegsLow8:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, IntRegsLow8:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegsLow8:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegsLow8:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+           (MI VectorRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+           (MI VecDblRegs:$src1, VecDblRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+           (MI VecDblRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+           (MI VecPredRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+           (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VecPredRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VecPredRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2),
+           (MI VecPredRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID IntRegs:$src1),
+           (MI IntRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+           (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+           (MI VecPredRegs:$src1, VecPredRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, VecPredRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, VecPredRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4),
+           (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4),
+           (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>;
+def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>;
+def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>;
+def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>;
+def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>;
+
+defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>;
+defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>;
+defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>;
+defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>;
+defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>;
+defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>;
+defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>;
+defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>;
+defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>;
+defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>;
+defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>;
+defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>;
+defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>;
+defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>;
+defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>;
+defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>;
+defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>;
+defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>;
+defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>;
+defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>;
+defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>;
+defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>;
+defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>;
+defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>;
+defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>;
+defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>;
+defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>;
+defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>;
+defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>;
+defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>;
+defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>;
+defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>;
+defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>;
+defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>;
+defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>;
+defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>;
+defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>;
+defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>;
+defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>;
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 72d8011277e6..d73fc7c73185 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -130,6 +130,8 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
   if (II->getOpcode() == TargetOpcode::KILL)
     return false;
 
+  if (II->isImplicitDef())
+    return false;
 
   // Make sure there there is no 'def' or 'use' of any of the uses of
   // feeder insn between it's definition, this MI and jump, jmpInst
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index 983310571563..f87a1b8e424d 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -1,298 +1,33 @@
-//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
+//===--- HexagonOperands.td -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
-// This file is distributed under the University of Illnois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
-def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; }
-def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; }
-def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; }
-def s8_0Imm64Operand : AsmOperandClass { let Name = "s8_0Imm64"; }
-def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; }
-def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
-def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
-def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
-def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
-def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
-def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
-def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; }
-def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; }
-def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
-def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
-def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
-def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
-def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
-def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
-def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; }
-def u9_0ImmOperand : AsmOperandClass { let Name = "u9_0Imm"; }
-def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; }
-def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; }
-def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
-def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
-def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
-def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
-def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; }
-def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; }
-def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; }
-def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; }
-def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; }
-def n8_0ImmOperand : AsmOperandClass { let Name = "n8_0Imm"; }
-// Immediate operands.
-
-let OperandType = "OPERAND_IMMEDIATE",
-    DecoderMethod = "unsignedImmDecoder" in {
-  def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand;
-                                let DecoderMethod = "s32_0ImmDecoder"; }
-  def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
-  def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand;
-                               let DecoderMethod = "s8_0ImmDecoder"; }
-  def s8_0Imm64 : Operand<i64>  { let ParserMatchClass = s8_0Imm64Operand;
-                                  let DecoderMethod = "s8_0ImmDecoder"; }
-  def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand;
-                             let DecoderMethod = "s6_0ImmDecoder"; }
-  def s6_3Imm : Operand<i32>;
-  def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
-                               let DecoderMethod = "s4_0ImmDecoder"; }
-  def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
-                               let DecoderMethod = "s4_1ImmDecoder"; }
-  def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
-                               let DecoderMethod = "s4_2ImmDecoder"; }
-  def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
-                               let DecoderMethod = "s4_3ImmDecoder"; }
-  def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
-  def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; }
-  def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
-  def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
-  def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
-  def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
-  def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
-  def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
-  def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; }
-  def u9_0Imm : Operand<i32> { let ParserMatchClass = u9_0ImmOperand; }
-  def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; }
-  def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; }
-  def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
-  def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
-  def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
-  def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
-  def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; }
-  def u5_1Imm : Operand<i32>;
-  def u5_2Imm : Operand<i32>;
-  def u5_3Imm : Operand<i32>;
-  def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; }
-  def u4_1Imm : Operand<i32>;
-  def u4_2Imm : Operand<i32>;
-  def u4_3Imm : Operand<i32>;
-  def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; }
-  def u3_1Imm : Operand<i32>;
-  def u3_2Imm : Operand<i32>;
-  def u3_3Imm : Operand<i32>;
-  def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; }
-  def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; }
-  def n8_0Imm : Operand<i32> { let ParserMatchClass = n8_0ImmOperand; }
-}
-
-let OperandType = "OPERAND_IMMEDIATE" in {
-  def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
-                               let PrintMethod = "prints4_6ImmOperand";
-                               let DecoderMethod = "s4_6ImmDecoder";}
-  def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
-                               let DecoderMethod = "s4_6ImmDecoder";}
-  def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
-                               let PrintMethod = "prints3_6ImmOperand";
-                               let DecoderMethod = "s3_6ImmDecoder";}
-  def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
-                               let DecoderMethod = "s3_6ImmDecoder";}
-}
-def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
-def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
-
-//
-// Immediate predicates
-//
-def s32_0ImmPred  : PatLeaf<(i32 imm), [{
+def f32ImmOperand : AsmOperandClass { let Name = "f32Imm"; }
+def f32Imm : Operand<f32> { let ParserMatchClass = f32ImmOperand; }
+def f64ImmOperand : AsmOperandClass { let Name = "f64Imm"; }
+def f64Imm : Operand<f64> { let ParserMatchClass = f64ImmOperand; }
+def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+def s9_0ImmOperand : AsmOperandClass { let Name = "s9_0Imm"; }
+def s9_0Imm : Operand<i32> { let ParserMatchClass = s9_0ImmOperand; }
+def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
+def r32_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isInt<32>(v);
 }]>;
-
-def s31_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<31,1>(v);
-}]>;
-
-def s30_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<30,2>(v);
-}]>;
-
-def s29_3ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<29,3>(v);
-}]>;
-
-def s10_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<10>(v);
-}]>;
-
-def s8_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<8>(v);
-}]>;
-
-def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<8>(v);
-}]>;
-
-def s6_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<6>(v);
-}]>;
-
-def s4_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<4>(v);
-}]>;
-
-def s4_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,1>(v);
-}]>;
-
-def s4_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,2>(v);
-}]>;
-
-def s4_3ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,3>(v);
-}]>;
-
-def u32_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<32>(v);
-}]>;
-
-def u16_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<16>(v);
-}]>;
-
-def u11_3ImmPred : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<11,3>(v);
-}]>;
-
 def u9_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isUInt<9>(v);
 }]>;
-
-def u8_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<8>(v);
-}]>;
-
-def u6_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<6>(v);
-}]>;
-
-def u6_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<6,1>(v);
-}]>;
-
-def u6_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<6,2>(v);
-}]>;
-
-def u5_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<5>(v);
-}]>;
-
-def u4_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<4>(v);
-}]>;
-
-def u3_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<3>(v);
-}]>;
-
-def u2_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<2>(v);
-}]>;
-
-// Extendable immediate operands.
-def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
-def s16_0ExtOperand : AsmOperandClass { let Name = "s16_0Ext"; }
-def s12_0ExtOperand : AsmOperandClass { let Name = "s12_0Ext"; }
-def s10_0ExtOperand : AsmOperandClass { let Name = "s10_0Ext"; }
-def s9_0ExtOperand : AsmOperandClass { let Name = "s9_0Ext"; }
-def s8_0ExtOperand : AsmOperandClass { let Name = "s8_0Ext"; }
-def s7_0ExtOperand : AsmOperandClass { let Name = "s7_0Ext"; }
-def s6_0ExtOperand : AsmOperandClass { let Name = "s6_0Ext"; }
-def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
-def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
-def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
-def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
-def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
-def u7_0ExtOperand : AsmOperandClass { let Name = "u7_0Ext"; }
-def u8_0ExtOperand : AsmOperandClass { let Name = "u8_0Ext"; }
-def u9_0ExtOperand : AsmOperandClass { let Name = "u9_0Ext"; }
-def u10_0ExtOperand : AsmOperandClass { let Name = "u10_0Ext"; }
-def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
-def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
-def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
-def u32_0MustExtOperand : AsmOperandClass { let Name = "u32_0MustExt"; }
-
-
-
-let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
-    DecoderMethod = "unsignedImmDecoder" in {
-  def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
-  def s16_0Ext : Operand<i32> { let ParserMatchClass = s16_0ExtOperand;
-                                let DecoderMethod = "s16_0ImmDecoder"; }
-  def s12_0Ext : Operand<i32> { let ParserMatchClass = s12_0ExtOperand;
-                                let DecoderMethod = "s12_0ImmDecoder"; }
-  def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
-                                let DecoderMethod = "s11_0ImmDecoder"; }
-  def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
-                                let DecoderMethod = "s11_1ImmDecoder"; }
-  def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
-                                let DecoderMethod = "s11_2ImmDecoder"; }
-  def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
-                                let DecoderMethod = "s11_3ImmDecoder"; }
-  def s10_0Ext : Operand<i32> { let ParserMatchClass = s10_0ExtOperand;
-                                let DecoderMethod = "s10_0ImmDecoder"; }
-  def s9_0Ext : Operand<i32> { let ParserMatchClass = s9_0ExtOperand;
-                               let DecoderMethod = "s9_0ImmDecoder"; }
-  def s8_0Ext : Operand<i32> { let ParserMatchClass = s8_0ExtOperand;
-                               let DecoderMethod = "s8_0ImmDecoder"; }
-  def s7_0Ext : Operand<i32> { let ParserMatchClass = s7_0ExtOperand; }
-  def s6_0Ext : Operand<i32> { let ParserMatchClass = s6_0ExtOperand;
-                               let DecoderMethod = "s6_0ImmDecoder"; }
-  def u7_0Ext : Operand<i32> { let ParserMatchClass = u7_0ExtOperand; }
-  def u8_0Ext : Operand<i32> { let ParserMatchClass = u8_0ExtOperand; }
-  def u9_0Ext : Operand<i32> { let ParserMatchClass = u9_0ExtOperand; }
-  def u10_0Ext : Operand<i32> { let ParserMatchClass = u10_0ExtOperand; }
-  def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
-  def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
-  def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
-  def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
-  def u32_0MustExt : Operand<i32> { let ParserMatchClass = u32_0MustExtOperand; }
-}
-
+def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; let RenderMethod = "addImmOperands"; }
+def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
+def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
+def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
 
 // This complex pattern exists only to create a machine instruction operand
 // of type "frame index". There doesn't seem to be a way to do that directly
@@ -305,28 +40,6 @@ def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>;
 def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>;
 def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>;
 
-// Address operands.
-
-let PrintMethod = "printGlobalOperand" in {
-  def globaladdress : Operand<i32>;
-  def globaladdressExt : Operand<i32>;
-}
-
-let PrintMethod = "printJumpTable" in
-def jumptablebase : Operand<i32>;
-
-def brtarget : Operand<OtherVT> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
-def brtargetExt : Operand<OtherVT> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
-def calltarget : Operand<i32> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
 
 def bblabel : Operand<i32>;
 def bbl     : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 89db46799cb3..b243de317dc5 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -208,7 +208,16 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
     NodeAddr<UseNode *> UN = *I;
     RegisterRef UR = UN.Addr->getRegRef(*DFG);
     NodeSet Visited, Defs;
-    const auto &ReachingDefs = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    const auto &P = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    if (!P.second) {
+      DEBUG({
+        dbgs() << "*** Unable to collect all reaching defs for use ***\n"
+               << PrintNode<UseNode*>(UN, *DFG) << '\n'
+               << "The program's complexity may exceed the limits.\n";
+      });
+      return false;
+    }
+    const auto &ReachingDefs = P.first;
     if (ReachingDefs.size() > 1) {
       DEBUG({
         dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
@@ -230,7 +239,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
   for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
     DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
                  << "\n");
-    RegisterRef DR = DFG->normalizeRef(DA.Addr->getRegRef(*DFG));
+    RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
 
     auto UseSet = LV->getAllReachedUses(DR, DA);
 
@@ -250,7 +259,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
                      << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
         if (!phiUse.empty()) {
           for (auto I : phiUse) {
-            if (DR.Reg != I.first)
+            if (!DFG->getPRI().alias(RegisterRef(I.first), DR))
               continue;
             auto phiUseSet = I.second;
             for (auto phiUI : phiUseSet) {
@@ -333,17 +342,17 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
       short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-      MIB.addOperand(OldMI->getOperand(0));
-      MIB.addOperand(OldMI->getOperand(2));
-      MIB.addOperand(OldMI->getOperand(3));
-      MIB.addOperand(ImmOp);
+      MIB.add(OldMI->getOperand(0));
+      MIB.add(OldMI->getOperand(2));
+      MIB.add(OldMI->getOperand(3));
+      MIB.add(ImmOp);
       OpStart = 4;
       Changed = true;
     } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
       short NewOpCode = HII->getAbsoluteForm(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
-                .addOperand(OldMI->getOperand(0));
+                .add(OldMI->getOperand(0));
       const GlobalValue *GV = ImmOp.getGlobal();
       int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(2).getImm();
 
@@ -359,9 +368,9 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
     short NewOpCode = HII->xformRegToImmOffset(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
     MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-    MIB.addOperand(OldMI->getOperand(0));
-    MIB.addOperand(OldMI->getOperand(1));
-    MIB.addOperand(ImmOp);
+    MIB.add(OldMI->getOperand(0));
+    MIB.add(OldMI->getOperand(1));
+    MIB.add(ImmOp);
     OpStart = 4;
     Changed = true;
     DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
@@ -370,7 +379,7 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
 
   if (Changed)
     for (unsigned i = OpStart; i < OpEnd; ++i)
-      MIB.addOperand(OldMI->getOperand(i));
+      MIB.add(OldMI->getOperand(i));
 
   return Changed;
 }
@@ -390,10 +399,10 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-      MIB.addOperand(OldMI->getOperand(1));
-      MIB.addOperand(OldMI->getOperand(2));
-      MIB.addOperand(ImmOp);
-      MIB.addOperand(OldMI->getOperand(3));
+      MIB.add(OldMI->getOperand(1));
+      MIB.add(OldMI->getOperand(2));
+      MIB.add(ImmOp);
+      MIB.add(OldMI->getOperand(3));
       OpStart = 4;
     } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
       short NewOpCode = HII->getAbsoluteForm(*OldMI);
@@ -402,7 +411,7 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       const GlobalValue *GV = ImmOp.getGlobal();
       int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(1).getImm();
       MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
-      MIB.addOperand(OldMI->getOperand(2));
+      MIB.add(OldMI->getOperand(2));
       OpStart = 3;
     }
     Changed = true;
@@ -412,9 +421,9 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
     short NewOpCode = HII->xformRegToImmOffset(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
     MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-    MIB.addOperand(OldMI->getOperand(0));
-    MIB.addOperand(ImmOp);
-    MIB.addOperand(OldMI->getOperand(1));
+    MIB.add(OldMI->getOperand(0));
+    MIB.add(ImmOp);
+    MIB.add(OldMI->getOperand(1));
     OpStart = 2;
     Changed = true;
     DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
@@ -422,7 +431,7 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
   }
   if (Changed)
     for (unsigned i = OpStart; i < OpEnd; ++i)
-      MIB.addOperand(OldMI->getOperand(i));
+      MIB.add(OldMI->getOperand(i));
 
   return Changed;
 }
@@ -473,26 +482,26 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
         BuildMI(*BB, InsertPt, UseMI->getDebugLoc(), HII->get(NewOpCode));
     // change mem(Rs + # ) -> mem(Rt << # + ##)
     if (UseMID.mayLoad()) {
-      MIB.addOperand(UseMI->getOperand(0));
-      MIB.addOperand(AddAslMI->getOperand(2));
-      MIB.addOperand(AddAslMI->getOperand(3));
+      MIB.add(UseMI->getOperand(0));
+      MIB.add(AddAslMI->getOperand(2));
+      MIB.add(AddAslMI->getOperand(3));
       const GlobalValue *GV = ImmOp.getGlobal();
       MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm(),
                            ImmOp.getTargetFlags());
       OpStart = 3;
     } else if (UseMID.mayStore()) {
-      MIB.addOperand(AddAslMI->getOperand(2));
-      MIB.addOperand(AddAslMI->getOperand(3));
+      MIB.add(AddAslMI->getOperand(2));
+      MIB.add(AddAslMI->getOperand(3));
       const GlobalValue *GV = ImmOp.getGlobal();
       MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm(),
                            ImmOp.getTargetFlags());
-      MIB.addOperand(UseMI->getOperand(2));
+      MIB.add(UseMI->getOperand(2));
       OpStart = 3;
     } else
       llvm_unreachable("Unhandled instruction");
 
     for (unsigned i = OpStart; i < OpEnd; ++i)
-      MIB.addOperand(UseMI->getOperand(i));
+      MIB.add(UseMI->getOperand(i));
 
     Deleted.insert(UseMI);
   }
@@ -617,7 +626,7 @@ bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
 
   for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) {
     updateMap(IA);
-    DFG->pushDefs(IA, DefM);
+    DFG->pushAllDefs(IA, DefM);
   }
 
   MachineDomTreeNode *N = MDT->getNode(B);
@@ -629,6 +638,9 @@ bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
 }
 
 bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   bool Changed = false;
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &MRI = MF.getRegInfo();
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index ad81287007e6..b8c3bf0745ce 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -17,6 +17,16 @@ def HiReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_hi)>;
 def IsOrAdd: PatFrag<(ops node:$Addr, node:$off),
     (or node:$Addr, node:$off), [{ return isOrEquivalentToAdd(N); }]>;
 
+def Iss4_6 : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  return isShiftedInt<4,6>(V);
+}]>;
+
+def Iss4_7 : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  return isShiftedInt<4,7>(V);
+}]>;
+
 def IsPow2_32 : PatLeaf<(i32 imm), [{
   uint32_t V = N->getZExtValue();
   return isPowerOf2_32(V);
@@ -89,6 +99,11 @@ def LogN2_64 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(Log2_64(NV), SDLoc(N), MVT::i32);
 }]>;
 
+def ToZext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A4_combineir 0, (i32 $Rs)))>;
+def ToSext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A2_sxtw (i32 $Rs)))>;
+
 
 class T_CMP_pat <InstHexagon MI, PatFrag OpNode, PatLeaf ImmPred>
   : Pat<(i1 (OpNode I32:$src1, ImmPred:$src2)),
@@ -153,8 +168,12 @@ def: Pat<(sub s32_0ImmPred:$s10, IntRegs:$Rs),
 def: Pat<(not I32:$src1),
          (A2_subri -1, IntRegs:$src1)>;
 
+def TruncI64ToI32: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
 def: Pat<(s32_0ImmPred:$s16), (A2_tfrsi imm:$s16)>;
-def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi imm:$s8)>;
+def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi (TruncI64ToI32 $s8))>;
 
 def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, I32:$Rs),
           (C2_muxri I1:$Pu, imm:$s8, I32:$Rs)>;
@@ -274,7 +293,7 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
 
-def: Pat<(br bb:$dst),                  (J2_jump brtarget:$dst)>;
+def: Pat<(br bb:$dst),                  (J2_jump b30_2Imm:$dst)>;
 def: Pat<(brcond I1:$src1, bb:$block),  (J2_jumpt PredRegs:$src1, bb:$block)>;
 def: Pat<(brind I32:$dst),              (J2_jumpr IntRegs:$dst)>;
 
@@ -695,8 +714,8 @@ def HexagonCONST32    : SDNode<"HexagonISD::CONST32",    SDTHexagonCONST32>;
 def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
 
 // Map TLS addressses to A2_tfrsi.
-def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s16_0Ext:$addr)>;
-def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s16_0Ext:$label)>;
+def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s32_0Imm:$addr)>;
+def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s32_0Imm:$label)>;
 
 def: Pat<(i64 imm:$v), (CONST64 imm:$v)>;
 def: Pat<(i1 0), (PS_false)>;
@@ -898,26 +917,35 @@ def: Pat<(i1 (setule I64:$src1, I64:$src2)),
          (C2_not (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>;
 
 // Sign extends.
-// i1 -> i32
-def: Pat<(i32 (sext I1:$src1)),
-         (C2_muxii PredRegs:$src1, -1, 0)>;
+// sext i1->i32
+def: Pat<(i32 (sext I1:$Pu)),
+         (C2_muxii I1:$Pu, -1, 0)>;
 
-// i1 -> i64
-def: Pat<(i64 (sext I1:$src1)),
-         (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>;
+// sext i1->i64
+def: Pat<(i64 (sext I1:$Pu)),
+         (A2_combinew (C2_muxii PredRegs:$Pu, -1, 0),
+                      (C2_muxii PredRegs:$Pu, -1, 0))>;
 
 // Zero extends.
-// i1 -> i32
-def: Pat<(i32 (zext I1:$src1)),
-         (C2_muxii PredRegs:$src1, 1, 0)>;
+// zext i1->i32
+def: Pat<(i32 (zext I1:$Pu)),
+         (C2_muxii PredRegs:$Pu, 1, 0)>;
+
+// zext i1->i64
+def: Pat<(i64 (zext I1:$Pu)),
+         (ToZext64 (C2_muxii PredRegs:$Pu, 1, 0))>;
+
+// zext i32->i64
+def: Pat<(Zext64 I32:$Rs),
+         (ToZext64 IntRegs:$Rs)>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def: Pat<(i32 (anyext I1:$src1)),
-         (C2_muxii PredRegs:$src1, 1, 0)>;
+def: Pat<(i32 (anyext I1:$Pu)),
+         (C2_muxii PredRegs:$Pu, 1, 0)>;
 
-// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
-def: Pat<(i64 (anyext I1:$src1)),
-         (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>;
+// Map from Rss = Pd to Rdd = combine(#0, (mux(Pd, #1, #0)))
+def: Pat<(i64 (anyext I1:$Pu)),
+         (ToZext64 (C2_muxii PredRegs:$Pu, 1, 0))>;
 
 // Clear the sign bit in a 64-bit register.
 def ClearSign : OutPatFrag<(ops node:$Rss),
@@ -1244,11 +1272,6 @@ def: Pat<(HexagonCOMBINE s32_0ImmPred:$s8, s8_0ImmPred:$S8),
 }
 
 
-def ToZext64: OutPatFrag<(ops node:$Rs),
-  (i64 (A4_combineir 0, (i32 $Rs)))>;
-def ToSext64: OutPatFrag<(ops node:$Rs),
-  (i64 (A2_sxtw (i32 $Rs)))>;
-
 // Patterns to generate indexed loads with different forms of the address:
 // - frameindex,
 // - base + offset,
@@ -1349,14 +1372,6 @@ let AddedComplexity = 20 in {
   def: Loadxs_simple_pat<load,        i64, L4_loadrd_rr>;
 }
 
-// zext i1->i64
-def: Pat<(i64 (zext I1:$src1)),
-         (ToZext64 (C2_muxii PredRegs:$src1, 1, 0))>;
-
-// zext i32->i64
-def: Pat<(Zext64 I32:$src1),
-         (ToZext64 IntRegs:$src1)>;
-
 let AddedComplexity = 40 in
 multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT,
                            PatFrag stOp> {
@@ -1587,6 +1602,15 @@ def: Pat<(i64 (cttz I64:$Rss)), (ToZext64 (S2_ct0p I64:$Rss))>;
 def: Pat<(i64 (ctlz (not I64:$Rss))), (ToZext64 (S2_cl1p I64:$Rss))>;
 def: Pat<(i64 (cttz (not I64:$Rss))), (ToZext64 (S2_ct1p I64:$Rss))>;
 
+def: Pat<(i64 (ctpop I64:$Rss)), (ToZext64 (S5_popcountp I64:$Rss))>;
+def: Pat<(i32 (ctpop I32:$Rs)), (S5_popcountp (A4_combineir 0, I32:$Rs))>;
+
+def: Pat<(bitreverse I32:$Rs), (S2_brev I32:$Rs)>;
+def: Pat<(bitreverse I64:$Rss), (S2_brevp I64:$Rss)>;
+
+def: Pat<(bswap I32:$Rs), (A2_swiz I32:$Rs)>;
+def: Pat<(bswap I64:$Rss), (A2_combinew (A2_swiz (LoReg $Rss)),
+                                        (A2_swiz (HiReg $Rss)))>;
 
 let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
   def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
@@ -2235,12 +2259,6 @@ def ftoi : SDNodeXForm<fpimm, [{
 def: Pat<(sra (i64 (add (sra I64:$src1, u6_0ImmPred:$src2), 1)), (i32 1)),
          (S2_asr_i_p_rnd I64:$src1, imm:$src2)>;
 
-def SDTHexagonI32I64: SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
-                                           SDTCisVT<1, i64>]>;
-def HexagonPOPCOUNT: SDNode<"HexagonISD::POPCOUNT", SDTHexagonI32I64>;
-
-def: Pat<(HexagonPOPCOUNT I64:$Rss), (S5_popcountp I64:$Rss)>;
-
 let AddedComplexity = 20 in {
   defm: Loadx_pat<load, f32, s30_2ImmPred, L2_loadri_io>;
   defm: Loadx_pat<load, f64, s29_3ImmPred, L2_loadrd_io>;
@@ -2718,17 +2736,6 @@ def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [
 }]>;
 
 
-def s4_6ImmPred: PatLeaf<(i32 imm), [{
-  int64_t V = N->getSExtValue();
-  return isShiftedInt<4,6>(V);
-}]>;
-
-def s4_7ImmPred: PatLeaf<(i32 imm), [{
-  int64_t V = N->getSExtValue();
-  return isShiftedInt<4,7>(V);
-}]>;
-
-
 multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Aligned stores
   def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
@@ -2749,25 +2756,25 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Fold Add R+OFF into vector store.
   let AddedComplexity = 10 in {
     def : Pat<(alignedstore (VTSgl VectorRegs:$src1),
-                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
-              (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32b_ai IntRegs:$src2, Iss4_6:$offset,
                            (VTSgl VectorRegs:$src1))>,
               Requires<[UseHVXSgl]>;
     def : Pat<(unalignedstore (VTSgl VectorRegs:$src1),
-                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
-              (V6_vS32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32Ub_ai IntRegs:$src2, Iss4_6:$offset,
                            (VTSgl VectorRegs:$src1))>,
               Requires<[UseHVXSgl]>;
 
     // Fold Add R+OFF into vector store 128B.
     def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1),
-                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
-              (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32b_ai_128B IntRegs:$src2, Iss4_7:$offset,
                                 (VTDbl VectorRegs128B:$src1))>,
               Requires<[UseHVXDbl]>;
     def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1),
-                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
-              (V6_vS32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset,
                                 (VTDbl VectorRegs128B:$src1))>,
               Requires<[UseHVXDbl]>;
   }
@@ -2798,18 +2805,18 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
 
   // Fold Add R+OFF into vector load.
   let AddedComplexity = 10 in {
-    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
-              (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32b_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
                Requires<[UseHVXDbl]>;
-    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
-              (V6_vL32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
                Requires<[UseHVXDbl]>;
 
-    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
-              (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32b_ai IntRegs:$src2, Iss4_6:$offset)>,
               Requires<[UseHVXSgl]>;
-    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
-              (V6_vL32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32Ub_ai IntRegs:$src2, Iss4_6:$offset)>,
               Requires<[UseHVXSgl]>;
   }
 }
@@ -3253,8 +3260,8 @@ def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt),
                       (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>;
 
 def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)),
-         (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)),
-                             (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>;
+         (LoReg (S2_vtrunewh (A2_combineii 0, 0),
+                             (vmpyh V2I16:$Rs, V2I16:$Rt)))>;
 
 // Multiplies two v4i16 vectors.
 def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
@@ -3345,3 +3352,11 @@ def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)),
 def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)),
          (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>;
 
+
+// Read cycle counter.
+//
+def SDTInt64Leaf: SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
+  [SDNPHasChain]>;
+
+def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
new file mode 100644
index 000000000000..5a720e794562
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -0,0 +1,537 @@
+//===--- HexagonPseudo.td -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+let PrintMethod = "printGlobalOperand" in {
+  def globaladdress : Operand<i32>;
+  def globaladdressExt : Operand<i32>;
+}
+
+let isPseudo = 1 in {
+let isCodeGenOnly = 0 in
+def A2_iconst : Pseudo<(outs IntRegs:$Rd32), (ins s23_2Imm:$Ii), "${Rd32}=iconst(#${Ii})">;
+def DUPLEX_Pseudo : InstHexagon<(outs), (ins s32_0Imm:$offset), "DUPLEX", [], "", DUPLEX, TypePSEUDO>;
+}
+
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
+                             (ins s32_0Imm:$src1, s8_0Imm:$src2),
+                             "$dst=combine(#$src1,#$src2)">;
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    hasNewValue = 1, opNewValue = 0 in
+class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
+  : InstHexagon<(outs IntRegs:$dst),
+              (ins u16_0Imm:$imm_value),
+              "$dst"#RegHalf#"=#$imm_value", [], "", ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, OpcodeHexagon {
+    bits<5> dst;
+    bits<32> imm_value;
+
+    let Inst{27} = Rs;
+    let Inst{26-24} = MajOp;
+    let Inst{21} = MinOp;
+    let Inst{20-16} = dst;
+    let Inst{23-22} = imm_value{15-14};
+    let Inst{13-0} = imm_value{13-0};
+}
+
+let isAsmParserOnly = 1 in {
+  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
+  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
+}
+
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
+  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
+                "$Rd = CONST32(#$v)", []>;
+  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
+                "$Rd = CONST64(#$v)", []>;
+}
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              ".error \"should not emit\" ", []>;
+
+let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                             ".error \"should not emit\" ", []>;
+
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins b30_2Imm:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC1], Uses = [SA1, LC1] in {
+def ENDLOOP1 : Endloop<(outs), (ins b30_2Imm:$offset),
+                       ":endloop1",
+                       []>;
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
+           #mnemonic#"($offset,#$src2)",
+           [], "" , CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<10> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2{9-5};
+    let Inst{12-8} = offset{8-4};
+    let Inst{7-5} = src2{4-2};
+    let Inst{4-3} = offset{3-2};
+    let Inst{1-0} = src2{1-0};
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
+           #mnemonic#"($offset,$src2)",
+           [], "" ,CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<5> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b000000;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2;
+    let Inst{12-8} = offset{8-4};
+    let Inst{4-3} = offset{3-2};
+  }
+
+multiclass LOOP_ri<string mnemonic> {
+  let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
+    def iext: LOOP_iBase<mnemonic, b30_2Imm, 1>;
+    def rext: LOOP_rBase<mnemonic, b30_2Imm, 1>;
+  }
+}
+
+
+let Defs = [SA0, LC0, USR] in
+defm J2_loop0 : LOOP_ri<"loop0">;
+
+// Interestingly only loop0's appear to set usr.lpcfg
+let Defs = [SA1, LC1] in
+defm J2_loop1 : LOOP_ri<"loop1">;
+
+let isCall = 1, hasSideEffects = 1, isPredicable = 0,
+    isExtended = 0, isExtendable = 1, opExtendable = 0,
+    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
+class T_Call<string ExtStr>
+  : JInst<(outs), (ins a30_2Imm:$dst),
+      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
+  let BaseOpcode = "call";
+  bits<24> dst;
+
+  let IClass = 0b0101;
+  let Inst{27-25} = 0b101;
+  let Inst{24-16,13-1} = dst{23-2};
+  let Inst{0} = 0b0;
+}
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1, Defs = [R16],
+    isPredicable = 0 in
+def CALLProfile :  T_Call<"">;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+    Defs = [PC, R31, R6, R7, P0] in
+def PS_call_stk : T_Call<"">;
+
+let isCall = 1, hasSideEffects = 1, cofMax1 = 1 in
+class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
+               dag InputDag = (ins IntRegs:$Rs)>
+  : JInst<(outs), InputDag,
+      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
+                                 "if ($Pu) callr $Rs"),
+                                 "callr $Rs"),
+      [], "", J_tc_2early_SLOT2> {
+    bits<5> Rs;
+    bits<2> Pu;
+    let isPredicated = isPred;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b0101;
+    let Inst{27-25} = 0b000;
+    let Inst{24-23} = !if (isPred, 0b10, 0b01);
+    let Inst{22} = 0;
+    let Inst{21} = isPredNot;
+    let Inst{9-8} = !if (isPred, Pu, 0b00);
+    let Inst{20-16} = Rs;
+
+  }
+
+let isCodeGenOnly = 1 in {
+  def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
+}
+
+let isCall = 1, hasSideEffects = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 0, isCodeGenOnly = 1,
+    BaseOpcode = "PS_call_nr", isExtentSigned = 1, opExtentAlign = 2,
+    Itinerary = J_tc_2early_SLOT23 in
+class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops>
+  : Pseudo<(outs), iops, "">, PredRel {
+    bits<2> Pu;
+    bits<17> dst;
+    let opExtentBits = nbits;
+    let isPredicable = 0;  // !if(isPred, 0, 1);
+    let isPredicated = 0;  // isPred;
+    let isPredicatedFalse = isFalse;
+}
+
+def PS_call_nr : Call_nr<24, 0, 0, (ins s32_0Imm:$Ii)>;
+//def PS_call_nrt: Call_nr<17, 1, 0, (ins PredRegs:$Pu, s32_0Imm:$dst)>;
+//def PS_call_nrf: Call_nr<17, 1, 1, (ins PredRegs:$Pu, s32_0Imm:$dst)>;
+
+let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
+    isPredicable = 1, hasSideEffects = 0, InputType = "reg",
+    cofMax1 = 1 in
+class T_JMPr
+  :  InstHexagon<(outs), (ins IntRegs:$dst), "jumpr $dst", [],
+                 "", J_tc_2early_SLOT2, TypeJ>, OpcodeHexagon {
+    bits<5> dst;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010100;
+    let Inst{20-16} = dst;
+}
+
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
+    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr;
+
+// Indirect tail-call.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_r : T_JMPr;
+
+//
+// Direct tail-calls.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_i : Pseudo<(outs), (ins a30_2Imm:$dst), "", []>;
+
+let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
+def PS_aligna : Pseudo<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
+
+// Generate frameindex addresses. The main reason for the offset operand is
+// that every instruction that is allowed to have frame index as an operand
+// will then have that operand followed by an immediate operand (the offset).
+// This simplifies the frame-index elimination code.
+//
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def PS_fi  : Pseudo<(outs IntRegs:$Rd),
+                         (ins IntRegs:$fi, s32_0Imm:$off), "">;
+  def PS_fia : Pseudo<(outs IntRegs:$Rd),
+                         (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
+}
+
+class CondStr<string CReg, bit True, bit New> {
+  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
+}
+class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+  string S = Mnemonic # !if(Taken, ":t", ":nt");
+}
+let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
+    hasSideEffects = 0, InputType = "reg", cofMax1 = 1 in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
+  :  InstHexagon<(outs), (ins PredRegs:$src, IntRegs:$dst),
+                 CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+                 JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst",
+                 [], "", J_tc_2early_SLOT2, TypeJ>, OpcodeHexagon {
+
+    let isTaken = isTak;
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<5> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-22} = 0b001101;
+    let Inst{21} = PredNot;
+    let Inst{20-16} = dst;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+}
+multiclass JMPR_Pred<bit PredNot> {
+  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
+  // Predicate new
+  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
+  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
+}
+multiclass JMPR_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMPr;
+    defm t : JMPR_Pred<0>;
+    defm f : JMPR_Pred<1>;
+  }
+}
+let isTerminator = 1, hasSideEffects = 0, isReturn = 1, isCodeGenOnly = 1, isBarrier = 1 in
+defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
+
+//defm V6_vtran2x2_map : HexagonMapping<(outs VectorRegs:$Vy32, VectorRegs:$Vx32), (ins VectorRegs:$Vx32in, IntRegs:$Rt32), "vtrans2x2(${Vy32},${Vx32},${Rt32})", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, VectorRegs:$Vx32in, IntRegs:$Rt32)>;
+
+// The reason for the custom inserter is to record all ALLOCA instructions
+// in MachineFunctionInfo.
+let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
+def PS_alloca: InstHexagon<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, u32_0Imm:$A), "",
+      [], "", ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+                        (ins IntRegs:$addr, s32_0Imm:$off),
+                        ".error \"should not emit\"", []>;
+
+// Load modifier.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_mod : LDInst<(outs ModRegs:$dst),
+                        (ins IntRegs:$addr, s32_0Imm:$off),
+                        ".error \"should not emit\"", []>;
+
+// Vector load
+let Predicates = [HasV60T, UseHVX] in
+let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
+                  IType type = TypeCVI_VM_LD>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+// Vector store
+let Predicates = [HasV60T, UseHVX] in
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = CVI_VM_ST,
+                IType type = TypeCVI_VM_ST>
+: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
+      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      ".error \"should not emit\" ", []>;
+
+let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
+    isPredicable = 1,
+    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
+class T_JMP<string ExtStr>
+  : JInst_CJUMP_UCJUMP<(outs), (ins b30_2Imm:$dst),
+      "jump " # ExtStr # "$dst",
+      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
+    bits<24> dst;
+    let IClass = 0b0101;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-16} = dst{23-15};
+    let Inst{13-1} = dst{14-2};
+}
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
+  }
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<"">, PredRel;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<"">, PredRel;
+  }
+}
+
+// Save registers function call.
+let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
+  def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel;
+
+  let Defs = [P0] in
+  def SAVE_REGISTERS_CALL_V4STK : T_Call<"">, PredRel;
+
+  let Defs = [P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28] in
+  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, P0] in
+  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<"">, PredRel;
+}
+
+// Vector load/store pseudos
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class STrivv_template<RegisterClass RC>
+  : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
+
+def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class LDrivv_template<RegisterClass RC>
+  : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
+
+def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+// Store vector predicate pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+  def PS_vstorerq_ai : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
+              ".error \"should not emit\" ", []>,
+              Requires<[HasV60T,UseHVXSgl]>;
+
+  def PS_vstorerq_ai_128B : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VectorRegs:$src1),
+              ".error \"should not emit\" ", []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+
+  def PS_vloadrq_ai : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
+              ".error \"should not emit\" ", []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+
+  def PS_vloadrq_ai_128B : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
+              ".error \"should not emit\" ", []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = CVI_VA_DV,
+              IType type = TypeCVI_VA_DV>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+  def PS_vselect: VSELInst<(outs VectorRegs:$dst),
+        (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
+        (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+        "", []>, Requires<[HasV60T,UseHVXDbl]>;
+  def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
+        (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
+        (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
+        "", []>, Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Store predicate.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_pred : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, PredRegs:$src1),
+      ".error \"should not emit\"", []>;
+// Store modifier.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_mod : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, ModRegs:$src1),
+      ".error \"should not emit\"", []>;
+
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
+                         "$dst = #$src1">;
+
+// Hexagon doesn't have a vector multiply with C semantics.
+// Instead, generate a pseudo instruction that gets expaneded into two
+// scalar MPYI instructions.
+// This is expanded by ExpandPostRAPseudos.
+let isPseudo = 1 in
+def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
+
+let isPseudo = 1 in
+def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
+      "$Rd = $Rx">;
+
+def DuplexIClass0:  InstDuplex < 0 >;
+def DuplexIClass1:  InstDuplex < 1 >;
+def DuplexIClass2:  InstDuplex < 2 >;
+let isExtendable = 1 in {
+  def DuplexIClass3:  InstDuplex < 3 >;
+  def DuplexIClass4:  InstDuplex < 4 >;
+  def DuplexIClass5:  InstDuplex < 5 >;
+  def DuplexIClass6:  InstDuplex < 6 >;
+  def DuplexIClass7:  InstDuplex < 7 >;
+}
+def DuplexIClass8:  InstDuplex < 8 >;
+def DuplexIClass9:  InstDuplex < 9 >;
+def DuplexIClassA:  InstDuplex < 0xA >;
+def DuplexIClassB:  InstDuplex < 0xB >;
+def DuplexIClassC:  InstDuplex < 0xC >;
+def DuplexIClassD:  InstDuplex < 0xD >;
+def DuplexIClassE:  InstDuplex < 0xE >;
+def DuplexIClassF:  InstDuplex < 0xF >;
diff --git a/lib/Target/Hexagon/HexagonRDFOpt.cpp b/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 30640e19ebac..b3aba50b5625 100644
--- a/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -94,7 +94,7 @@ struct HexagonDCE : public DeadCodeElimination {
 
 
 bool HexagonCP::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
-  auto mapRegs = [MI,&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
+  auto mapRegs = [&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
     EM.insert(std::make_pair(DstR, SrcR));
   };
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d3f230d3f8a6..2a1bb63af789 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -36,6 +36,9 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
+
 using namespace llvm;
 
 HexagonRegisterInfo::HexagonRegisterInfo()
@@ -125,6 +128,7 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case HexagonSubtarget::V5:
   case HexagonSubtarget::V55:
   case HexagonSubtarget::V60:
+  case HexagonSubtarget::V62:
     return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
   }
 
@@ -133,25 +137,47 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 }
 
 
+const uint32_t *HexagonRegisterInfo::getCallPreservedMask(
+      const MachineFunction &MF, CallingConv::ID) const {
+  return HexagonCSR_RegMask;
+}
+
+
 BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   const {
   BitVector Reserved(getNumRegs());
   Reserved.set(Hexagon::R29);
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
-  Reserved.set(Hexagon::PC);
-  Reserved.set(Hexagon::D14);
-  Reserved.set(Hexagon::D15);
-  Reserved.set(Hexagon::LC0);
-  Reserved.set(Hexagon::LC1);
-  Reserved.set(Hexagon::SA0);
-  Reserved.set(Hexagon::SA1);
-  Reserved.set(Hexagon::UGP);
-  Reserved.set(Hexagon::GP);
-  Reserved.set(Hexagon::CS0);
-  Reserved.set(Hexagon::CS1);
-  Reserved.set(Hexagon::CS);
-  Reserved.set(Hexagon::USR);
+  // Control registers.
+  Reserved.set(Hexagon::SA0);         // C0
+  Reserved.set(Hexagon::LC0);         // C1
+  Reserved.set(Hexagon::SA1);         // C2
+  Reserved.set(Hexagon::LC1);         // C3
+  Reserved.set(Hexagon::P3_0);        // C4
+  Reserved.set(Hexagon::USR);         // C8
+  Reserved.set(Hexagon::PC);          // C9
+  Reserved.set(Hexagon::UGP);         // C10
+  Reserved.set(Hexagon::GP);          // C11
+  Reserved.set(Hexagon::CS0);         // C12
+  Reserved.set(Hexagon::CS1);         // C13
+  Reserved.set(Hexagon::UPCYCLELO);   // C14
+  Reserved.set(Hexagon::UPCYCLEHI);   // C15
+  Reserved.set(Hexagon::FRAMELIMIT);  // C16
+  Reserved.set(Hexagon::FRAMEKEY);    // C17
+  Reserved.set(Hexagon::PKTCOUNTLO);  // C18
+  Reserved.set(Hexagon::PKTCOUNTHI);  // C19
+  Reserved.set(Hexagon::UTIMERLO);    // C30
+  Reserved.set(Hexagon::UTIMERHI);    // C31
+  // Out of the control registers, only C8 is explicitly defined in
+  // HexagonRegisterInfo.td. If others are defined, make sure to add
+  // them here as well.
+  Reserved.set(Hexagon::C8);
+  Reserved.set(Hexagon::USR_OVF);
+
+  for (int x = Reserved.find_first(); x >= 0; x = Reserved.find_next(x))
+    markSuperRegs(Reserved, x);
+
   return Reserved;
 }
 
@@ -267,6 +293,3 @@ unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
   return Hexagon::R6;
 }
 
-
-#define GET_REGINFO_TARGET_DESC
-#include "HexagonGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 1fb295b5bd8c..8a3f175b8488 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -35,7 +35,8 @@ public:
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF)
         const override;
-
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+        CallingConv::ID) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index a75f3514dbd2..93ab2f731207 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -140,41 +140,54 @@ let Namespace = "Hexagon" in {
   }
 
   // Control registers.
-  def SA0  : Rc<0,  "sa0",       ["c0"]>,   DwarfRegNum<[67]>;
-  def LC0  : Rc<1,  "lc0",       ["c1"]>,   DwarfRegNum<[68]>;
-  def SA1  : Rc<2,  "sa1",       ["c2"]>,   DwarfRegNum<[69]>;
-  def LC1  : Rc<3,  "lc1",       ["c3"]>,   DwarfRegNum<[70]>;
-  def P3_0 : Rc<4,  "p3:0",      ["c4"], [P0, P1, P2, P3]>,
-                                            DwarfRegNum<[71]>;
-  def C5   : Rc<5,  "c5",        ["c5"]>,   DwarfRegNum<[72]>; // future use
-  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[73]>;
-  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[74]>;
+  def SA0:        Rc<0,  "sa0",        ["c0"]>,    DwarfRegNum<[67]>;
+  def LC0:        Rc<1,  "lc0",        ["c1"]>,    DwarfRegNum<[68]>;
+  def SA1:        Rc<2,  "sa1",        ["c2"]>,    DwarfRegNum<[69]>;
+  def LC1:        Rc<3,  "lc1",        ["c3"]>,    DwarfRegNum<[70]>;
+  def P3_0:       Rc<4,  "p3:0",       ["c4"], [P0, P1, P2, P3]>,
+                                                   DwarfRegNum<[71]>;
+  // When defining more Cn registers, make sure to explicitly mark them
+  // as reserved in HexagonRegisterInfo.cpp.
+  def C5:         Rc<5,  "c5",         ["c5"]>,    DwarfRegNum<[72]>;
+  def C6:         Rc<6,  "c6",         [], [M0]>,  DwarfRegNum<[73]>;
+  def C7:         Rc<7,  "c7",         [], [M1]>,  DwarfRegNum<[74]>;
   // Define C8 separately and make it aliased with USR.
   // The problem is that USR has subregisters (e.g. overflow). If USR was
   // specified as a subregister of C9_8, it would imply that subreg_overflow
   // and isub_lo can be composed, which leads to all kinds of issues
   // with lane masks.
-  def C8   : Rc<8,  "c8",       [], [USR]>, DwarfRegNum<[75]>;
-  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[76]>;
-  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[77]>;
-  def GP   : Rc<11, "gp",        ["c11"]>,  DwarfRegNum<[78]>;
-  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[79]>;
-  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[80]>;
-  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[81]>;
-  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[82]>;
+  def C8:         Rc<8,  "c8",         [], [USR]>, DwarfRegNum<[75]>;
+  def PC:         Rc<9,  "pc">,                    DwarfRegNum<[76]>;
+  def UGP:        Rc<10, "ugp",        ["c10"]>,   DwarfRegNum<[77]>;
+  def GP:         Rc<11, "gp",         ["c11"]>,   DwarfRegNum<[78]>;
+  def CS0:        Rc<12, "cs0",        ["c12"]>,   DwarfRegNum<[79]>;
+  def CS1:        Rc<13, "cs1",        ["c13"]>,   DwarfRegNum<[80]>;
+  def UPCYCLELO:  Rc<14, "upcyclelo",  ["c14"]>,   DwarfRegNum<[81]>;
+  def UPCYCLEHI:  Rc<15, "upcyclehi",  ["c15"]>,   DwarfRegNum<[82]>;
+  def FRAMELIMIT: Rc<16, "framelimit", ["c16"]>,   DwarfRegNum<[83]>;
+  def FRAMEKEY:   Rc<17, "framekey",   ["c17"]>,   DwarfRegNum<[84]>;
+  def PKTCOUNTLO: Rc<18, "pktcountlo", ["c18"]>,   DwarfRegNum<[85]>;
+  def PKTCOUNTHI: Rc<19, "pktcounthi", ["c19"]>,   DwarfRegNum<[86]>;
+  def UTIMERLO:   Rc<30, "utimerlo",   ["c30"]>,   DwarfRegNum<[97]>;
+  def UTIMERHI:   Rc<31, "utimerhi",   ["c31"]>,   DwarfRegNum<[98]>;
 }
 
   // Control registers pairs.
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-    def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
-    def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
-    def C5_4   : Rcc<4,   "c5:4",  [P3_0, C5]>,              DwarfRegNum<[71]>;
-    def C7_6   : Rcc<6,   "c7:6",  [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
+    def C1_0:     Rcc<0,  "c1:0",   [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+    def C3_2:     Rcc<2,  "c3:2",   [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+    def C5_4:     Rcc<4,  "c5:4",   [P3_0, C5]>,              DwarfRegNum<[71]>;
+    def C7_6:     Rcc<6,  "c7:6",   [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
     // Use C8 instead of USR as a subregister of C9_8.
-    def C9_8   : Rcc<8,   "c9:8",  [C8, PC]>,                DwarfRegNum<[74]>;
-    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
-    def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
-    def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
+    def C9_8:     Rcc<8,  "c9:8",   [C8, PC]>,                DwarfRegNum<[74]>;
+    def C11_10:   Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
+    def CS:       Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
+    def UPCYCLE:  Rcc<14, "c15:14", [UPCYCLELO, UPCYCLEHI]>,  DwarfRegNum<[80]>;
+    def C17_16:   Rcc<16, "c17:16", [FRAMELIMIT, FRAMEKEY]>,  DwarfRegNum<[83]>;
+    def PKTCOUNT: Rcc<18, "c19:18", [PKTCOUNTLO, PKTCOUNTHI], ["pktcount"]>,
+                                                              DwarfRegNum<[85]>;
+    def UTIMER:   Rcc<30, "c31:30", [UTIMERLO, UTIMERHI], ["utimer"]>,
+                                                              DwarfRegNum<[97]>;
   }
 
   foreach i = 0-31 in {
@@ -219,6 +232,10 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
 }
 
 // Registers are listed in reverse order for allocation preference reasons.
+def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
+                                   (add R23, R22, R21, R20, R19, R18, R17,
+                                        R16, R7, R6, R5, R4, R3, R2, R1, R0)>;
+
 def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
                                 (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
 
@@ -226,6 +243,10 @@ def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
                                (add (sequence "D%u", 0, 4),
                                     (sequence "D%u", 6, 13), D5, D14, D15)>;
 
+def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
+                                          (add D11, D10, D9, D8, D3, D2, D1,
+                                               D0)>;
+
 def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
                                (add (sequence "V%u", 0, 31))>;
 
@@ -259,28 +280,28 @@ def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
 
 let Size = 32, isAllocatable = 0 in
 def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
-                            (add LC0, SA0, LC1, SA1,
-                                 P3_0, C5,
-                                 M0, M1, C6, C7, C8, CS0, CS1, UPCL, UPCH,
-                                 USR, UGP, GP, PC)>;
+  (add LC0, SA0, LC1, SA1, P3_0, C5, C6, C7,
+       C8, PC, UGP, GP, CS0, CS1, UPCYCLELO, UPCYCLEHI,
+       FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI,
+       M0, M1, USR)>;
 
 let isAllocatable = 0 in
 def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
 
 let Size = 64, isAllocatable = 0 in
 def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
-                              (add C1_0, C3_2, C7_6, C9_8, C11_10, CS, UPC)>;
-
-def VolatileV3 {
-  list<Register> Regs = [D0, D1, D2, D3, D4, D5, D6, D7,
-                         R28, R31,
-                         P0, P1, P2, P3,
-                         M0, M1,
-                         LC0, LC1, SA0, SA1, USR, USR_OVF, CS0, CS1,
-                         V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
-                         V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
-                         V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
-                         W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11,
-                         W12, W13, W14, W15,
-                         Q0, Q1, Q2, Q3];
-}
+  (add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE, C17_16,
+       PKTCOUNT, UTIMER)>;
+
+// These registers are new for v62 and onward.
+// The function RegisterMatchesArch() uses this list for validation.
+let isAllocatable = 0 in
+def V62Regs : RegisterClass<"Hexagon", [i32], 32,
+                            (add FRAMELIMIT, FRAMEKEY,   C17_16,
+                                 PKTCOUNTLO, PKTCOUNTHI, PKTCOUNT,
+                                 UTIMERLO,   UTIMERHI,   UTIMER)>;
+
+
+def HexagonCSR
+  : CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
+                         R24, R25, R26, R27)>;
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index 6e4987b7e4e3..9b5fbea04d18 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -21,4 +21,12 @@ include "HexagonScheduleV55.td"
 //===----------------------------------------------------------------------===//
 
 include "HexagonScheduleV60.td"
+include "HexagonIICScalar.td"
+include "HexagonIICHVX.td"
+
+//===----------------------------------------------------------------------===//
+// V62 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV62.td"
 
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 7416baab392c..880cc0a02b6a 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -61,15 +61,21 @@ def J_tc_2early_SLOT23       : InstrItinClass;
 def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT       : InstrItinClass;
 def J_tc_2early_SLOT2        : InstrItinClass;
 def LD_tc_ld_SLOT01          : InstrItinClass;
+def LD_tc_ld_pi_SLOT01          : InstrItinClass;
 def LD_tc_ld_SLOT0           : InstrItinClass;
 def LD_tc_3or4stall_SLOT0    : InstrItinClass;
 def M_tc_2_SLOT23            : InstrItinClass;
+def M_tc_2_acc_SLOT23        : InstrItinClass;
 def M_tc_3_SLOT23            : InstrItinClass;
 def M_tc_1_SLOT23            : InstrItinClass;
 def M_tc_3x_SLOT23           : InstrItinClass;
+def M_tc_3x_acc_SLOT23       : InstrItinClass;
 def M_tc_3or4x_SLOT23        : InstrItinClass;
+def M_tc_3or4x_acc_SLOT23    : InstrItinClass;
 def ST_tc_st_SLOT01          : InstrItinClass;
+def ST_tc_st_pi_SLOT01       : InstrItinClass;
 def ST_tc_st_SLOT0           : InstrItinClass;
+def ST_tc_st_pi_SLOT0        : InstrItinClass;
 def ST_tc_ld_SLOT0           : InstrItinClass;
 def ST_tc_3stall_SLOT0       : InstrItinClass;
 def S_2op_tc_1_SLOT23        : InstrItinClass;
@@ -131,21 +137,27 @@ def HexagonItinerariesV4 :
 
         //Load
         InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_pi_SLOT01     , [InstrStage<1, [SLOT0, SLOT1]>]>,
         InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
 
         // M
         InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_acc_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_acc_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_acc_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
 
         // Store
         // ST
         InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT01     , [InstrStage<1, [SLOT0, SLOT1]>]>,
         // ST0
         InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT0      , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
 
         // S
diff --git a/lib/Target/Hexagon/HexagonScheduleV55.td b/lib/Target/Hexagon/HexagonScheduleV55.td
index b2a75f7200d7..06cbcb16abb7 100644
--- a/lib/Target/Hexagon/HexagonScheduleV55.td
+++ b/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -88,6 +88,8 @@ def HexagonItinerariesV55 :
         // Load
         InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
                                              [2, 1]>,
+        InstrItinData<LD_tc_ld_pi_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                             [2, 1]>,
         InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1]>,
         InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [2, 1]>,
 
@@ -96,21 +98,30 @@ def HexagonItinerariesV55 :
                                           [1, 1, 1]>,
         InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [2, 1, 1]>,
+        InstrItinData<M_tc_2_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
         InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [1, 1, 1]>,
         InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [3, 1, 1]>,
+        InstrItinData<M_tc_3x_acc_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1, 1]>,
         InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [3, 1, 1]>,
+        InstrItinData<M_tc_3or4x_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
         InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
                                           [3, 1, 1]>,
 
         // Store
         InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
                                           [1, 1, 1]>,
+        InstrItinData<ST_tc_st_pi_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                          [1, 1, 1]>,
         InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
         InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
         InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+        InstrItinData<ST_tc_st_pi_SLOT0 , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
 
         // S
         InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td
index dc2ce43b0579..63784710f52b 100644
--- a/lib/Target/Hexagon/HexagonScheduleV60.td
+++ b/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -19,6 +19,8 @@ def CVI_LD     : FuncUnit;
 def CVI_XLSHF  : FuncUnit;
 def CVI_MPY01  : FuncUnit;
 def CVI_ALL    : FuncUnit;
+def CVI_XLMPY0 : FuncUnit;
+def CVI_SHFMPY1: FuncUnit;
 
 // Combined functional unit data.
 def HexagonComboFuncsV60 :
@@ -26,7 +28,9 @@ def HexagonComboFuncsV60 :
       ComboFuncData<CVI_XLSHF    , [CVI_XLANE, CVI_SHIFT]>,
       ComboFuncData<CVI_MPY01    , [CVI_MPY0, CVI_MPY1]>,
       ComboFuncData<CVI_ALL      , [CVI_ST, CVI_XLANE, CVI_SHIFT,
-                                    CVI_MPY0, CVI_MPY1, CVI_LD]>
+                                    CVI_MPY0, CVI_MPY1, CVI_LD]>,
+      ComboFuncData<CVI_XLMPY0   , [CVI_XLANE, CVI_MPY0]>,
+      ComboFuncData<CVI_SHFMPY1  , [CVI_SHIFT, CVI_MPY1]>
     ]>;
 
 // Note: When adding additional vector scheduling classes, add the
@@ -39,6 +43,7 @@ def CVI_VX           : InstrItinClass;
 def CVI_VX_DV_LONG   : InstrItinClass;
 def CVI_VX_DV        : InstrItinClass;
 def CVI_VX_DV_SLOT2  : InstrItinClass;
+def CVI_VX_DV_SLOT2_LONG_EARLY : InstrItinClass;
 def CVI_VP           : InstrItinClass;
 def CVI_VP_LONG      : InstrItinClass;
 def CVI_VP_VS_EARLY  : InstrItinClass;
@@ -150,22 +155,28 @@ def HexagonItinerariesV60 :
 
         // Load
         InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_pi_SLOT01   , [InstrStage<3, [SLOT0, SLOT1]>]>,
         InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
         InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
 
         // M
         InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_acc_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_acc_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_acc_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
 
         // Store
         InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>,
         InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
         InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
         InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT0 , [InstrStage<1, [SLOT0]>]>,
 
         // S
         InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
diff --git a/lib/Target/Hexagon/HexagonScheduleV62.td b/lib/Target/Hexagon/HexagonScheduleV62.td
new file mode 100644
index 000000000000..0758788a600b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonScheduleV62.td
@@ -0,0 +1,129 @@
+//=-HexagonScheduleV62.td - HexagonV62 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// V62 follows the same schedule as V60 with following exceptions:
+// Following instructions are permissible on any slot on V62:
+// V4_J4_cmpeq_fp0_jump_nt
+// V4_J4_cmpeq_fp0_jump_t
+// V4_J4_cmpeq_fp1_jump_nt
+// V4_J4_cmpeq_fp1_jump_t
+// V4_J4_cmpeq_tp0_jump_nt
+// V4_J4_cmpeq_tp0_jump_t
+// V4_J4_cmpeq_tp1_jump_nt
+// V4_J4_cmpeq_tp1_jump_t
+// V4_J4_cmpeqi_fp0_jump_nt
+// V4_J4_cmpeqi_fp0_jump_t
+// V4_J4_cmpeqi_fp1_jump_nt
+// V4_J4_cmpeqi_fp1_jump_t
+// V4_J4_cmpeqi_tp0_jump_nt
+// V4_J4_cmpeqi_tp0_jump_t
+// V4_J4_cmpeqi_tp1_jump_nt
+// V4_J4_cmpeqi_tp1_jump_t
+// V4_J4_cmpeqn1_fp0_jump_nt
+// V4_J4_cmpeqn1_fp0_jump_t
+// V4_J4_cmpeqn1_fp1_jump_nt
+// V4_J4_cmpeqn1_fp1_jump_t
+// V4_J4_cmpeqn1_tp0_jump_nt
+// V4_J4_cmpeqn1_tp0_jump_t
+// V4_J4_cmpeqn1_tp1_jump_nt
+// V4_J4_cmpeqn1_tp1_jump_t
+// V4_J4_cmpgt_fp0_jump_nt
+// V4_J4_cmpgt_fp0_jump_t
+// V4_J4_cmpgt_fp1_jump_nt
+// V4_J4_cmpgt_fp1_jump_t
+// V4_J4_cmpgt_tp0_jump_nt
+// V4_J4_cmpgt_tp0_jump_t
+// V4_J4_cmpgt_tp1_jump_nt
+// V4_J4_cmpgt_tp1_jump_t
+// V4_J4_cmpgti_fp0_jump_nt
+// V4_J4_cmpgti_fp0_jump_t
+// V4_J4_cmpgti_fp1_jump_nt
+// V4_J4_cmpgti_fp1_jump_t
+// V4_J4_cmpgti_tp0_jump_nt
+// V4_J4_cmpgti_tp0_jump_t
+// V4_J4_cmpgti_tp1_jump_nt
+// V4_J4_cmpgti_tp1_jump_t
+// V4_J4_cmpgtn1_fp0_jump_nt
+// V4_J4_cmpgtn1_fp0_jump_t
+// V4_J4_cmpgtn1_fp1_jump_nt
+// V4_J4_cmpgtn1_fp1_jump_t
+// V4_J4_cmpgtn1_tp0_jump_nt
+// V4_J4_cmpgtn1_tp0_jump_t
+// V4_J4_cmpgtn1_tp1_jump_nt
+// V4_J4_cmpgtn1_tp1_jump_t
+// V4_J4_cmpgtu_fp0_jump_nt
+// V4_J4_cmpgtu_fp0_jump_t
+// V4_J4_cmpgtu_fp1_jump_nt
+// V4_J4_cmpgtu_fp1_jump_t
+// V4_J4_cmpgtu_tp0_jump_nt
+// V4_J4_cmpgtu_tp0_jump_t
+// V4_J4_cmpgtu_tp1_jump_nt
+// V4_J4_cmpgtu_tp1_jump_t
+// V4_J4_cmpgtui_fp0_jump_nt
+// V4_J4_cmpgtui_fp0_jump_t
+// V4_J4_cmpgtui_fp1_jump_nt
+// V4_J4_cmpgtui_fp1_jump_t
+// V4_J4_cmpgtui_tp0_jump_nt
+// V4_J4_cmpgtui_tp0_jump_t
+// V4_J4_cmpgtui_tp1_jump_nt
+// V4_J4_cmpgtui_tp1_jump_t
+// V4_J4_tstbit0_fp0_jump_nt
+// V4_J4_tstbit0_fp0_jump_t
+// V4_J4_tstbit0_fp1_jump_nt
+// V4_J4_tstbit0_fp1_jump_t
+// V4_J4_tstbit0_tp0_jump_nt
+// V4_J4_tstbit0_tp0_jump_t
+// V4_J4_tstbit0_tp1_jump_nt
+// V4_J4_tstbit0_tp1_jump_t
+// JMP
+// JMPEXT
+// JMPEXT_f
+// JMPEXT_fnew_nt
+// JMPEXT_fnew_t
+// JMPEXT_t
+// JMPEXT_tnew_nt
+// JMPEXT_tnew_t
+// JMPNOTEXT
+// JMPNOTEXT_f
+// JMPNOTEXT_fnew_nt
+// JMPNOTEXT_fnew_t
+// JMPNOTEXT_t
+// JMPNOTEXT_tnew_nt
+// JMPNOTEXT_tnew_t
+// JMP_f
+// JMP_fnew_nt
+// JMP_fnew_t
+// JMP_t
+// JMP_tnew_nt
+// JMP_tnew_t
+// RESTORE_DEALLOC_RET_JMP_V4
+// RESTORE_DEALLOC_RET_JMP_V4_EXT
+
+def HexagonV62ItinList : ScalarItin, HVXV62Itin {
+  list<InstrItinData> ItinList =
+    !listconcat(ScalarItin_list, HVXV62Itin_list);
+}
+
+def HexagonItinerariesV62 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL],
+                           [], HexagonV62ItinList.ItinList>;
+
+def HexagonModelV62 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV62;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V62 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 10730536080e..002e87fb32ce 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -51,11 +51,12 @@ SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                 Type::getVoidTy(*DAG.getContext()),
-                 DAG.getTargetExternalSymbol(SpecialMemcpyName,
-                      TLI.getPointerTy(DAG.getDataLayout()), Flags),
-                 std::move(Args))
+      .setLibCallee(
+          TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+          Type::getVoidTy(*DAG.getContext()),
+          DAG.getTargetExternalSymbol(
+              SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout()), Flags),
+          std::move(Args))
       .setDiscardResult();
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 2c937216d463..471e32221b29 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -131,13 +131,15 @@ namespace {
 INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
   "Hexagon Split Double Registers", false, false)
 
-void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
       const USet &Part, const TargetRegisterInfo &TRI) {
   dbgs() << '{';
   for (auto I : Part)
     dbgs() << ' ' << PrintReg(I, &TRI);
   dbgs() << " }";
 }
+#endif
 
 bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
   for (auto I : IRM) {
@@ -391,7 +393,7 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
 
 bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
       const {
-  unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+  unsigned FixedNum = 0, LoopPhiNum = 0;
   int32_t TotalP = 0;
 
   for (unsigned DR : Part) {
@@ -428,7 +430,6 @@ bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
           LoopPhiNum++;
       }
       // Splittable instruction.
-      SplitNum++;
       int32_t P = profit(UseI);
       if (P == std::numeric_limits<int>::min())
         return false;
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 8c23a2465dd6..033b93fc910a 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -88,6 +88,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
     { "hexagonv5", V5 },
     { "hexagonv55", V55 },
     { "hexagonv60", V60 },
+    { "hexagonv62", V62 },
   };
 
   auto foundIt = CpuTable.find(CPUString);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index f2b9cdaad1ae..6a3e7f13be4c 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -38,9 +38,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool ModeIEEERndNear;
 
 public:
-  enum HexagonArchEnum {
-    V4, V5, V55, V60
-  };
+#include "HexagonDepArch.h"
 
   HexagonArchEnum HexagonArchVersion;
   /// True if the target should use Back-Skip-Back scheduling. This is the
@@ -98,6 +96,9 @@ public:
   bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
   bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
   bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
+  bool hasV62TOps() const { return getHexagonArchVersion() >= V62; }
+  bool hasV62TOpsOnly() const { return getHexagonArchVersion() == V62; }
+
   bool modeIEEERndNear() const { return ModeIEEERndNear; }
   bool useHVXOps() const { return UseHVXOps; }
   bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
diff --git a/lib/Target/Hexagon/HexagonSystemInst.td b/lib/Target/Hexagon/HexagonSystemInst.td
deleted file mode 100644
index 629a98749ee9..000000000000
--- a/lib/Target/Hexagon/HexagonSystemInst.td
+++ /dev/null
@@ -1,134 +0,0 @@
-//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                     Cache manipulation instructions.
-//===----------------------------------------------------------------------===//
-let mayStore = 1 in
-class ST_MISC_CACHEOP<dag outs, dag ins,
-              string asmstr, list<dag> pattern = [],
-              bits<3> amode, bits<3> type, bits<1> un>
-  : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
-
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<5> Rd;
-    let Inst{31-28} = 0b1010;
-    let Inst{27-25} = amode;
-    let Inst{24-22} = type;
-    let Inst{21}    = un;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rt;
-    let Inst{4-0}   = Rd;
-}
-
-let mayStore = 1 in
-class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
-              string asmstr, list<dag> pattern = [],
-              bits<3> amode, bits<3> type, bits<1> un>
-  : SYSInst<outs, ins, asmstr, pattern, ""> {
-
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<5> Rd;
-    let Inst{31-28} = 0b1010;
-    let Inst{27-25} = amode;
-    let Inst{24-22} = type;
-    let Inst{21}    = un;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rt;
-    let Inst{4-0}   = Rd;
-}
-
-
-let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
-def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
-    "syncht" , [], 0b100, 0b001, 0b0>;
-}
-
-let Rt = 0, Rd = 0 in {
-let isSoloAin1 = 1 in {
-  def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
-  def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
-  def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
-  }
-}
-
-let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
-  def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
-  def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
-      "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
-}
-
-let hasSideEffects = 0, isSolo = 1 in
-class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
-  : JRInst <
-  (outs), (ins IntRegs:$Rs),
-  #mnemonic#"($Rs)" > {
-    bits<5> Rs;
-
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0110110;
-    let Inst{20-16} = Rs;
-    let Inst{13-12} = 0b00;
-    let Inst{11} = MajOp;
-  }
-// Instruction cache invalidate
-def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
-
-// Zero an aligned 32-byte cacheline.
-let isSoloAin1 = 1 in
-def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
-  "dczeroa($Rs)"> {
-    bits<5> Rs;
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b0000110;
-    let Inst{13} = 0b0;
-    let Inst{20-16} = Rs;
-  }
-
-// Memory synchronization.
-let hasSideEffects = 0, isSolo = 1 in
-def Y2_isync: JRInst <(outs), (ins),
-  "isync"> {
-    let IClass = 0b0101;
-    let Inst{27-16} = 0b011111000000;
-    let Inst{13} = 0b0;
-    let Inst{9-0} = 0b0000000010;
-  }
-
-//===----------------------------------------------------------------------===//
-//                     System/User instructions.
-//===----------------------------------------------------------------------===//
-// traps and pause
-let hasSideEffects = 0, isSolo = 1 in
-class J2_MISC_TRAP_PAUSE<string mnemonic, bits<2> MajOp>
-  : JRInst
-  <(outs), (ins u8_0Imm:$u8),
-   #mnemonic#"(#$u8)"> {
-    bits<8> u8;
-
-    let IClass = 0b0101;
-    let Inst{27-24} = 0b0100;
-    let Inst{23-22} = MajOp;
-    let Inst{12-8} = u8{7-3};
-    let Inst{4-2} = u8{2-0};
-  }
-def J2_trap0 : J2_MISC_TRAP_PAUSE<"trap0", 0b00>;
-def J2_trap1 : J2_MISC_TRAP_PAUSE<"trap1", 0b10>;
-def J2_pause : J2_MISC_TRAP_PAUSE<"pause", 0b01>;
-
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 132d12a66d46..06fc9195fa67 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 using namespace llvm;
 
@@ -98,11 +99,6 @@ static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
 extern "C" int HexagonTargetMachineModule;
 int HexagonTargetMachineModule = 0;
 
-extern "C" void LLVMInitializeHexagonTarget() {
-  // Register the target.
-  RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
-}
-
 static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
   return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
 }
@@ -114,6 +110,8 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 namespace llvm {
   extern char &HexagonExpandCondsetsID;
   void initializeHexagonExpandCondsetsPass(PassRegistry&);
+  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
+  Pass *createHexagonLoopIdiomPass();
 
   FunctionPass *createHexagonBitSimplify();
   FunctionPass *createHexagonBranchRelaxation();
@@ -150,6 +148,12 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
+extern "C" void LLVMInitializeHexagonTarget() {
+  // Register the target.
+  RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
+  initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+}
+
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
@@ -172,11 +176,11 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
 
 const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
+  AttributeList FnAttrs = F.getAttributes();
   Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-cpu");
   Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -196,6 +200,14 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
+void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {
+  PMB.addExtension(
+    PassManagerBuilder::EP_LateLoopOptimizations,
+    [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      PM.add(createHexagonLoopIdiomPass());
+    });
+}
+
 TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(HexagonTTIImpl(this, F));
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 70835c0d4ac5..3d01929fbfb8 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -37,6 +37,7 @@ public:
 
   static unsigned getModuleMatchQuality(const Module &M);
 
+  void adjustPassManager(PassManagerBuilder &PMB) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetIRAnalysis getTargetIRAnalysis() override;
 
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 7b1247d815a5..3a789a5f7e0b 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -440,7 +440,7 @@ bool HexagonPacketizerList::promoteToDotNew(MachineInstr &MI,
 }
 
 bool HexagonPacketizerList::demoteToDotOld(MachineInstr &MI) {
-  int NewOpcode = HII->getDotOldOp(MI.getOpcode());
+  int NewOpcode = HII->getDotOldOp(MI);
   MI.setDesc(HII->get(NewOpcode));
   return true;
 }
@@ -720,6 +720,8 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
   // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
   for (auto &MO : PacketMI.operands()) {
+    if (MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+      return false;
     if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
       continue;
     unsigned R = MO.getReg();
@@ -759,9 +761,12 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI,
 }
 
 static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) {
-  for (auto &MO : I.operands())
+  for (auto &MO : I.operands()) {
+    if (MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+      return true;
     if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
       return true;
+  }
   return false;
 }
 
@@ -1046,7 +1051,9 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
     // XTYPE instructions.  Since there is no convenient way of identifying fp
     // XTYPE instructions, only allow grouping with ALU32 for now.
     unsigned TJ = HII.getType(MJ);
-    if (TJ != HexagonII::TypeALU32)
+    if (TJ != HexagonII::TypeALU32_2op &&
+        TJ != HexagonII::TypeALU32_3op &&
+        TJ != HexagonII::TypeALU32_ADDI)
       return true;
     break;
   }
@@ -1171,6 +1178,36 @@ bool HexagonPacketizerList::hasControlDependence(const MachineInstr &I,
          (J.isBranch() || J.isCall() || J.isBarrier());
 }
 
+bool HexagonPacketizerList::hasRegMaskDependence(const MachineInstr &I,
+                                                 const MachineInstr &J) {
+  // Adding I to a packet that has J.
+
+  // Regmasks are not reflected in the scheduling dependency graph, so
+  // we need to check them manually. This code assumes that regmasks only
+  // occur on calls, and the problematic case is when we add an instruction
+  // defining a register R to a packet that has a call that clobbers R via
+  // a regmask. Those cannot be packetized together, because the call will
+  // be executed last. That's also a reson why it is ok to add a call
+  // clobbering R to a packet that defines R.
+
+  // Look for regmasks in J.
+  for (const MachineOperand &OpJ : J.operands()) {
+    if (!OpJ.isRegMask())
+      continue;
+    assert((J.isCall() || HII->isTailCall(J)) && "Regmask on a non-call");
+    for (const MachineOperand &OpI : I.operands()) {
+      if (OpI.isReg()) {
+        if (OpJ.clobbersPhysReg(OpI.getReg()))
+          return true;
+      } else if (OpI.isRegMask()) {
+        // Both are regmasks. Assume that they intersect.
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
                                                     const MachineInstr &J) {
   bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
@@ -1217,6 +1254,14 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   if (Dependence)
     return false;
 
+  // Regmasks are not accounted for in the scheduling graph, so we need
+  // to explicitly check for dependencies caused by them. They should only
+  // appear on calls, so it's not too pessimistic to reject all regmask
+  // dependencies.
+  Dependence = hasRegMaskDependence(I, J);
+  if (Dependence)
+    return false;
+
   // V4 allows dual stores. It does not allow second store, if the first
   // store is not in SLOT0. New value store, new value jump, dealloc_return
   // and memop always take SLOT0. Arch spec 3.4.4.2.
@@ -1465,13 +1510,19 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     //   R0 = ...                   ; SUI
     // Those cannot be packetized together, since the call will observe
     // the effect of the assignment to R0.
-    if (DepType == SDep::Anti && J.isCall()) {
+    if ((DepType == SDep::Anti || DepType == SDep::Output) && J.isCall()) {
       // Check if I defines any volatile register. We should also check
       // registers that the call may read, but these happen to be a
       // subset of the volatile register set.
-      for (const MCPhysReg *P = J.getDesc().ImplicitDefs; P && *P; ++P) {
-        if (!I.modifiesRegister(*P, HRI))
+      for (const MachineOperand &Op : I.operands()) {
+        if (Op.isReg() && Op.isDef()) {
+          unsigned R = Op.getReg();
+          if (!J.readsRegister(R, HRI) && !J.modifiesRegister(R, HRI))
+            continue;
+        } else if (!Op.isRegMask()) {
+          // If I has a regmask assume dependency.
           continue;
+        }
         FoundSequentialDependence = true;
         break;
       }
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index b28b926ec300..3f28dc5b79ce 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -7,6 +7,9 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 
 namespace llvm {
+class HexagonInstrInfo;
+class HexagonRegisterInfo;
+
 class HexagonPacketizerList : public VLIWPacketizerList {
   // Vector of instructions assigned to the packet that has just been created.
   std::vector<MachineInstr*> OldPacketMIs;
@@ -109,6 +112,7 @@ protected:
   void reserveResourcesForConstExt();
   bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
 };
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index e0077acc7af0..7a27a8c5e10f 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -36,6 +36,7 @@ required_libraries =
   HexagonAsmParser
   HexagonDesc
   HexagonInfo
+  IPO
   MC
   Scalar
   SelectionDAG
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index c140bd1d7ee2..337af294eb86 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -9,10 +9,10 @@
 
 #include "Hexagon.h"
 #include "HexagonFixupKinds.h"
-#include "HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -59,9 +59,10 @@ class HexagonAsmBackend : public MCAsmBackend {
     RF.getFixups() = Fixups;
   }
 public:
-  HexagonAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) :
-    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
-    Extender(nullptr) {}
+  HexagonAsmBackend(const Target &T, const Triple &TT, uint8_t OSABI,
+      StringRef CPU) :
+      OSABI(OSABI), CPU(CPU), MCII(T.createMCInstrInfo()),
+      RelaxTarget(new MCInst *), Extender(nullptr) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createHexagonELFObjectWriter(OS, OSABI, CPU);
@@ -88,101 +89,101 @@ public:
       // This table *must* be in same the order of fixup_* kinds in
       // HexagonFixupKinds.h.
       //
-      // namei                          offset  bits  flags
-      { "fixup_Hexagon_B22_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B15_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B7_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_LO16",             0,    32,   0 },
-      { "fixup_Hexagon_HI16",             0,    32,   0 },
-      { "fixup_Hexagon_32",               0,    32,   0 },
-      { "fixup_Hexagon_16",               0,    32,   0 },
-      { "fixup_Hexagon_8",                0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_0",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_1",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_2",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_3",        0,    32,   0 },
-      { "fixup_Hexagon_HL16",             0,    32,   0 },
-      { "fixup_Hexagon_B13_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B9_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B32_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_32_6_X",           0,    32,   0 },
-      { "fixup_Hexagon_B22_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B15_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B13_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B9_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B7_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_16_X",             0,    32,   0 },
-      { "fixup_Hexagon_12_X",             0,    32,   0 },
-      { "fixup_Hexagon_11_X",             0,    32,   0 },
-      { "fixup_Hexagon_10_X",             0,    32,   0 },
-      { "fixup_Hexagon_9_X",              0,    32,   0 },
-      { "fixup_Hexagon_8_X",              0,    32,   0 },
-      { "fixup_Hexagon_7_X",              0,    32,   0 },
-      { "fixup_Hexagon_6_X",              0,    32,   0 },
-      { "fixup_Hexagon_32_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_COPY",             0,    32,   0 },
-      { "fixup_Hexagon_GLOB_DAT",         0,    32,   0 },
-      { "fixup_Hexagon_JMP_SLOT",         0,    32,   0 },
-      { "fixup_Hexagon_RELATIVE",         0,    32,   0 },
-      { "fixup_Hexagon_PLT_B22_PCREL",    0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GOTREL_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_32",        0,    32,   0 },
-      { "fixup_Hexagon_GOT_LO16",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_HI16",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_32",           0,    32,   0 },
-      { "fixup_Hexagon_GOT_16",           0,    32,   0 },
-      { "fixup_Hexagon_DTPMOD_32",        0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_32",        0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_16",        0,    32,   0 },
-      { "fixup_Hexagon_GD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_LD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GD_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_IE_LO16",          0,    32,   0 },
-      { "fixup_Hexagon_IE_HI16",          0,    32,   0 },
-      { "fixup_Hexagon_IE_32",            0,    32,   0 },
-      { "fixup_Hexagon_IE_16",            0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_TPREL_LO16",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_HI16",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_32",         0,    32,   0 },
-      { "fixup_Hexagon_TPREL_16",         0,    32,   0 },
-      { "fixup_Hexagon_6_PCREL_X",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GOTREL_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_GOT_32_6_X",       0,    32,   0 },
-      { "fixup_Hexagon_GOT_16_X",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_11_X",         0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_IE_32_6_X",        0,    32,   0 },
-      { "fixup_Hexagon_IE_16_X",          0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_TPREL_32_6_X",     0,    32,   0 },
-      { "fixup_Hexagon_TPREL_16_X",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_11_X",       0,    32,   0 }
+      // namei                          offset  bits    flags
+      { "fixup_Hexagon_B22_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LO16",           0,      32,     0 },
+      { "fixup_Hexagon_HI16",           0,      32,     0 },
+      { "fixup_Hexagon_32",             0,      32,     0 },
+      { "fixup_Hexagon_16",             0,      32,     0 },
+      { "fixup_Hexagon_8",              0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_0",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_1",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_2",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_3",      0,      32,     0 },
+      { "fixup_Hexagon_HL16",           0,      32,     0 },
+      { "fixup_Hexagon_B13_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B32_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_32_6_X",         0,      32,     0 },
+      { "fixup_Hexagon_B22_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B13_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL_X",     0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL_X",     0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_16_X",           0,      32,     0 },
+      { "fixup_Hexagon_12_X",           0,      32,     0 },
+      { "fixup_Hexagon_11_X",           0,      32,     0 },
+      { "fixup_Hexagon_10_X",           0,      32,     0 },
+      { "fixup_Hexagon_9_X",            0,      32,     0 },
+      { "fixup_Hexagon_8_X",            0,      32,     0 },
+      { "fixup_Hexagon_7_X",            0,      32,     0 },
+      { "fixup_Hexagon_6_X",            0,      32,     0 },
+      { "fixup_Hexagon_32_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_COPY",           0,      32,     0 },
+      { "fixup_Hexagon_GLOB_DAT",       0,      32,     0 },
+      { "fixup_Hexagon_JMP_SLOT",       0,      32,     0 },
+      { "fixup_Hexagon_RELATIVE",       0,      32,     0 },
+      { "fixup_Hexagon_PLT_B22_PCREL",  0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_32",      0,      32,     0 },
+      { "fixup_Hexagon_GOT_LO16",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_HI16",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_32",         0,      32,     0 },
+      { "fixup_Hexagon_GOT_16",         0,      32,     0 },
+      { "fixup_Hexagon_DTPMOD_32",      0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_32",      0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_16",      0,      32,     0 },
+      { "fixup_Hexagon_GD_PLT_B22_PCREL",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B22_PCREL",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GD_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_IE_LO16",        0,      32,     0 },
+      { "fixup_Hexagon_IE_HI16",        0,      32,     0 },
+      { "fixup_Hexagon_IE_32",          0,      32,     0 },
+      { "fixup_Hexagon_IE_16",          0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_TPREL_LO16",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_HI16",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_32",       0,      32,     0 },
+      { "fixup_Hexagon_TPREL_16",       0,      32,     0 },
+      { "fixup_Hexagon_6_PCREL_X",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_GOT_32_6_X",     0,      32,     0 },
+      { "fixup_Hexagon_GOT_16_X",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_11_X",       0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_IE_32_6_X",      0,      32,     0 },
+      { "fixup_Hexagon_IE_16_X",        0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_TPREL_32_6_X",   0,      32,     0 },
+      { "fixup_Hexagon_TPREL_16_X",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_11_X",     0,      32,     0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -401,7 +402,8 @@ public:
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t FixupValue, bool IsPCRel) const override {
+                  uint64_t FixupValue, bool IsPCRel,
+                  MCContext &Ctx) const override {
 
     // When FixupValue is 0 the relocation is external and there
     // is nothing for us to do.
@@ -524,10 +526,9 @@ public:
     bool Relaxable = false;
     // Branches and loop-setup insns are handled as necessary by relaxation.
     if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
-        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) ==
-             HexagonII::TypeCOMPOUND &&
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCJ &&
          MCID.isBranch()) ||
-        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNCJ &&
          MCID.isBranch()) ||
         (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
          HMI.getOpcode() != Hexagon::C4_addipc))
@@ -724,7 +725,8 @@ public:
                   Size = 0;
                 }
               }
-              bool Error = HexagonMCShuffle(*MCII, RF.getSubtargetInfo(), Inst);
+              bool Error = HexagonMCShuffle(true, *MCII, RF.getSubtargetInfo(),
+                                            Inst);
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
@@ -739,15 +741,17 @@ public:
       }
     }
   }
-};
-} // end anonymous namespace
+}; // class HexagonAsmBackend
 
-namespace llvm {
-MCAsmBackend *createHexagonAsmBackend(Target const &T,
+} // namespace
+
+// MCAsmBackend
+MCAsmBackend *llvm::createHexagonAsmBackend(Target const &T,
                                       MCRegisterInfo const & /*MRI*/,
                                       const Triple &TT, StringRef CPU,
                                       const MCTargetOptions &Options) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
-  return new HexagonAsmBackend(T, OSABI, CPU);
-}
+
+  StringRef CPUString = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  return new HexagonAsmBackend(T, TT, OSABI, CPUString);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 4292f6b3faa4..9c80312b790d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 
+#include "HexagonDepITypes.h"
 #include "HexagonMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <stdint.h>
@@ -27,57 +28,14 @@ namespace llvm {
 /// instruction info tracks.
 ///
 namespace HexagonII {
-  // *** The code below must match HexagonInstrFormat*.td *** //
-
-  // Insn types.
-  // *** Must match HexagonInstrFormat*.td ***
-  enum Type {
-    TypePSEUDO  = 0,
-    TypeALU32   = 1,
-    TypeCR      = 2,
-    TypeJR      = 3,
-    TypeJ       = 4,
-    TypeLD      = 5,
-    TypeST      = 6,
-    TypeSYSTEM  = 7,
-    TypeXTYPE   = 8,
-    TypeV4LDST  = 9,
-    TypeNV      = 10,
-    TypeDUPLEX  = 11,
-    TypeCOMPOUND = 12,
-    TypeCVI_FIRST     = 13,
-    TypeCVI_VA        = TypeCVI_FIRST,
-    TypeCVI_VA_DV     = 14,
-    TypeCVI_VX        = 15,
-    TypeCVI_VX_DV     = 16,
-    TypeCVI_VP        = 17,
-    TypeCVI_VP_VS     = 18,
-    TypeCVI_VS        = 19,
-    TypeCVI_VINLANESAT= 20,
-    TypeCVI_VM_LD     = 21,
-    TypeCVI_VM_TMP_LD = 22,
-    TypeCVI_VM_CUR_LD = 23,
-    TypeCVI_VM_VP_LDU = 24,
-    TypeCVI_VM_ST     = 25,
-    TypeCVI_VM_NEW_ST = 26,
-    TypeCVI_VM_STU    = 27,
-    TypeCVI_HIST      = 28,
-    TypeCVI_LAST      = TypeCVI_HIST,
-    TypePREFIX  = 30, // Such as extenders.
-    TypeENDLOOP = 31  // Such as end of a HW loop.
-  };
+  unsigned const TypeCVI_FIRST = TypeCVI_HIST;
+  unsigned const TypeCVI_LAST = TypeCVI_VX_DV;
 
   enum SubTarget {
-    HasV2SubT     = 0xf,
-    HasV2SubTOnly = 0x1,
-    NoV2SubT      = 0x0,
-    HasV3SubT     = 0xe,
-    HasV3SubTOnly = 0x2,
-    NoV3SubT      = 0x1,
-    HasV4SubT     = 0xc,
-    NoV4SubT      = 0x3,
-    HasV5SubT     = 0x8,
-    NoV5SubT      = 0x7
+    HasV4SubT     = 0x3f,
+    HasV5SubT     = 0x3e,
+    HasV55SubT    = 0x3c,
+    HasV60SubT    = 0x38,
   };
 
   enum AddrMode {
@@ -107,102 +65,101 @@ namespace HexagonII {
   enum {
     // This 5-bit field describes the insn type.
     TypePos  = 0,
-    TypeMask = 0x1f,
+    TypeMask = 0x3f,
 
     // Solo instructions.
-    SoloPos  = 5,
+    SoloPos  = 6,
     SoloMask = 0x1,
     // Packed only with A or X-type instructions.
-    SoloAXPos  = 6,
+    SoloAXPos  = 7,
     SoloAXMask = 0x1,
     // Only A-type instruction in first slot or nothing.
-    SoloAin1Pos  = 7,
+    SoloAin1Pos  = 8,
     SoloAin1Mask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 8,
+    PredicatedPos  = 9,
     PredicatedMask = 0x1,
-    PredicatedFalsePos  = 9,
+    PredicatedFalsePos  = 10,
     PredicatedFalseMask = 0x1,
-    PredicatedNewPos  = 10,
+    PredicatedNewPos  = 11,
     PredicatedNewMask = 0x1,
-    PredicateLatePos  = 11,
+    PredicateLatePos  = 12,
     PredicateLateMask = 0x1,
 
     // New-Value consumer instructions.
-    NewValuePos  = 12,
+    NewValuePos  = 13,
     NewValueMask = 0x1,
     // New-Value producer instructions.
-    hasNewValuePos  = 13,
+    hasNewValuePos  = 14,
     hasNewValueMask = 0x1,
     // Which operand consumes or produces a new value.
-    NewValueOpPos  = 14,
+    NewValueOpPos  = 15,
     NewValueOpMask = 0x7,
     // Stores that can become new-value stores.
-    mayNVStorePos  = 17,
+    mayNVStorePos  = 18,
     mayNVStoreMask = 0x1,
     // New-value store instructions.
-    NVStorePos  = 18,
+    NVStorePos  = 19,
     NVStoreMask = 0x1,
     // Loads that can become current-value loads.
-    mayCVLoadPos  = 19,
+    mayCVLoadPos  = 20,
     mayCVLoadMask = 0x1,
     // Current-value load instructions.
-    CVLoadPos  = 20,
+    CVLoadPos  = 21,
     CVLoadMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 21,
+    ExtendablePos  = 22,
     ExtendableMask = 0x1,
     // Insns must be extended.
-    ExtendedPos  = 22,
+    ExtendedPos  = 23,
     ExtendedMask = 0x1,
     // Which operand may be extended.
-    ExtendableOpPos  = 23,
+    ExtendableOpPos  = 24,
     ExtendableOpMask = 0x7,
     // Signed or unsigned range.
-    ExtentSignedPos  = 26,
+    ExtentSignedPos  = 27,
     ExtentSignedMask = 0x1,
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 27,
+    ExtentBitsPos  = 28,
     ExtentBitsMask = 0x1f,
     // Alignment power-of-two before extending operand.
-    ExtentAlignPos  = 32,
+    ExtentAlignPos  = 33,
     ExtentAlignMask = 0x3,
 
     // Valid subtargets
-    validSubTargetPos  = 34,
-    validSubTargetMask = 0xf,
+    validSubTargetPos  = 35,
+    validSubTargetMask = 0x3f,
 
     // Addressing mode for load/store instructions.
-    AddrModePos  = 40,
+    AddrModePos  = 41,
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
-    MemAccessSizePos = 43,
+    MemAccessSizePos = 44,
     MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
-    TakenPos = 47,
+    TakenPos = 48,
     TakenMask = 0x1,
 
     // Floating-point instructions.
-    FPPos  = 48,
+    FPPos  = 49,
     FPMask = 0x1,
 
     // New-Value producer-2 instructions.
-    hasNewValuePos2  = 50,
+    hasNewValuePos2  = 51,
     hasNewValueMask2 = 0x1,
-
     // Which operand consumes or produces a new value.
-    NewValueOpPos2  = 51,
+    NewValueOpPos2  = 52,
     NewValueOpMask2 = 0x7,
 
     // Accumulator instructions.
-    AccumulatorPos = 54,
+    AccumulatorPos = 55,
     AccumulatorMask = 0x1,
 
     // Complex XU, prevent xu competition by preferring slot3
-    PrefersSlot3Pos = 55,
+    PrefersSlot3Pos = 56,
     PrefersSlot3Mask = 0x1,
 
     CofMax1Pos = 60,
@@ -217,8 +174,6 @@ namespace HexagonII {
     // Hexagon Specific MachineOperand flags.
     MO_NO_FLAG,
 
-    HMOTF_ConstExtended = 1,
-
     /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
     /// Used for computing a global address for PIC compilations
     MO_PCREL,
@@ -250,7 +205,13 @@ namespace HexagonII {
 
     // MO_TPREL - indicates relocation for TLS
     // local Executable method
-    MO_TPREL
+    MO_TPREL,
+
+    // HMOTF_ConstExtended
+    // Addendum to abovem, indicates a const extended op
+    // Can be used as a mask.
+    HMOTF_ConstExtended = 0x80
+
   };
 
   // Hexagon Sub-instruction classes.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 42fcc5a6aa89..dd790fd41257 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -125,46 +125,6 @@ void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
   O << -1;
 }
 
-void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<9>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
-  O << formatImm(Imm/64);
-}
-
-void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<10>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
-  O << formatImm(Imm/128);
-}
-
-void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<10>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
-  O << formatImm(Imm/64);
-}
-
-void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<11>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
-  O << formatImm(Imm/128);
-}
-
 void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
                                             raw_ostream &O) const {
   printOperand(MI, OpNo, O);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 5f421184b20a..ac8e391905e0 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -44,14 +44,6 @@ public:
                           raw_ostream &O) const;
   void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
                            raw_ostream &O) const;
-  void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
   void printBranchOperand(MCInst const *MI, unsigned OpNo,
                           raw_ostream &O) const;
   void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index c619c36164cf..446b3b2ce668 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -23,6 +23,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = nullptr;  // .xword is only supported by V9.
   CommentString = "//";
+  SupportsDebugInformation = true;
 
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
@@ -30,8 +31,8 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
 
-  SupportsDebugInformation = true;
   MinInstAlignment = 4;
   UsesELFSectionDirectiveForBSS  = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
+  UseLogicalShr = false;
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 07c9ad96a0d7..62b21c419f30 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -47,12 +47,40 @@ void HexagonMCChecker::init() {
   if (HexagonMCInstrInfo::isBundle(MCB))
     // Unfurl a bundle.
     for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      init(*I.getInst());
+      MCInst const &Inst = *I.getInst();
+      if (HexagonMCInstrInfo::isDuplex(MCII, Inst)) {
+        init(*Inst.getOperand(0).getInst());
+        init(*Inst.getOperand(1).getInst());
+      }
+      else
+        init(Inst);
     }
   else
     init(MCB);
 }
 
+void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
+                               bool &isTrue) {
+  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+    // Note an used predicate register.
+    PredReg = R;
+    isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+    // Note use of new predicate register.
+    if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+      NewPreds.insert(PredReg);
+  }
+  else
+    // Note register use.  Super-registers are not tracked directly,
+    // but their components.
+    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+        SRI.isValid();
+        ++SRI)
+      if (!MCSubRegIterator(*SRI, &RI).isValid())
+        // Skip super-registers used indirectly.
+        Uses.insert(*SRI);
+}
+
 void HexagonMCChecker::init(MCInst const& MCI) {
   const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
   unsigned PredReg = Hexagon::NoRegister;
@@ -60,28 +88,10 @@ void HexagonMCChecker::init(MCInst const& MCI) {
 
   // Get used registers.
   for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
-    if (MCI.getOperand(i).isReg()) {
-      unsigned R = MCI.getOperand(i).getReg();
-
-      if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
-        // Note an used predicate register.
-        PredReg = R;
-        isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
-
-        // Note use of new predicate register.
-        if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
-          NewPreds.insert(PredReg);
-      }
-      else
-        // Note register use.  Super-registers are not tracked directly,
-        // but their components.
-        for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
-           SRI.isValid();
-           ++SRI)
-         if (!MCSubRegIterator(*SRI, &RI).isValid())
-           // Skip super-registers used indirectly.
-           Uses.insert(*SRI);
-    }
+    if (MCI.getOperand(i).isReg())
+      initReg(MCI, MCI.getOperand(i).getReg(), PredReg, isTrue);
+  for (unsigned i = 0; i < MCID.getNumImplicitUses(); ++i)
+    initReg(MCI, MCID.getImplicitUses()[i], PredReg, isTrue);
 
   // Get implicit register definitions.
   if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
@@ -216,9 +226,11 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     if (!MCSubRegIterator(N, &RI).isValid()) {
       // Super-registers cannot use new values.
       if (MCID.isBranch())
-        NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+        NewUses[N] = NewSense::Jmp(
+          llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNCJ);
       else
-        NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+        NewUses[N] = NewSense::Use(
+          PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
     }
   }
 }
@@ -230,14 +242,18 @@ HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo cons
   init();
 }
 
-bool HexagonMCChecker::check() {
+bool HexagonMCChecker::check(bool FullCheck) {
   bool chkB = checkBranches();
   bool chkP = checkPredicates();
   bool chkNV = checkNewValues();
   bool chkR = checkRegisters();
   bool chkS = checkSolo();
-  bool chkSh = checkShuffle();
-  bool chkSl = checkSlots();
+  bool chkSh = true;
+  if (FullCheck)
+   chkSh = checkShuffle();
+  bool chkSl = true;
+  if (FullCheck)
+   chkSl = checkSlots();
   bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
 
   return chk;
@@ -271,8 +287,8 @@ bool HexagonMCChecker::checkBranches() {
   HexagonMCErrInfo errInfo;
   if (HexagonMCInstrInfo::isBundle(MCB)) {
     bool hasConditional = false;
-    unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
-             NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+    unsigned Branches = 0,
+             Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
              Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
 
     for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
@@ -284,12 +300,6 @@ bool HexagonMCChecker::checkBranches() {
       if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
           HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
         ++Branches;
-        if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
-            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
-          ++NewIndirectBranches;
-        if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
-          ++NewValueBranches;
-
         if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
             HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
           hasConditional = true;
@@ -298,9 +308,6 @@ bool HexagonMCChecker::checkBranches() {
           Unconditional = i; // Record the position of the unconditional branch.
         }
       }
-      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
-          HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
-        ++Returns;
     }
 
     if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
@@ -504,7 +511,7 @@ bool HexagonMCChecker::checkShuffle() {
   HexagonMCErrInfo errInfo;
   // Branch info is lost when duplexing. The unduplexed insns must be
   // checked and only branch errors matter for this case.
-  HexagonMCShuffler MCS(MCII, STI, MCB);
+  HexagonMCShuffler MCS(true, MCII, STI, MCB);
   if (!MCS.check()) {
     if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
       errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
@@ -513,7 +520,7 @@ bool HexagonMCChecker::checkShuffle() {
       return false;
     }
   }
-  HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
+  HexagonMCShuffler MCSDX(true, MCII, STI, MCBDX);
   if (!MCSDX.check()) {
     errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
     errInfo.setShuffleError(MCSDX.getError());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 33e22798c954..c3b3d4c14c88 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -168,6 +168,7 @@ class HexagonMCChecker {
 
   void init();
   void init(MCInst const&);
+  void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
 
   // Checks performed.
   bool checkBranches();
@@ -177,6 +178,7 @@ class HexagonMCChecker {
   bool checkSolo();
   bool checkShuffle();
   bool checkSlots();
+  bool checkSize();
 
   static void compoundRegisterMap(unsigned&);
 
@@ -196,7 +198,7 @@ class HexagonMCChecker {
   explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
                             const MCRegisterInfo& ri);
 
-  bool check();
+  bool check(bool FullCheck = true);
 
   /// add a new error/warning
   void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 2645a17b9bd0..c0956520de73 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -35,38 +35,40 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
                                            MCContext &aMCT)
     : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
-      Extended(new bool(false)), CurrentBundle(new MCInst const *) {}
+      Extended(new bool(false)), CurrentBundle(new MCInst const *),
+      CurrentIndex(new size_t(0)) {}
 
-uint32_t HexagonMCCodeEmitter::parseBits(size_t Instruction, size_t Last,
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Last,
                                          MCInst const &MCB,
                                          MCInst const &MCI) const {
   bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
-  if (Instruction == 0) {
+  if (*CurrentIndex == 0) {
     if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
       assert(!Duplex);
-      assert(Instruction != Last);
+      assert(*CurrentIndex != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
-  if (Instruction == 1) {
+  if (*CurrentIndex == 1) {
     if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
       assert(!Duplex);
-      assert(Instruction != Last);
+      assert(*CurrentIndex != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
   if (Duplex) {
-    assert(Instruction == Last);
+    assert(*CurrentIndex == Last);
     return HexagonII::INST_PARSE_DUPLEX;
   }
-  if(Instruction == Last)
+  if(*CurrentIndex == Last)
     return HexagonII::INST_PARSE_PACKET_END;
   return HexagonII::INST_PARSE_NOT_END;
 }
 
-void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
+/// EncodeInstruction - Emit the bundle
+void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                              SmallVectorImpl<MCFixup> &Fixups,
-                                             MCSubtargetInfo const &STI) const {
+                                             const MCSubtargetInfo &STI) const {
   MCInst &HMB = const_cast<MCInst &>(MI);
 
   assert(HexagonMCInstrInfo::isBundle(HMB));
@@ -74,7 +76,7 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
   *Addend = 0;
   *Extended = false;
   *CurrentBundle = &MI;
-  size_t Instruction = 0;
+  *CurrentIndex = 0;
   size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
   for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
     MCInst &HMI = const_cast<MCInst &>(*I.getInst());
@@ -82,11 +84,10 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
                                 computeAvailableFeatures(STI.getFeatureBits()));
 
     EncodeSingleInstruction(HMI, OS, Fixups, STI,
-                            parseBits(Instruction, Last, HMB, HMI),
-                            Instruction);
+                            parseBits(Last, HMB, HMI));
     *Extended = HexagonMCInstrInfo::isImmext(HMI);
     *Addend += HEXAGON_INSTR_SIZE;
-    ++Instruction;
+    ++*CurrentIndex;
   }
   return;
 }
@@ -107,165 +108,44 @@ static bool RegisterMatches(unsigned Consumer, unsigned Producer,
 /// EncodeSingleInstruction - Emit a single
 void HexagonMCCodeEmitter::EncodeSingleInstruction(
     const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI, uint32_t Parse, size_t Index) const {
-  MCInst HMB = MI;
-  assert(!HexagonMCInstrInfo::isBundle(HMB));
+    const MCSubtargetInfo &STI, uint32_t Parse) const {
+  assert(!HexagonMCInstrInfo::isBundle(MI));
   uint64_t Binary;
 
-  // Compound instructions are limited to using registers 0-7 and 16-23
-  // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
-  static unsigned RegMap[8] = {Hexagon::R8,  Hexagon::R9,  Hexagon::R10,
-                               Hexagon::R11, Hexagon::R12, Hexagon::R13,
-                               Hexagon::R14, Hexagon::R15};
-
   // Pseudo instructions don't get encoded and shouldn't be here
   // in the first place!
-  assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
+  assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo() &&
          "pseudo-instruction found");
   DEBUG(dbgs() << "Encoding insn"
-                  " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                  " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
                                                                     "\n");
 
-  if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
-    for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
-      if (HMB.getOperand(i).isReg()) {
-        unsigned Reg =
-            MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
-        if ((Reg <= 23) && (Reg >= 16))
-          HMB.getOperand(i).setReg(RegMap[Reg - 16]);
-      }
-  }
-
-  if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
-    // Calculate the new value distance to the associated producer
-    MCOperand &MCO =
-        HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
-    unsigned SOffset = 0;
-    unsigned VOffset = 0;
-    unsigned Register = MCO.getReg();
-    unsigned Register1;
-    unsigned Register2;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
-    auto i = Instructions.begin() + Index - 1;
-    for (;; --i) {
-      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
-      MCInst const &Inst = *i->getInst();
-      if (HexagonMCInstrInfo::isImmext(Inst))
-        continue;
-      ++SOffset;
-      if (HexagonMCInstrInfo::isVector(MCII, Inst))
-        // Vector instructions don't count scalars
-        ++VOffset;
-      Register1 =
-          HexagonMCInstrInfo::hasNewValue(MCII, Inst)
-              ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
-              : static_cast<unsigned>(Hexagon::NoRegister);
-      Register2 =
-          HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
-              ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
-              : static_cast<unsigned>(Hexagon::NoRegister);
-      if (!RegisterMatches(Register, Register1, Register2))
-        // This isn't the register we're looking for
-        continue;
-      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
-        // Producer is unpredicated
-        break;
-      assert(HexagonMCInstrInfo::isPredicated(MCII, HMB) &&
-             "Unpredicated consumer depending on predicated producer");
-      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
-          HexagonMCInstrInfo::isPredicatedTrue(MCII, HMB))
-        // Producer predicate sense matched ours
-        break;
-    }
-    // Hexagon PRM 10.11 Construct Nt from distance
-    unsigned Offset =
-        HexagonMCInstrInfo::isVector(MCII, HMB) ? VOffset : SOffset;
-    Offset <<= 1;
-    Offset |=
-        HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
-    MCO.setReg(Offset + Hexagon::R0);
-  }
-
-  Binary = getBinaryCodeForInstr(HMB, Fixups, STI);
+  Binary = getBinaryCodeForInstr(MI, Fixups, STI);
   // Check for unimplemented instructions. Immediate extenders
   // are encoded as zero, so they need to be accounted for.
-  if ((!Binary) &&
-      ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
-       (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
-       (HMB.getOpcode() != A4_ext_g))) {
+  if (!Binary &&
+      MI.getOpcode() != DuplexIClass0 &&
+      MI.getOpcode() != A4_ext) {
     DEBUG(dbgs() << "Unimplemented inst: "
-                    " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                    " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
                                                                       "\n");
     llvm_unreachable("Unimplemented Instruction");
   }
   Binary |= Parse;
 
   // if we need to emit a duplexed instruction
-  if (HMB.getOpcode() >= Hexagon::DuplexIClass0 &&
-      HMB.getOpcode() <= Hexagon::DuplexIClassF) {
+  if (MI.getOpcode() >= Hexagon::DuplexIClass0 &&
+      MI.getOpcode() <= Hexagon::DuplexIClassF) {
     assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
            "Emitting duplex without duplex parse bits");
-    unsigned dupIClass;
-    switch (HMB.getOpcode()) {
-    case Hexagon::DuplexIClass0:
-      dupIClass = 0;
-      break;
-    case Hexagon::DuplexIClass1:
-      dupIClass = 1;
-      break;
-    case Hexagon::DuplexIClass2:
-      dupIClass = 2;
-      break;
-    case Hexagon::DuplexIClass3:
-      dupIClass = 3;
-      break;
-    case Hexagon::DuplexIClass4:
-      dupIClass = 4;
-      break;
-    case Hexagon::DuplexIClass5:
-      dupIClass = 5;
-      break;
-    case Hexagon::DuplexIClass6:
-      dupIClass = 6;
-      break;
-    case Hexagon::DuplexIClass7:
-      dupIClass = 7;
-      break;
-    case Hexagon::DuplexIClass8:
-      dupIClass = 8;
-      break;
-    case Hexagon::DuplexIClass9:
-      dupIClass = 9;
-      break;
-    case Hexagon::DuplexIClassA:
-      dupIClass = 10;
-      break;
-    case Hexagon::DuplexIClassB:
-      dupIClass = 11;
-      break;
-    case Hexagon::DuplexIClassC:
-      dupIClass = 12;
-      break;
-    case Hexagon::DuplexIClassD:
-      dupIClass = 13;
-      break;
-    case Hexagon::DuplexIClassE:
-      dupIClass = 14;
-      break;
-    case Hexagon::DuplexIClassF:
-      dupIClass = 15;
-      break;
-    default:
-      llvm_unreachable("Unimplemented DuplexIClass");
-      break;
-    }
+    unsigned dupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
     // 29 is the bit position.
     // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
     // Last bit is moved to bit position 13
     Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
 
-    const MCInst *subInst0 = HMB.getOperand(0).getInst();
-    const MCInst *subInst1 = HMB.getOperand(1).getInst();
+    const MCInst *subInst0 = MI.getOperand(0).getInst();
+    const MCInst *subInst1 = MI.getOperand(1).getInst();
 
     // get subinstruction slot 0
     unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
@@ -293,14 +173,13 @@ void raise_relocation_error(unsigned bits, unsigned kind) {
 /// getFixupNoBits - Some insns are not extended and thus have no
 /// bits.  These cases require a more brute force method for determining
 /// the correct relocation.
-namespace {
-Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
-                                      const MCOperand &MO,
-                                      const MCSymbolRefExpr::VariantKind kind) {
+Hexagon::Fixups HexagonMCCodeEmitter::getFixupNoBits(
+    MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
+    const MCSymbolRefExpr::VariantKind kind) const {
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
   unsigned insnType = llvm::HexagonMCInstrInfo::getType(MCII, MI);
 
-  if (insnType == HexagonII::TypePREFIX) {
+  if (insnType == HexagonII::TypeEXTENDER) {
     switch (kind) {
     case MCSymbolRefExpr::VK_GOTREL:
       return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
@@ -319,11 +198,21 @@ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
     case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
     case MCSymbolRefExpr::VK_Hexagon_PCREL:
-    case MCSymbolRefExpr::VK_None:
-      if (MCID.isBranch())
-        return Hexagon::fixup_Hexagon_B32_PCREL_X;
-      else
-        return Hexagon::fixup_Hexagon_32_6_X;
+      return Hexagon::fixup_Hexagon_B32_PCREL_X;
+    case MCSymbolRefExpr::VK_None: {
+      auto Insts = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+      for (auto I = Insts.begin(), N = Insts.end(); I != N; ++I) {
+        if (I->getInst() == &MI) {
+          const MCInst &NextI = *(I+1)->getInst();
+          const MCInstrDesc &D = HexagonMCInstrInfo::getDesc(MCII, NextI);
+          if (D.isBranch() || D.isCall() ||
+              HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
+            return Hexagon::fixup_Hexagon_B32_PCREL_X;
+          return Hexagon::fixup_Hexagon_32_6_X;
+        }
+      }
+      raise_relocation_error(0, kind);
+    }
     default:
       raise_relocation_error(0, kind);
     }
@@ -406,7 +295,6 @@ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   }
   llvm_unreachable("Relocation exit not taken");
 }
-}
 
 namespace llvm {
 extern const MCInstrDesc HexagonInsts[];
@@ -450,7 +338,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
   int64_t Value;
   if (ME->evaluateAsAbsolute(Value))
     return Value;
-  assert(ME->getKind() == MCExpr::SymbolRef || ME->getKind() == MCExpr::Binary);
+  assert(ME->getKind() == MCExpr::SymbolRef ||
+         ME->getKind() == MCExpr::Binary);
   if (ME->getKind() == MCExpr::Binary) {
     MCBinaryExpr const *Binary = cast<MCBinaryExpr>(ME);
     getExprOpValue(MI, MO, Binary->getLHS(), Fixups, STI);
@@ -581,7 +470,30 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
         if (HexagonMCInstrInfo::s23_2_reloc(*MO.getExpr()))
           FixupKind = Hexagon::fixup_Hexagon_23_REG;
         else
-          raise_relocation_error(bits, kind);
+          if (MCID.mayStore() || MCID.mayLoad()) {
+            for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
+                 ++ImpUses) {
+              if (*ImpUses != Hexagon::GP)
+                continue;
+              switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+              case HexagonII::MemAccessSize::ByteAccess:
+                FixupKind = fixup_Hexagon_GPREL16_0;
+                break;
+              case HexagonII::MemAccessSize::HalfWordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_1;
+                break;
+              case HexagonII::MemAccessSize::WordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_2;
+                break;
+              case HexagonII::MemAccessSize::DoubleWordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_3;
+                break;
+              default:
+                raise_relocation_error(bits, kind);
+              }
+            }
+          } else
+            raise_relocation_error(bits, kind);
         break;
       }
       case MCSymbolRefExpr::VK_DTPREL:
@@ -795,10 +707,71 @@ unsigned
 HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
                                         SmallVectorImpl<MCFixup> &Fixups,
                                         MCSubtargetInfo const &STI) const {
+#ifndef NDEBUG
+  size_t OperandNumber = ~0U;
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i)
+    if (&MI.getOperand(i) == &MO) {
+      OperandNumber = i;
+      break;
+    }
+  assert((OperandNumber != ~0U) && "Operand not found");
+#endif
+
+  if (HexagonMCInstrInfo::isNewValue(MCII, MI) &&
+      &MO == &MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI))) {
+    // Calculate the new value distance to the associated producer
+    MCOperand const &MCO =
+      MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI));
+    unsigned SOffset = 0;
+    unsigned VOffset = 0;
+    unsigned Register = MCO.getReg();
+    unsigned Register1;
+    unsigned Register2;
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.begin() + *CurrentIndex - 1;
+    for (;; --i) {
+      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
+      MCInst const &Inst = *i->getInst();
+      if (HexagonMCInstrInfo::isImmext(Inst))
+        continue;
+      ++SOffset;
+      if (HexagonMCInstrInfo::isVector(MCII, Inst))
+        // Vector instructions don't count scalars
+        ++VOffset;
+      Register1 =
+        HexagonMCInstrInfo::hasNewValue(MCII, Inst)
+        ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
+        : static_cast<unsigned>(Hexagon::NoRegister);
+      Register2 =
+        HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
+        ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
+        : static_cast<unsigned>(Hexagon::NoRegister);
+      if (!RegisterMatches(Register, Register1, Register2))
+        // This isn't the register we're looking for
+        continue;
+      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+        // Producer is unpredicated
+        break;
+      assert(HexagonMCInstrInfo::isPredicated(MCII, MI) &&
+        "Unpredicated consumer depending on predicated producer");
+      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
+        HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
+        // Producer predicate sense matched ours
+        break;
+    }
+    // Hexagon PRM 10.11 Construct Nt from distance
+    unsigned Offset =
+      HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset : SOffset;
+    Offset <<= 1;
+    Offset |=
+      HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+    return Offset;
+  }
   assert(!MO.isImm());
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    if (HexagonMCInstrInfo::isSubInstruction(MI))
+    if (HexagonMCInstrInfo::isSubInstruction(MI) ||
+        llvm::HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCJ)
       return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
     switch(MI.getOpcode()){
     case Hexagon::A2_tfrrcr:
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 8e0667d9ac8e..c3a4beec313f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -15,6 +15,7 @@
 #ifndef HEXAGONMCCODEEMITTER_H
 #define HEXAGONMCCODEEMITTER_H
 
+#include "MCTargetDesc/HexagonFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -31,18 +32,22 @@ class HexagonMCCodeEmitter : public MCCodeEmitter {
   std::unique_ptr<unsigned> Addend;
   std::unique_ptr<bool> Extended;
   std::unique_ptr<MCInst const *> CurrentBundle;
+  std::unique_ptr<size_t> CurrentIndex;
 
   // helper routine for getMachineOpValue()
   unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
                           const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
+  Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+                                 const MCOperand &MO,
+                                 const MCSymbolRefExpr::VariantKind kind) const;
+
 public:
   HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
 
   // Return parse bits for instruction `MCI' inside bundle `MCB'
-  uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB,
-                    MCInst const &MCI) const;
+  uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
 
   void encodeInstruction(MCInst const &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -51,7 +56,7 @@ public:
   void EncodeSingleInstruction(const MCInst &MI, raw_ostream &OS,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI,
-                               uint32_t Parse, size_t Index) const;
+                               uint32_t Parse) const;
 
   // \brief TableGen'erated function for getting the
   // binary encoding for an instruction.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 9a09a17767a6..ffa980ca6563 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -14,6 +14,7 @@
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
@@ -396,7 +397,7 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
 /// is found update the contents fo the bundle with the compound insn.
 /// If a compound instruction is found then the bundle will have one
 /// additional slot.
-void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
+void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                                      MCContext &Context, MCInst &MCI) {
   assert(HexagonMCInstrInfo::isBundle(MCI) &&
          "Non-Bundle where Bundle expected");
@@ -405,8 +406,23 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
   if (MCI.size() < 2)
     return;
 
+  bool StartedValid = llvm::HexagonMCShuffle(false, MCII, STI, MCI);
+
+  // Create a vector, needed to keep the order of jump instructions.
+  MCInst CheckList(MCI);
+
   // Look for compounds until none are found, only update the bundle when
   // a compound is found.
-  while (lookForCompound(MCII, Context, MCI))
-    ;
+  while (lookForCompound(MCII, Context, CheckList)) {
+    // Keep the original bundle around in case the shuffle fails.
+    MCInst OriginalBundle(MCI);
+
+    // Need to update the bundle.
+    MCI = CheckList;
+
+    if (StartedValid && !llvm::HexagonMCShuffle(false, MCII, STI, MCI)) {
+      DEBUG(dbgs() << "Found ERROR\n");
+      MCI = OriginalBundle;
+    }
+  }
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 413f052aa4bd..e8f154a1fa53 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -262,6 +263,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::EH_RETURN_JMPR:
 
   case Hexagon::J2_jumpr:
+  case Hexagon::PS_jmpret:
     // jumpr r31
     // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
     DstReg = MCI.getOperand(0).getReg();
@@ -275,6 +277,12 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::J2_jumprfnew:
   case Hexagon::J2_jumprtnewpt:
   case Hexagon::J2_jumprfnewpt:
+  case Hexagon::PS_jmprett:
+  case Hexagon::PS_jmpretf:
+  case Hexagon::PS_jmprettnew:
+  case Hexagon::PS_jmpretfnew:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmpretfnewpt:
     DstReg = MCI.getOperand(1).getReg();
     SrcReg = MCI.getOperand(0).getReg();
     // [if ([!]p0[.new])] jumpr r31
@@ -284,15 +292,10 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     }
     break;
   case Hexagon::L4_return_t:
-
   case Hexagon::L4_return_f:
-
   case Hexagon::L4_return_tnew_pnt:
-
   case Hexagon::L4_return_fnew_pnt:
-
   case Hexagon::L4_return_tnew_pt:
-
   case Hexagon::L4_return_fnew_pt:
     // [if ([!]p0[.new])] dealloc_return
     SrcReg = MCI.getOperand(0).getReg();
@@ -565,7 +568,8 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
 bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
                                              MCInst const &MIa, bool ExtendedA,
                                              MCInst const &MIb, bool ExtendedB,
-                                             bool bisReversable) {
+                                             bool bisReversable,
+                                             MCSubtargetInfo const &STI) {
   // Slot 1 cannot be extended in duplexes PRM 10.5
   if (ExtendedA)
     return false;
@@ -625,11 +629,16 @@ bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
       return false;
   }
 
-  // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
-  //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
-  if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
-    if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
-      return false;
+  if (STI.getCPU().equals_lower("hexagonv4") ||
+      STI.getCPU().equals_lower("hexagonv5") ||
+      STI.getCPU().equals_lower("hexagonv55") ||
+      STI.getCPU().equals_lower("hexagonv60")) {
+    // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
+    //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
+    if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
+      if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
+        return false;
+    }
   }
 
   return (isDuplexPairMatch(MIaG, MIbG));
@@ -703,6 +712,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
       Result.setOpcode(Hexagon::SA1_dec);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
       break;
     } //  1,2 SUBInst $Rd = add($Rs,#-1)
     else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
@@ -806,20 +816,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break; //    none  SUBInst deallocframe
   case Hexagon::EH_RETURN_JMPR:
   case Hexagon::J2_jumpr:
+  case Hexagon::PS_jmpret:
     Result.setOpcode(Hexagon::SL2_jumpr31);
     break; //    none  SUBInst jumpr r31
   case Hexagon::J2_jumprf:
+  case Hexagon::PS_jmpretf:
     Result.setOpcode(Hexagon::SL2_jumpr31_f);
     break; //    none  SUBInst if (!p0) jumpr r31
   case Hexagon::J2_jumprfnew:
   case Hexagon::J2_jumprfnewpt:
+  case Hexagon::PS_jmpretfnewpt:
+  case Hexagon::PS_jmpretfnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_fnew);
     break; //    none  SUBInst if (!p0.new) jumpr:nt r31
   case Hexagon::J2_jumprt:
+  case Hexagon::PS_jmprett:
     Result.setOpcode(Hexagon::SL2_jumpr31_t);
     break; //    none  SUBInst if (p0) jumpr r31
   case Hexagon::J2_jumprtnew:
   case Hexagon::J2_jumprtnewpt:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmprettnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_tnew);
     break; //    none  SUBInst if (p0.new) jumpr:nt r31
   case Hexagon::L2_loadrb_io:
@@ -966,6 +983,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     if (Absolute && Value == -1) {
       Result.setOpcode(Hexagon::SA1_setin1);
       addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
       break; //  2 1 SUBInst $Rd = #-1
     } else {
       Result.setOpcode(Hexagon::SA1_seti);
@@ -1005,6 +1023,7 @@ static bool isStoreInst(unsigned opCode) {
 
 SmallVector<DuplexCandidate, 8>
 HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
+                                          MCSubtargetInfo const &STI,
                                           MCInst const &MCB) {
   assert(isBundle(MCB));
   SmallVector<DuplexCandidate, 8> duplexToTry;
@@ -1033,7 +1052,7 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
               HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
               *MCB.getOperand(j).getInst(),
               HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
-              bisReversable)) {
+              bisReversable, STI)) {
         // Get iClass.
         unsigned iClass = iClassOfDuplexPair(
             getDuplexCandidateGroup(*MCB.getOperand(k).getInst()),
@@ -1058,7 +1077,7 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
                 HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
                 *MCB.getOperand(k).getInst(),
                 HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
-                bisReversable)) {
+                bisReversable, STI)) {
           // Get iClass.
           unsigned iClass = iClassOfDuplexPair(
               getDuplexCandidateGroup(*MCB.getOperand(j).getInst()),
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 226470cfbced..9e1ff9ca35d7 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -37,30 +37,19 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned>
-    GPSize("gpsize", cl::NotHidden,
-           cl::desc("Global Pointer Addressing Size.  The default size is 8."),
-           cl::Prefix, cl::init(8));
-
-void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
-                                           const MCSubtargetInfo &STI) {
-  MCInst HMI = HexagonMCInstrInfo::createBundle();
-  MCInst *MCB;
-
-  if (MCK.getOpcode() != Hexagon::BUNDLE) {
-    HMI.addOperand(MCOperand::createInst(&MCK));
-    MCB = &HMI;
-  } else
-    MCB = const_cast<MCInst *>(&MCK);
-
-  // Examines packet and pad the packet, if needed, when an
-  // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
-  HexagonMCShuffle(*MCII, STI, *MCB);
-
-  assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
+static cl::opt<unsigned> GPSize
+  ("gpsize", cl::NotHidden,
+   cl::desc("Global Pointer Addressing Size.  The default size is 8."),
+   cl::Prefix,
+   cl::init(8));
+
+void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
+                                           const MCSubtargetInfo &STI, bool) {
+  assert(MCB.getOpcode() == Hexagon::BUNDLE);
+  assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
+  assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
   bool Extended = false;
-  for (auto &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+  for (auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
     MCInst *MCI = const_cast<MCInst *>(I.getInst());
     if (Extended) {
       if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
@@ -77,11 +66,12 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
 
   // At this point, MCB is a bundle
   // Iterate through the bundle and assign addends for the instructions
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
     MCInst *MCI = const_cast<MCInst *>(I.getInst());
     EmitSymbol(*MCI);
   }
-  MCObjectStreamer::EmitInstruction(*MCB, STI);
+
+  MCObjectStreamer::EmitInstruction(MCB, STI);
 }
 
 void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) {
@@ -119,9 +109,11 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     MCSectionSubPair P = getCurrentSection();
     SwitchSection(&Section);
 
-    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-    EmitLabel(Symbol);
-    EmitZeros(Size);
+    if (ELFSymbol->isUndefined(false)) {
+      EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+      EmitLabel(Symbol);
+      EmitZeros(Size);
+    }
 
     // Update the maximum alignment of the section if necessary.
     if (ByteAlignment > Section.getAlignment())
@@ -144,9 +136,10 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
   ELFSymbol->setSize(MCConstantExpr::create(Size, getContext()));
 }
 
-void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
-    MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment,
-    unsigned AccessSize) {
+void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
+                                                         uint64_t Size,
+                                                         unsigned ByteAlignment,
+                                                         unsigned AccessSize) {
   getAssembler().registerSymbol(*Symbol);
   auto ELFSymbol = cast<MCSymbolELF>(Symbol);
   ELFSymbol->setBinding(ELF::STB_LOCAL);
@@ -154,11 +147,12 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
   HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
 }
 
-namespace llvm {
 
-MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                     raw_pwrite_stream &OS, MCCodeEmitter *CE) {
-  return new HexagonMCELFStreamer(Context, MAB, OS, CE);
-}
+namespace llvm {
+  MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
+                                       MCAsmBackend &MAB,
+                                       raw_pwrite_stream &OS, MCCodeEmitter *CE) {
+    return new HexagonMCELFStreamer(Context, MAB, OS, CE);
+  }
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index 0ac1a68d4ef9..024dff1a2f97 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -27,7 +27,15 @@ public:
       : MCELFStreamer(Context, TAB, OS, Emitter),
         MCII(createHexagonMCInstrInfo()) {}
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  HexagonMCELFStreamer(MCContext &Context,
+                       MCAsmBackend &TAB,
+                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter,
+                       MCAssembler *Assembler) :
+  MCELFStreamer(Context, TAB, OS, Emitter),
+  MCII (createHexagonMCInstrInfo()) {}
+
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override;
   void EmitSymbol(const MCInst &Inst);
   void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment,
@@ -36,8 +44,9 @@ public:
                                  unsigned ByteAlignment, unsigned AccessSize);
 };
 
-MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                     raw_pwrite_stream &OS, MCCodeEmitter *CE);
+MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
+                                     MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                     MCCodeEmitter *CE);
 
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index e93906a0a396..14300edc7e1b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -11,7 +11,9 @@
 #include "HexagonMCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -36,7 +38,47 @@ MCFragment *llvm::HexagonMCExpr::findAssociatedFragment() const {
   return Expr->findAssociatedFragment();
 }
 
-void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Cannot handle nested target MCExpr");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *be = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(be->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(be->getRHS(), Asm);
+    break;
+  }
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &symRef = *cast<MCSymbolRefExpr>(Expr);
+    switch (symRef.getKind()) {
+    default:
+      return;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_TPREL:
+      break;
+    }
+    cast<MCSymbolELF>(symRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  auto expr = getExpr();
+  fixELFSymbolsInTLSFixupsImpl(expr, Asm);
+}
 
 MCExpr const *HexagonMCExpr::getExpr() const { return Expr; }
 
@@ -75,4 +117,4 @@ void HexagonMCExpr::setSignMismatch(bool Val) {
 
 bool HexagonMCExpr::signMismatch() const {
   return SignMismatch;
-}
-\ No newline at end of file
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index e627f026c8ad..553ffba508a1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -16,10 +16,9 @@
 #include "Hexagon.h"
 #include "HexagonBaseInfo.h"
 #include "HexagonMCChecker.h"
-
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
@@ -59,31 +58,36 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
                                             MCSubtargetInfo const &STI,
                                             MCContext &Context, MCInst &MCB,
                                             HexagonMCChecker *Check) {
-  // Examine the packet and convert pairs of instructions to compound
-  // instructions when possible.
-  if (!HexagonDisableCompound)
-    HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
   // Check the bundle for errors.
-  bool CheckOk = Check ? Check->check() : true;
+  bool CheckOk = Check ? Check->check(false) : true;
   if (!CheckOk)
     return false;
-  HexagonMCShuffle(MCII, STI, MCB);
+  // Examine the packet and convert pairs of instructions to compound
+  // instructions when possible.
+  if (!HexagonDisableCompound)
+    HexagonMCInstrInfo::tryCompound(MCII, STI, Context, MCB);
+  HexagonMCShuffle(false, MCII, STI, MCB);
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
   MCInst InstBundlePreDuplex = MCInst(MCB);
   if (!HexagonDisableDuplex) {
     SmallVector<DuplexCandidate, 8> possibleDuplexes;
-    possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+    possibleDuplexes =
+        HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB);
     HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
   }
   // Examines packet and pad the packet, if needed, when an
   // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(Context, MCB);
+  HexagonMCInstrInfo::padEndloop(MCB, Context);
   // If compounding and duplexing didn't reduce the size below
   // 4 or less we have a packet that is too big.
   if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
     return false;
-  HexagonMCShuffle(MCII, STI, MCB);
+  // Check the bundle for errors.
+  CheckOk = Check ? Check->check(true) : true;
+  if (!CheckOk)
+    return false;
+  HexagonMCShuffle(true, MCII, STI, MCB);
   return true;
 }
 
@@ -111,32 +115,14 @@ MCInst HexagonMCInstrInfo::createBundle() {
   return Result;
 }
 
-MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
-                                         MCInst const &inst0,
-                                         MCInst const &inst1) {
-  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
-  MCInst *duplexInst = new (Context) MCInst;
-  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
-
-  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
-  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
-  duplexInst->addOperand(MCOperand::createInst(SubInst0));
-  duplexInst->addOperand(MCOperand::createInst(SubInst1));
-  return duplexInst;
-}
-
 MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
                                           MCInst const &Inst,
                                           MCOperand const &MO) {
   assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
          HexagonMCInstrInfo::isExtended(MCII, Inst));
 
-  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
   MCInst XMI;
-  XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
-                 HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
-                    ? Hexagon::A4_ext_b
-                    : Hexagon::A4_ext);
+  XMI.setOpcode(Hexagon::A4_ext);
   if (MO.isImm())
     XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
   else if (MO.isExpr())
@@ -146,6 +132,20 @@ MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
   return XMI;
 }
 
+MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
+                                         MCInst const &inst0,
+                                         MCInst const &inst1) {
+  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
+  MCInst *duplexInst = new (Context) MCInst;
+  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
+
+  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
+  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
+  duplexInst->addOperand(MCOperand::createInst(SubInst0));
+  duplexInst->addOperand(MCOperand::createInst(SubInst1));
+  return duplexInst;
+}
+
 MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
                                                    size_t Index) {
   assert(Index <= bundleSize(MCB));
@@ -173,22 +173,9 @@ HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
                                    HexagonII::MemAccesSizeMask));
 }
 
-unsigned HexagonMCInstrInfo::getBitCount(MCInstrInfo const &MCII,
-                                         MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
-}
-
-// Return constant extended operand number.
-unsigned short HexagonMCInstrInfo::getCExtOpNum(MCInstrInfo const &MCII,
-                                                MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
-}
-
 MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
                                                MCInst const &MCI) {
-  return (MCII.get(MCI.getOpcode()));
+  return MCII.get(MCI.getOpcode());
 }
 
 unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
@@ -276,34 +263,32 @@ unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
   return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
 }
 
-// Return the max value that a constant extendable operand can have
-// without being extended.
+/// Return the maximum value of an extendable operand.
 int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned isSigned =
-      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
 
-  if (isSigned) // if value is signed
-    return ~(-1U << (bits - 1));
-  else
-    return ~(-1U << bits);
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+
+  if (S) // if value is signed
+    return (1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1)) - 1;
+  return (1 << HexagonMCInstrInfo::getExtentBits(MCII, MCI)) - 1;
 }
 
-// Return the min value that a constant extendable operand can have
-// without being extended.
+/// Return the minimum value of an extendable operand.
 int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned isSigned =
-      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
 
-  if (isSigned) // if value is signed
-    return -1U << (bits - 1);
-  else
-    return 0;
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+
+  if (S) // if value is signed
+    return -(1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1));
+  return 0;
 }
 
 StringRef HexagonMCInstrInfo::getName(MCInstrInfo const &MCII,
@@ -319,9 +304,7 @@ unsigned short HexagonMCInstrInfo::getNewValueOp(MCInstrInfo const &MCII,
 
 MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
                                                         MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned const O =
-      (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
+  unsigned O = HexagonMCInstrInfo::getNewValueOp(MCII, MCI);
   MCOperand const &MCO = MCI.getOperand(O);
 
   assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
@@ -349,6 +332,13 @@ HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
   return (MCO);
 }
 
+/// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = MCII.get(MCI.getOpcode()).TSFlags;
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
 int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -361,33 +351,55 @@ int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
     return Hexagon::ArchV4;
   case HexagonII::HasV5SubT:
     return Hexagon::ArchV5;
+  case HexagonII::HasV55SubT:
+    return Hexagon::ArchV55;
+  case HexagonII::HasV60SubT:
+    return Hexagon::ArchV60;
   }
 }
 
-// Return the Hexagon ISA class for the insn.
-unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
-                                     MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-
-  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
-}
-
+/// Return the slots this instruction can execute out of
 unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
                                       MCSubtargetInfo const &STI,
                                       MCInst const &MCI) {
-
   const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
   int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
   return ((II[SchedClass].FirstStage + HexagonStages)->getUnits());
 }
 
-bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+/// Return the slots this instruction consumes in addition to
+/// the slot(s) it can execute out of
+
+unsigned HexagonMCInstrInfo::getOtherReservedSlots(MCInstrInfo const &MCII,
+                                                   MCSubtargetInfo const &STI,
+                                                   MCInst const &MCI) {
+  const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+  int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  unsigned Slots = 0;
+
+  // FirstStage are slots that this instruction can execute in.
+  // FirstStage+1 are slots that are also consumed by this instruction.
+  // For example: vmemu can only execute in slot 0 but also consumes slot 1.
+  for (unsigned Stage = II[SchedClass].FirstStage + 1;
+       Stage < II[SchedClass].LastStage; ++Stage) {
+    unsigned Units = (Stage + HexagonStages)->getUnits();
+    if (Units > HexagonGetLastSlot())
+      break;
+    // fyi: getUnits() will return 0x1, 0x2, 0x4 or 0x8
+    Slots |= Units;
+  }
+
+  // if 0 is returned, then no additional slots are consumed by this inst.
+  return Slots;
+}
+
+bool HexagonMCInstrInfo::hasDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
   if (!HexagonMCInstrInfo::isBundle(MCI))
     return false;
 
   for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
     auto MI = I.getInst();
-    if (isImmext(*MI))
+    if (HexagonMCInstrInfo::isDuplex(MCII, *MI))
       return true;
   }
 
@@ -398,7 +410,20 @@ bool HexagonMCInstrInfo::hasExtenderForIndex(MCInst const &MCB, size_t Index) {
   return extenderForIndex(MCB, Index) != nullptr;
 }
 
-// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+  if (!HexagonMCInstrInfo::isBundle(MCI))
+    return false;
+
+  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
+    auto MI = I.getInst();
+    if (isImmext(*MI))
+      return true;
+  }
+
+  return false;
+}
+
+/// Return whether the insn produces a value.
 bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -418,46 +443,19 @@ MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
   return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
 }
 
+/// Return where the instruction is an accumulator.
+bool HexagonMCInstrInfo::isAccumulator(MCInstrInfo const &MCII,
+                                       MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
 bool HexagonMCInstrInfo::isBundle(MCInst const &MCI) {
   auto Result = Hexagon::BUNDLE == MCI.getOpcode();
   assert(!Result || (MCI.size() > 0 && MCI.getOperand(0).isImm()));
   return Result;
 }
 
-// Return whether the insn is an actual insn.
-bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
-          !HexagonMCInstrInfo::isPrefix(MCII, MCI) &&
-          HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
-}
-
-bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
-}
-
-bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
-                                    MCInst const &MCI) {
-  return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
-}
-
-bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
-  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
-          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
-}
-
-bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
-}
-
-// Return whether the instruction needs to be constant extended.
-// 1) Always return true if the instruction has 'isExtended' flag set.
-//
-// isExtendable:
-// 2) For immediate extended operands, return true only if the value is
-//    out-of-range.
-// 3) For global address, always return true.
-
 bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   if (HexagonMCInstrInfo::isExtended(MCII, MCI))
@@ -470,9 +468,9 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
     return true;
   // Branch insns are handled as necessary by relaxation.
   if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
-      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCJ &&
        HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
-      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNCJ &&
        HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
     return false;
   // Otherwise loop instructions and other CR insts are handled by relaxation
@@ -492,6 +490,30 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
   return (MinValue > Value || Value > MaxValue);
 }
 
+bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return !HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
+         !HexagonMCInstrInfo::isPrefix(MCII, MCI);
+}
+
+bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
+}
+
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  return (getType(MCII, MCI) == HexagonII::TypeCJ);
+}
+
+bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
+          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
+}
+
+bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
   uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -510,9 +532,7 @@ bool HexagonMCInstrInfo::isFloat(MCInstrInfo const &MCII, MCInst const &MCI) {
 }
 
 bool HexagonMCInstrInfo::isImmext(MCInst const &MCI) {
-  auto Op = MCI.getOpcode();
-  return (Op == Hexagon::A4_ext_b || Op == Hexagon::A4_ext_c ||
-          Op == Hexagon::A4_ext_g || Op == Hexagon::A4_ext);
+  return MCI.getOpcode() == Hexagon::A4_ext;
 }
 
 bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
@@ -530,20 +550,17 @@ bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
           (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
 }
 
-// Return whether the insn is a new-value consumer.
+/// Return whether the insn expects newly produced value.
 bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
 }
 
-// Return whether the operand can be constant extended.
-bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
-                                           MCInst const &MCI,
-                                           unsigned short OperandNum) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) ==
-         OperandNum;
+/// Return whether the operand is extendable.
+bool HexagonMCInstrInfo::isOpExtendable(MCInstrInfo const &MCII,
+                                        MCInst const &MCI, unsigned short O) {
+  return (O == HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
 }
 
 bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
@@ -558,6 +575,10 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
+bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeEXTENDER == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -582,12 +603,22 @@ bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
   return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
 }
 
-bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
+/// Return whether the insn can be packaged only with A and X-type insns.
+bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
 }
 
-bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+/// Return whether the insn can be packaged only with an A-type insn in slot #1.
+bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
+}
+
+/// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = MCII.get(MCI.getOpcode()).TSFlags;
   return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
 }
 
@@ -663,17 +694,6 @@ bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
   }
 }
 
-bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
-}
-
-bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
-                                    MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
-}
-
 bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
   if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
       (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
@@ -705,16 +725,26 @@ bool HexagonMCInstrInfo::mustExtend(MCExpr const &Expr) {
   return HExpr.mustExtend();
 }
 void HexagonMCInstrInfo::setMustNotExtend(MCExpr const &Expr, bool Val) {
-  HexagonMCExpr &HExpr =
-      const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HexagonMCExpr &HExpr = const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
   HExpr.setMustNotExtend(Val);
 }
 bool HexagonMCInstrInfo::mustNotExtend(MCExpr const &Expr) {
   HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
   return HExpr.mustNotExtend();
 }
+void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
+  HExpr.setS23_2_reloc(Val);
+}
+bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
+  HexagonMCExpr const *HExpr = llvm::dyn_cast<HexagonMCExpr>(&Expr);
+  if (!HExpr)
+    return false;
+  return HExpr->s23_2_reloc();
+}
 
-void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
+void HexagonMCInstrInfo::padEndloop(MCInst &MCB, MCContext &Context) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
   assert(isBundle(MCB));
@@ -727,22 +757,8 @@ void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
 
 bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
-  if (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR)
-    return false;
-
-  unsigned SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_2_SLOT23:
-  case Hexagon::Sched::ALU64_tc_3x_SLOT23:
-  case Hexagon::Sched::M_tc_2_SLOT23:
-  case Hexagon::Sched::M_tc_3x_SLOT23:
-  case Hexagon::Sched::S_2op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_3x_SLOT23:
-    return true;
-  }
-  return false;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::PrefersSlot3Pos) & HexagonII::PrefersSlot3Mask;
 }
 
 void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
@@ -778,15 +794,6 @@ void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
   Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
   assert(isMemStoreReorderEnabled(MCI));
 }
-void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
-  HexagonMCExpr &HExpr =
-      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
-  HExpr.setS23_2_reloc(Val);
-}
-bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
-  HexagonMCExpr const &HExpr = *llvm::cast<HexagonMCExpr>(&Expr);
-  return HExpr.s23_2_reloc();
-}
 
 void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
   assert(isBundle(MCI));
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index d701c3ade69e..2e989adb5ccb 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -19,11 +19,8 @@
 
 namespace llvm {
 class HexagonMCChecker;
-class MCContext;
 class MCInstrDesc;
 class MCInstrInfo;
-class MCInst;
-class MCOperand;
 class MCSubtargetInfo;
 namespace HexagonII {
 enum class MemAccessSize;
@@ -67,16 +64,6 @@ bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                         MCContext &Context, MCInst &MCB,
                         HexagonMCChecker *Checker);
 
-// Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
-
-MCInst createBundle();
-
-// Return the extender for instruction at Index or nullptr if none
-MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
-void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
-                    MCInst const &MCI);
-
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
                      MCInst const &inst1);
@@ -86,27 +73,28 @@ MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
 // Convert this instruction in to a duplex subinst
 MCInst deriveSubInst(MCInst const &Inst);
 
+// Clamp off upper 26 bits of extendable operand for emission
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
 // Return the extender for instruction at Index or nullptr if none
 MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                    MCInst const &MCI);
 
 // Return memory access size
 HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII,
                                        MCInst const &MCI);
-
-// Return number of bits in the constant extended operand.
-unsigned getBitCount(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return constant extended operand number.
-unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
-
 MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return which duplex group this instruction belongs to
 unsigned getDuplexCandidateGroup(MCInst const &MI);
 
 // Return a list of all possible instruction duplex combinations
-SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
-                                                      MCInst const &MCB);
+SmallVector<DuplexCandidate, 8>
+getDuplexPossibilties(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst const &MCB);
 unsigned getDuplexRegisterNumbering(unsigned Reg);
 
 MCExpr const &getExpr(MCExpr const &Expr);
@@ -143,7 +131,6 @@ MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
 unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
 MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
                                      MCInst const &MCI);
-
 int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the Hexagon ISA class for the insn.
@@ -152,6 +139,9 @@ unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
 /// Return the slots used by the insn.
 unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                   MCInst const &MCI);
+unsigned getOtherReservedSlots(MCInstrInfo const &MCII,
+                               MCSubtargetInfo const &STI, MCInst const &MCI);
+bool hasDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Does the packet have an extender for the instruction at Index
 bool hasExtenderForIndex(MCInst const &MCB, size_t Index);
@@ -161,19 +151,6 @@ bool hasImmExt(MCInst const &MCI);
 // Return whether the instruction is a legal new-value producer.
 bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return the instruction at Index
-MCInst const &instruction(MCInst const &MCB, size_t Index);
-
-// Returns whether this MCInst is a wellformed bundle
-bool isBundle(MCInst const &MCI);
-
-// Return whether the insn is an actual insn.
-bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
-bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
-bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return the duplex iclass given the two duplex classes
 unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
 
 int64_t minConstant(MCInst const &MCI, size_t Index);
@@ -189,6 +166,18 @@ template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
   return isUInt<N>(minConstant(MCI, Index));
 }
 
+// Return the instruction at Index
+MCInst const &instruction(MCInst const &MCB, size_t Index);
+bool isAccumulator(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Returns whether this MCInst is a wellformed bundle
+bool isBundle(MCInst const &MCI);
+
+// Return whether the insn is an actual insn.
+bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
+
 // Return whether the instruction needs to be constant extended.
 bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
 
@@ -229,15 +218,12 @@ bool isMemStoreReorderEnabled(MCInst const &MCI);
 
 // Return whether the insn is a new-value consumer.
 bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return true if the operand can be constant extended.
-bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
-                       unsigned short OperandNum);
+bool isOpExtendable(MCInstrInfo const &MCII, MCInst const &MCI, unsigned short);
 
 // Can these two instructions be duplexed
 bool isOrderedDuplexPair(MCInstrInfo const &MCII, MCInst const &MIa,
                          bool ExtendedA, MCInst const &MIb, bool ExtendedB,
-                         bool bisReversable);
+                         bool bisReversable, MCSubtargetInfo const &STI);
 
 // Returns whether this bundle is an endloop1
 bool isOuterLoop(MCInst const &MCI);
@@ -270,12 +256,11 @@ bool mustExtend(MCExpr const &Expr);
 bool mustNotExtend(MCExpr const &Expr);
 
 // Pad the bundle with nops to satisfy endloop requirements
-void padEndloop(MCContext &Context, MCInst &MCI);
-
+void padEndloop(MCInst &MCI, MCContext &Context);
 bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Replace the instructions inside MCB, represented by Candidate
-void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
+void replaceDuplex(MCContext &Context, MCInst &MCI, DuplexCandidate Candidate);
 
 bool s23_2_reloc(MCExpr const &Expr);
 // Marks a bundle as endloop0
@@ -295,7 +280,8 @@ unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
                         unsigned Producer2);
 
 // Attempt to find and replace compound pairs
-void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                 MCContext &Context, MCInst &MCI);
 }
 }
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 7f8e7a4edb0c..529a5fd5ed82 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -33,42 +33,39 @@ void HexagonMCShuffler::init(MCInst &MCB) {
     MCInst const *Extender = nullptr;
     // Copy the bundle for the shuffling.
     for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
-      MCInst *MI = const_cast<MCInst *>(I.getInst());
+      MCInst &MI = *const_cast<MCInst *>(I.getInst());
+      DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode()) << '\n');
+      assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo());
 
-      if (!HexagonMCInstrInfo::isImmext(*MI)) {
-        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
-               false);
+      if (!HexagonMCInstrInfo::isImmext(MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, MI));
         Extender = nullptr;
       } else
-        Extender = MI;
+        Extender = &MI;
     }
   }
 
   BundleFlags = MCB.getOperand(0).getImm();
 }
 
-void HexagonMCShuffler::init(MCInst &MCB, MCInst const *AddMI,
+void HexagonMCShuffler::init(MCInst &MCB, MCInst const &AddMI,
                              bool bInsertAtFront) {
   if (HexagonMCInstrInfo::isBundle(MCB)) {
-    if (bInsertAtFront && AddMI)
-      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
-             false);
+    if (bInsertAtFront)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, AddMI));
     MCInst const *Extender = nullptr;
     // Copy the bundle for the shuffling.
     for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
       assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
-      MCInst *MI = const_cast<MCInst *>(I.getInst());
-      if (!HexagonMCInstrInfo::isImmext(*MI)) {
-        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
-               false);
+      MCInst &MI = *const_cast<MCInst *>(I.getInst());
+      if (!HexagonMCInstrInfo::isImmext(MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, MI));
         Extender = nullptr;
       } else
-        Extender = MI;
+        Extender = &MI;
     }
-    if (!bInsertAtFront && AddMI)
-      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
-             false);
+    if (!bInsertAtFront)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, AddMI));
   }
 
   BundleFlags = MCB.getOperand(0).getImm();
@@ -80,11 +77,11 @@ void HexagonMCShuffler::copyTo(MCInst &MCB) {
   // Copy the results into the bundle.
   for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
 
-    MCInst const *MI = I->getDesc();
+    MCInst const &MI = I->getDesc();
     MCInst const *Extender = I->getExtender();
     if (Extender)
       MCB.addOperand(MCOperand::createInst(Extender));
-    MCB.addOperand(MCOperand::createInst(MI));
+    MCB.addOperand(MCOperand::createInst(&MI));
   }
 }
 
@@ -98,9 +95,9 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
   return (!getError());
 }
 
-bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                            MCInst &MCB) {
-  HexagonMCShuffler MCS(MCII, STI, MCB);
+bool llvm::HexagonMCShuffle(bool Fatal, MCInstrInfo const &MCII,
+                            MCSubtargetInfo const &STI, MCInst &MCB) {
+  HexagonMCShuffler MCS(true, MCII, STI, MCB);
 
   if (DisableShuffle)
     // Ignore if user chose so.
@@ -124,6 +121,18 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   if (!MCS.reshuffleTo(MCB)) {
     // Unless there is any error, which should not happen at this point.
     unsigned shuffleError = MCS.getError();
+
+    if (!Fatal && (shuffleError !=  HexagonShuffler::SHUFFLE_SUCCESS))
+      return false;
+    if (shuffleError !=  HexagonShuffler::SHUFFLE_SUCCESS) {
+      errs() << "\nFailing packet:\n";
+      for (const auto& I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+        MCInst *MI = const_cast<MCInst *>(I.getInst());
+        errs() << HexagonMCInstrInfo::getName(MCII, *MI) << ' ' << HexagonMCInstrInfo::getDesc(MCII, *MI).getOpcode() << '\n';
+      }
+      errs() << '\n';
+    }
+
     switch (shuffleError) {
     default:
       llvm_unreachable("unknown error");
@@ -176,7 +185,7 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
     DuplexCandidate duplexToTry = possibleDuplexes.pop_back_val();
     MCInst Attempt(MCB);
     HexagonMCInstrInfo::replaceDuplex(Context, Attempt, duplexToTry);
-    HexagonMCShuffler MCS(MCII, STI, Attempt); // copy packet to the shuffler
+    HexagonMCShuffler MCS(true, MCII, STI, Attempt); // copy packet to the shuffler
     if (MCS.size() == 1) {                     // case of one duplex
       // copy the created duplex in the shuffler to the bundle
       MCS.copyTo(MCB);
@@ -191,7 +200,7 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   }
 
   if (doneShuffling == false) {
-    HexagonMCShuffler MCS(MCII, STI, MCB);
+    HexagonMCShuffler MCS(true, MCII, STI, MCB);
     doneShuffling = MCS.reshuffleTo(MCB); // shuffle
     shuffleError = MCS.getError();
   }
@@ -202,8 +211,8 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
 }
 
 bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                            MCInst &MCB, MCInst const *AddMI, int fixupCount) {
-  if (!HexagonMCInstrInfo::isBundle(MCB) || !AddMI)
+                            MCInst &MCB, MCInst const &AddMI, int fixupCount) {
+  if (!HexagonMCInstrInfo::isBundle(MCB))
     return false;
 
   // if fixups present, make sure we don't insert too many nops that would
@@ -211,8 +220,15 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   unsigned int bundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if (bundleSize >= HEXAGON_PACKET_SIZE)
     return false;
+  bool bhasDuplex = HexagonMCInstrInfo::hasDuplex(MCII, MCB);
   if (fixupCount >= 2) {
-    return false;
+    if (bhasDuplex) {
+      if (bundleSize >= HEXAGON_PACKET_SIZE - 1) {
+        return false;
+      }
+    } else {
+      return false;
+    }
   } else {
     if (bundleSize == HEXAGON_PACKET_SIZE - 1 && fixupCount)
       return false;
@@ -221,7 +237,16 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   if (DisableShuffle)
     return false;
 
-  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI);
+  // mgl: temporary code (shuffler doesn't take into account the fact that
+  // a duplex takes up two slots.  for example, 3 nops can be put into a packet
+  // containing a duplex oversubscribing slots by 1).
+  unsigned maxBundleSize = (HexagonMCInstrInfo::hasImmExt(MCB))
+                               ? HEXAGON_PACKET_SIZE
+                               : HEXAGON_PACKET_SIZE - 1;
+  if (bhasDuplex && bundleSize >= maxBundleSize)
+    return false;
+
+  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI, false);
   if (!MCS.reshuffleTo(MCB)) {
     unsigned shuffleError = MCS.getError();
     switch (shuffleError) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
index a21cce1fc240..14bbfda4c914 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -27,16 +27,16 @@ class HexagonMCShuffler : public HexagonShuffler {
   bool duplex_present;
 
 public:
-  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                    MCInst &MCB)
+  HexagonMCShuffler(bool Fatal, MCInstrInfo const &MCII,
+                    MCSubtargetInfo const &STI, MCInst &MCB)
       : HexagonShuffler(MCII, STI) {
     init(MCB);
   };
   HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                    MCInst &MCB, const MCInst *AddMI,
-                    bool bInsertAtFront = false)
+                    MCInst &MCB, MCInst const &AddMI,
+                    bool InsertAtFront)
       : HexagonShuffler(MCII, STI) {
-    init(MCB, AddMI, bInsertAtFront);
+    init(MCB, AddMI, InsertAtFront);
   };
 
   // Copy reordered bundle to another.
@@ -49,14 +49,14 @@ public:
 
 private:
   void init(MCInst &MCB);
-  void init(MCInst &MCB, const MCInst *AddMI, bool bInsertAtFront = false);
+  void init(MCInst &MCB, MCInst const &AddMI, bool InsertAtFront);
 };
 
 // Invocation of the shuffler.
+bool HexagonMCShuffle(bool Fatal, MCInstrInfo const &MCII,
+                      MCSubtargetInfo const &STI, MCInst &);
 bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                      MCInst &);
-bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                      MCInst &, const MCInst *, int);
+                      MCInst &, MCInst const &, int);
 unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                           MCContext &Context, MCInst &,
                           SmallVector<DuplexCandidate, 8>);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 694cf582f8d9..bb98c2bbef6d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -66,6 +67,12 @@ static cl::opt<bool> HexagonV55ArchVariant("mv55", cl::Hidden, cl::init(false),
 static cl::opt<bool> HexagonV60ArchVariant("mv60", cl::Hidden, cl::init(false),
   cl::desc("Build for Hexagon V60"));
 
+static cl::opt<bool> HexagonV62ArchVariant("mv62", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V62"));
+
+static cl::opt<bool> EnableHVX("mhvx", cl::Hidden, cl::init(false),
+  cl::desc("Enable Hexagon Vector Extension (HVX)"));
+
 static StringRef DefaultArch = "hexagonv60";
 
 static StringRef HexagonGetArchVariant() {
@@ -77,6 +84,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv55";
   if (HexagonV60ArchVariant)
     return "hexagonv60";
+  if (HexagonV62ArchVariant)
+    return "hexagonv62";
   return "";
 }
 
@@ -95,31 +104,16 @@ StringRef Hexagon_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
   return ArchV;
 }
 
-MCInstrInfo *llvm::createHexagonMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitHexagonMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitHexagonMCRegisterInfo(X, Hexagon::R31);
-  return X;
-}
-
-static MCSubtargetInfo *
-createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  CPU = Hexagon_MC::selectHexagonCPU(TT, CPU);
-  return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
-}
+unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV4FU::SLOT3; }
 
 namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
 public:
   HexagonTargetAsmStreamer(MCStreamer &S,
-                           formatted_raw_ostream &, bool,
-                           MCInstPrinter &)
+                           formatted_raw_ostream &OS,
+                           bool isVerboseAsm,
+                           MCInstPrinter &IP)
       : HexagonTargetStreamer(S) {}
 
   void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS,
@@ -156,24 +150,15 @@ public:
 
 class HexagonTargetELFStreamer : public HexagonTargetStreamer {
 public:
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
   HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
       : HexagonTargetStreamer(S) {
-    auto Bits = STI.getFeatureBits();
-    unsigned Flags = 0;
-    if (Bits[Hexagon::ArchV60])
-      Flags = ELF::EF_HEXAGON_MACH_V60;
-    else if (Bits[Hexagon::ArchV55])
-      Flags = ELF::EF_HEXAGON_MACH_V55;
-    else if (Bits[Hexagon::ArchV5])
-      Flags = ELF::EF_HEXAGON_MACH_V5;
-    else if (Bits[Hexagon::ArchV4])
-      Flags = ELF::EF_HEXAGON_MACH_V4;
-    getStreamer().getAssembler().setELFHeaderEFlags(Flags);
+    MCAssembler &MCA = getStreamer().getAssembler();
+    MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
-  MCELFStreamer &getStreamer() {
-    return static_cast<MCELFStreamer &>(Streamer);
-  }
 
   void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                               unsigned ByteAlignment,
@@ -196,13 +181,26 @@ public:
 
 } // end anonymous namespace
 
+llvm::MCInstrInfo *llvm::createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R31);
+  return X;
+}
+
 static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
                                          const Triple &TT) {
   MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(nullptr, Hexagon::R30, 0);
+      MCCFIInstruction::createDefCfa(nullptr,
+          MRI.getDwarfRegNum(Hexagon::R30, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -212,31 +210,138 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
                                                  const MCInstrInfo &MII,
-                                                 const MCRegisterInfo &MRI) {
+                                                 const MCRegisterInfo &MRI)
+{
   if (SyntaxVariant == 0)
-    return (new HexagonInstPrinter(MAI, MII, MRI));
+    return new HexagonInstPrinter(MAI, MII, MRI);
   else
     return nullptr;
 }
 
-static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
-                                                   formatted_raw_ostream &OS,
-                                                   MCInstPrinter *InstPrint,
-                                                   bool IsVerboseAsm) {
-  return new HexagonTargetAsmStreamer(S,  OS, IsVerboseAsm, *InstPrint);
+static MCTargetStreamer *
+createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                          MCInstPrinter *IP, bool IsVerboseAsm) {
+  return new HexagonTargetAsmStreamer(S, OS, IsVerboseAsm, *IP);
 }
 
-static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
-                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
-                                    MCCodeEmitter *Emitter, bool RelaxAll) {
-  return createHexagonELFStreamer(Context, MAB, OS, Emitter);
+static MCStreamer *createMCStreamer(Triple const &T,
+                                    MCContext &Context,
+                                    MCAsmBackend &MAB,
+                                    raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll) {
+  return createHexagonELFStreamer(T, Context, MAB, OS, Emitter);
 }
 
 static MCTargetStreamer *
-createHexagonObjectTargetStreamer(MCStreamer &S, MCSubtargetInfo const &STI) {
+createHexagonObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   return new HexagonTargetELFStreamer(S, STI);
 }
 
+static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
+  uint64_t FB = STI->getFeatureBits().to_ullong();
+  if (FB & (1ULL << F))
+    STI->ToggleFeature(F);
+}
+
+static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
+  uint64_t FB = STI->getFeatureBits().to_ullong();
+  return (FB & (1ULL << F)) != 0;
+}
+
+StringRef Hexagon_MC::ParseHexagonTriple(const Triple &TT, StringRef CPU) {
+  StringRef CPUName = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  StringRef FS = "";
+  if (EnableHVX) {
+    if (CPUName.equals_lower("hexagonv60") ||
+        CPUName.equals_lower("hexagonv62"))
+      FS = "+hvx";
+  }
+  return FS;
+}
+
+static bool isCPUValid(std::string CPU)
+{
+  std::vector<std::string> table
+  {
+    "hexagonv4",
+    "hexagonv5",
+    "hexagonv55",
+    "hexagonv60",
+    "hexagonv62",
+  };
+
+  return std::find(table.begin(), table.end(), CPU) != table.end();
+}
+
+MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
+                                                          StringRef CPU,
+                                                          StringRef FS) {
+  StringRef ArchFS = (FS.size()) ? FS : Hexagon_MC::ParseHexagonTriple(TT, CPU);
+  StringRef CPUName = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  if (!isCPUValid(CPUName.str())) {
+    errs() << "error: invalid CPU \"" << CPUName.str().c_str()
+           << "\" specified\n";
+    return nullptr;
+  }
+
+  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
+  if (X->getFeatureBits()[Hexagon::ExtensionHVXDbl]) {
+    llvm::FeatureBitset Features = X->getFeatureBits();
+    X->setFeatureBits(Features.set(Hexagon::ExtensionHVX));
+  }
+  return X;
+}
+
+unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
+  static std::map<StringRef,unsigned> ElfFlags = {
+    {"hexagonv4",  ELF::EF_HEXAGON_MACH_V4},
+    {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
+    {"hexagonv55", ELF::EF_HEXAGON_MACH_V55},
+    {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
+    {"hexagonv62", ELF::EF_HEXAGON_MACH_V62},
+  };
+
+  auto F = ElfFlags.find(STI.getCPU());
+  assert(F != ElfFlags.end() && "Unrecognized Architecture");
+  return F->second;
+}
+
+namespace {
+class HexagonMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  HexagonMCInstrAnalysis(MCInstrInfo const *Info) : MCInstrAnalysis(Info) {}
+
+  bool isUnconditionalBranch(MCInst const &Inst) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  bool isConditionalBranch(MCInst const &Inst) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  bool evaluateBranch(MCInst const &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    if(!HexagonMCInstrInfo::isExtendable(*Info, Inst))
+      return false;
+    auto const &Extended(HexagonMCInstrInfo::getExtendableOperand(*Info, Inst));
+    assert(Extended.isExpr());
+    int64_t Value;
+    if(!Extended.getExpr()->evaluateAsAbsolute(Value))
+      return false;
+    Target = Value;
+    return true;
+  }
+};
+}
+
+static MCInstrAnalysis *createHexagonMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new HexagonMCInstrAnalysis(Info);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
@@ -252,7 +357,7 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
 
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(getTheHexagonTarget(),
-                                          createHexagonMCSubtargetInfo);
+    Hexagon_MC::createHexagonMCSubtargetInfo);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(getTheHexagonTarget(),
@@ -262,8 +367,18 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   TargetRegistry::RegisterMCAsmBackend(getTheHexagonTarget(),
                                        createHexagonAsmBackend);
 
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(getTheHexagonTarget(),
+                                          createHexagonMCInstrAnalysis);
+
   // Register the obj streamer
-  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(), createMCStreamer);
+  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(),
+                                      createMCStreamer);
+
+  // Register the obj target streamer
+  TargetRegistry::RegisterObjectTargetStreamer(getTheHexagonTarget(),
+                                      createHexagonObjectTargetStreamer);
 
   // Register the asm streamer
   TargetRegistry::RegisterAsmTargetStreamer(getTheHexagonTarget(),
@@ -272,7 +387,4 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC Inst Printer
   TargetRegistry::RegisterMCInstPrinter(getTheHexagonTarget(),
                                         createHexagonMCInstPrinter);
-
-  TargetRegistry::RegisterObjectTargetStreamer(
-      getTheHexagonTarget(), createHexagonObjectTargetStreamer);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 6e677e9d9f86..6bb69be6142e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -41,6 +41,18 @@ extern cl::opt<bool> HexagonDisableDuplex;
 extern const InstrStage HexagonStages[];
 
 MCInstrInfo *createHexagonMCInstrInfo();
+MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT);
+
+namespace Hexagon_MC {
+  StringRef ParseHexagonTriple(const Triple &TT, StringRef CPU);
+  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+
+  /// Create a Hexagon MCSubtargetInfo instance. This is exposed so Asm parser,
+  /// etc. do not need to go through TargetRegistry.
+  MCSubtargetInfo *createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                                StringRef FS);
+  unsigned GetELFFlags(const MCSubtargetInfo &STI);
+}
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCRegisterInfo &MRI,
@@ -54,13 +66,9 @@ MCAsmBackend *createHexagonAsmBackend(const Target &T,
 MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
                                              uint8_t OSABI, StringRef CPU);
 
-namespace Hexagon_MC {
-
-  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
-
-} // end namespace Hexagon_MC
+unsigned HexagonGetLastSlot();
 
-} // end namespace llvm
+} // End llvm namespace
 
 // Define symbolic names for Hexagon registers.  This defines a mapping from
 // register name to register number.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 88f37d620dcf..853f76213d38 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "HexagonShuffler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -37,16 +38,16 @@ class HexagonBid {
   unsigned Bid;
 
 public:
-  HexagonBid() : Bid(0){};
-  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; };
+  HexagonBid() : Bid(0){}
+  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; }
 
   // Check if the insn priority is overflowed.
-  bool isSold() const { return (Bid >= MAX); };
+  bool isSold() const { return (Bid >= MAX); }
 
   HexagonBid &operator+=(const HexagonBid &B) {
     Bid += B.Bid;
     return *this;
-  };
+  }
 };
 
 // Slot shuffling allocation.
@@ -56,7 +57,7 @@ class HexagonUnitAuction {
   unsigned isSold : HEXAGON_PACKET_SIZE;
 
 public:
-  HexagonUnitAuction() : isSold(0){};
+  HexagonUnitAuction(unsigned cs = 0) : isSold(cs){};
 
   // Allocate slots.
   bool bid(unsigned B) {
@@ -70,29 +71,29 @@ public:
           isSold |= Scores[i].isSold() << i;
         }
       return true;
-      ;
     } else
       // Error if the desired slots are already full.
       return false;
-  };
+  }
 };
 } // end anonymous namespace
 
 unsigned HexagonResource::setWeight(unsigned s) {
   const unsigned SlotWeight = 8;
   const unsigned MaskWeight = SlotWeight - 1;
-  bool Key = (1 << s) & getUnits();
-
-  // TODO: Improve this API so that we can prevent misuse statically.
-  assert(SlotWeight * s < 32 && "Argument to setWeight too large.");
+  unsigned Units = getUnits();
+  unsigned Key = ((1u << s) & Units) != 0;
 
   // Calculate relative weight of the insn for the given slot, weighing it the
   // heavier the more restrictive the insn is and the lowest the slots that the
   // insn may be executed in.
-  Weight =
-      (Key << (SlotWeight * s)) * ((MaskWeight - countPopulation(getUnits()))
-                                   << countTrailingZeros(getUnits()));
-  return (Weight);
+  if (Key == 0 || Units == 0 || (SlotWeight*s >= 32))
+    return Weight = 0;
+
+  unsigned Ctpop = countPopulation(Units);
+  unsigned Cttz = countTrailingZeros(Units);
+  Weight = (1u << (SlotWeight * s)) * ((MaskWeight - Ctpop) << Cttz);
+  return Weight;
 }
 
 void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
@@ -104,7 +105,10 @@ void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
   (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
   (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
   (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
-  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] =
+      (CPU == "hexagonv60" || CPU == "hexagonv61" || CPU == "hexagonv61v1") ?
+      UnitsAndLanes(CVI_SHIFT, 1) :
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_LD] =
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
@@ -141,6 +145,40 @@ HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
   }
 }
 
+struct CVIUnits {
+  unsigned Units;
+  unsigned Lanes;
+};
+typedef SmallVector<struct CVIUnits, 8> HVXInstsT;
+
+static unsigned makeAllBits(unsigned startBit, unsigned Lanes)
+
+{
+  for (unsigned i = 1 ; i < Lanes ; ++i)
+    startBit = (startBit << 1) | startBit;
+  return startBit;
+}
+
+static bool checkHVXPipes(const HVXInstsT& hvxInsts, unsigned startIdx, unsigned usedUnits)
+
+{
+  if (startIdx < hvxInsts.size()) {
+    if (!hvxInsts[startIdx].Units)
+      return checkHVXPipes(hvxInsts, startIdx + 1, usedUnits);
+    for (unsigned b = 0x1 ; b <= 0x8 ; b <<= 1) {
+      if ((hvxInsts[startIdx].Units & b) == 0)
+        continue;
+      unsigned allBits = makeAllBits(b, hvxInsts[startIdx].Lanes);
+      if ((allBits & usedUnits) == 0) {
+        if (checkHVXPipes(hvxInsts, startIdx + 1, usedUnits | allBits))
+          return true;
+      }
+    }
+    return false;
+  }
+  return true;
+}
+
 HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
     : MCII(MCII), STI(STI) {
@@ -154,21 +192,82 @@ void HexagonShuffler::reset() {
   Error = SHUFFLE_SUCCESS;
 }
 
-void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
-                             unsigned S, bool X) {
-  HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
+void HexagonShuffler::append(MCInst const &ID, MCInst const *Extender,
+                             unsigned S) {
+  HexagonInstr PI(&TUL, MCII, &ID, Extender, S);
 
   Packet.push_back(PI);
 }
 
+static struct {
+  unsigned first;
+  unsigned second;
+} jumpSlots[] = { {8, 4}, {8, 2}, {8, 1}, {4, 2}, {4, 1}, {2, 1} };
+#define MAX_JUMP_SLOTS (sizeof(jumpSlots)/sizeof(jumpSlots[0]))
+
+namespace {
+bool isDuplexAGroup(unsigned Opcode) {
+  switch (Opcode) {
+  case Hexagon::SA1_addi:
+  case Hexagon::SA1_addrx:
+  case Hexagon::SA1_addsp:
+  case Hexagon::SA1_and1:
+  case Hexagon::SA1_clrf:
+  case Hexagon::SA1_clrfnew:
+  case Hexagon::SA1_clrt:
+  case Hexagon::SA1_clrtnew:
+  case Hexagon::SA1_cmpeqi:
+  case Hexagon::SA1_combine0i:
+  case Hexagon::SA1_combine1i:
+  case Hexagon::SA1_combine2i:
+  case Hexagon::SA1_combine3i:
+  case Hexagon::SA1_combinerz:
+  case Hexagon::SA1_combinezr:
+  case Hexagon::SA1_dec:
+  case Hexagon::SA1_inc:
+  case Hexagon::SA1_seti:
+  case Hexagon::SA1_setin1:
+  case Hexagon::SA1_sxtb:
+  case Hexagon::SA1_sxth:
+  case Hexagon::SA1_tfr:
+  case Hexagon::SA1_zxtb:
+  case Hexagon::SA1_zxth:
+    return true;
+    break;
+  default:
+    return false;
+  }
+}
+
+unsigned countNeitherAnorX(MCInstrInfo const &MCII, MCInst const &ID) {
+  unsigned Result = 0;
+  unsigned Type = HexagonMCInstrInfo::getType(MCII, ID);
+  if (Type == HexagonII::TypeDUPLEX) {
+    unsigned subInst0Opcode = ID.getOperand(0).getInst()->getOpcode();
+    unsigned subInst1Opcode = ID.getOperand(1).getInst()->getOpcode();
+    Result += !isDuplexAGroup(subInst0Opcode);
+    Result += !isDuplexAGroup(subInst1Opcode);
+  } else
+    Result += Type != HexagonII::TypeALU32_2op &&
+              Type != HexagonII::TypeALU32_3op &&
+              Type != HexagonII::TypeALU32_ADDI &&
+              Type != HexagonII::TypeS_2op &&
+              Type != HexagonII::TypeS_3op &&
+              Type != HexagonII::TypeALU64 &&
+              (Type != HexagonII::TypeM ||
+               HexagonMCInstrInfo::isFloat(MCII, ID));
+  return Result;
+}
+}
+
 /// Check that the packet is legal and enforce relative insn order.
 bool HexagonShuffler::check() {
   // Descriptive slot masks.
   const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1, slotOne = 0x2,
-                 slotThree = 0x8, slotFirstJump = 0x8, slotLastJump = 0x4,
+                 slotThree = 0x8, //slotFirstJump = 0x8,
                  slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
   // Highest slots for branches and stores used to keep their original order.
-  unsigned slotJump = slotFirstJump;
+  //unsigned slotJump = slotFirstJump;
   unsigned slotLoadStore = slotFirstLoadStore;
   // Number of branches, solo branches, indirect branches.
   unsigned jumps = 0, jump1 = 0;
@@ -188,36 +287,41 @@ bool HexagonShuffler::check() {
   unsigned onlyNo1 = 0;
   unsigned xtypeFloat = 0;
   unsigned pSlot3Cnt = 0;
+  unsigned memops = 0;
+  unsigned deallocs = 0;
   iterator slot3ISJ = end();
+  std::vector<iterator> foundBranches;
+  unsigned reservedSlots = 0;
 
   // Collect information from the insns in the packet.
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const *ID = ISJ->getDesc();
-
-    if (HexagonMCInstrInfo::isSolo(MCII, *ID))
-      solo += !ISJ->isSoloException();
-    else if (HexagonMCInstrInfo::isSoloAX(MCII, *ID))
-      onlyAX += !ISJ->isSoloException();
-    else if (HexagonMCInstrInfo::isSoloAin1(MCII, *ID))
-      onlyAin1 += !ISJ->isSoloException();
-    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32 &&
-        HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeXTYPE)
-      ++neitherAnorX;
-    if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID)) {
+    MCInst const &ID = ISJ->getDesc();
+
+    if (HexagonMCInstrInfo::isSolo(MCII, ID))
+      solo++;
+    else if (HexagonMCInstrInfo::isSoloAX(MCII, ID))
+      onlyAX++;
+    else if (HexagonMCInstrInfo::isSoloAin1(MCII, ID))
+      onlyAin1++;
+    neitherAnorX += countNeitherAnorX(MCII, ID);
+    if (HexagonMCInstrInfo::prefersSlot3(MCII, ID)) {
       ++pSlot3Cnt;
       slot3ISJ = ISJ;
     }
-    if (HexagonMCInstrInfo::isCofMax1(MCII, *ID))
+    reservedSlots |= HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID);
+    if (HexagonMCInstrInfo::isCofMax1(MCII, ID))
       ++jump1;
 
-    switch (HexagonMCInstrInfo::getType(MCII, *ID)) {
-    case HexagonII::TypeXTYPE:
-      if (HexagonMCInstrInfo::isFloat(MCII, *ID))
+    switch (HexagonMCInstrInfo::getType(MCII, ID)) {
+    case HexagonII::TypeS_2op:
+    case HexagonII::TypeS_3op:
+    case HexagonII::TypeALU64:
+      if (HexagonMCInstrInfo::isFloat(MCII, ID))
         ++xtypeFloat;
       break;
-    case HexagonII::TypeJR:
     case HexagonII::TypeJ:
       ++jumps;
+      foundBranches.push_back(ISJ);
       break;
     case HexagonII::TypeCVI_VM_VP_LDU:
       ++onlyNo1;
@@ -228,10 +332,14 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
-      if (ISJ->Core.getUnits() == slotSingleLoad)
+      if (ISJ->Core.getUnits() == slotSingleLoad ||
+          HexagonMCInstrInfo::getType(MCII, ID) ==
+              HexagonII::TypeCVI_VM_VP_LDU)
         ++load0;
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
-        ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isReturn()) {
+        ++deallocs, ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+        foundBranches.push_back(ISJ);
+      }
       break;
     case HexagonII::TypeCVI_VM_STU:
       ++onlyNo1;
@@ -241,27 +349,66 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeST:
       ++stores;
       ++memory;
-      if (ISJ->Core.getUnits() == slotSingleStore)
+      if (ISJ->Core.getUnits() == slotSingleStore ||
+          HexagonMCInstrInfo::getType(MCII, ID) == HexagonII::TypeCVI_VM_STU)
         ++store0;
       break;
     case HexagonII::TypeV4LDST:
       ++loads;
       ++stores;
       ++store1;
+      ++memops;
       ++memory;
       break;
-    case HexagonII::TypeNV:
+    case HexagonII::TypeNCJ:
       ++memory; // NV insns are memory-like.
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch())
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isBranch()) {
         ++jumps, ++jump1;
+        foundBranches.push_back(ISJ);
+      }
+      break;
+    case HexagonII::TypeV2LDST:
+      if(HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+        ++loads;
+        ++memory;
+        if (ISJ->Core.getUnits() == slotSingleLoad ||
+            HexagonMCInstrInfo::getType(MCII,ID) ==
+                HexagonII::TypeCVI_VM_VP_LDU)
+          ++load0;
+      }
+      else {
+        assert(HexagonMCInstrInfo::getDesc(MCII, ID).mayStore());
+        ++memory;
+        ++stores;
+      }
       break;
     case HexagonII::TypeCR:
     // Legacy conditional branch predicated on a register.
-    case HexagonII::TypeSYSTEM:
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad())
-        ++loads;
+    case HexagonII::TypeCJ:
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      break;
+    case HexagonII::TypeDUPLEX: {
+      ++duplex;
+      MCInst const &Inst0 = *ID.getOperand(0).getInst();
+      MCInst const &Inst1 = *ID.getOperand(1).getInst();
+      if (HexagonMCInstrInfo::isCofMax1(MCII, Inst0))
+        ++jump1;
+      if (HexagonMCInstrInfo::isCofMax1(MCII, Inst1))
+        ++jump1;
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst0).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst1).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
       break;
     }
+    }
   }
 
   // Check if the packet is legal.
@@ -277,12 +424,20 @@ bool HexagonShuffler::check() {
     Error = SHUFFLE_ERROR_BRANCHES;
     return false;
   }
+  if (memops && stores > 1) {
+    Error = SHUFFLE_ERROR_STORE_LOAD_CONFLICT;
+    return false;
+  }
+  if (deallocs && stores) {
+    Error = SHUFFLE_ERROR_STORE_LOAD_CONFLICT;
+    return false;
+  }
 
   // Modify packet accordingly.
   // TODO: need to reserve slots #0 and #1 for duplex insns.
   bool bOnlySlot3 = false;
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const *ID = ISJ->getDesc();
+    MCInst const &ID = ISJ->getDesc();
 
     if (!ISJ->Core.getUnits()) {
       // Error if insn may not be executed in any slot.
@@ -291,40 +446,26 @@ bool HexagonShuffler::check() {
     }
 
     // Exclude from slot #1 any insn but A2_nop.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).getOpcode() != Hexagon::A2_nop)
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).getOpcode() != Hexagon::A2_nop)
       if (onlyNo1)
         ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
 
     // Exclude from slot #1 any insn but A-type.
-    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32)
+    if (HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_2op &&
+        HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_3op &&
+        HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_ADDI)
       if (onlyAin1)
         ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
 
-    // Branches must keep the original order.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch() ||
-        HexagonMCInstrInfo::getDesc(MCII, *ID).isCall())
-      if (jumps > 1) {
-        if (slotJump < slotLastJump) {
-          // Error if indirect branch with another branch or
-          // no more slots available for branches.
-          Error = SHUFFLE_ERROR_BRANCHES;
-          return false;
-        }
-        // Pin the branch to the highest slot available to it.
-        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotJump);
-        // Update next highest slot available to branches.
-        slotJump >>= 1;
-      }
-
     // A single load must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad()) {
-      if (loads == 1 && loads == memory)
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+      if (loads == 1 && loads == memory && memops == 0)
         // Pin the load to slot #0.
         ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
     }
 
     // A single store must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayStore()) {
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayStore()) {
       if (!store0) {
         if (stores == 1)
           ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
@@ -347,7 +488,7 @@ bool HexagonShuffler::check() {
       }
     }
 
-    // flag if an instruction can only be executed in slot 3
+    // flag if an instruction requires to be in slot 3
     if (ISJ->Core.getUnits() == slotThree)
       bOnlySlot3 = true;
 
@@ -358,14 +499,61 @@ bool HexagonShuffler::check() {
     }
   }
 
+  // preserve branch order
   bool validateSlots = true;
-  if (bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+  if (jumps > 1) {
+    if (foundBranches.size() > 2) {
+      Error = SHUFFLE_ERROR_BRANCHES;
+      return false;
+    }
+
+    // try all possible choices
+    for (unsigned int i = 0 ; i < MAX_JUMP_SLOTS ; ++i) {
+      // validate first jump with this slot rule
+      if (!(jumpSlots[i].first & foundBranches[0]->Core.getUnits()))
+        continue;
+
+      // validate second jump with this slot rule
+      if (!(jumpSlots[i].second & foundBranches[1]->Core.getUnits()))
+        continue;
+
+      // both valid for this configuration, set new slot rules
+      PacketSave = Packet;
+      foundBranches[0]->Core.setUnits(jumpSlots[i].first);
+      foundBranches[1]->Core.setUnits(jumpSlots[i].second);
+
+      HexagonUnitAuction AuctionCore(reservedSlots);
+      std::sort(begin(), end(), HexagonInstr::lessCore);
+
+      // see if things ok with that instruction being pinned to slot "slotJump"
+      bool bFail = false;
+      for (iterator I = begin(); I != end() && bFail != true; ++I)
+        if (!AuctionCore.bid(I->Core.getUnits()))
+          bFail = true;
+
+      // if yes, great, if not then restore original slot mask
+      if (!bFail) {
+        validateSlots = false; // all good, no need to re-do auction
+        break;
+      }
+      else
+        // restore original values
+        Packet = PacketSave;
+    }
+    if (validateSlots == true) {
+      Error = SHUFFLE_ERROR_NOSLOTS;
+      return false;
+    }
+  }
+
+  if (jumps <= 1 && bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+    validateSlots = true;
     // save off slot mask of instruction marked with A_PREFER_SLOT3
     // and then pin it to slot #3
     unsigned saveUnits = slot3ISJ->Core.getUnits();
     slot3ISJ->Core.setUnits(saveUnits & slotThree);
 
-    HexagonUnitAuction AuctionCore;
+    HexagonUnitAuction AuctionCore(reservedSlots);
     std::sort(begin(), end(), HexagonInstr::lessCore);
 
     // see if things ok with that instruction being pinned to slot #3
@@ -379,16 +567,16 @@ bool HexagonShuffler::check() {
       validateSlots = false; // all good, no need to re-do auction
     else
       for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-        MCInst const *ID = ISJ->getDesc();
-        if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID))
+        MCInst const &ID = ISJ->getDesc();
+        if (HexagonMCInstrInfo::prefersSlot3(MCII, ID))
           ISJ->Core.setUnits(saveUnits);
       }
   }
 
-  // Check if any slot, core, is over-subscribed.
+  // Check if any slot, core or CVI, is over-subscribed.
   // Verify the core slot subscriptions.
   if (validateSlots) {
-    HexagonUnitAuction AuctionCore;
+    HexagonUnitAuction AuctionCore(reservedSlots);
 
     std::sort(begin(), end(), HexagonInstr::lessCore);
 
@@ -399,17 +587,27 @@ bool HexagonShuffler::check() {
       }
   }
   // Verify the CVI slot subscriptions.
-  {
-    HexagonUnitAuction AuctionCVI;
-
-    std::sort(begin(), end(), HexagonInstr::lessCVI);
-
-    for (iterator I = begin(); I != end(); ++I)
-      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
-        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
-          Error = SHUFFLE_ERROR_SLOTS;
-          return false;
-        }
+  std::sort(begin(), end(), HexagonInstr::lessCVI);
+  // create vector of hvx instructions to check
+  HVXInstsT hvxInsts;
+  hvxInsts.clear();
+  for (iterator I = begin(); I != end(); ++I) {
+    struct CVIUnits inst;
+    inst.Units = I->CVI.getUnits();
+    inst.Lanes = I->CVI.getLanes();
+    if (inst.Units == 0)
+      continue; // not an hvx inst or an hvx inst that doesn't uses any pipes
+    hvxInsts.push_back(inst);
+  }
+  // if there are any hvx instructions in this packet, check pipe usage
+  if (hvxInsts.size() > 0) {
+    unsigned startIdx, usedUnits;
+    startIdx = usedUnits = 0x0;
+    if (checkHVXPipes(hvxInsts, startIdx, usedUnits) == false) {
+      // too many pipes used to be valid
+      Error = SHUFFLE_ERROR_SLOTS;
+      return false;
+    }
   }
 
   Error = SHUFFLE_SUCCESS;
@@ -452,10 +650,12 @@ bool HexagonShuffler::shuffle() {
     }
 
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
-    DEBUG(dbgs().write_hex(ISJ->Core.getUnits());
-          dbgs() << ':'
-                 << HexagonMCInstrInfo::getDesc(MCII, *ISJ->getDesc())
-                        .getOpcode();
+    DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
+      dbgs() << '/';
+      dbgs().write_hex(ISJ->CVI.getUnits()) << '|';
+      dbgs() << ISJ->CVI.getLanes();
+    } dbgs() << ':'
+             << HexagonMCInstrInfo::getDesc(MCII, ISJ->getDesc()).getOpcode();
           dbgs() << '\n');
   DEBUG(dbgs() << '\n');
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index a093f8545132..36e8fa19d467 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -35,7 +35,8 @@ public:
   HexagonResource(unsigned s) { setUnits(s); };
 
   void setUnits(unsigned s) {
-    Slots = s & ~(~0U << HEXAGON_PACKET_SIZE);
+    Slots = s & ((1u << HEXAGON_PACKET_SIZE) - 1);
+    setWeight(s);
   };
   unsigned setWeight(unsigned s);
 
@@ -44,7 +45,8 @@ public:
 
   // Check if the resources are in ascending slot order.
   static bool lessUnits(const HexagonResource &A, const HexagonResource &B) {
-    return (countPopulation(A.getUnits()) < countPopulation(B.getUnits()));
+    return (countPopulation(A.getUnits()) <
+            countPopulation(B.getUnits()));
   };
   // Check if the resources are in ascending weight order.
   static bool lessWeight(const HexagonResource &A, const HexagonResource &B) {
@@ -86,10 +88,10 @@ public:
                      unsigned s, MCInst const *id);
   static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
 
-  bool isValid() const { return (Valid); };
-  unsigned getLanes() const { return (Lanes); };
-  bool mayLoad() const { return (Load); };
-  bool mayStore() const { return (Store); };
+  bool isValid() const { return Valid; };
+  unsigned getLanes() const { return Lanes; };
+  bool mayLoad() const { return Load; };
+  bool mayStore() const { return Store; };
 };
 
 // Handle to an insn used by the shuffling algorithm.
@@ -100,21 +102,17 @@ class HexagonInstr {
   MCInst const *Extender;
   HexagonResource Core;
   HexagonCVIResource CVI;
-  bool SoloException;
 
 public:
   HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
                MCInstrInfo const &MCII, MCInst const *id,
-               MCInst const *Extender, unsigned s, bool x = false)
-      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
-        SoloException(x) {};
+               MCInst const *Extender, unsigned s)
+      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id) {}
 
-  MCInst const *getDesc() const { return (ID); };
+  MCInst const &getDesc() const { return *ID; };
 
   MCInst const *getExtender() const { return Extender; }
 
-  unsigned isSoloException() const { return (SoloException); };
-
   // Check if the handles are in ascending order for shuffling purposes.
   bool operator<(const HexagonInstr &B) const {
     return (HexagonResource::lessWeight(B.Core, Core));
@@ -136,6 +134,7 @@ class HexagonShuffler {
 
   // Insn handles in a bundle.
   HexagonPacket Packet;
+  HexagonPacket PacketSave;
 
   // Shuffling error code.
   unsigned Error;
@@ -178,8 +177,7 @@ public:
   iterator end() { return (Packet.end()); };
 
   // Add insn handle to the bundle .
-  void append(MCInst const *ID, MCInst const *Extender, unsigned S,
-              bool X = false);
+  void append(MCInst const &ID, MCInst const *Extender, unsigned S);
 
   // Return the error code for the last check or shuffling of the bundle.
   void setError(unsigned Err) { Error = Err; };
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index 392871628d98..57ce9fabc5e3 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -11,6 +11,7 @@
 
 #include "RDFCopy.h"
 #include "RDFGraph.h"
+#include "RDFLiveness.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -53,47 +54,12 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
 void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
   CopyMap.insert(std::make_pair(SA.Id, EM));
   Copies.push_back(SA.Id);
-
-  for (auto I : EM) {
-    auto FS = DefM.find(I.second.Reg);
-    if (FS == DefM.end() || FS->second.empty())
-      continue; // Undefined source
-    RDefMap[I.second][SA.Id] = FS->second.top()->Id;
-    // Insert DstR into the map.
-    RDefMap[I.first];
-  }
-}
-
-
-void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
-  RegisterSet RRs;
-  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
-    RRs.insert(RA.Addr->getRegRef(DFG));
-  bool Common = false;
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    Common = true;
-    break;
-  }
-  if (!Common)
-    return;
-
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    auto F = DefM.find(R.first.Reg);
-    if (F == DefM.end() || F->second.empty())
-      continue;
-    R.second[IA.Id] = F->second.top()->Id;
-  }
 }
 
 
 bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
   bool Changed = false;
   auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
-  DFG.markBlock(BA.Id, DefM);
 
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
     if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
@@ -102,20 +68,30 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
       if (interpretAsCopy(SA.Addr->getCode(), EM))
         recordCopy(SA, EM);
     }
-
-    updateMap(IA);
-    DFG.pushDefs(IA, DefM);
   }
 
   MachineDomTreeNode *N = MDT.getNode(B);
   for (auto I : *N)
     Changed |= scanBlock(I->getBlock());
 
-  DFG.releaseBlock(BA.Id, DefM);
   return Changed;
 }
 
 
+NodeId CopyPropagation::getLocalReachingDef(RegisterRef RefRR,
+      NodeAddr<InstrNode*> IA) {
+  NodeAddr<RefNode*> RA = L.getNearestAliasedRef(RefRR, IA);
+  if (RA.Id != 0) {
+    if (RA.Addr->getKind() == NodeAttrs::Def)
+      return RA.Id;
+    assert(RA.Addr->getKind() == NodeAttrs::Use);
+    if (NodeId RD = RA.Addr->getReachingDef())
+      return RD;
+  }
+  return 0;
+}
+
+
 bool CopyPropagation::run() {
   scanBlock(&DFG.getMF().front());
 
@@ -129,14 +105,6 @@ bool CopyPropagation::run() {
                << Print<RegisterRef>(J.second, DFG);
       dbgs() << " }\n";
     }
-    dbgs() << "\nRDef map:\n";
-    for (auto R : RDefMap) {
-      dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
-      for (auto &M : R.second)
-        dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
-               << Print<NodeId>(M.second, DFG);
-      dbgs() << " }\n";
-    }
   }
 
   bool Changed = false;
@@ -176,8 +144,7 @@ bool CopyPropagation::run() {
       if (DR == SR)
         continue;
 
-      auto &RDefSR = RDefMap[SR];
-      NodeId RDefSR_SA = RDefSR[SA.Id];
+      NodeId AtCopy = getLocalReachingDef(SR, SA);
 
       for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
         auto UA = DFG.addr<UseNode*>(N);
@@ -190,7 +157,8 @@ bool CopyPropagation::run() {
 
         NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
         assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
-        if (RDefSR[IA.Id] != RDefSR_SA)
+        NodeId AtUse = getLocalReachingDef(SR, IA);
+        if (AtCopy != AtUse)
           continue;
 
         MachineOperand &Op = UA.Addr->getOp();
@@ -206,8 +174,8 @@ bool CopyPropagation::run() {
         Op.setReg(NewReg);
         Op.setSubReg(0);
         DFG.unlinkUse(UA, false);
-        if (RDefSR_SA != 0) {
-          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(RDefSR_SA));
+        if (AtCopy != 0) {
+          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(AtCopy));
         } else {
           UA.Addr->setReachingDef(0);
           UA.Addr->setSibling(0);
diff --git a/lib/Target/Hexagon/RDFCopy.h b/lib/Target/Hexagon/RDFCopy.h
index 5ece11bd5ce4..bbd625c5f5f6 100644
--- a/lib/Target/Hexagon/RDFCopy.h
+++ b/lib/Target/Hexagon/RDFCopy.h
@@ -11,6 +11,9 @@
 #define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 
 #include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
 #include <map>
 #include <vector>
 
@@ -24,7 +27,7 @@ namespace rdf {
 
   struct CopyPropagation {
     CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
-        Trace(false) {}
+        L(dfg.getMF().getRegInfo(), dfg), Trace(false) {}
 
     virtual ~CopyPropagation() = default;
 
@@ -39,18 +42,16 @@ namespace rdf {
   private:
     const MachineDominatorTree &MDT;
     DataFlowGraph &DFG;
-    DataFlowGraph::DefStackMap DefM;
+    Liveness L;
     bool Trace;
 
-    // map: register -> (map: stmt -> reaching def)
-    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
     // map: statement -> (map: dst reg -> src reg)
     std::map<NodeId, EqualityMap> CopyMap;
     std::vector<NodeId> Copies;
 
     void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
-    void updateMap(NodeAddr<InstrNode*> IA);
     bool scanBlock(MachineBasicBlock *B);
+    NodeId getLocalReachingDef(RegisterRef RefRR, NodeAddr<InstrNode*> IA);
   };
 
 } // end namespace rdf
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 63177d51cada..9aa8ad68e07e 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -62,9 +62,19 @@ bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
     return true;
   if (MI->isPHI())
     return false;
-  for (auto &Op : MI->operands())
+  for (auto &Op : MI->operands()) {
     if (Op.isReg() && MRI.isReserved(Op.getReg()))
       return true;
+    if (Op.isRegMask()) {
+      const uint32_t *BM = Op.getRegMask();
+      for (unsigned R = 0, RN = DFG.getTRI().getNumRegs(); R != RN; ++R) {
+        if (BM[R/32] & (1u << (R%32)))
+          continue;
+        if (MRI.isReserved(R))
+          return true;
+      }
+    }
+  }
   return false;
 }
 
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index fa272ea1a76a..7a2895aa4e8c 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -276,7 +276,7 @@ raw_ostream &operator<< (raw_ostream &OS,
   MachineBasicBlock *BB = P.Obj.Addr->getCode();
   unsigned NP = BB->pred_size();
   std::vector<int> Ns;
-  auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void {
+  auto PrintBBs = [&OS] (std::vector<int> Ns) -> void {
     unsigned N = Ns.size();
     for (int I : Ns) {
       OS << "BB#" << I;
@@ -424,7 +424,7 @@ RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
   if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
     return G.unpack(Ref.PR);
   assert(Ref.Op != nullptr);
-  return G.makeRegRef(Ref.Op->getReg(), Ref.Op->getSubReg());
+  return G.makeRegRef(*Ref.Op);
 }
 
 // Set the register reference in the reference node directly (for references
@@ -617,8 +617,12 @@ bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
 // Check if the definition of RR produces an unspecified value.
 bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
       const {
+  const MachineOperand &Op = In.getOperand(OpNum);
+  if (Op.isRegMask())
+    return true;
+  assert(Op.isReg());
   if (In.isCall())
-    if (In.getOperand(OpNum).isImplicit())
+    if (Op.isDef() && Op.isDead())
       return true;
   return false;
 }
@@ -654,109 +658,6 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
   return false;
 }
 
-RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
-  RegisterId SuperReg = RR.Reg;
-  while (true) {
-    MCSuperRegIterator SR(SuperReg, &TRI, false);
-    if (!SR.isValid())
-      break;
-    SuperReg = *SR;
-  }
-
-  const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
-  LaneBitmask Common = RR.Mask & RC.LaneMask;
-  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
-  LaneBitmask SuperMask = TRI.composeSubRegIndexLaneMask(Sub, Common);
-  return RegisterRef(SuperReg, SuperMask);
-}
-
-bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
-  RegisterRef NR = normalize(RR);
-  auto F = Masks.find(NR.Reg);
-  if (F != Masks.end()) {
-    if ((F->second & NR.Mask).any())
-      return true;
-  }
-  if (CheckUnits) {
-    for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U)
-      if (ExpAliasUnits.test(*U))
-        return true;
-  }
-  return false;
-}
-
-bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
-  // Always have a cover for empty lane mask.
-  RegisterRef NR = normalize(RR);
-  if (NR.Mask.none())
-    return true;
-  auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    return false;
-  return (NR.Mask & F->second) == NR.Mask;
-}
-
-RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
-  RegisterRef NR = normalize(RR);
-  auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    Masks.insert({NR.Reg, NR.Mask});
-  else
-    F->second |= NR.Mask;
-
-  // Visit all register units to see if there are any that were created
-  // by explicit aliases. Add those that were to the bit vector.
-  for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U) {
-    MCRegUnitRootIterator R(*U, &TRI);
-    ++R;
-    if (!R.isValid())
-      continue;
-    ExpAliasUnits.set(*U);
-    CheckUnits = true;
-  }
-  return *this;
-}
-
-RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
-  for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
-    insert(RegisterRef(P.first, P.second));
-  return *this;
-}
-
-RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
-  RegisterRef NR = normalize(RR);
-  auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    return *this;
-  LaneBitmask NewM = F->second & ~NR.Mask;
-  if (NewM.none())
-    Masks.erase(F);
-  else
-    F->second = NewM;
-  return *this;
-}
-
-RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
-  for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
-    clear(RegisterRef(P.first, P.second));
-  return *this;
-}
-
-RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
-  RegisterAggr T(TRI);
-  T.insert(RR).clear(*this);
-  if (T.empty())
-    return RegisterRef();
-  return RegisterRef(T.begin()->first, T.begin()->second);
-}
-
-void RegisterAggr::print(raw_ostream &OS) const {
-  OS << '{';
-  for (auto I : Masks)
-    OS << ' ' << PrintReg(I.first, &TRI) << PrintLaneMaskOpt(I.second);
-  OS << " }";
-}
-
 //
 // The data flow graph construction.
 //
@@ -764,7 +665,8 @@ void RegisterAggr::print(raw_ostream &OS) const {
 DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
       const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
       const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
-    : MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
+    : MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(toi),
+      LiveIns(PRI) {
 }
 
 // The implementation of the definition stack.
@@ -857,17 +759,6 @@ unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
 
 // Register information.
 
-// Get the list of references aliased to RR. Lane masks are ignored.
-RegisterSet DataFlowGraph::getAliasSet(RegisterId Reg) const {
-  // Do not include RR in the alias set.
-  RegisterSet AS;
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-
-  for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
-    AS.insert(RegisterRef(*AI));
-  return AS;
-}
-
 RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
   RegisterSet LR;
   const Function &F = *MF.getFunction();
@@ -1010,11 +901,22 @@ void DataFlowGraph::build(unsigned Options) {
   BlockRefsMap RefM;
   buildBlockRefs(EA, RefM);
 
-  // Add function-entry phi nodes.
+  // Collect function live-ins and entry block live-ins.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+  MachineBasicBlock &EntryB = *EA.Addr->getCode();
+  assert(EntryB.pred_empty() && "Function entry block has predecessors");
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
+    LiveIns.insert(RegisterRef(I->first));
+  if (MRI.tracksLiveness()) {
+    for (auto I : EntryB.liveins())
+      LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
+  }
+
+  // Add function-entry phi nodes for the live-in registers.
+  //for (std::pair<RegisterId,LaneBitmask> P : LiveIns) {
+  for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
+    RegisterRef RR = *I;
     NodeAddr<PhiNode*> PA = newPhi(EA);
-    RegisterRef RR = RegisterRef(I->first);
     uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
     NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
     PA.Addr->addMember(DA, *this);
@@ -1071,27 +973,19 @@ void DataFlowGraph::build(unsigned Options) {
 }
 
 RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(PhysicalRegisterInfo::isRegMaskId(Reg) ||
+         TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Reg != 0);
   if (Sub != 0)
     Reg = TRI.getSubReg(Reg, Sub);
   return RegisterRef(Reg);
 }
 
-RegisterRef DataFlowGraph::normalizeRef(RegisterRef RR) const {
-  // FIXME copied from RegisterAggr
-  RegisterId SuperReg = RR.Reg;
-  while (true) {
-    MCSuperRegIterator SR(SuperReg, &TRI, false);
-    if (!SR.isValid())
-      break;
-    SuperReg = *SR;
-  }
-
-  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
-  const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
-  LaneBitmask SuperMask = RR.Mask &
-                          TRI.composeSubRegIndexLaneMask(Sub, RC.LaneMask);
-  return RegisterRef(SuperReg, SuperMask);
+RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
+  assert(Op.isReg() || Op.isRegMask());
+  if (Op.isReg())
+    return makeRegRef(Op.getReg(), Op.getSubReg());
+  return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll());
 }
 
 RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
@@ -1100,13 +994,13 @@ RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
     return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
   }
 #ifndef NDEBUG
-  RegisterRef NAR = normalizeRef(AR);
-  RegisterRef NBR = normalizeRef(BR);
-  assert(NAR.Reg != NBR.Reg);
+//  RegisterRef NAR = PRI.normalize(AR);
+//  RegisterRef NBR = PRI.normalize(BR);
+//  assert(NAR.Reg != NBR.Reg);
 #endif
   // This isn't strictly correct, because the overlap may happen in the
   // part masked out.
-  if (TRI.regsOverlap(AR.Reg, BR.Reg))
+  if (PRI.alias(AR, BR))
     return AR;
   return RegisterRef();
 }
@@ -1137,11 +1031,61 @@ void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
 
 // Push all definitions from the instruction node IA to an appropriate
 // stack in DefM.
+void DataFlowGraph::pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  pushClobbers(IA, DefM);
+  pushDefs(IA, DefM);
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  NodeSet Visited;
+  std::set<RegisterId> Defined;
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) {
+    if (Visited.count(DA.Id))
+      continue;
+    if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering))
+      continue;
+
+    NodeList Rel = getRelatedRefs(IA, DA);
+    NodeAddr<DefNode*> PDA = Rel.front();
+    RegisterRef RR = PDA.Addr->getRegRef(*this);
+
+    // Push the definition on the stack for the register and all aliases.
+    // The def stack traversal in linkNodeUp will check the exact aliasing.
+    DefM[RR.Reg].push(DA);
+    Defined.insert(RR.Reg);
+    for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
+      // Check that we don't push the same def twice.
+      assert(A != RR.Reg);
+      if (!Defined.count(A))
+        DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (NodeAddr<NodeBase*> T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
 void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
-  NodeList Defs = IA.Addr->members_if(IsDef, *this);
   NodeSet Visited;
 #ifndef NDEBUG
-  RegisterSet Defined;
+  std::set<RegisterId> Defined;
 #endif
 
   // The important objectives of this function are:
@@ -1156,9 +1100,11 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
   //     unspecified order), but the order does not matter from the data-
   //     -flow perspective.
 
-  for (NodeAddr<DefNode*> DA : Defs) {
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) {
     if (Visited.count(DA.Id))
       continue;
+    if (DA.Addr->getFlags() & NodeAttrs::Clobbering)
+      continue;
 
     NodeList Rel = getRelatedRefs(IA, DA);
     NodeAddr<DefNode*> PDA = Rel.front();
@@ -1166,7 +1112,7 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
 #ifndef NDEBUG
     // Assert if the register is defined in two or more unrelated defs.
     // This could happen if there are two or more def operands defining it.
-    if (!Defined.insert(RR).second) {
+    if (!Defined.insert(RR.Reg).second) {
       MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
       dbgs() << "Multiple definitions of register: "
              << Print<RegisterRef>(RR, *this) << " in\n  " << *MI
@@ -1177,10 +1123,10 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
     // Push the definition on the stack for the register and all aliases.
     // The def stack traversal in linkNodeUp will check the exact aliasing.
     DefM[RR.Reg].push(DA);
-    for (RegisterRef A : getAliasSet(RR.Reg /*FIXME? use RegisterRef*/)) {
+    for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
       // Check that we don't push the same def twice.
-      assert(A != RR);
-      DefM[A.Reg].push(DA);
+      assert(A != RR.Reg);
+      DefM[A].push(DA);
     }
     // Mark all the related defs as visited.
     for (NodeAddr<NodeBase*> T : Rel)
@@ -1203,59 +1149,6 @@ NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
   return Refs;
 }
 
-// Return true if RA and RB overlap, false otherwise.
-bool DataFlowGraph::alias(RegisterRef RA, RegisterRef RB) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
-  assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
-
-  MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
-  MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
-  // Reg units are returned in the numerical order.
-  while (UMA.isValid() && UMB.isValid()) {
-    std::pair<uint32_t,LaneBitmask> PA = *UMA;
-    std::pair<uint32_t,LaneBitmask> PB = *UMB;
-    if (PA.first == PB.first) {
-      // Lane mask of 0 (given by the iterator) should be treated as "full".
-      // This can happen when the register has only one unit, or when the
-      // unit corresponds to explicit aliasing. In such cases, the lane mask
-      // from RegisterRef should be ignored.
-      if (PA.second.none() || PB.second.none())
-        return true;
-
-      // At this point the common unit corresponds to a subregister. The lane
-      // masks correspond to the lane mask of that unit within the original
-      // register, for example assuming register quadruple q0 = r3:0, and
-      // a register pair d1 = r3:2, the lane mask of r2 in q0 may be 0b0100,
-      // while the lane mask of r2 in d1 may be 0b0001.
-      LaneBitmask LA = PA.second & RA.Mask;
-      LaneBitmask LB = PB.second & RB.Mask;
-      if (LA.any() && LB.any()) {
-        unsigned Root = *MCRegUnitRootIterator(PA.first, &TRI);
-        // If register units were guaranteed to only have 1 bit in any lane
-        // mask, the code below would not be necessary. This is because LA
-        // and LB would have at most 1 bit set each, and that bit would be
-        // guaranteed to correspond to the given register unit.
-        uint32_t SubA = TRI.getSubRegIndex(RA.Reg, Root);
-        uint32_t SubB = TRI.getSubRegIndex(RB.Reg, Root);
-        const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(Root);
-        LaneBitmask MaskA = TRI.reverseComposeSubRegIndexLaneMask(SubA, LA);
-        LaneBitmask MaskB = TRI.reverseComposeSubRegIndexLaneMask(SubB, LB);
-        if ((MaskA & MaskB & RC.LaneMask).any())
-          return true;
-      }
-
-      ++UMA;
-      ++UMB;
-      continue;
-    }
-    if (PA.first < PB.first)
-      ++UMA;
-    else if (PB.first < PA.first)
-      ++UMB;
-  }
-  return false;
-}
-
 // Clear all information in the graph.
 void DataFlowGraph::reset() {
   Memory.clear();
@@ -1370,58 +1263,53 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     if (In.isCall())
       return true;
     // Is tail call?
-    if (In.isBranch())
+    if (In.isBranch()) {
       for (const MachineOperand &Op : In.operands())
         if (Op.isGlobal() || Op.isSymbol())
           return true;
+      // Assume indirect branches are calls. This is for the purpose of
+      // keeping implicit operands, and so it won't hurt on intra-function
+      // indirect branches.
+      if (In.isIndirectBranch())
+        return true;
+    }
     return false;
   };
 
   auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool {
     // This instruction defines DR. Check if there is a use operand that
     // would make DR live on entry to the instruction.
-    for (const MachineOperand &UseOp : In.operands()) {
-      if (!UseOp.isReg() || !UseOp.isUse() || UseOp.isUndef())
+    for (const MachineOperand &Op : In.operands()) {
+      if (!Op.isReg() || Op.getReg() == 0 || !Op.isUse() || Op.isUndef())
         continue;
-      RegisterRef UR = makeRegRef(UseOp.getReg(), UseOp.getSubReg());
-      if (alias(DR, UR))
+      RegisterRef UR = makeRegRef(Op);
+      if (PRI.alias(DR, UR))
         return false;
     }
     return true;
   };
 
-  // Collect a set of registers that this instruction implicitly uses
-  // or defines. Implicit operands from an instruction will be ignored
-  // unless they are listed here.
-  RegisterSet ImpUses, ImpDefs;
-  if (const uint16_t *ImpD = In.getDesc().getImplicitDefs())
-    while (uint16_t R = *ImpD++)
-      ImpDefs.insert(RegisterRef(R));
-  if (const uint16_t *ImpU = In.getDesc().getImplicitUses())
-    while (uint16_t R = *ImpU++)
-      ImpUses.insert(RegisterRef(R));
-
   bool IsCall = isCall(In);
-  bool NeedsImplicit = IsCall || In.isInlineAsm() || In.isReturn();
-  bool IsPredicated = TII.isPredicated(In);
   unsigned NumOps = In.getNumOperands();
 
   // Avoid duplicate implicit defs. This will not detect cases of implicit
   // defs that define registers that overlap, but it is not clear how to
   // interpret that in the absence of explicit defs. Overlapping explicit
   // defs are likely illegal already.
-  RegisterSet DoneDefs;
+  BitVector DoneDefs(TRI.getNumRegs());
   // Process explicit defs first.
   for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
       continue;
-    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+    unsigned R = Op.getReg();
+    if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
+      continue;
     uint16_t Flags = NodeAttrs::None;
     if (TOI.isPreserving(In, OpN)) {
       Flags |= NodeAttrs::Preserving;
       // If the def is preserving, check if it is also undefined.
-      if (isDefUndef(In, RR))
+      if (isDefUndef(In, makeRegRef(Op)))
         Flags |= NodeAttrs::Undef;
     }
     if (TOI.isClobbering(In, OpN))
@@ -1432,7 +1320,25 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
       Flags |= NodeAttrs::Dead;
     NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
     SA.Addr->addMember(DA, *this);
-    DoneDefs.insert(RR);
+    assert(!DoneDefs.test(R));
+    DoneDefs.set(R);
+  }
+
+  // Process reg-masks (as clobbers).
+  BitVector DoneClobbers(TRI.getNumRegs());
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isRegMask())
+      continue;
+    uint16_t Flags = NodeAttrs::Clobbering | NodeAttrs::Fixed |
+                     NodeAttrs::Dead;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    // Record all clobbered registers in DoneDefs.
+    const uint32_t *RM = Op.getRegMask();
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i)
+      if (!(RM[i/32] & (1u << (i%32))))
+        DoneClobbers.set(i);
   }
 
   // Process implicit defs, skipping those that have already been added
@@ -1441,11 +1347,10 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
       continue;
-    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
-    if (!NeedsImplicit && !ImpDefs.count(RR))
-      continue;
-    if (DoneDefs.count(RR))
+    unsigned R = Op.getReg();
+    if (!R || !TargetRegisterInfo::isPhysicalRegister(R) || DoneDefs.test(R))
       continue;
+    RegisterRef RR = makeRegRef(Op);
     uint16_t Flags = NodeAttrs::None;
     if (TOI.isPreserving(In, OpN)) {
       Flags |= NodeAttrs::Preserving;
@@ -1457,24 +1362,22 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
       Flags |= NodeAttrs::Clobbering;
     if (TOI.isFixedReg(In, OpN))
       Flags |= NodeAttrs::Fixed;
-    if (IsCall && Op.isDead())
+    if (IsCall && Op.isDead()) {
+      if (DoneClobbers.test(R))
+        continue;
       Flags |= NodeAttrs::Dead;
+    }
     NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
     SA.Addr->addMember(DA, *this);
-    DoneDefs.insert(RR);
+    DoneDefs.set(R);
   }
 
   for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isUse())
       continue;
-    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
-    // Add implicit uses on return and call instructions, and on predicated
-    // instructions regardless of whether or not they appear in the instruction
-    // descriptor's list.
-    bool Implicit = Op.isImplicit();
-    bool TakeImplicit = NeedsImplicit || IsPredicated;
-    if (Implicit && !TakeImplicit && !ImpUses.count(RR))
+    unsigned R = Op.getReg();
+    if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
       continue;
     uint16_t Flags = NodeAttrs::None;
     if (Op.isUndef())
@@ -1570,7 +1473,7 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
 
   auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
     for (RegisterRef I : RRs)
-      if (I != RR && RegisterAggr::isCoverOf(I, RR, TRI))
+      if (I != RR && RegisterAggr::isCoverOf(I, RR, PRI))
         RR = I;
     return RR;
   };
@@ -1597,7 +1500,7 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
   auto Aliased = [this,&MaxRefs](RegisterRef RR,
                                  std::vector<unsigned> &Closure) -> bool {
     for (unsigned I : Closure)
-      if (alias(RR, MaxRefs[I]))
+      if (PRI.alias(RR, MaxRefs[I]))
         return true;
     return false;
   };
@@ -1708,7 +1611,7 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
   NodeAddr<T> TAP;
 
   // References from the def stack that have been examined so far.
-  RegisterAggr Defs(TRI);
+  RegisterAggr Defs(PRI);
 
   for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
     RegisterRef QR = I->Addr->getRegRef(*this);
@@ -1744,13 +1647,15 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
 }
 
 // Create data-flow links for all reference nodes in the statement node SA.
-void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+template <typename Predicate>
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA,
+      Predicate P) {
 #ifndef NDEBUG
   RegisterSet Defs;
 #endif
 
   // Link all nodes (upwards in the data-flow) with their reaching defs.
-  for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+  for (NodeAddr<RefNode*> RA : SA.Addr->members_if(P, *this)) {
     uint16_t Kind = RA.Addr->getKind();
     assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
     RegisterRef RR = RA.Addr->getRegRef(*this);
@@ -1779,6 +1684,13 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   // Push block delimiters.
   markBlock(BA.Id, DefM);
 
+  auto IsClobber = [] (NodeAddr<RefNode*> RA) -> bool {
+    return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+  auto IsNoClobber = [] (NodeAddr<RefNode*> RA) -> bool {
+    return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+
   assert(BA.Addr && "block node address is needed to create a data-flow link");
   // For each non-phi instruction in the block, link all the defs and uses
   // to their reaching defs. For any member of the block (including phis),
@@ -1786,10 +1698,17 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
     // Ignore phi nodes here. They will be linked part by part from the
     // predecessors.
-    if (IA.Addr->getKind() == NodeAttrs::Stmt)
-      linkStmtRefs(DefM, IA);
+    if (IA.Addr->getKind() == NodeAttrs::Stmt) {
+      linkStmtRefs(DefM, IA, IsUse);
+      linkStmtRefs(DefM, IA, IsClobber);
+    }
 
     // Push the definitions on the stack.
+    pushClobbers(IA, DefM);
+
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA, IsNoClobber);
+
     pushDefs(IA, DefM);
   }
 
diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h
index 49d78a8b22b5..d5faca4cd6f4 100644
--- a/lib/Target/Hexagon/RDFGraph.h
+++ b/lib/Target/Hexagon/RDFGraph.h
@@ -225,6 +225,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
 #define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
 
+#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -260,7 +261,6 @@ namespace llvm {
 namespace rdf {
 
   typedef uint32_t NodeId;
-  typedef uint32_t RegisterId;
 
   struct DataFlowGraph;
 
@@ -412,25 +412,6 @@ namespace rdf {
     AllocatorTy MemPool;
   };
 
-  struct RegisterRef {
-    RegisterId Reg;
-    LaneBitmask Mask;
-
-    RegisterRef() : RegisterRef(0) {}
-    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
-      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
-
-    operator bool() const { return Reg != 0 && Mask.any(); }
-    bool operator== (const RegisterRef &RR) const {
-      return Reg == RR.Reg && Mask == RR.Mask;
-    }
-    bool operator!= (const RegisterRef &RR) const {
-      return !operator==(RR);
-    }
-    bool operator< (const RegisterRef &RR) const {
-      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
-    }
-  };
   typedef std::set<RegisterRef> RegisterSet;
 
   struct TargetOperandInfo {
@@ -450,39 +431,6 @@ namespace rdf {
     uint32_t MaskId;
   };
 
-  // Template class for a map translating uint32_t into arbitrary types.
-  // The map will act like an indexed set: upon insertion of a new object,
-  // it will automatically assign a new index to it. Index of 0 is treated
-  // as invalid and is never allocated.
-  template <typename T, unsigned N = 32>
-  struct IndexedSet {
-    IndexedSet() : Map() { Map.reserve(N); }
-
-    T get(uint32_t Idx) const {
-      // Index Idx corresponds to Map[Idx-1].
-      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
-      return Map[Idx-1];
-    }
-
-    uint32_t insert(T Val) {
-      // Linear search.
-      auto F = llvm::find(Map, Val);
-      if (F != Map.end())
-        return F - Map.begin() + 1;
-      Map.push_back(Val);
-      return Map.size();  // Return actual_index + 1.
-    }
-
-    uint32_t find(T Val) const {
-      auto F = llvm::find(Map, Val);
-      assert(F != Map.end());
-      return F - Map.begin();
-    }
-
-  private:
-    std::vector<T> Map;
-  };
-
   struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
     LaneMaskIndex() = default;
 
@@ -497,55 +445,6 @@ namespace rdf {
       assert(LM.any());
       return LM.all() ? 0 : find(LM);
     }
-
-    PackedRegisterRef pack(RegisterRef RR) {
-      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
-    }
-    PackedRegisterRef pack(RegisterRef RR) const {
-      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
-    }
-
-    RegisterRef unpack(PackedRegisterRef PR) const {
-      return RegisterRef(PR.Reg, getLaneMaskForIndex(PR.MaskId));
-    }
-  };
-
-  struct RegisterAggr {
-    RegisterAggr(const TargetRegisterInfo &tri)
-        : ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false), TRI(tri) {}
-    RegisterAggr(const RegisterAggr &RG) = default;
-
-    bool empty() const { return Masks.empty(); }
-    bool hasAliasOf(RegisterRef RR) const;
-    bool hasCoverOf(RegisterRef RR) const;
-    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
-                          const TargetRegisterInfo &TRI) {
-      return RegisterAggr(TRI).insert(RA).hasCoverOf(RB);
-    }
-
-    RegisterAggr &insert(RegisterRef RR);
-    RegisterAggr &insert(const RegisterAggr &RG);
-    RegisterAggr &clear(RegisterRef RR);
-    RegisterAggr &clear(const RegisterAggr &RG);
-
-    RegisterRef clearIn(RegisterRef RR) const;
-
-    void print(raw_ostream &OS) const;
-
-  private:
-    typedef std::unordered_map<RegisterId, LaneBitmask> MapType;
-
-  public:
-    typedef MapType::const_iterator iterator;
-    iterator begin() const { return Masks.begin(); }
-    iterator end() const { return Masks.end(); }
-    RegisterRef normalize(RegisterRef RR) const;
-
-  private:
-    MapType Masks;
-    BitVector ExpAliasUnits; // Register units for explicit aliases.
-    bool CheckUnits;
-    const TargetRegisterInfo &TRI;
   };
 
   struct NodeBase {
@@ -761,8 +660,10 @@ namespace rdf {
     MachineFunction &getMF() const { return MF; }
     const TargetInstrInfo &getTII() const { return TII; }
     const TargetRegisterInfo &getTRI() const { return TRI; }
+    const PhysicalRegisterInfo &getPRI() const { return PRI; }
     const MachineDominatorTree &getDT() const { return MDT; }
     const MachineDominanceFrontier &getDF() const { return MDF; }
+    const RegisterAggr &getLiveIns() const { return LiveIns; }
 
     struct DefStack {
       DefStack() = default;
@@ -828,15 +729,22 @@ namespace rdf {
     typedef std::unordered_map<RegisterId,DefStack> DefStackMap;
 
     void build(unsigned Options = BuildOptions::None);
-    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     void markBlock(NodeId B, DefStackMap &DefM);
     void releaseBlock(NodeId B, DefStackMap &DefM);
 
-    PackedRegisterRef pack(RegisterRef RR)       { return LMI.pack(RR); }
-    PackedRegisterRef pack(RegisterRef RR) const { return LMI.pack(RR); }
-    RegisterRef unpack(PackedRegisterRef PR) const { return LMI.unpack(PR); }
+    PackedRegisterRef pack(RegisterRef RR) {
+      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
+    }
+    PackedRegisterRef pack(RegisterRef RR) const {
+      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
+    }
+    RegisterRef unpack(PackedRegisterRef PR) const {
+      return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId));
+    }
+
     RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
-    RegisterRef normalizeRef(RegisterRef RR) const;
+    RegisterRef makeRegRef(const MachineOperand &Op) const;
     RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
 
     NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
@@ -853,6 +761,10 @@ namespace rdf {
     NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA) const;
 
+    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) const {
+      return BlockNodes.at(BB);
+    }
+
     void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
       unlinkUseDF(UA);
       if (RemoveFromOwner)
@@ -898,13 +810,9 @@ namespace rdf {
       return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
     }
 
-    // Register aliasing.
-    bool alias(RegisterRef RA, RegisterRef RB) const;
-
   private:
     void reset();
 
-    RegisterSet getAliasSet(RegisterId Reg) const;
     RegisterSet getLandingPadLiveIns() const;
 
     NodeAddr<NodeBase*> newNode(uint16_t Attrs);
@@ -940,9 +848,12 @@ namespace rdf {
         NodeAddr<BlockNode*> BA);
     void removeUnusedPhis();
 
+    void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
         NodeAddr<T> TA, DefStack &DS);
-    void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+    template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
+        NodeAddr<StmtNode*> SA, Predicate P);
     void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
 
     void unlinkUseDF(NodeAddr<UseNode*> UA);
@@ -953,23 +864,21 @@ namespace rdf {
       IA.Addr->removeMember(RA, *this);
     }
 
-    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) {
-      return BlockNodes[BB];
-    }
+    MachineFunction &MF;
+    const TargetInstrInfo &TII;
+    const TargetRegisterInfo &TRI;
+    const PhysicalRegisterInfo PRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const TargetOperandInfo &TOI;
 
+    RegisterAggr LiveIns;
     NodeAddr<FuncNode*> Func;
     NodeAllocator Memory;
     // Local map:  MachineBasicBlock -> NodeAddr<BlockNode*>
     std::map<MachineBasicBlock*,NodeAddr<BlockNode*>> BlockNodes;
     // Lane mask map.
     LaneMaskIndex LMI;
-
-    MachineFunction &MF;
-    const TargetInstrInfo &TII;
-    const TargetRegisterInfo &TRI;
-    const MachineDominatorTree &MDT;
-    const MachineDominanceFrontier &MDF;
-    const TargetOperandInfo &TOI;
   };  // struct DataFlowGraph
 
   template <typename Predicate>
@@ -1013,12 +922,6 @@ namespace rdf {
     return MM;
   }
 
-  // Optionally print the lane mask, if it is not ~0.
-  struct PrintLaneMaskOpt {
-    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
-    LaneBitmask Mask;
-  };
-  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
 
   template <typename T> struct Print;
   template <typename T>
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index e74c4bfc1645..b0532f933b16 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -31,11 +31,15 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 using namespace rdf;
 
+static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
+  cl::Hidden, cl::desc("Maximum recursion level"));
+
 namespace llvm {
 namespace rdf {
   template<>
@@ -85,7 +89,8 @@ namespace rdf {
 // the data-flow.
 
 NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
-      NodeAddr<RefNode*> RefA, bool FullChain, const RegisterAggr &DefRRs) {
+      NodeAddr<RefNode*> RefA, bool TopShadows, bool FullChain,
+      const RegisterAggr &DefRRs) {
   NodeList RDefs; // Return value.
   SetVector<NodeId> DefQ;
   SetVector<NodeId> Owners;
@@ -105,6 +110,11 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   auto SNA = DFG.addr<RefNode*>(Start);
   if (NodeId RD = SNA.Addr->getReachingDef())
     DefQ.insert(RD);
+  if (TopShadows) {
+    for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA))
+      if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
 
   // Collect all the reaching defs, going up until a phi node is encountered,
   // or there are no more reaching defs. From this set, the actual set of
@@ -119,7 +129,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
     // Stop at the covering/overwriting def of the initial register reference.
     RegisterRef RR = TA.Addr->getRegRef(DFG);
     if (!DFG.IsPreservingDef(TA))
-      if (RegisterAggr::isCoverOf(RR, RefRR, TRI))
+      if (RegisterAggr::isCoverOf(RR, RefRR, PRI))
         continue;
     // Get the next level of reaching defs. This will include multiple
     // reaching defs for shadows.
@@ -134,7 +144,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   for (NodeId N : DefQ) {
     auto TA = DFG.addr<DefNode*>(N);
     bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
-    if (!IsPhi && !DFG.alias(RefRR, TA.Addr->getRegRef(DFG)))
+    if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
       continue;
     Defs.insert(TA.Id);
     Owners.insert(TA.Addr->getOwner(DFG).Id);
@@ -241,20 +251,30 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
 }
 
 
-NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
-      NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs) {
+std::pair<NodeSet,bool>
+Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+      NodeSet &Visited, const NodeSet &Defs) {
+  return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest);
+}
+
+
+std::pair<NodeSet,bool>
+Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+      NodeSet &Visited, const NodeSet &Defs, unsigned Nest, unsigned MaxNest) {
+  if (Nest > MaxNest)
+    return { NodeSet(), false };
   // Collect all defined registers. Do not consider phis to be defining
   // anything, only collect "real" definitions.
-  RegisterAggr DefRRs(TRI);
+  RegisterAggr DefRRs(PRI);
   for (NodeId D : Defs) {
     const auto DA = DFG.addr<const DefNode*>(D);
     if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
       DefRRs.insert(DA.Addr->getRegRef(DFG));
   }
 
-  NodeList RDs = getAllReachingDefs(RefRR, RefA, true, DefRRs);
+  NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs);
   if (RDs.empty())
-    return Defs;
+    return { Defs, true };
 
   // Make a copy of the preexisting definitions and add the newly found ones.
   NodeSet TmpDefs = Defs;
@@ -273,12 +293,74 @@ NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
     Visited.insert(PA.Id);
     // Go over all phi uses and get the reaching defs for each use.
     for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
-      const auto &T = getAllReachingDefsRec(RefRR, U, Visited, TmpDefs);
-      Result.insert(T.begin(), T.end());
+      const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
+                                                Nest+1, MaxNest);
+      if (!T.second)
+        return { T.first, false };
+      Result.insert(T.first.begin(), T.first.end());
     }
   }
 
-  return Result;
+  return { Result, true };
+}
+
+/// Find the nearest ref node aliased to RefRR, going upwards in the data
+/// flow, starting from the instruction immediately preceding Inst.
+NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR,
+      NodeAddr<InstrNode*> IA) {
+  NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+  NodeList Ins = BA.Addr->members(DFG);
+  NodeId FindId = IA.Id;
+  auto E = Ins.rend();
+  auto B = std::find_if(Ins.rbegin(), E,
+                        [FindId] (const NodeAddr<InstrNode*> T) {
+                          return T.Id == FindId;
+                        });
+  // Do not scan IA (which is what B would point to).
+  if (B != E)
+    ++B;
+
+  do {
+    // Process the range of instructions from B to E.
+    for (NodeAddr<InstrNode*> I : make_range(B, E)) {
+      NodeList Refs = I.Addr->members(DFG);
+      NodeAddr<RefNode*> Clob, Use;
+      // Scan all the refs in I aliased to RefRR, and return the one that
+      // is the closest to the output of I, i.e. def > clobber > use.
+      for (NodeAddr<RefNode*> R : Refs) {
+        if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR))
+          continue;
+        if (DFG.IsDef(R)) {
+          // If it's a non-clobbering def, just return it.
+          if (!(R.Addr->getFlags() & NodeAttrs::Clobbering))
+            return R;
+          Clob = R;
+        } else {
+          Use = R;
+        }
+      }
+      if (Clob.Id != 0)
+        return Clob;
+      if (Use.Id != 0)
+        return Use;
+    }
+
+    // Go up to the immediate dominator, if any.
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    BA = NodeAddr<BlockNode*>();
+    if (MachineDomTreeNode *N = MDT.getNode(BB)) {
+      if ((N = N->getIDom()))
+        BA = DFG.findBlock(N->getBlock());
+    }
+    if (!BA.Id)
+      break;
+
+    Ins = BA.Addr->members(DFG);
+    B = Ins.rbegin();
+    E = Ins.rend();
+  } while (true);
+
+  return NodeAddr<RefNode*>();
 }
 
 
@@ -299,7 +381,7 @@ NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
     auto UA = DFG.addr<UseNode*>(U);
     if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
       RegisterRef UR = UA.Addr->getRegRef(DFG);
-      if (DFG.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
+      if (PRI.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
         Uses.insert(U);
     }
     U = UA.Addr->getSibling();
@@ -312,7 +394,7 @@ NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
     RegisterRef DR = DA.Addr->getRegRef(DFG);
     // If this def is already covered, it cannot reach anything new.
     // Similarly, skip it if it is not aliased to the interesting register.
-    if (DefRRs.hasCoverOf(DR) || !DFG.alias(RefRR, DR))
+    if (DefRRs.hasCoverOf(DR) || !PRI.alias(RefRR, DR))
       continue;
     NodeSet T;
     if (DFG.IsPreservingDef(DA)) {
@@ -343,6 +425,7 @@ void Liveness::computePhiInfo() {
   // phi use -> (map: reaching phi -> set of registers defined in between)
   std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
   std::vector<NodeId> PhiUQ;  // Work list of phis for upward propagation.
+  std::map<NodeId,RegisterAggr> PhiDRs;  // Phi -> registers defined by it.
 
   // Go over all phis.
   for (NodeAddr<PhiNode*> PhiA : Phis) {
@@ -355,12 +438,15 @@ void Liveness::computePhiInfo() {
     // For each def, add to the queue all reached (non-phi) defs.
     SetVector<NodeId> DefQ;
     NodeSet PhiDefs;
+    RegisterAggr DRs(PRI);
     for (NodeAddr<RefNode*> R : PhiRefs) {
       if (!DFG.IsRef<NodeAttrs::Def>(R))
         continue;
+      DRs.insert(R.Addr->getRegRef(DFG));
       DefQ.insert(R.Id);
       PhiDefs.insert(R.Id);
     }
+    PhiDRs.insert(std::make_pair(PhiA.Id, DRs));
 
     // Collect the super-set of all possible reached uses. This set will
     // contain all uses reached from this phi, either directly from the
@@ -377,9 +463,9 @@ void Liveness::computePhiInfo() {
         NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
         uint16_t F = A.Addr->getFlags();
         if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
-	  RegisterRef R = DFG.normalizeRef(getRestrictedRegRef(A));
+          RegisterRef R = PRI.normalize(A.Addr->getRegRef(DFG));
           RealUses[R.Reg].insert({A.Id,R.Mask});
-	}
+        }
         UN = A.Addr->getSibling();
       }
       // Visit all reached defs, and add them to the queue. These defs may
@@ -424,17 +510,13 @@ void Liveness::computePhiInfo() {
         auto UA = DFG.addr<UseNode*>(I->first);
         // Undef flag is checked above.
         assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
-	RegisterRef R(UI->first, I->second);
+        RegisterRef R(UI->first, I->second);
         NodeList RDs = getAllReachingDefs(R, UA);
-        if (any_of(RDs, InPhiDefs))
-          ++I;
-        else
-          I = Uses.erase(I);
+        // If none of the reaching defs of R are from this phi, remove this
+        // use of R.
+        I = any_of(RDs, InPhiDefs) ? std::next(I) : Uses.erase(I);
       }
-      if (Uses.empty())
-        UI = RealUses.erase(UI);
-      else
-        ++UI;
+      UI = Uses.empty() ? RealUses.erase(UI) : std::next(UI);
     }
 
     // If this phi reaches some "real" uses, add it to the queue for upward
@@ -452,32 +534,29 @@ void Liveness::computePhiInfo() {
     for (auto I : PhiRefs) {
       if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id))
         continue;
-      NodeAddr<UseNode*> UA = I;
-
-      // Given a phi use UA, traverse all related phi uses (including UA).
-      // The related phi uses may reach different phi nodes or may reach the
-      // same phi node. If multiple uses reach the same phi P, the intervening
-      // defs must be accumulated for all such uses. To group all such uses
-      // into one set, map their node ids to the first use id that reaches P.
-      std::map<NodeId,NodeId> FirstUse; // Phi reached up -> first phi use.
-
-      for (NodeAddr<UseNode*> VA : DFG.getRelatedRefs(PhiA, UA)) {
-        SeenUses.insert(VA.Id);
-        RegisterAggr DefRRs(TRI);
-        for (NodeAddr<DefNode*> DA : getAllReachingDefs(VA)) {
-          if (DA.Addr->getFlags() & NodeAttrs::PhiRef) {
-            NodeId RP = DA.Addr->getOwner(DFG).Id;
-            NodeId FU = FirstUse.insert({RP,VA.Id}).first->second;
-            std::map<NodeId,RegisterAggr> &M = PhiUp[FU];
-            auto F = M.find(RP);
-            if (F == M.end())
-              M.insert(std::make_pair(RP, DefRRs));
-            else
-              F->second.insert(DefRRs);
-          }
-          DefRRs.insert(DA.Addr->getRegRef(DFG));
+      NodeAddr<PhiUseNode*> PUA = I;
+      if (PUA.Addr->getReachingDef() == 0)
+        continue;
+
+      RegisterRef UR = PUA.Addr->getRegRef(DFG);
+      NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs);
+      RegisterAggr DefRRs(PRI);
+
+      for (NodeAddr<DefNode*> D : Ds) {
+        if (D.Addr->getFlags() & NodeAttrs::PhiRef) {
+          NodeId RP = D.Addr->getOwner(DFG).Id;
+          std::map<NodeId,RegisterAggr> &M = PhiUp[PUA.Id];
+          auto F = M.find(RP);
+          if (F == M.end())
+            M.insert(std::make_pair(RP, DefRRs));
+          else
+            F->second.insert(DefRRs);
         }
+        DefRRs.insert(D.Addr->getRegRef(DFG));
       }
+
+      for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PhiA, PUA))
+        SeenUses.insert(T.Id);
     }
   }
 
@@ -522,7 +601,7 @@ void Liveness::computePhiInfo() {
 
     for (NodeAddr<UseNode*> UA : PUs) {
       std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
-      RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(UA));
+      RegisterRef UR = PRI.normalize(UA.Addr->getRegRef(DFG));
       for (const std::pair<NodeId,RegisterAggr> &P : PUM) {
         bool Changed = false;
         const RegisterAggr &MidDefs = P.second;
@@ -540,14 +619,19 @@ void Liveness::computePhiInfo() {
         //       then add (R-MidDefs,U) to RealUseMap[P]
         //
         for (const std::pair<RegisterId,NodeRefSet> &T : RUM) {
-          RegisterRef R = DFG.restrictRef(RegisterRef(T.first), UR);
-          if (!R)
+          RegisterRef R(T.first);
+          // The current phi (PA) could be a phi for a regmask. It could
+          // reach a whole variety of uses that are not related to the
+          // specific upward phi (P.first).
+          const RegisterAggr &DRs = PhiDRs.at(P.first);
+          if (!DRs.hasAliasOf(R))
             continue;
+          R = DRs.intersectWith(R);
           for (std::pair<NodeId,LaneBitmask> V : T.second) {
-            RegisterRef S = DFG.restrictRef(RegisterRef(R.Reg, V.second), R);
-            if (!S)
+            LaneBitmask M = R.Mask & V.second;
+            if (M.none())
               continue;
-            if (RegisterRef SS = MidDefs.clearIn(S)) {
+            if (RegisterRef SS = MidDefs.clearIn(RegisterRef(R.Reg, M))) {
               NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
               Changed |= RS.insert({V.first,SS.Mask}).second;
             }
@@ -645,30 +729,43 @@ void Liveness::computeLiveIns() {
       if (RUs.empty())
         continue;
 
+      NodeSet SeenUses;
       for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        if (!SeenUses.insert(U.Id).second)
+          continue;
         NodeAddr<PhiUseNode*> PUA = U;
         if (PUA.Addr->getReachingDef() == 0)
           continue;
 
-        // Mark all reached "real" uses of P as live on exit in the
-        // predecessor.
-        // Remap all the RUs so that they have a correct reaching def.
+        // Each phi has some set (possibly empty) of reached "real" uses,
+        // that is, uses that are part of the compiled program. Such a use
+        // may be located in some farther block, but following a chain of
+        // reaching defs will eventually lead to this phi.
+        // Any chain of reaching defs may fork at a phi node, but there
+        // will be a path upwards that will lead to this phi. Now, this
+        // chain will need to fork at this phi, since some of the reached
+        // uses may have definitions joining in from multiple predecessors.
+        // For each reached "real" use, identify the set of reaching defs
+        // coming from each predecessor P, and add them to PhiLOX[P].
+        //
         auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
         RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
 
-        RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(PUA));
-        for (const std::pair<RegisterId,NodeRefSet> &T : RUs) {
-          // Check if T.first aliases UR?
-          LaneBitmask M;
-          for (std::pair<NodeId,LaneBitmask> P : T.second)
-            M |= P.second;
-
-          RegisterRef S = DFG.restrictRef(RegisterRef(T.first, M), UR);
-          if (!S)
-            continue;
-          for (NodeAddr<DefNode*> D : getAllReachingDefs(S, PUA))
-            LOX[S.Reg].insert({D.Id, S.Mask});
+        for (const std::pair<RegisterId,NodeRefSet> &RS : RUs) {
+          // We need to visit each individual use.
+          for (std::pair<NodeId,LaneBitmask> P : RS.second) {
+            // Create a register ref corresponding to the use, and find
+            // all reaching defs starting from the phi use, and treating
+            // all related shadows as a single use cluster.
+            RegisterRef S(RS.first, P.second);
+            NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs);
+            for (NodeAddr<DefNode*> D : Ds)
+              LOX[S.Reg].insert({D.Id, S.Mask});
+          }
         }
+
+        for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PA, PUA))
+          SeenUses.insert(T.Id);
       }  // for U : phi uses
     }  // for P : Phis
   }  // for B : Blocks
@@ -684,9 +781,7 @@ void Liveness::computeLiveIns() {
   traverse(&MF.front(), LiveIn);
 
   // Add function live-ins to the live-in set of the function entry block.
-  auto &EntryIn = LiveMap[&MF.front()];
-  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
-    EntryIn.insert(RegisterRef(I->first));
+  LiveMap[&MF.front()].insert(DFG.getLiveIns());
 
   if (Trace) {
     // Dump the liveness map
@@ -702,19 +797,9 @@ void Liveness::computeLiveIns() {
       //dbgs() << "\tcomp = " << Print<RegisterAggr>(LiveMap[&B], DFG) << '\n';
 
       LV.clear();
-      for (std::pair<RegisterId,LaneBitmask> P : LiveMap[&B]) {
-        MCSubRegIndexIterator S(P.first, &TRI);
-        if (!S.isValid()) {
-          LV.push_back(RegisterRef(P.first));
-          continue;
-        }
-        do {
-          LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
-          if ((M & P.second).any())
-            LV.push_back(RegisterRef(S.getSubReg()));
-          ++S;
-        } while (S.isValid());
-      }
+      const RegisterAggr &LG = LiveMap[&B];
+      for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I)
+        LV.push_back(*I);
       std::sort(LV.begin(), LV.end());
       dbgs() << "\tcomp = {";
       for (auto I : LV)
@@ -735,9 +820,10 @@ void Liveness::resetLiveIns() {
     for (auto I : T)
       B.removeLiveIn(I);
     // Add the newly computed live-ins.
-    auto &LiveIns = LiveMap[&B];
-    for (auto I : LiveIns) {
-      B.addLiveIn({MCPhysReg(I.first), I.second});
+    const RegisterAggr &LiveIns = LiveMap[&B];
+    for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
+      RegisterRef R = *I;
+      B.addLiveIn({MCPhysReg(R.Reg), R.Mask});
     }
   }
 }
@@ -791,7 +877,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
         Live.reset(*SR);
     }
     for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isUse())
+      if (!Op.isReg() || !Op.isUse() || Op.isUndef())
         continue;
       unsigned R = Op.getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(R))
@@ -803,9 +889,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
         IsLive = true;
         break;
       }
-      if (IsLive)
-        continue;
-      Op.setIsKill(true);
+      if (!IsLive)
+        Op.setIsKill(true);
       for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
         Live.set(*SR);
     }
@@ -813,17 +898,6 @@ void Liveness::resetKills(MachineBasicBlock *B) {
 }
 
 
-RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
-  assert(DFG.IsRef<NodeAttrs::Use>(RA));
-  if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
-    NodeId RD = RA.Addr->getReachingDef();
-    assert(RD);
-    RA = DFG.addr<DefNode*>(RD);
-  }
-  return RA.Addr->getRegRef(DFG);
-}
-
-
 // Helper function to obtain the basic block containing the reaching def
 // of the given use.
 MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
@@ -921,7 +995,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
       // propagated upwards. This only applies to non-preserving defs,
       // and to the parts of the register actually covered by those defs.
       // (Note that phi defs should always be preserving.)
-      RegisterAggr RRs(TRI);
+      RegisterAggr RRs(PRI);
       LRef.Mask = OR.second;
 
       if (!DFG.IsPreservingDef(DA)) {
@@ -949,10 +1023,9 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
           // registers are not covering LRef. The first def from the
           // upward chain will be live.
           // Subtract all accumulated defs (RRs) from LRef.
-          RegisterAggr L(TRI);
-          L.insert(LRef).clear(RRs);
-          assert(!L.empty());
-          NewDefs.insert({TA.Id,L.begin()->second});
+          RegisterRef T = RRs.clearIn(LRef);
+          assert(T);
+          NewDefs.insert({TA.Id,T.Mask});
           break;
         }
 
@@ -983,7 +1056,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
     for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
       if (UA.Addr->getFlags() & NodeAttrs::Undef)
         continue;
-      RegisterRef RR = DFG.normalizeRef(UA.Addr->getRegRef(DFG));
+      RegisterRef RR = PRI.normalize(UA.Addr->getRegRef(DFG));
       for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
         if (getBlockWithRef(D.Id) != B)
           LiveIn[RR.Reg].insert({D.Id,RR.Mask});
diff --git a/lib/Target/Hexagon/RDFLiveness.h b/lib/Target/Hexagon/RDFLiveness.h
index c88396f36bbb..6f2615b7c4f3 100644
--- a/lib/Target/Hexagon/RDFLiveness.h
+++ b/lib/Target/Hexagon/RDFLiveness.h
@@ -33,7 +33,7 @@ namespace rdf {
     // This is really a std::map, except that it provides a non-trivial
     // default constructor to the element accessed via [].
     struct LiveMapType {
-      LiveMapType(const TargetRegisterInfo &tri) : Empty(tri) {}
+      LiveMapType(const PhysicalRegisterInfo &pri) : Empty(pri) {}
 
       RegisterAggr &operator[] (MachineBasicBlock *B) {
         return Map.emplace(B, Empty).first->second;
@@ -49,26 +49,31 @@ namespace rdf {
     typedef std::map<RegisterId,NodeRefSet> RefMap;
 
     Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
-      : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()),
-        MRI(mri), LiveMap(g.getTRI()), Empty(), NoRegs(g.getTRI()),
-        Trace(false) {}
+      : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
+        MDF(g.getDF()), LiveMap(g.getPRI()), Empty(),
+        NoRegs(g.getPRI()), Trace(false) {}
 
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        bool FullChain, const RegisterAggr &DefRRs);
+        bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
     NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false, NoRegs);
+      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
+                                false, NoRegs);
     }
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefRR, RefA, false, NoRegs);
+      return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
     }
-    NodeSet getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        NodeSet &Visited, const NodeSet &Defs);
     NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
         const RegisterAggr &DefRRs);
     NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
       return getAllReachedUses(RefRR, DefA, NoRegs);
     }
 
+    std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
+        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
+
+    NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
+        NodeAddr<InstrNode*> IA);
+
     LiveMapType &getLiveMap() { return LiveMap; }
     const LiveMapType &getLiveMap() const { return LiveMap; }
     const RefMap &getRealUses(NodeId P) const {
@@ -87,9 +92,9 @@ namespace rdf {
   private:
     const DataFlowGraph &DFG;
     const TargetRegisterInfo &TRI;
+    const PhysicalRegisterInfo &PRI;
     const MachineDominatorTree &MDT;
     const MachineDominanceFrontier &MDF;
-    MachineRegisterInfo &MRI;
     LiveMapType LiveMap;
     const RefMap Empty;
     const RegisterAggr NoRegs;
@@ -121,12 +126,13 @@ namespace rdf {
     // the dominator tree), create a map: block -> set of uses live on exit.
     std::map<MachineBasicBlock*,RefMap> PhiLOX;
 
-    bool isRestrictedToRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
-        RegisterRef RR) const;
-    RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
     MachineBasicBlock *getBlockWithRef(NodeId RN) const;
     void traverse(MachineBasicBlock *B, RefMap &LiveIn);
     void emptify(RefMap &M);
+
+    std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
+        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
+        unsigned Nest, unsigned MaxNest);
   };
 } // namespace rdf
 } // namespace llvm
diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp
new file mode 100644
index 000000000000..5c5496a548af
--- /dev/null
+++ b/lib/Target/Hexagon/RDFRegisters.cpp
@@ -0,0 +1,368 @@
+//===--- RDFRegisters.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RDFRegisters.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+using namespace llvm;
+using namespace rdf;
+
+PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
+      const MachineFunction &mf)
+    : TRI(tri) {
+  RegInfos.resize(TRI.getNumRegs());
+
+  BitVector BadRC(TRI.getNumRegs());
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
+    for (MCPhysReg R : *RC) {
+      RegInfo &RI = RegInfos[R];
+      if (RI.RegClass != nullptr && !BadRC[R]) {
+        if (RC->LaneMask != RI.RegClass->LaneMask) {
+          BadRC.set(R);
+          RI.RegClass = nullptr;
+        }
+      } else
+        RI.RegClass = RC;
+    }
+  }
+
+  UnitInfos.resize(TRI.getNumRegUnits());
+
+  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+    if (UnitInfos[U].Reg != 0)
+      continue;
+    MCRegUnitRootIterator R(U, &TRI);
+    assert(R.isValid());
+    RegisterId F = *R;
+    ++R;
+    if (R.isValid()) {
+      UnitInfos[U].Mask = LaneBitmask::getAll();
+      UnitInfos[U].Reg = F;
+    } else {
+      for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) {
+        std::pair<uint32_t,LaneBitmask> P = *I;
+        UnitInfo &UI = UnitInfos[P.first];
+        UI.Reg = F;
+        if (P.second.any()) {
+          UI.Mask = P.second;
+        } else {
+          if (const TargetRegisterClass *RC = RegInfos[F].RegClass)
+            UI.Mask = RC->LaneMask;
+          else
+            UI.Mask = LaneBitmask::getAll();
+        }
+      }
+    }
+  }
+
+  for (const uint32_t *RM : TRI.getRegMasks())
+    RegMasks.insert(RM);
+  for (const MachineBasicBlock &B : mf)
+    for (const MachineInstr &In : B)
+      for (const MachineOperand &Op : In.operands())
+        if (Op.isRegMask())
+          RegMasks.insert(Op.getRegMask());
+}
+
+RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
+  return RR;
+}
+
+std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
+  // Do not include RR in the alias set.
+  std::set<RegisterId> AS;
+  assert(isRegMaskId(Reg) || TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (isRegMaskId(Reg)) {
+    // XXX SLOW
+    const uint32_t *MB = getRegMaskBits(Reg);
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
+      if (MB[i/32] & (1u << (i%32)))
+        continue;
+      AS.insert(i);
+    }
+    for (const uint32_t *RM : RegMasks) {
+      RegisterId MI = getRegMaskId(RM);
+      if (MI != Reg && aliasMM(RegisterRef(Reg), RegisterRef(MI)))
+        AS.insert(MI);
+    }
+    return AS;
+  }
+
+  for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
+    AS.insert(*AI);
+  for (const uint32_t *RM : RegMasks) {
+    RegisterId MI = getRegMaskId(RM);
+    if (aliasRM(RegisterRef(Reg), RegisterRef(MI)))
+      AS.insert(MI);
+  }
+  return AS;
+}
+
+bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
+  assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+
+  MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
+  MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
+  // Reg units are returned in the numerical order.
+  while (UMA.isValid() && UMB.isValid()) {
+    // Skip units that are masked off in RA.
+    std::pair<RegisterId,LaneBitmask> PA = *UMA;
+    if (PA.second.any() && (PA.second & RA.Mask).none()) {
+      ++UMA;
+      continue;
+    }
+    // Skip units that are masked off in RB.
+    std::pair<RegisterId,LaneBitmask> PB = *UMB;
+    if (PB.second.any() && (PB.second & RB.Mask).none()) {
+      ++UMB;
+      continue;
+    }
+
+    if (PA.first == PB.first)
+      return true;
+    if (PA.first < PB.first)
+      ++UMA;
+    else if (PB.first < PA.first)
+      ++UMB;
+  }
+  return false;
+}
+
+bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
+  const uint32_t *MB = getRegMaskBits(RM.Reg);
+  bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32));
+  // If the lane mask information is "full", e.g. when the given lane mask
+  // is a superset of the lane mask from the register class, check the regmask
+  // bit directly.
+  if (RR.Mask == LaneBitmask::getAll())
+    return !Preserved;
+  const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass;
+  if (RC != nullptr && (RR.Mask & RC->LaneMask) == RC->LaneMask)
+    return !Preserved;
+
+  // Otherwise, check all subregisters whose lane mask overlaps the given
+  // mask. For each such register, if it is preserved by the regmask, then
+  // clear the corresponding bits in the given mask. If at the end, all
+  // bits have been cleared, the register does not alias the regmask (i.e.
+  // is it preserved by it).
+  LaneBitmask M = RR.Mask;
+  for (MCSubRegIndexIterator SI(RR.Reg, &TRI); SI.isValid(); ++SI) {
+    LaneBitmask SM = TRI.getSubRegIndexLaneMask(SI.getSubRegIndex());
+    if ((SM & RR.Mask).none())
+      continue;
+    unsigned SR = SI.getSubReg();
+    if (!(MB[SR/32] & (1u << (SR%32))))
+      continue;
+    // The subregister SR is preserved.
+    M &= ~SM;
+    if (M.none())
+      return false;
+  }
+
+  return true;
+}
+
+bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const {
+  assert(isRegMaskId(RM.Reg) && isRegMaskId(RN.Reg));
+  unsigned NumRegs = TRI.getNumRegs();
+  const uint32_t *BM = getRegMaskBits(RM.Reg);
+  const uint32_t *BN = getRegMaskBits(RN.Reg);
+
+  for (unsigned w = 0, nw = NumRegs/32; w != nw; ++w) {
+    // Intersect the negations of both words. Disregard reg=0,
+    // i.e. 0th bit in the 0th word.
+    uint32_t C = ~BM[w] & ~BN[w];
+    if (w == 0)
+      C &= ~1;
+    if (C)
+      return true;
+  }
+
+  // Check the remaining registers in the last word.
+  unsigned TailRegs = NumRegs % 32;
+  if (TailRegs == 0)
+    return false;
+  unsigned TW = NumRegs / 32;
+  uint32_t TailMask = (1u << TailRegs) - 1;
+  if (~BM[TW] & ~BN[TW] & TailMask)
+    return true;
+
+  return false;
+}
+
+
+bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
+    // XXX SLOW
+    const uint32_t *MB = PRI.getRegMaskBits(RR.Reg);
+    for (unsigned i = 1, e = PRI.getTRI().getNumRegs(); i != e; ++i) {
+      if (MB[i/32] & (1u << (i%32)))
+        continue;
+      if (hasAliasOf(RegisterRef(i, LaneBitmask::getAll())))
+        return true;
+    }
+    return false;
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t,LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      if (Units.test(P.first))
+        return true;
+  }
+  return false;
+}
+
+bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
+    // XXX SLOW
+    const uint32_t *MB = PRI.getRegMaskBits(RR.Reg);
+    for (unsigned i = 1, e = PRI.getTRI().getNumRegs(); i != e; ++i) {
+      if (MB[i/32] & (1u << (i%32)))
+        continue;
+      if (!hasCoverOf(RegisterRef(i, LaneBitmask::getAll())))
+        return false;
+    }
+    return true;
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t,LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      if (!Units.test(P.first))
+        return false;
+  }
+  return true;
+}
+
+RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
+    BitVector PU(PRI.getTRI().getNumRegUnits()); // Preserved units.
+    const uint32_t *MB = PRI.getRegMaskBits(RR.Reg);
+    for (unsigned i = 1, e = PRI.getTRI().getNumRegs(); i != e; ++i) {
+      if (!(MB[i/32] & (1u << (i%32))))
+        continue;
+      for (MCRegUnitIterator U(i, &PRI.getTRI()); U.isValid(); ++U)
+        PU.set(*U);
+    }
+    Units |= PU.flip();
+    return *this;
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t,LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      Units.set(P.first);
+  }
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
+  Units |= RG.Units;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::intersect(RegisterRef RR) {
+  return intersect(RegisterAggr(PRI).insert(RR));
+}
+
+RegisterAggr &RegisterAggr::intersect(const RegisterAggr &RG) {
+  Units &= RG.Units;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
+  return clear(RegisterAggr(PRI).insert(RR));
+}
+
+RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
+  Units.reset(RG.Units);
+  return *this;
+}
+
+RegisterRef RegisterAggr::intersectWith(RegisterRef RR) const {
+  RegisterAggr T(PRI);
+  T.insert(RR).intersect(*this);
+  if (T.empty())
+    return RegisterRef();
+  RegisterRef NR = T.makeRegRef();
+  assert(NR);
+  return NR;
+}
+
+RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
+  return RegisterAggr(PRI).insert(RR).clear(*this).makeRegRef();
+}
+
+RegisterRef RegisterAggr::makeRegRef() const {
+  int U = Units.find_first();
+  if (U < 0)
+    return RegisterRef();
+
+  auto AliasedRegs = [this] (uint32_t Unit, BitVector &Regs) {
+    for (MCRegUnitRootIterator R(Unit, &PRI.getTRI()); R.isValid(); ++R)
+      for (MCSuperRegIterator S(*R, &PRI.getTRI(), true); S.isValid(); ++S)
+        Regs.set(*S);
+  };
+
+  // Find the set of all registers that are aliased to all the units
+  // in this aggregate.
+
+  // Get all the registers aliased to the first unit in the bit vector.
+  BitVector Regs(PRI.getTRI().getNumRegs());
+  AliasedRegs(U, Regs);
+  U = Units.find_next(U);
+
+  // For each other unit, intersect it with the set of all registers
+  // aliased that unit.
+  while (U >= 0) {
+    BitVector AR(PRI.getTRI().getNumRegs());
+    AliasedRegs(U, AR);
+    Regs &= AR;
+    U = Units.find_next(U);
+  }
+
+  // If there is at least one register remaining, pick the first one,
+  // and consolidate the masks of all of its units contained in this
+  // aggregate.
+
+  int F = Regs.find_first();
+  if (F <= 0)
+    return RegisterRef();
+
+  LaneBitmask M;
+  for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) {
+    std::pair<uint32_t,LaneBitmask> P = *I;
+    if (Units.test(P.first))
+      M |= P.second.none() ? LaneBitmask::getAll() : P.second;
+  }
+  return RegisterRef(F, M);
+}
+
+void RegisterAggr::print(raw_ostream &OS) const {
+  OS << '{';
+  for (int U = Units.find_first(); U >= 0; U = Units.find_next(U))
+    OS << ' ' << PrintRegUnit(U, &PRI.getTRI());
+  OS << " }";
+}
+
+RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG,
+      bool End)
+    : Owner(&RG) {
+  for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) {
+    RegisterRef R = RG.PRI.getRefForUnit(U);
+    Masks[R.Reg] |= R.Mask;
+  }
+  Pos = End ? Masks.end() : Masks.begin();
+  Index = End ? Masks.size() : 0;
+}
+
diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h
new file mode 100644
index 000000000000..4b35c85a6b62
--- /dev/null
+++ b/lib/Target/Hexagon/RDFRegisters.h
@@ -0,0 +1,209 @@
+//===--- RDFRegisters.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
+#define LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+namespace rdf {
+
+  typedef uint32_t RegisterId;
+
+  // Template class for a map translating uint32_t into arbitrary types.
+  // The map will act like an indexed set: upon insertion of a new object,
+  // it will automatically assign a new index to it. Index of 0 is treated
+  // as invalid and is never allocated.
+  template <typename T, unsigned N = 32>
+  struct IndexedSet {
+    IndexedSet() : Map() { Map.reserve(N); }
+
+    T get(uint32_t Idx) const {
+      // Index Idx corresponds to Map[Idx-1].
+      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
+      return Map[Idx-1];
+    }
+
+    uint32_t insert(T Val) {
+      // Linear search.
+      auto F = llvm::find(Map, Val);
+      if (F != Map.end())
+        return F - Map.begin() + 1;
+      Map.push_back(Val);
+      return Map.size();  // Return actual_index + 1.
+    }
+
+    uint32_t find(T Val) const {
+      auto F = llvm::find(Map, Val);
+      assert(F != Map.end());
+      return F - Map.begin() + 1;
+    }
+
+    typedef typename std::vector<T>::const_iterator const_iterator;
+    const_iterator begin() const { return Map.begin(); }
+    const_iterator end() const { return Map.end(); }
+
+  private:
+    std::vector<T> Map;
+  };
+
+  struct RegisterRef {
+    RegisterId Reg = 0;
+    LaneBitmask Mask = LaneBitmask::getNone();
+
+    RegisterRef() = default;
+    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
+      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
+
+    operator bool() const {
+      return Reg != 0 && Mask.any();
+    }
+    bool operator== (const RegisterRef &RR) const {
+      return Reg == RR.Reg && Mask == RR.Mask;
+    }
+    bool operator!= (const RegisterRef &RR) const {
+      return !operator==(RR);
+    }
+    bool operator< (const RegisterRef &RR) const {
+      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
+    }
+  };
+
+
+  struct PhysicalRegisterInfo {
+    PhysicalRegisterInfo(const TargetRegisterInfo &tri,
+                         const MachineFunction &mf);
+
+    static bool isRegMaskId(RegisterId R) {
+      return TargetRegisterInfo::isStackSlot(R);
+    }
+    RegisterId getRegMaskId(const uint32_t *RM) const {
+      return TargetRegisterInfo::index2StackSlot(RegMasks.find(RM));
+    }
+    const uint32_t *getRegMaskBits(RegisterId R) const {
+      return RegMasks.get(TargetRegisterInfo::stackSlot2Index(R));
+    }
+    RegisterRef normalize(RegisterRef RR) const;
+
+    bool alias(RegisterRef RA, RegisterRef RB) const {
+      if (!isRegMaskId(RA.Reg))
+        return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
+      return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
+    }
+    std::set<RegisterId> getAliasSet(RegisterId Reg) const;
+
+    RegisterRef getRefForUnit(uint32_t U) const {
+      return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask);
+    }
+
+    const TargetRegisterInfo &getTRI() const { return TRI; }
+
+  private:
+    struct RegInfo {
+      const TargetRegisterClass *RegClass = nullptr;
+    };
+    struct UnitInfo {
+      RegisterId Reg = 0;
+      LaneBitmask Mask;
+    };
+
+    const TargetRegisterInfo &TRI;
+    std::vector<RegInfo> RegInfos;
+    std::vector<UnitInfo> UnitInfos;
+    IndexedSet<const uint32_t*> RegMasks;
+
+    bool aliasRR(RegisterRef RA, RegisterRef RB) const;
+    bool aliasRM(RegisterRef RR, RegisterRef RM) const;
+    bool aliasMM(RegisterRef RM, RegisterRef RN) const;
+  };
+
+
+  struct RegisterAggr {
+    RegisterAggr(const PhysicalRegisterInfo &pri)
+        : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
+    RegisterAggr(const RegisterAggr &RG) = default;
+
+    bool empty() const { return Units.empty(); }
+    bool hasAliasOf(RegisterRef RR) const;
+    bool hasCoverOf(RegisterRef RR) const;
+    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
+                          const PhysicalRegisterInfo &PRI) {
+      return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
+    }
+
+    RegisterAggr &insert(RegisterRef RR);
+    RegisterAggr &insert(const RegisterAggr &RG);
+    RegisterAggr &intersect(RegisterRef RR);
+    RegisterAggr &intersect(const RegisterAggr &RG);
+    RegisterAggr &clear(RegisterRef RR);
+    RegisterAggr &clear(const RegisterAggr &RG);
+
+    RegisterRef intersectWith(RegisterRef RR) const;
+    RegisterRef clearIn(RegisterRef RR) const;
+    RegisterRef makeRegRef() const;
+
+    void print(raw_ostream &OS) const;
+
+    struct rr_iterator {
+      typedef std::map<RegisterId,LaneBitmask> MapType;
+    private:
+      MapType Masks;
+      MapType::iterator Pos;
+      unsigned Index;
+      const RegisterAggr *Owner;
+    public:
+      rr_iterator(const RegisterAggr &RG, bool End);
+      RegisterRef operator*() const {
+        return RegisterRef(Pos->first, Pos->second);
+      }
+      rr_iterator &operator++() {
+        ++Pos;
+        ++Index;
+        return *this;
+      }
+      bool operator==(const rr_iterator &I) const {
+        assert(Owner == I.Owner);
+        return Index == I.Index;
+      }
+      bool operator!=(const rr_iterator &I) const {
+        return !(*this == I);
+      }
+    };
+
+    rr_iterator rr_begin() const {
+      return rr_iterator(*this, false);
+    }
+    rr_iterator rr_end() const {
+      return rr_iterator(*this, true);
+    }
+
+  private:
+    BitVector Units;
+    const PhysicalRegisterInfo &PRI;
+  };
+
+
+  // Optionally print the lane mask, if it is not ~0.
+  struct PrintLaneMaskOpt {
+    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
+    LaneBitmask Mask;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
+
+} // namespace rdf
+} // namespace llvm
+
+#endif
+
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 57ead973b56e..1d6c07974beb 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -1096,7 +1096,7 @@ StringRef LanaiAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
   return Mnemonic;
 }
 
-bool IsMemoryAssignmentError(const OperandVector &Operands) {
+static bool IsMemoryAssignmentError(const OperandVector &Operands) {
   // Detects if a memory operation has an erroneous base register modification.
   // Memory operations are detected by matching the types of operands.
   //
diff --git a/lib/Target/Lanai/InstPrinter/CMakeLists.txt b/lib/Target/Lanai/InstPrinter/CMakeLists.txt
index 6badb1c98a6d..7f76b895e6ec 100644
--- a/lib/Target/Lanai/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Lanai/InstPrinter/CMakeLists.txt
@@ -1,3 +1,3 @@
-add_llvm_library(LLVMLanaiInstPrinter
+add_llvm_library(LLVMLanaiAsmPrinter
   LanaiInstPrinter.cpp
   )
diff --git a/lib/Target/Lanai/InstPrinter/LLVMBuild.txt b/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
index eed9a587d14a..8d9768c6017b 100644
--- a/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = LanaiInstPrinter
+name = LanaiAsmPrinter
 parent = Lanai
 required_libraries = MC Support
 add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/LLVMBuild.txt b/lib/Target/Lanai/LLVMBuild.txt
index 798fc351bd70..cb91ffb61a98 100644
--- a/lib/Target/Lanai/LLVMBuild.txt
+++ b/lib/Target/Lanai/LLVMBuild.txt
@@ -36,7 +36,7 @@ required_libraries =
  LanaiAsmParser
  LanaiDesc
  LanaiInfo
- LanaiInstPrinter
+ LanaiAsmPrinter
  MC
  SelectionDAG
  Support
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
index fcd5da876b15..a7c9a7a7f280 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -518,7 +518,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
   const MCInstrDesc &DefDesc = DefMI->getDesc();
   for (unsigned i = 1, e = DefDesc.getNumOperands();
        i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
-    NewMI.addOperand(DefMI->getOperand(i));
+    NewMI.add(DefMI->getOperand(i));
 
   unsigned CondCode = MI.getOperand(3).getImm();
   if (Invert)
@@ -531,7 +531,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
   // register operand tied to the first def.  The tie makes the register
   // allocator ensure the FalseReg is allocated the same register as operand 0.
   FalseReg.setImplicit();
-  NewMI.addOperand(FalseReg);
+  NewMI.add(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
 
   // Update SeenMIs set: register newly created MI and erase removed DefMI.
diff --git a/lib/Target/Lanai/LanaiMCInstLower.cpp b/lib/Target/Lanai/LanaiMCInstLower.cpp
index 39c633578d43..90ede6566acf 100644
--- a/lib/Target/Lanai/LanaiMCInstLower.cpp
+++ b/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -130,7 +130,7 @@ void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
       break;
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     }
 
diff --git a/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt b/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt
index 8070dbabb1a6..05e52ccc18d7 100644
--- a/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = LanaiDesc
 parent = Lanai
-required_libraries = LanaiInfo LanaiInstPrinter MC MCDisassembler Support
+required_libraries = LanaiInfo LanaiAsmPrinter MC MCDisassembler Support
 add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index a04fe8112fb9..0ef1401ef531 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -50,7 +50,7 @@ public:
       : MCAsmBackend(), OSType(OST) {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -90,7 +90,7 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
 void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                  unsigned /*DataSize*/, uint64_t Value,
-                                 bool /*IsPCRel*/) const {
+                                 bool /*IsPCRel*/, MCContext & /*Ctx*/) const {
   MCFixupKind Kind = Fixup.getKind();
   Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
 
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index f5b5335bb989..10254677a5ad 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -89,7 +89,7 @@ public:
 
 } // end anonymous namespace
 
-Lanai::Fixups FixupKind(const MCExpr *Expr) {
+static Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
   if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
@@ -134,8 +134,8 @@ unsigned LanaiMCCodeEmitter::getMachineOpValue(
 }
 
 // Helper function to adjust P and Q bits on load and store instructions.
-unsigned adjustPqBits(const MCInst &Inst, unsigned Value, unsigned PBitShift,
-                      unsigned QBitShift) {
+static unsigned adjustPqBits(const MCInst &Inst, unsigned Value,
+                             unsigned PBitShift, unsigned QBitShift) {
   const MCOperand AluOp = Inst.getOperand(3);
   unsigned AluCode = AluOp.getImm();
 
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 5fd6b6305f68..424b5ae418f7 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -194,8 +194,8 @@ bool MSP430BSel::expandBranches(OffsetVector &BlockOffsets) {
         // Jump over the long branch on the opposite condition
         TII->reverseBranchCondition(Cond);
         MI = BuildMI(*MBB, MI, dl, TII->get(MSP430::JCC))
-                             .addMBB(NextMBB)
-                             .addOperand(Cond[0]);
+                 .addMBB(NextMBB)
+                 .add(Cond[0]);
         InstrSizeDiff += TII->getInstSizeInBytes(*MI);
         ++MI;
       }
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index b38f5781c84a..0434f8abfbf4 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -13,11 +13,11 @@
 // MSP430 Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 def RetCC_MSP430 : CallingConv<[
-  // i8 are returned in registers R15B, R14B, R13B, R12B
-  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+  // i8 are returned in registers R12B, R13B, R14B, R15B
+  CCIfType<[i8], CCAssignToReg<[R12B, R13B, R14B, R15B]>>,
 
-  // i16 are returned in registers R15, R14, R13, R12
-  CCIfType<[i16], CCAssignToReg<[R15, R14, R13, R12]>>
+  // i16 are returned in registers R12, R13, R14, R15
+  CCIfType<[i16], CCAssignToReg<[R12, R13, R14, R15]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 6e481b68e038..cd58eda5d924 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -61,7 +61,8 @@ namespace {
       return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
     }
 
-    void dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump() {
       errs() << "MSP430ISelAddressMode " << this << '\n';
       if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
         errs() << "Base.Reg ";
@@ -83,6 +84,7 @@ namespace {
       } else if (JT != -1)
         errs() << " JT" << JT << " Align" << Align << '\n';
     }
+#endif
   };
 }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 73346b9ce41d..40b1dd3cc2eb 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -245,13 +245,20 @@ MSP430TargetLowering::getRegForInlineAsmConstraint(
 template<typename ArgT>
 static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
                               SmallVectorImpl<unsigned> &Out) {
-  unsigned CurrentArgIndex = ~0U;
-  for (unsigned i = 0, e = Args.size(); i != e; i++) {
-    if (CurrentArgIndex == Args[i].OrigArgIndex) {
-      Out.back()++;
+  unsigned CurrentArgIndex;
+
+  if (Args.empty())
+    return;
+
+  CurrentArgIndex = Args[0].OrigArgIndex;
+  Out.push_back(0);
+
+  for (auto &Arg : Args) {
+    if (CurrentArgIndex == Arg.OrigArgIndex) {
+      Out.back() += 1;
     } else {
       Out.push_back(1);
-      CurrentArgIndex++;
+      CurrentArgIndex = Arg.OrigArgIndex;
     }
   }
 }
@@ -275,7 +282,7 @@ static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
   static const MCPhysReg RegList[] = {
-    MSP430::R15, MSP430::R14, MSP430::R13, MSP430::R12
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
   };
   static const unsigned NbRegs = array_lengthof(RegList);
 
@@ -288,7 +295,7 @@ static void AnalyzeArguments(CCState &State,
   ParseFunctionArgs(Args, ArgsParts);
 
   unsigned RegsLeft = NbRegs;
-  bool UseStack = false;
+  bool UsedStack = false;
   unsigned ValNo = 0;
 
   for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
@@ -316,20 +323,22 @@ static void AnalyzeArguments(CCState &State,
 
     unsigned Parts = ArgsParts[i];
 
-    if (!UseStack && Parts <= RegsLeft) {
-      unsigned FirstVal = ValNo;
+    if (!UsedStack && Parts == 2 && RegsLeft == 1) {
+      // Special case for 32-bit register split, see EABI section 3.3.3
+      unsigned Reg = State.AllocateReg(RegList);
+      State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+      RegsLeft -= 1;
+
+      UsedStack = true;
+      CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    } else if (Parts <= RegsLeft) {
       for (unsigned j = 0; j < Parts; j++) {
         unsigned Reg = State.AllocateReg(RegList);
         State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
         RegsLeft--;
       }
-
-      // Reverse the order of the pieces to agree with the "big endian" format
-      // required in the calling convention ABI.
-      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
-      std::reverse(B, B + Parts);
     } else {
-      UseStack = true;
+      UsedStack = true;
       for (unsigned j = 0; j < Parts; j++)
         CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
     }
@@ -351,10 +360,6 @@ static void AnalyzeReturnValues(CCState &State,
                                 SmallVectorImpl<CCValAssign> &RVLocs,
                                 const SmallVectorImpl<ArgT> &Args) {
   AnalyzeRetResult(State, Args);
-
-  // Reverse splitted return values to get the "big endian" format required
-  // to agree with the calling convention ABI.
-  std::reverse(RVLocs.begin(), RVLocs.end());
 }
 
 SDValue MSP430TargetLowering::LowerFormalArguments(
@@ -496,9 +501,33 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
     }
   }
 
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = FuncInfo->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(MVT::i16));
+        FuncInfo->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+    }
+  }
+
   return Chain;
 }
 
+bool
+MSP430TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                     MachineFunction &MF,
+                                     bool IsVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_MSP430);
+}
+
 SDValue
 MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                   bool isVarArg,
@@ -506,6 +535,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                   const SmallVectorImpl<SDValue> &OutVals,
                                   const SDLoc &dl, SelectionDAG &DAG) const {
 
+  MachineFunction &MF = DAG.getMachineFunction();
+
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -537,6 +568,22 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  if (MF.getFunction()->hasStructRetAttr()) {
+    MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in entry block");
+
+    SDValue Val =
+      DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy(DAG.getDataLayout()));
+    unsigned R12 = MSP430::R12;
+
+    Chain = DAG.getCopyToReg(Chain, dl, R12, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(R12, getPointerTy(DAG.getDataLayout())));
+  }
+
   unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
                   MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
 
@@ -1219,7 +1266,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
                 BB->end());
   RemBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  // Add edges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
   BB->addSuccessor(LoopBB);
   BB->addSuccessor(RemBB);
   LoopBB->addSuccessor(RemBB);
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 8864807e999e..3a729623c99a 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -158,6 +158,12 @@ namespace llvm {
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const override;
 
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF,
+                        bool IsVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 47b0e270c5b3..e7716382b222 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -119,7 +119,7 @@ void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index 2d937318c7e5..fcaa8a1d6c72 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -33,15 +33,23 @@ class MSP430MachineFunctionInfo : public MachineFunctionInfo {
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
 public:
   MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
 
   explicit MSP430MachineFunctionInfo(MachineFunction &MF)
-    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0), SRetReturnReg(0) {}
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
   int getRAIndex() const { return ReturnAddrIndex; }
   void setRAIndex(int Index) { ReturnAddrIndex = Index; }
 
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index d054578deb67..d407774574be 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -7,47 +7,67 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIFlagsSection.h"
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
-#include "MipsRegisterInfo.h"
-#include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <memory>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mips-asm-parser"
 
 namespace llvm {
+
 class MCInstrInfo;
-}
+
+} // end namespace llvm
 
 namespace {
+
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions(const FeatureBitset &Features_) :
-    ATReg(1), Reorder(true), Macro(true), Features(Features_) {}
+  MipsAssemblerOptions(const FeatureBitset &Features_) : Features(Features_) {}
 
   MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
     ATReg = Opts->getATRegIndex();
@@ -84,12 +104,13 @@ public:
   static const FeatureBitset AllArchRelatedMask;
 
 private:
-  unsigned ATReg;
-  bool Reorder;
-  bool Macro;
+  unsigned ATReg = 1;
+  bool Reorder = true;
+  bool Macro = true;
   FeatureBitset Features;
 };
-}
+
+} // end anonymous namespace
 
 const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
     Mips::FeatureMips1, Mips::FeatureMips2, Mips::FeatureMips3,
@@ -103,6 +124,7 @@ const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
 };
 
 namespace {
+
 class MipsAsmParser : public MCTargetAsmParser {
   MipsTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -147,6 +169,8 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool parseBracketSuffix(StringRef Name, OperandVector &Operands);
 
+  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID);
+
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
@@ -252,6 +276,18 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                  const MCSubtargetInfo *STI);
 
+  bool expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                    const MCSubtargetInfo *STI);
+
+  bool expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                  const MCSubtargetInfo *STI);
+
+  bool expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                   const MCSubtargetInfo *STI);
+
+  bool expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
+
   bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                              const MCSubtargetInfo *STI, bool IsLoad);
 
@@ -339,6 +375,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   /// This should be used in pseudo-instruction expansions which need AT.
   unsigned getATReg(SMLoc Loc);
 
+  bool canUseATReg();
+
   bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                           const MCSubtargetInfo *STI);
 
@@ -466,9 +504,11 @@ public:
   bool isGP64bit() const {
     return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
   }
+
   bool isFP64bit() const {
     return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
   }
+
   const MipsABIInfo &getABI() const { return ABI; }
   bool isABI_N32() const { return ABI.IsN32(); }
   bool isABI_N64() const { return ABI.IsN64(); }
@@ -484,48 +524,63 @@ public:
   bool inMicroMipsMode() const {
     return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
   }
+
   bool hasMips1() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips1];
   }
+
   bool hasMips2() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips2];
   }
+
   bool hasMips3() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips3];
   }
+
   bool hasMips4() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips4];
   }
+
   bool hasMips5() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips5];
   }
+
   bool hasMips32() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips32];
   }
+
   bool hasMips64() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips64];
   }
+
   bool hasMips32r2() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
   }
+
   bool hasMips64r2() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
   }
+
   bool hasMips32r3() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
   }
+
   bool hasMips64r3() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
   }
+
   bool hasMips32r5() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
   }
+
   bool hasMips64r5() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
   }
+
   bool hasMips32r6() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
   }
+
   bool hasMips64r6() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
   }
@@ -533,15 +588,19 @@ public:
   bool hasDSP() const {
     return getSTI().getFeatureBits()[Mips::FeatureDSP];
   }
+
   bool hasDSPR2() const {
     return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
   }
+
   bool hasDSPR3() const {
     return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
   }
+
   bool hasMSA() const {
     return getSTI().getFeatureBits()[Mips::FeatureMSA];
   }
+
   bool hasCnMips() const {
     return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
   }
@@ -627,9 +686,6 @@ public:
     }
   }
 };
-}
-
-namespace {
 
 /// MipsOperand - Instances of this class represent a parsed Mips machine
 /// instruction.
@@ -671,6 +727,22 @@ public:
   MipsOperand(KindTy K, MipsAsmParser &Parser)
       : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
 
+  ~MipsOperand() override {
+    switch (Kind) {
+    case k_Immediate:
+      break;
+    case k_Memory:
+      delete Mem.Base;
+      break;
+    case k_RegList:
+      delete RegList.List;
+    case k_RegisterIndex:
+    case k_Token:
+    case k_RegPair:
+      break;
+    }
+  }
+
 private:
   /// For diagnostics, and checking the assembler temporary
   MipsAsmParser &AsmParser;
@@ -716,7 +788,7 @@ private:
                                                 const MCRegisterInfo *RegInfo,
                                                 SMLoc S, SMLoc E,
                                                 MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_RegisterIndex, Parser);
     Op->RegIdx.Index = Index;
     Op->RegIdx.RegInfo = RegInfo;
     Op->RegIdx.Kind = RegKind;
@@ -896,6 +968,16 @@ public:
   /// Render the operand to an MCInst as a GPR32
   /// Asserts if the wrong number of operands are requested, or the operand
   /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR32ZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
+  void addGPR32NonZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
   void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
@@ -1104,45 +1186,58 @@ public:
     // $0/$zero here so that MCK_ZERO works correctly.
     return isGPRAsmReg() && RegIdx.Index == 0;
   }
+
   bool isRegIdx() const { return Kind == k_RegisterIndex; }
   bool isImm() const override { return Kind == k_Immediate; }
+
   bool isConstantImm() const {
     int64_t Res;
     return isImm() && getImm()->evaluateAsAbsolute(Res);
   }
+
   bool isConstantImmz() const {
     return isConstantImm() && getConstantImm() == 0;
   }
+
   template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
     return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
   }
+
   template <unsigned Bits> bool isSImm() const {
     return isConstantImm() ? isInt<Bits>(getConstantImm()) : isImm();
   }
+
   template <unsigned Bits> bool isUImm() const {
     return isConstantImm() ? isUInt<Bits>(getConstantImm()) : isImm();
   }
+
   template <unsigned Bits> bool isAnyImm() const {
     return isConstantImm() ? (isInt<Bits>(getConstantImm()) ||
                               isUInt<Bits>(getConstantImm()))
                            : isImm();
   }
+
   template <unsigned Bits, int Offset = 0> bool isConstantSImm() const {
     return isConstantImm() && isInt<Bits>(getConstantImm() - Offset);
   }
+
   template <unsigned Bottom, unsigned Top> bool isConstantUImmRange() const {
     return isConstantImm() && getConstantImm() >= Bottom &&
            getConstantImm() <= Top;
   }
+
   bool isToken() const override {
     // Note: It's not possible to pretend that other operand kinds are tokens.
     // The matcher emitter checks tokens first.
     return Kind == k_Token;
   }
+
   bool isMem() const override { return Kind == k_Memory; }
+
   bool isConstantMemOff() const {
     return isMem() && isa<MCConstantExpr>(getMemOff());
   }
+
   // Allow relocation operators.
   // FIXME: This predicate and others need to look through binary expressions
   //        and determine whether a Value is a constant or not.
@@ -1160,28 +1255,34 @@ public:
     bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
     return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
   }
+
   bool isMemWithGRPMM16Base() const {
     return isMem() && getMemBase()->isMM16AsmReg();
   }
+
   template <unsigned Bits> bool isMemWithUimmOffsetSP() const {
     return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
       && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP);
   }
+
   template <unsigned Bits> bool isMemWithUimmWordAlignedOffsetSP() const {
     return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
       && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
       && (getMemBase()->getGPR32Reg() == Mips::SP);
   }
+
   template <unsigned Bits> bool isMemWithSimmWordAlignedOffsetGP() const {
     return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
       && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
       && (getMemBase()->getGPR32Reg() == Mips::GP);
   }
+
   template <unsigned Bits, unsigned ShiftLeftAmount>
   bool isScaledUImm() const {
     return isConstantImm() &&
            isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
   }
+
   template <unsigned Bits, unsigned ShiftLeftAmount>
   bool isScaledSImm() const {
     if (isConstantImm() && isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
@@ -1193,6 +1294,7 @@ public:
     bool Success = getImm()->evaluateAsRelocatable(Res, nullptr, nullptr);
     return Success && isShiftedInt<Bits, ShiftLeftAmount>(Res.getConstant());
   }
+
   bool isRegList16() const {
     if (!isRegList())
       return false;
@@ -1217,14 +1319,18 @@ public:
 
     return true;
   }
+
   bool isInvNum() const { return Kind == k_Immediate; }
+
   bool isLSAImm() const {
     if (!isConstantImm())
       return false;
     int64_t Val = getConstantImm();
     return 1 <= Val && Val <= 4;
   }
+
   bool isRegList() const { return Kind == k_RegList; }
+
   bool isMovePRegPair() const {
     if (Kind != k_RegList || RegList.List->size() != 2)
       return false;
@@ -1257,6 +1363,7 @@ public:
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
+
   bool isRegPair() const {
     return Kind == k_RegPair && RegIdx.Index <= 30;
   }
@@ -1310,7 +1417,7 @@ public:
 
   static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
                                                   MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_Token, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_Token, Parser);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -1385,7 +1492,7 @@ public:
 
   static std::unique_ptr<MipsOperand>
   CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_Immediate, Parser);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1395,7 +1502,7 @@ public:
   static std::unique_ptr<MipsOperand>
   CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
             SMLoc E, MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_Memory, Parser);
     Op->Mem.Base = Base.release();
     Op->Mem.Off = Off;
     Op->StartLoc = S;
@@ -1406,9 +1513,9 @@ public:
   static std::unique_ptr<MipsOperand>
   CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
                 MipsAsmParser &Parser) {
-    assert (Regs.size() > 0 && "Empty list not allowed");
+    assert(Regs.size() > 0 && "Empty list not allowed");
 
-    auto Op = make_unique<MipsOperand>(k_RegList, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_RegList, Parser);
     Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
@@ -1418,7 +1525,7 @@ public:
   static std::unique_ptr<MipsOperand> CreateRegPair(const MipsOperand &MOP,
                                                     SMLoc S, SMLoc E,
                                                     MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_RegPair, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_RegPair, Parser);
     Op->RegIdx.Index = MOP.RegIdx.Index;
     Op->RegIdx.RegInfo = MOP.RegIdx.RegInfo;
     Op->RegIdx.Kind = MOP.RegIdx.Kind;
@@ -1427,14 +1534,25 @@ public:
     return Op;
   }
 
+ bool isGPRZeroAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index == 0;
+  }
+
+ bool isGPRNonZeroAsmReg() const {
+   return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index > 0 &&
+          RegIdx.Index <= 31;
+  }
+
   bool isGPRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
   }
+
   bool isMM16AsmReg() const {
     if (!(isRegIdx() && RegIdx.Kind))
       return false;
     return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
             || RegIdx.Index == 16 || RegIdx.Index == 17);
+
   }
   bool isMM16AsmRegZero() const {
     if (!(isRegIdx() && RegIdx.Kind))
@@ -1443,42 +1561,53 @@ public:
             (RegIdx.Index >= 2 && RegIdx.Index <= 7) ||
             RegIdx.Index == 17);
   }
+
   bool isMM16AsmRegMoveP() const {
     if (!(isRegIdx() && RegIdx.Kind))
       return false;
     return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 3) ||
       (RegIdx.Index >= 16 && RegIdx.Index <= 20));
   }
+
   bool isFGRAsmReg() const {
     // AFGR64 is $0-$15 but we handle this in getAFGR64()
     return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
   }
+
   bool isHWRegsAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
   }
+
   bool isCCRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
   }
+
   bool isFCCAsmReg() const {
     if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
       return false;
     return RegIdx.Index <= 7;
   }
+
   bool isACCAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
   }
+
   bool isCOP0AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP0 && RegIdx.Index <= 31;
   }
+
   bool isCOP2AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
   }
+
   bool isCOP3AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
   }
+
   bool isMSA128AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
   }
+
   bool isMSACtrlAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
   }
@@ -1488,22 +1617,6 @@ public:
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
 
-  virtual ~MipsOperand() {
-    switch (Kind) {
-    case k_Immediate:
-      break;
-    case k_Memory:
-      delete Mem.Base;
-      break;
-    case k_RegList:
-      delete RegList.List;
-    case k_RegisterIndex:
-    case k_Token:
-    case k_RegPair:
-      break;
-    }
-  }
-
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case k_Immediate:
@@ -1553,11 +1666,15 @@ public:
     }
   }
 }; // class MipsOperand
-} // namespace
+
+} // end anonymous namespace
 
 namespace llvm {
+
 extern const MCInstrDesc MipsInsts[];
-}
+
+} // end namespace llvm
+
 static const MCInstrDesc &getInstDesc(unsigned Opcode) {
   return MipsInsts[Opcode];
 }
@@ -1785,6 +1902,61 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     }
   }
 
+  // Warn on division by zero. We're checking here as all instructions get
+  // processed here, not just the macros that need expansion.
+  //
+  // The MIPS backend models most of the divison instructions and macros as
+  // three operand instructions. The pre-R6 divide instructions however have
+  // two operands and explicitly define HI/LO as part of the instruction,
+  // not in the operands.
+  unsigned FirstOp = 1;
+  unsigned SecondOp = 2;
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case Mips::SDivIMacro:
+  case Mips::UDivIMacro:
+  case Mips::DSDivIMacro:
+  case Mips::DUDivIMacro:
+    if (Inst.getOperand(2).getImm() == 0) {
+      if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+          Inst.getOperand(1).getReg() == Mips::ZERO_64)
+        Warning(IDLoc, "dividing zero by zero");
+      else
+        Warning(IDLoc, "division by zero");
+    }
+    break;
+  case Mips::DSDIV:
+  case Mips::SDIV:
+  case Mips::UDIV:
+  case Mips::DUDIV:
+  case Mips::UDIV_MM:
+  case Mips::SDIV_MM:
+    FirstOp = 0;
+    SecondOp = 1;
+  case Mips::SDivMacro:
+  case Mips::DSDivMacro:
+  case Mips::UDivMacro:
+  case Mips::DUDivMacro:
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DIVU_MMR6:
+  case Mips::DDIVU_MM64R6:
+  case Mips::DIV_MMR6:
+  case Mips::DDIV_MM64R6:
+    if (Inst.getOperand(SecondOp).getReg() == Mips::ZERO ||
+        Inst.getOperand(SecondOp).getReg() == Mips::ZERO_64) {
+      if (Inst.getOperand(FirstOp).getReg() == Mips::ZERO ||
+          Inst.getOperand(FirstOp).getReg() == Mips::ZERO_64)
+        Warning(IDLoc, "dividing zero by zero");
+      else
+        Warning(IDLoc, "division by zero");
+    }
+    break;
+  }
+
   // For PIC code convert unconditional jump to unconditional branch.
   if ((Inst.getOpcode() == Mips::J || Inst.getOpcode() == Mips::J_MM) &&
       inPicMode()) {
@@ -2135,6 +2307,8 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BneImm:
   case Mips::BeqImm:
+  case Mips::BEQLImmMacro:
+  case Mips::BNELImmMacro:
     return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BLT:
   case Mips::BLE:
@@ -2170,15 +2344,19 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::BGTULImmMacro:
     return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SDivMacro:
+  case Mips::SDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
                                                          : MER_Success;
   case Mips::DSDivMacro:
+  case Mips::DSDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
                                                         : MER_Success;
   case Mips::UDivMacro:
+  case Mips::UDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
                                                           : MER_Success;
   case Mips::DUDivMacro:
+  case Mips::DUDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
                                                          : MER_Success;
   case Mips::PseudoTRUNC_W_S:
@@ -2200,11 +2378,24 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::Usw:
     return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::NORImm:
+  case Mips::NORImm64:
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLTImm64:
+    if (isInt<16>(Inst.getOperand(2).getImm())) {
+      Inst.setOpcode(Mips::SLTi64);
+      return MER_NotAMacro;
+    }
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLTUImm64:
+    if (isInt<16>(Inst.getOperand(2).getImm())) {
+      Inst.setOpcode(Mips::SLTiu64);
+      return MER_NotAMacro;
+    }
     return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
-  case Mips::ADDi:
-  case Mips::ADDiu:
-  case Mips::SLTi:
-  case Mips::SLTiu:
+  case Mips::ADDi:   case Mips::ADDi_MM:
+  case Mips::ADDiu:  case Mips::ADDiu_MM:
+  case Mips::SLTi:   case Mips::SLTi_MM:
+  case Mips::SLTiu:  case Mips::SLTiu_MM:
     if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
         Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
       int64_t ImmValue = Inst.getOperand(2).getImm();
@@ -2214,9 +2405,9 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                                          : MER_Success;
     }
     return MER_NotAMacro;
-  case Mips::ANDi:
-  case Mips::ORi:
-  case Mips::XORi:
+  case Mips::ANDi:  case Mips::ANDi_MM:  case Mips::ANDi64:
+  case Mips::ORi:   case Mips::ORi_MM:   case Mips::ORi64:
+  case Mips::XORi:  case Mips::XORi_MM:  case Mips::XORi64:
     if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
         Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
       int64_t ImmValue = Inst.getOperand(2).getImm();
@@ -2240,6 +2431,17 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::ABSMacro:
     return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULImmMacro:
+  case Mips::DMULImmMacro:
+    return expandMulImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULOMacro:
+  case Mips::DMULOMacro:
+    return expandMulO(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULOUMacro:
+  case Mips::DMULOUMacro:
+    return expandMulOU(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::DMULMacro:
+    return expandDMULMacro(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::LDMacro:
   case Mips::SDMacro:
     return expandLoadStoreDMacro(Inst, IDLoc, Out, STI,
@@ -2392,7 +2594,6 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
 
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
-
     if (!Is32BitImm && !isInt<32>(ImmValue)) {
       // Traditional behaviour seems to special case this particular value. It's
       // not clear why other masks are handled differently.
@@ -2618,20 +2819,24 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
-    // We always need AT for the 64-bit expansion.
-    // If it is not available we exit.
-    unsigned ATReg = getATReg(IDLoc);
-    if (!ATReg)
-      return true;
+    // We need AT for the 64-bit expansion in the cases where the optional
+    // source register is the destination register and for the superscalar
+    // scheduled form.
+    //
+    // If it is not available we exit if the destination is the same as the
+    // source register.
 
     const MipsMCExpr *HighestExpr =
         MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
     const MipsMCExpr *HigherExpr =
         MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
 
-    if (UseSrcReg &&
-        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
-                                                               SrcReg)) {
+    bool RdRegIsRsReg =
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg);
+
+    if (canUseATReg() && UseSrcReg && RdRegIsRsReg) {
+      unsigned ATReg = getATReg(IDLoc);
+
       // If $rs is the same as $rd:
       // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
       //                        daddiu $at, $at, %higher(sym)
@@ -2653,29 +2858,65 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);
 
       return false;
-    }
+    } else if (canUseATReg() && !RdRegIsRsReg) {
+      unsigned ATReg = getATReg(IDLoc);
 
-    // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
-    // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
-    //                            lui    $at, %hi(sym)
-    //                            daddiu $rd, $rd, %higher(sym)
-    //                            daddiu $at, $at, %lo(sym)
-    //                            dsll32 $rd, $rd, 0
-    //                            daddu  $rd, $rd, $at
-    //                            (daddu  $rd, $rd, $rs)
-    TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
-                STI);
-    TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
-    TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
-                 MCOperand::createExpr(HigherExpr), IDLoc, STI);
-    TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
-                 IDLoc, STI);
-    TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
-    TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
-    if (UseSrcReg)
-      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+      // If the $rs is different from $rd or if $rs isn't specified and we
+      // have $at available:
+      // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+      //                            lui    $at, %hi(sym)
+      //                            daddiu $rd, $rd, %higher(sym)
+      //                            daddiu $at, $at, %lo(sym)
+      //                            dsll32 $rd, $rd, 0
+      //                            daddu  $rd, $rd, $at
+      //                            (daddu  $rd, $rd, $rs)
+      //
+      // Which is preferred for superscalar issue.
+      TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
+      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
 
-    return false;
+      return false;
+    } else if (!canUseATReg() && !RdRegIsRsReg) {
+      // Otherwise, synthesize the address in the destination register
+      // serially:
+      // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+      //                            daddiu $rd, $rd, %higher(sym)
+      //                            dsll   $rd, $rd, 16
+      //                            daddiu $rd, $rd, %hi(sym)
+      //                            dsll   $rd, $rd, 16
+      //                            daddiu $rd, $rd, %lo(sym)
+      TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HiExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+
+      return false;
+    } else {
+      // We have a case where SrcReg == DstReg and we don't have $at
+      // available. We can't expand this case, so error out appropriately.
+      assert(SrcReg == DstReg && !canUseATReg() &&
+             "Could have expanded dla but didn't?");
+      reportParseError(IDLoc,
+                     "pseudo-instruction requires $at, which is not available");
+      return true;
+    }
   }
 
   // And now, the 32-bit symbol address expansion:
@@ -2769,6 +3010,8 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) &&
          "expected immediate or expression operand");
 
+  bool IsLikely = false;
+
   unsigned OpCode = 0;
   switch(Inst.getOpcode()) {
     case Mips::BneImm:
@@ -2777,16 +3020,29 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     case Mips::BeqImm:
       OpCode = Mips::BEQ;
       break;
+    case Mips::BEQLImmMacro:
+      OpCode = Mips::BEQL;
+      IsLikely = true;
+      break;
+    case Mips::BNELImmMacro:
+      OpCode = Mips::BNEL;
+      IsLikely = true;
+      break;
     default:
       llvm_unreachable("Unknown immediate branch pseudo-instruction.");
       break;
   }
 
   int64_t ImmValue = ImmOp.getImm();
-  if (ImmValue == 0)
-    TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
-                 STI);
-  else {
+  if (ImmValue == 0) {
+    if (IsLikely) {
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO,
+                   MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
+      TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+    } else
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+              STI);
+  } else {
     warnIfNoMacro(IDLoc);
 
     unsigned ATReg = getATReg(IDLoc);
@@ -2797,7 +3053,12 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                       IDLoc, Out, STI))
       return true;
 
-    TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
+    if (IsLikely) {
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
+              MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
+      TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+    } else
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
   }
   return false;
 }
@@ -2904,9 +3165,9 @@ bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
   unsigned Opcode = Inst.getOpcode();
   unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;
 
-  assert (Inst.getOperand(OpNum - 1).isImm() &&
-          Inst.getOperand(OpNum - 2).isReg() &&
-          Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
+  assert(Inst.getOperand(OpNum - 1).isImm() &&
+         Inst.getOperand(OpNum - 2).isReg() &&
+         Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
 
   if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
       Inst.getOperand(OpNum - 1).getImm() >= 0 &&
@@ -3185,6 +3446,14 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
+// Expand a integer division macro.
+//
+// Notably we don't have to emit a warning when encountering $rt as the $zero
+// register, or 0 as an immediate. processInstruction() has already done that.
+//
+// The destination register can only be $zero when expanding (S)DivIMacro or
+// D(S)DivMacro.
+
 bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                               const MCSubtargetInfo *STI, const bool IsMips64,
                               const bool Signed) {
@@ -3200,67 +3469,88 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert(RsRegOp.isReg() && "expected register operand kind");
   unsigned RsReg = RsRegOp.getReg();
 
-  const MCOperand &RtRegOp = Inst.getOperand(2);
-  assert(RtRegOp.isReg() && "expected register operand kind");
-  unsigned RtReg = RtRegOp.getReg();
+  unsigned RtReg;
+  int64_t ImmValue;
+
+  const MCOperand &RtOp = Inst.getOperand(2);
+  assert((RtOp.isReg() || RtOp.isImm()) &&
+         "expected register or immediate operand kind");
+  if (RtOp.isReg())
+    RtReg = RtOp.getReg();
+  else
+    ImmValue = RtOp.getImm();
+
   unsigned DivOp;
   unsigned ZeroReg;
+  unsigned SubOp;
 
   if (IsMips64) {
     DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
     ZeroReg = Mips::ZERO_64;
+    SubOp = Mips::DSUB;
   } else {
     DivOp = Signed ? Mips::SDIV : Mips::UDIV;
     ZeroReg = Mips::ZERO;
+    SubOp = Mips::SUB;
   }
 
   bool UseTraps = useTraps();
 
-  if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
-    if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
-      Warning(IDLoc, "dividing zero by zero");
-    if (IsMips64) {
-      if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
-        if (UseTraps) {
-          TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
-          return false;
-        }
+  if (RtOp.isImm()) {
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
 
+    if (ImmValue == 0) {
+      if (UseTraps)
+        TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
+      else
         TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
-        return false;
-      }
+      return false;
+    }
+
+    if (ImmValue == 1) {
+      TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI);
+      return false;
+    } else if (Signed && ImmValue == -1) {
+      TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
+      return false;
     } else {
-      TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+      if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
+                        false, Inst.getLoc(), Out, STI))
+        return true;
+      TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
+      TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
       return false;
     }
+    return true;
   }
 
+  // If the macro expansion of (d)div(u) would always trap or break, insert
+  // the trap/break and exit. This gives a different result to GAS. GAS has
+  // an inconsistency/missed optimization in that not all cases are handled
+  // equivalently. As the observed behaviour is the same, we're ok.
   if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
-    Warning(IDLoc, "division by zero");
-    if (Signed) {
-      if (UseTraps) {
-        TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
-        return false;
-      }
-
-      TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+    if (UseTraps) {
+      TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
       return false;
     }
+    TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+    return false;
   }
 
-  // FIXME: The values for these two BranchTarget variables may be different in
-  // micromips. These magic numbers need to be removed.
-  unsigned BranchTargetNoTraps;
-  unsigned BranchTarget;
+  // Temporary label for first branch traget
+  MCContext &Context = TOut.getStreamer().getContext();
+  MCSymbol *BrTarget;
+  MCOperand LabelOp;
 
   if (UseTraps) {
-    BranchTarget = IsMips64 ? 12 : 8;
     TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
   } else {
-    BranchTarget = IsMips64 ? 20 : 16;
-    BranchTargetNoTraps = 8;
     // Branch to the li instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, STI);
+    BrTarget = Context.createTempSymbol();
+    LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+    TOut.emitRRX(Mips::BNE, RtReg, ZeroReg, LabelOp, IDLoc, STI);
   }
 
   TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
@@ -3269,6 +3559,9 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
 
   if (!Signed) {
+    if (!UseTraps)
+      TOut.getStreamer().EmitLabel(BrTarget);
+
     TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
     return false;
   }
@@ -3277,15 +3570,23 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   if (!ATReg)
     return true;
 
+  if (!UseTraps)
+    TOut.getStreamer().EmitLabel(BrTarget);
+
   TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
+
+  // Temporary label for the second branch target.
+  MCSymbol *BrTargetEnd = Context.createTempSymbol();
+  MCOperand LabelOpEnd =
+      MCOperand::createExpr(MCSymbolRefExpr::create(BrTargetEnd, Context));
+
+  // Branch to the mflo instruction.
+  TOut.emitRRX(Mips::BNE, RtReg, ATReg, LabelOpEnd, IDLoc, STI);
+
   if (IsMips64) {
-    // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
     TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
     TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
   } else {
-    // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
     TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
   }
 
@@ -3293,10 +3594,12 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
   else {
     // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, STI);
+    TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI);
     TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
     TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
   }
+
+  TOut.getStreamer().EmitLabel(BrTargetEnd);
   TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
   return false;
 }
@@ -3503,10 +3806,10 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
                                          const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
 
-  assert (Inst.getNumOperands() == 3 && "Invalid operand count");
-  assert (Inst.getOperand(0).isReg() &&
-          Inst.getOperand(1).isReg() &&
-          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
   unsigned ATReg = Mips::NoRegister;
   unsigned FinalDstReg = Mips::NoRegister;
@@ -3514,7 +3817,7 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
   unsigned SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
 
-  bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+  bool Is32Bit = isInt<32>(ImmValue) || (!isGP64bit() && isUInt<32>(ImmValue));
 
   unsigned FinalOpcode = Inst.getOpcode();
 
@@ -3530,30 +3833,69 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     switch (FinalOpcode) {
     default:
       llvm_unreachable("unimplemented expansion");
-    case (Mips::ADDi):
+    case Mips::ADDi:
       FinalOpcode = Mips::ADD;
       break;
-    case (Mips::ADDiu):
+    case Mips::ADDiu:
       FinalOpcode = Mips::ADDu;
       break;
-    case (Mips::ANDi):
+    case Mips::ANDi:
       FinalOpcode = Mips::AND;
       break;
-    case (Mips::NORImm):
+    case Mips::NORImm:
       FinalOpcode = Mips::NOR;
       break;
-    case (Mips::ORi):
+    case Mips::ORi:
       FinalOpcode = Mips::OR;
       break;
-    case (Mips::SLTi):
+    case Mips::SLTi:
       FinalOpcode = Mips::SLT;
       break;
-    case (Mips::SLTiu):
+    case Mips::SLTiu:
       FinalOpcode = Mips::SLTu;
       break;
-    case (Mips::XORi):
+    case Mips::XORi:
       FinalOpcode = Mips::XOR;
       break;
+    case Mips::ADDi_MM:
+      FinalOpcode = Mips::ADD_MM;
+      break;
+    case Mips::ADDiu_MM:
+      FinalOpcode = Mips::ADDu_MM;
+      break;
+    case Mips::ANDi_MM:
+      FinalOpcode = Mips::AND_MM;
+      break;
+    case Mips::ORi_MM:
+      FinalOpcode = Mips::OR_MM;
+      break;
+    case Mips::SLTi_MM:
+      FinalOpcode = Mips::SLT_MM;
+      break;
+    case Mips::SLTiu_MM:
+      FinalOpcode = Mips::SLTu_MM;
+      break;
+    case Mips::XORi_MM:
+      FinalOpcode = Mips::XOR_MM;
+      break;
+    case Mips::ANDi64:
+      FinalOpcode = Mips::AND64;
+      break;
+    case Mips::NORImm64:
+      FinalOpcode = Mips::NOR64;
+      break;
+    case Mips::ORi64:
+      FinalOpcode = Mips::OR64;
+      break;
+    case Mips::SLTImm64:
+      FinalOpcode = Mips::SLT64;
+      break;
+    case Mips::SLTUImm64:
+      FinalOpcode = Mips::SLTu64;
+      break;
+    case Mips::XORi64:
+      FinalOpcode = Mips::XOR64;
+      break;
     }
 
     if (FinalDstReg == Mips::NoRegister)
@@ -3578,7 +3920,6 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   unsigned SecondShift = Mips::NOP;
 
   if (hasMips32r2()) {
-
     if (DReg == SReg) {
       TmpReg = getATReg(Inst.getLoc());
       if (!TmpReg)
@@ -3600,7 +3941,6 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   if (hasMips32()) {
-
     switch (Inst.getOpcode()) {
     default:
       llvm_unreachable("unexpected instruction opcode");
@@ -3642,7 +3982,6 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
   unsigned SecondShift = Mips::NOP;
 
   if (hasMips32r2()) {
-
     if (Inst.getOpcode() == Mips::ROLImm) {
       uint64_t MaxShift = 32;
       uint64_t ShiftValue = ImmValue;
@@ -3661,7 +4000,6 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
   }
 
   if (hasMips32()) {
-
     if (ImmValue == 0) {
       TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI);
       return false;
@@ -3707,7 +4045,6 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   unsigned SecondShift = Mips::NOP;
 
   if (hasMips64r2()) {
-
     if (TmpReg == SReg) {
       TmpReg = getATReg(Inst.getLoc());
       if (!TmpReg)
@@ -3729,7 +4066,6 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   if (hasMips64()) {
-
     switch (Inst.getOpcode()) {
     default:
       llvm_unreachable("unexpected instruction opcode");
@@ -3773,7 +4109,6 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
   MCInst TmpInst;
 
   if (hasMips64r2()) {
-
     unsigned FinalOpcode = Mips::NOP;
     if (ImmValue == 0)
       FinalOpcode = Mips::DROTR;
@@ -3801,7 +4136,6 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
   }
 
   if (hasMips64()) {
-
     if (ImmValue == 0) {
       TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI);
       return false;
@@ -3871,6 +4205,119 @@ bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
+bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                 const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int32_t ImmValue = Inst.getOperand(2).getImm();
+
+  ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out, STI);
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
+              SrcReg, ATReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  ATReg = getATReg(Inst.getLoc());
+  if (!ATReg)
+    return true;
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULOMacro ? Mips::MULT : Mips::DMULT,
+              SrcReg, TmpReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  TOut.emitRRI(Inst.getOpcode() == Mips::MULOMacro ? Mips::SRA : Mips::DSRA32,
+               DstReg, DstReg, 0x1F, IDLoc, STI);
+
+  TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
+
+  if (useTraps()) {
+    TOut.emitRRI(Mips::TNE, DstReg, ATReg, 6, IDLoc, STI);
+  } else {
+    MCContext & Context = TOut.getStreamer().getContext();
+    MCSymbol * BrTarget = Context.createTempSymbol();
+    MCOperand LabelOp =
+        MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+
+    TOut.emitRRX(Mips::BEQ, DstReg, ATReg, LabelOp, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitNop(IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
+
+    TOut.getStreamer().EmitLabel(BrTarget);
+  }
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULOUMacro ? Mips::MULTu : Mips::DMULTu,
+              SrcReg, TmpReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+  if (useTraps()) {
+    TOut.emitRRI(Mips::TNE, ATReg, Mips::ZERO, 6, IDLoc, STI);
+  } else {
+    MCContext & Context = TOut.getStreamer().getContext();
+    MCSymbol * BrTarget = Context.createTempSymbol();
+    MCOperand LabelOp =
+        MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+
+    TOut.emitRRX(Mips::BEQ, ATReg, Mips::ZERO, LabelOp, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitNop(IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
+
+    TOut.getStreamer().EmitLabel(BrTarget);
+  }
+
+  return false;
+}
+
+bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  TOut.emitRR(Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI);
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
 static unsigned nextReg(unsigned Reg) {
   switch (Reg) {
   case Mips::ZERO: return Mips::AT;
@@ -3985,7 +4432,6 @@ bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
 bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                const MCSubtargetInfo *STI) {
-
   warnIfNoMacro(IDLoc);
   MipsTargetStreamer &TOut = getTargetStreamer();
 
@@ -4158,17 +4604,15 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                             MCStreamer &Out,
                                             uint64_t &ErrorInfo,
                                             bool MatchingInlineAsm) {
-
   MCInst Inst;
   unsigned MatchResult =
       MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
-  case Match_Success: {
+  case Match_Success:
     if (processInstruction(Inst, IDLoc, Out, STI))
       return true;
     return false;
-  }
   case Match_MissingFeature:
     Error(IDLoc, "instruction requires a CPU feature not currently enabled");
     return true;
@@ -4441,7 +4885,6 @@ int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) {
 }
 
 int MipsAsmParser::matchFPURegisterName(StringRef Name) {
-
   if (Name[0] == 'f') {
     StringRef NumString = Name.substr(1);
     unsigned IntVal;
@@ -4455,7 +4898,6 @@ int MipsAsmParser::matchFPURegisterName(StringRef Name) {
 }
 
 int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
-
   if (Name.startswith("fcc")) {
     StringRef NumString = Name.substr(3);
     unsigned IntVal;
@@ -4469,7 +4911,6 @@ int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
 }
 
 int MipsAsmParser::matchACRegisterName(StringRef Name) {
-
   if (Name.startswith("ac")) {
     StringRef NumString = Name.substr(2);
     unsigned IntVal;
@@ -4511,6 +4952,10 @@ int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
   return CC;
 }
 
+bool MipsAsmParser::canUseATReg() {
+  return AssemblerOptions.back()->getATRegIndex() != 0;
+}
+
 unsigned MipsAsmParser::getATReg(SMLoc Loc) {
   unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
   if (ATIndex == 0) {
@@ -4589,7 +5034,6 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 }
 
 bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
-
   switch (Expr->getKind()) {
   case MCExpr::Constant:
     return true;
@@ -5522,7 +5966,7 @@ bool MipsAsmParser::parseSetPushDirective() {
 
   // Create a copy of the current assembler options environment and push it.
   AssemblerOptions.push_back(
-              make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+        llvm::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
 
   getTargetStreamer().emitDirectiveSetPush();
   return false;
@@ -5914,6 +6358,14 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetAtDirective();
   } else if (Tok.getString() == "arch") {
     return parseSetArchDirective();
+  } else if (Tok.getString() == "bopt") {
+    Warning(Tok.getLoc(), "'bopt' feature is unsupported");
+    getParser().Lex();
+    return false;
+  } else if (Tok.getString() == "nobopt") {
+    // We're already running in nobopt mode, so nothing to do.
+    getParser().Lex();
+    return false;
   } else if (Tok.getString() == "fp") {
     return parseSetFpDirective();
   } else if (Tok.getString() == "oddspreg") {
@@ -6001,7 +6453,7 @@ bool MipsAsmParser::parseDirectiveSet() {
 bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
   MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
         return true;
@@ -6773,3 +7225,15 @@ extern "C" void LLVMInitializeMipsAsmParser() {
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #include "MipsGenAsmMatcher.inc"
+
+bool MipsAsmParser::mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) {
+  // Find the appropriate table for this asm variant.
+  const MatchEntry *Start, *End;
+  switch (VariantID) {
+  default: llvm_unreachable("invalid variant!");
+  case 0: Start = std::begin(MatchTable0); End = std::end(MatchTable0); break;
+  }
+  // Search the table.
+  auto MnemonicRange = std::equal_range(Start, End, Mnemonic, LessOpcode());
+  return MnemonicRange.first != MnemonicRange.second;
+}
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index f80efb18507b..ecdf6b0de6e7 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -1,4 +1,4 @@
-//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//===- MipsDisassembler.cpp - Disassembler for Mips -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips.h"
-#include "MipsRegisterInfo.h"
-#include "MipsSubtarget.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,6 +39,7 @@ namespace {
 class MipsDisassembler : public MCDisassembler {
   bool IsMicroMips;
   bool IsBigEndian;
+
 public:
   MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool IsBigEndian)
       : MCDisassembler(STI, Ctx),
@@ -42,9 +49,11 @@ public:
   bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
   bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
   bool hasMips32() const { return STI.getFeatureBits()[Mips::FeatureMips32]; }
+
   bool hasMips32r6() const {
     return STI.getFeatureBits()[Mips::FeatureMips32r6];
   }
+
   bool isFP64() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
 
   bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
@@ -527,11 +536,13 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
                                        const void *Decoder);
 
 namespace llvm {
+
 Target &getTheMipselTarget();
 Target &getTheMipsTarget();
 Target &getTheMips64Target();
 Target &getTheMips64elTarget();
-}
+
+} // end namespace llvm
 
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
@@ -1106,6 +1117,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                               raw_ostream &CStream) const {
   uint32_t Insn;
   DecodeStatus Result;
+  Size = 0;
 
   if (IsMicroMips) {
     Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
@@ -1168,98 +1180,88 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       }
     }
 
-    // This is an invalid instruction. Let the disassembler move forward by the
-    // minimum instruction size.
+    // This is an invalid instruction. Claim that the Size is 2 bytes. Since
+    // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
+    // could form a valid instruction. The two bytes we rejected as an
+    // instruction could have actually beeen an inline constant pool that is
+    // unconditionally branched over.
     Size = 2;
     return MCDisassembler::Fail;
   }
 
+  // Attempt to read the instruction so that we can attempt to decode it. If
+  // the buffer is not 4 bytes long, let the higher level logic figure out
+  // what to do with a size of zero and MCDisassembler::Fail.
   Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
-  if (Result == MCDisassembler::Fail) {
-    Size = 4;
+  if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  }
+
+  // The only instruction size for standard encoded MIPS.
+  Size = 4;
 
   if (hasCOP3()) {
     DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
     Result =
         decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6() && isGP64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6() && isPTR64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips2() && isPTR64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasCnMips()) {
     DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (isGP64()) {
     DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result =
       decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
+  if (Result != MCDisassembler::Fail)
     return Result;
-  }
 
-  Size = 4;
   return MCDisassembler::Fail;
 }
 
@@ -1267,16 +1269,13 @@ static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder) {
-
   return MCDisassembler::Fail;
-
 }
 
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
-
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1620,7 +1619,7 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
   switch(Inst.getOpcode())
   {
   default:
-    assert (0 && "Unexpected instruction");
+    assert(false && "Unexpected instruction");
     return MCDisassembler::Fail;
     break;
   case Mips::LD_B:
@@ -1980,7 +1979,6 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   if (RegNo > 30 || RegNo %2)
     return MCDisassembler::Fail;
 
-  ;
   unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
@@ -2128,7 +2126,6 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
                                      const void *Decoder) {
-
   unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
   Inst.addOperand(MCOperand::createImm(JumpOffset));
   return MCDisassembler::Success;
@@ -2267,7 +2264,14 @@ static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   const void *Decoder) {
   // First we need to grab the pos(lsb) from MCInst.
   int Pos = Inst.getOperand(2).getImm();
-  int Size = (int) Insn - Pos + 1;
+  if (Inst.getOpcode() == Mips::DINSU)
+    Pos += 32;
+  int Size;
+  if (Inst.getOpcode() == Mips::DINSM ||
+      Inst.getOpcode() == Mips::DINSU)
+    Size = (int) Insn - Pos + 33;
+  else
+    Size = (int) Insn - Pos + 1;
   Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
   return MCDisassembler::Success;
 }
@@ -2363,7 +2367,6 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
                                        uint64_t Address, const void *Decoder) {
-
   unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
 
   switch (RegPair) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index 932d38a0b9fe..4a2b75b9ae46 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -1,4 +1,4 @@
-//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//===- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsABIFlagsSection.h"
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MipsABIFlags.h"
 
 using namespace llvm;
 
@@ -51,6 +55,7 @@ uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
 }
 
 namespace llvm {
+
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   // Write out a Elf_Internal_ABIFlags_v0 struct
   OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
@@ -66,4 +71,5 @@ MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
   return OS;
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 3966cae9fe33..f38541027023 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -1,4 +1,4 @@
-//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//===- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,10 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include <cstdint>
 
 namespace llvm {
 
@@ -23,36 +24,32 @@ struct MipsABIFlagsSection {
   enum class FpABIKind { ANY, XX, S32, S64, SOFT };
 
   // Version of flags structure.
-  uint16_t Version;
+  uint16_t Version = 0;
   // The level of the ISA: 1-5, 32, 64.
-  uint8_t ISALevel;
+  uint8_t ISALevel = 0;
   // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
-  uint8_t ISARevision;
+  uint8_t ISARevision = 0;
   // The size of general purpose registers.
-  Mips::AFL_REG GPRSize;
+  Mips::AFL_REG GPRSize = Mips::AFL_REG_NONE;
   // The size of co-processor 1 registers.
-  Mips::AFL_REG CPR1Size;
+  Mips::AFL_REG CPR1Size = Mips::AFL_REG_NONE;
   // The size of co-processor 2 registers.
-  Mips::AFL_REG CPR2Size;
+  Mips::AFL_REG CPR2Size = Mips::AFL_REG_NONE;
   // Processor-specific extension.
-  Mips::AFL_EXT ISAExtension;
+  Mips::AFL_EXT ISAExtension = Mips::AFL_EXT_NONE;
   // Mask of ASEs used.
-  uint32_t ASESet;
+  uint32_t ASESet = 0;
 
-  bool OddSPReg;
+  bool OddSPReg = false;
 
-  bool Is32BitABI;
+  bool Is32BitABI = false;
 
 protected:
   // The floating-point ABI.
-  FpABIKind FpABI;
+  FpABIKind FpABI = FpABIKind::ANY;
 
 public:
-  MipsABIFlagsSection()
-      : Version(0), ISALevel(0), ISARevision(0), GPRSize(Mips::AFL_REG_NONE),
-        CPR1Size(Mips::AFL_REG_NONE), CPR2Size(Mips::AFL_REG_NONE),
-        ISAExtension(Mips::AFL_EXT_NONE), ASESet(0), OddSPReg(false),
-        Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+  MipsABIFlagsSection() = default;
 
   uint16_t getVersionValue() { return (uint16_t)Version; }
   uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
@@ -80,6 +77,7 @@ public:
     FpABI = Value;
     Is32BitABI = IsABI32Bit;
   }
+
   StringRef getFpABIString(FpABIKind Value);
 
   template <class PredicateLibrary>
@@ -195,6 +193,7 @@ public:
 };
 
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 38b11f78e36d..3304449efb91 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 
 // Prepare value for the target space for it
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx = nullptr) {
+                                 MCContext &Ctx) {
 
   unsigned Kind = Fixup.getKind();
 
@@ -74,8 +74,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // address range. Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isInt<16>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC16 fixup");
       return 0;
     }
     break;
@@ -84,8 +84,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 19-bit signed immediate.
-    if (!isInt<19>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+    if (!isInt<19>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC19 fixup");
       return 0;
     }
     break;
@@ -121,8 +121,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 7-bit signed immediate.
-    if (!isInt<7>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+    if (!isInt<7>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC7 fixup");
       return 0;
     }
     break;
@@ -131,8 +131,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 10-bit signed immediate.
-    if (!isInt<10>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+    if (!isInt<10>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC10 fixup");
       return 0;
     }
     break;
@@ -141,8 +141,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isInt<16>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC16 fixup");
       return 0;
     }
     break;
@@ -150,21 +150,21 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isInt<18>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
       return 0;
     }
     break;
   case Mips::fixup_MICROMIPS_PC18_S3:
     // Check alignment.
-    if ((Value & 7) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if ((Value & 7)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
     }
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isInt<18>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
       return 0;
     }
     break;
@@ -172,8 +172,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isInt<21>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC21 fixup");
       return 0;
     }
     break;
@@ -181,8 +181,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isInt<26>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
@@ -190,8 +190,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isInt<26>(Value) && Ctx) {
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value)) {
+      Ctx.reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
@@ -199,8 +199,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isInt<21>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC21 fixup");
       return 0;
     }
     break;
@@ -236,10 +236,10 @@ static unsigned calculateMMLEIndex(unsigned i) {
 /// data fragment, at the offset specified by the fixup and following the
 /// fixup kind as appropriate.
 void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                unsigned DataSize, uint64_t Value,
-                                bool IsPCRel) const {
+                                unsigned DataSize, uint64_t Value, bool IsPCRel,
+                                MCContext &Ctx) const {
   MCFixupKind Kind = Fixup.getKind();
-  Value = adjustFixupValue(Fixup, Value);
+  Value = adjustFixupValue(Fixup, Value, Ctx);
 
   if (!Value)
     return; // Doesn't change encoding.
@@ -471,24 +471,6 @@ bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-/// processFixupValue - Target hook to process the literal value of a fixup
-/// if necessary.
-void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                       const MCAsmLayout &Layout,
-                                       const MCFixup &Fixup,
-                                       const MCFragment *DF,
-                                       const MCValue &Target,
-                                       uint64_t &Value,
-                                       bool &IsResolved) {
-  // At this point we'll ignore the value returned by adjustFixupValue as
-  // we are only checking if the fixup can be applied correctly. We have
-  // access to MCContext from here which allows us to report a fatal error
-  // with *possibly* a source code location.
-  // The caller will also ignore any changes we make to Value
-  // (recordRelocation() overwrites it with it's own calculation).
-  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
 // MCAsmBackend
 MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
                                              const MCRegisterInfo &MRI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index f260cfa566c9..4b3cc6e21f4c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -39,7 +39,7 @@ public:
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -82,11 +82,6 @@ public:
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index b2efd726da53..324fd3c6fe14 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -7,33 +7,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-#include <list>
-#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <list>
+#include <utility>
 
 #define DEBUG_TYPE "mips-elf-object-writer"
 
 using namespace llvm;
 
 namespace {
+
 /// Holds additional information needed by the relocation ordering algorithm.
 struct MipsRelocationEntry {
   const ELFRelocationEntry R; ///< The relocation.
-  bool Matched;               ///< Is this relocation part of a match.
+  bool Matched = false;       ///< Is this relocation part of a match.
 
-  MipsRelocationEntry(const ELFRelocationEntry &R) : R(R), Matched(false) {}
+  MipsRelocationEntry(const ELFRelocationEntry &R) : R(R) {}
 
   void print(raw_ostream &Out) const {
     R.print(Out);
@@ -53,23 +58,33 @@ public:
   MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64,
                       bool IsLittleEndian);
 
-  ~MipsELFObjectWriter() override;
+  ~MipsELFObjectWriter() override = default;
 
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
   bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                unsigned Type) const override;
-  virtual void sortRelocs(const MCAssembler &Asm,
-                          std::vector<ELFRelocationEntry> &Relocs) override;
+  void sortRelocs(const MCAssembler &Asm,
+                  std::vector<ELFRelocationEntry> &Relocs) override;
+};
+
+/// The possible results of the Predicate function used by find_best.
+enum FindBestPredicateResult {
+  FindBest_NoMatch = 0,  ///< The current element is not a match.
+  FindBest_Match,        ///< The current element is a match but better ones are
+                         ///  possible.
+  FindBest_PerfectMatch, ///< The current element is an unbeatable match.
 };
 
+} // end anonymous namespace
+
 /// Copy elements in the range [First, Last) to d1 when the predicate is true or
 /// d2 when the predicate is false. This is essentially both std::copy_if and
 /// std::remove_copy_if combined into a single pass.
 template <class InputIt, class OutputIt1, class OutputIt2, class UnaryPredicate>
-std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
-                                             OutputIt1 d1, OutputIt2 d2,
-                                             UnaryPredicate Predicate) {
+static std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
+                                                    OutputIt1 d1, OutputIt2 d2,
+                                                    UnaryPredicate Predicate) {
   for (InputIt I = First; I != Last; ++I) {
     if (Predicate(*I)) {
       *d1 = *I;
@@ -83,14 +98,6 @@ std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
   return std::make_pair(d1, d2);
 }
 
-/// The possible results of the Predicate function used by find_best.
-enum FindBestPredicateResult {
-  FindBest_NoMatch = 0,  ///< The current element is not a match.
-  FindBest_Match,        ///< The current element is a match but better ones are
-                         ///  possible.
-  FindBest_PerfectMatch, ///< The current element is an unbeatable match.
-};
-
 /// Find the best match in the range [First, Last).
 ///
 /// An element matches when Predicate(X) returns FindBest_Match or
@@ -101,8 +108,8 @@ enum FindBestPredicateResult {
 /// This is similar to std::find_if but finds the best of multiple possible
 /// matches.
 template <class InputIt, class UnaryPredicate, class Comparator>
-InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
-                  Comparator BetterThan) {
+static InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
+                         Comparator BetterThan) {
   InputIt Best = Last;
 
   for (InputIt I = First; I != Last; ++I) {
@@ -202,16 +209,12 @@ static void dumpRelocs(const char *Prefix, const Container &Relocs) {
 }
 #endif
 
-} // end anonymous namespace
-
 MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
                                          bool _isN64, bool IsLittleEndian)
     : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
                               /*HasRelocationAddend*/ _isN64,
                               /*IsN64*/ _isN64) {}
 
-MipsELFObjectWriter::~MipsELFObjectWriter() {}
-
 unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
                                            const MCValue &Target,
                                            const MCFixup &Fixup,
@@ -419,7 +422,6 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
 /// always match using the expressions from the source.
 void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                                      std::vector<ELFRelocationEntry> &Relocs) {
-
   // We do not need to sort the relocation table for RELA relocations which
   // N32/N64 uses as the relocation addend contains the value we require,
   // rather than it being split across a pair of relocations.
@@ -524,6 +526,8 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MIPS_GOT16:
   case ELF::R_MIPS16_GOT16:
   case ELF::R_MICROMIPS_GOT16:
+  case ELF::R_MIPS_HIGHER:
+  case ELF::R_MIPS_HIGHEST:
   case ELF::R_MIPS_HI16:
   case ELF::R_MIPS16_HI16:
   case ELF::R_MICROMIPS_HI16:
@@ -567,8 +571,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MIPS_INSERT_A:
   case ELF::R_MIPS_INSERT_B:
   case ELF::R_MIPS_DELETE:
-  case ELF::R_MIPS_HIGHER:
-  case ELF::R_MIPS_HIGHEST:
   case ELF::R_MIPS_CALL_HI16:
   case ELF::R_MIPS_CALL_LO16:
   case ELF::R_MIPS_SCN_DISP:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e7d687e89a8a..ae3278322311 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -8,15 +8,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsELFStreamer.h"
+#include "MipsOptionRecord.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
 
 using namespace llvm;
 
 void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
-                                      const MCSubtargetInfo &STI) {
+                                      const MCSubtargetInfo &STI, bool) {
   MCELFStreamer::EmitInstruction(Inst, STI);
 
   MCContext &Context = getContext();
@@ -51,7 +55,7 @@ void MipsELFStreamer::createPendingLabelRelocs() {
   Labels.clear();
 }
 
-void MipsELFStreamer::EmitLabel(MCSymbol *Symbol) {
+void MipsELFStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCELFStreamer::EmitLabel(Symbol);
   Labels.push_back(Symbol);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index a241cdebdcc8..f5eda112817e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -1,4 +1,4 @@
-//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//===- MipsELFStreamer.h - ELF Object Output --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,6 +21,7 @@
 #include <memory>
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -31,12 +32,10 @@ class MipsELFStreamer : public MCELFStreamer {
   MipsRegInfoRecord *RegInfoRecord;
   SmallVector<MCSymbol*, 4> Labels;
 
-
 public:
   MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, MAB, OS, Emitter) {
-
     RegInfoRecord = new MipsRegInfoRecord(this, Context);
     MipsOptionRecords.push_back(
         std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
@@ -46,12 +45,13 @@ public:
   /// \p Inst is actually emitted. For example, we can inspect the operands and
   /// gather sufficient information that allows us to reason about the register
   /// usage for the translation unit.
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool = false) override;
 
   /// Overriding this function allows us to record all labels that should be
   /// marked as microMIPS. Based on this data marking is done in
   /// EmitInstruction.
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .section directive is processed.
@@ -72,5 +72,6 @@ public:
 MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_pwrite_stream &OS,
                                      MCCodeEmitter *Emitter, bool RelaxAll);
-} // namespace llvm.
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index a44a35f49e5f..ebe3c5784888 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -19,9 +19,7 @@ using namespace llvm;
 void MipsMCAsmInfo::anchor() { }
 
 MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
-  if ((TheTriple.getArch() == Triple::mips) ||
-      (TheTriple.getArch() == Triple::mips64))
-    IsLittleEndian = false;
+  IsLittleEndian = TheTriple.isLittleEndian();
 
   if ((TheTriple.getArch() == Triple::mips64el) ||
       (TheTriple.getArch() == Triple::mips64)) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 0614316d5ac7..5685f0426e9b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -10,22 +10,29 @@
 // This file implements the MipsMCCodeEmitter class.
 //
 //===----------------------------------------------------------------------===//
-//
 
-#include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsMCCodeEmitter.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
@@ -34,6 +41,7 @@
 #undef GET_INSTRMAP_INFO
 
 namespace llvm {
+
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                          const MCRegisterInfo &MRI,
                                          MCContext &Ctx) {
@@ -45,12 +53,12 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          MCContext &Ctx) {
   return new MipsMCCodeEmitter(MCII, Ctx, true);
 }
-} // End of namespace llvm.
+
+} // end namespace llvm
 
 // If the D<shift> instruction has a shift amount that is greater
 // than 31 (checked in calling routine), lower it to a D<shift>32 instruction
 static void LowerLargeShift(MCInst& Inst) {
-
   assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
   assert(Inst.getOperand(2).isImm());
 
@@ -103,24 +111,25 @@ static void LowerDins(MCInst& InstIn) {
   assert(InstIn.getOperand(3).isImm());
   int64_t size = InstIn.getOperand(3).getImm();
 
-  if (size <= 32) {
-    if (pos < 32)  // DINS, do nothing
-      return;
+  assert((pos + size) <= 64 &&
+         "DINS cannot have position plus size over 64");
+  if (pos < 32) {
+    if ((pos + size) > 0 && (pos + size) <= 32)
+      return; // DINS, do nothing
+    else if ((pos + size) > 32) {
+      //DINSM
+      InstIn.getOperand(3).setImm(size - 32);
+      InstIn.setOpcode(Mips::DINSM);
+    }
+  } else if ((pos + size) > 32 && (pos + size) <= 64) {
     // DINSU
     InstIn.getOperand(2).setImm(pos - 32);
     InstIn.setOpcode(Mips::DINSU);
-    return;
   }
-  // DINSM
-  assert(pos < 32 && "DINS cannot have both size and pos > 32");
-  InstIn.getOperand(3).setImm(size - 32);
-  InstIn.setOpcode(Mips::DINSM);
-  return;
 }
 
 // Fix a bad compact branch encoding for beqc/bnec.
 void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
-
   // Encoding may be illegal !(rs < rt), but this situation is
   // easily fixed.
   unsigned RegOp0 = Inst.getOperand(0).getReg();
@@ -146,7 +155,6 @@ void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
 
   Inst.getOperand(0).setReg(RegOp1);
   Inst.getOperand(1).setReg(RegOp0);
-
 }
 
 bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
@@ -186,7 +194,6 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const
 {
-
   // Non-pseudo instructions that get changed for direct object
   // only based on operand values.
   // If this list of instructions get much longer we will move
@@ -272,7 +279,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -295,7 +301,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -318,7 +323,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -342,7 +346,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -366,7 +369,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -388,7 +390,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -410,7 +411,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -433,7 +433,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -456,7 +455,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -479,7 +477,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -501,7 +498,6 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
 unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
     const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -525,7 +521,6 @@ unsigned MipsMCCodeEmitter::
 getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   if (MO.isImm()) return MO.getImm();
@@ -544,7 +539,6 @@ unsigned MipsMCCodeEmitter::
 getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 4.
   if (MO.isImm()) return MO.getImm()>>2;
@@ -562,7 +556,6 @@ unsigned MipsMCCodeEmitter::
 getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 2.
   if (MO.isImm()) return MO.getImm() >> 1;
@@ -580,7 +573,6 @@ unsigned MipsMCCodeEmitter::
 getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     // The immediate is encoded as 'immediate << 2'.
@@ -599,7 +591,6 @@ unsigned MipsMCCodeEmitter::
 getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     int Value = MO.getImm();
@@ -613,7 +604,6 @@ unsigned MipsMCCodeEmitter::
 getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     unsigned Value = MO.getImm();
@@ -627,7 +617,6 @@ unsigned MipsMCCodeEmitter::
 getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     unsigned Binary = (MO.getImm() >> 2) & 0x0000ffff;
@@ -711,7 +700,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
     case MipsMCExpr::MEK_GPREL:
       FixupKind = Mips::fixup_Mips_GPREL16;
       break;
-    case MipsMCExpr::MEK_LO: {
+    case MipsMCExpr::MEK_LO:
       // Check for %lo(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff()) {
         FixupKind = Mips::fixup_Mips_GPOFF_LO;
@@ -720,7 +709,6 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
                                    : Mips::fixup_Mips_LO16;
       break;
-    }
     case MipsMCExpr::MEK_HIGHEST:
       FixupKind = Mips::fixup_Mips_HIGHEST;
       break;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 2d041dcbf040..d12d3195521a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -1,4 +1,4 @@
-//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//===- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,29 +10,25 @@
 // This file defines the MipsMCCodeEmitter class.
 //
 //===----------------------------------------------------------------------===//
-//
 
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Support/DataTypes.h"
-
-using namespace llvm;
+#include <cstdint>
 
 namespace llvm {
+
 class MCContext;
 class MCExpr;
+class MCFixup;
 class MCInst;
 class MCInstrInfo;
-class MCFixup;
 class MCOperand;
 class MCSubtargetInfo;
 class raw_ostream;
 
 class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
-  void operator=(const MipsMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
   bool IsLittleEndian;
@@ -43,8 +39,9 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
 public:
   MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
       : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
-
-  ~MipsMCCodeEmitter() override {}
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
+  MipsMCCodeEmitter &operator=(const MipsMCCodeEmitter &) = delete;
+  ~MipsMCCodeEmitter() override = default;
 
   void EmitByte(unsigned char C, raw_ostream &OS) const;
 
@@ -270,9 +267,11 @@ public:
   unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
-  private:
+
+private:
   void LowerCompactBranch(MCInst& Inst) const;
-}; // class MipsMCCodeEmitter
-} // namespace llvm.
+};
+
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 082bb87fcb8a..be04480044d4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -11,9 +11,15 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index d1a4334ec640..495d525ccff4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -1,4 +1,4 @@
-//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -70,6 +70,7 @@ public:
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
+
   MCFragment *findAssociatedFragment() const override {
     return getSubExpr()->findAssociatedFragment();
   }
@@ -86,6 +87,7 @@ public:
     return isGpOff(Kind);
   }
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index aef9bd3a8e2a..9266f0e216d1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -20,7 +20,11 @@
 #include "Mips.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCNaCl.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -38,14 +42,14 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
 public:
   MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
-      : MipsELFStreamer(Context, TAB, OS, Emitter), PendingCall(false) {}
+      : MipsELFStreamer(Context, TAB, OS, Emitter) {}
 
-  ~MipsNaClELFStreamer() override {}
+  ~MipsNaClELFStreamer() override = default;
 
 private:
   // Whether we started the sandboxing sequence for calls.  Calls are bundled
   // with branch delays and aligned to the bundle end.
-  bool PendingCall;
+  bool PendingCall = false;
 
   bool isIndirectJump(const MCInst &MI) {
     if (MI.getOpcode() == Mips::JALR) {
@@ -135,8 +139,8 @@ private:
 public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer.  We override it to mask dangerous instructions.
-  void EmitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override {
     // Sandbox indirect jumps.
     if (isIndirectJump(Inst)) {
       if (PendingCall)
@@ -265,4 +269,4 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
   return S;
 }
 
-}
+} // end namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 24b602810d6e..74d5e4cc9841 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -1,4 +1,4 @@
-//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//===- MipsOptionRecord.cpp - Abstraction for storing information ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsOptionRecord.h"
+#include "MipsABIInfo.h"
 #include "MipsELFStreamer.h"
+#include "MipsOptionRecord.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 7f79eb400f59..2d4083b27ed1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsTargetStreamer.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsELFStreamer.h"
@@ -685,6 +686,17 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   // issues as well.
   unsigned EFlags = MCA.getELFHeaderEFlags();
 
+  // FIXME: Fix a dependency issue by instantiating the ABI object to some
+  // default based off the triple. The triple doesn't describe the target
+  // fully, but any external user of the API that uses the MCTargetStreamer
+  // would otherwise crash on assertion failure.
+
+  ABI = MipsABIInfo(
+      STI.getTargetTriple().getArch() == Triple::ArchType::mipsel ||
+              STI.getTargetTriple().getArch() == Triple::ArchType::mips
+          ? MipsABIInfo::O32()
+          : MipsABIInfo::N64());
+
   // Architecture
   if (Features[Mips::FeatureMips64r6])
     EFlags |= ELF::EF_MIPS_ARCH_64R6;
@@ -721,23 +733,18 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   if (Features[Mips::FeatureNaN2008])
     EFlags |= ELF::EF_MIPS_NAN2008;
 
-  // -mabicalls and -mplt are not implemented but we should act as if they were
-  // given.
-  EFlags |= ELF::EF_MIPS_CPIC;
-
   MCA.setELFHeaderEFlags(EFlags);
 }
 
 void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  if (!isMicroMipsEnabled())
-    return;
   getStreamer().getAssembler().registerSymbol(*Symbol);
   uint8_t Type = Symbol->getType();
   if (Type != ELF::STT_FUNC)
     return;
 
-  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+  if (isMicroMipsEnabled())
+    Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
 }
 
 void MipsTargetELFStreamer::finish() {
@@ -795,10 +802,13 @@ void MipsTargetELFStreamer::finish() {
   } else if (Features[Mips::FeatureMips64r2] || Features[Mips::FeatureMips64])
     EFlags |= ELF::EF_MIPS_32BITMODE;
 
-  // If we've set the cpic eflag and we're n64, go ahead and set the pic
-  // one as well.
-  if (EFlags & ELF::EF_MIPS_CPIC && getABI().IsN64())
-    EFlags |= ELF::EF_MIPS_PIC;
+  // -mplt is not implemented but we should act as if it was
+  // given.
+  if (!Features[Mips::FeatureNoABICalls])
+    EFlags |= ELF::EF_MIPS_CPIC;
+
+  if (Pic)
+    EFlags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
 
   MCA.setELFHeaderEFlags(EFlags);
 
@@ -904,10 +914,10 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   const MCExpr *Size = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
       ExprRef, Context);
-  int64_t AbsSize;
-  if (!Size->evaluateAsAbsolute(AbsSize, MCA))
-    llvm_unreachable("Function size must be evaluatable as absolute");
-  Size = MCConstantExpr::create(AbsSize, Context);
+
+  // The ELFObjectWriter can determine the absolute size as it has access to
+  // the layout information of the assembly file, so a size expression rather
+  // than an absolute value is ok here.
   static_cast<MCSymbolELF *>(Sym)->setSize(Size);
 }
 
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index 05aad515da46..6b7f39e9dd79 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -475,29 +475,11 @@ defm : MaterializeImms<i64, ZERO_64, DADDIU_MM64R6, LUi64, ORi64>;
 //
 //===----------------------------------------------------------------------===//
 
-def : MipsPat<(MipsLo tglobaladdr:$in),
-              (DADDIU_MM64R6 ZERO_64, tglobaladdr:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tblockaddress:$in),
-              (DADDIU_MM64R6 ZERO_64, tblockaddress:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tjumptable:$in),
-              (DADDIU_MM64R6 ZERO_64, tjumptable:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tconstpool:$in),
-              (DADDIU_MM64R6 ZERO_64, tconstpool:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tglobaltlsaddr:$in),
-              (DADDIU_MM64R6 ZERO_64, tglobaltlsaddr:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo texternalsym:$in),
-              (DADDIU_MM64R6 ZERO_64, texternalsym:$in)>, ISA_MICROMIPS64R6;
-
-def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tglobaladdr:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tblockaddress:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tjumptable:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tconstpool:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MICROMIPS64R6;
+defm : MipsHiLoRelocs<LUi64, DADDIU_MM64R6, ZERO_64, GPR64Opnd>, SYM_32,
+                      ISA_MICROMIPS64R6;
+
+defm : MipsHighestHigherHiLoRelocs<LUi64, DADDIU_MM64R6>, SYM_64,
+                                   ISA_MICROMIPS64R6;
 
 def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
               (DADDU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index c0de9e7390a4..ee554bc7f69a 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1136,12 +1136,6 @@ let Predicates = [InMicroMips] in {
   def : MipsInstAlias<
           "sgtu $rs, $rt",
           (SLTu_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
-  def : MipsInstAlias<"slt $rs, $rt, $imm",
-                      (SLTi_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
-                               simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<"sltu $rs, $rt, $imm",
-                      (SLTiu_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
-                                simm32_relaxed:$imm), 0>;
   def : MipsInstAlias<"sll $rd, $rt, $rs",
                       (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
   def : MipsInstAlias<"sra $rd, $rt, $rs",
@@ -1163,18 +1157,21 @@ let Predicates = [InMicroMips] in {
   def : MipsInstAlias<"rotr $rt, $imm",
                       (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
   def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
-  def : MipsInstAlias<"and $rs, $rt, $imm",
-                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-  def : MipsInstAlias<"and $rs, $imm",
-                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
-  def : MipsInstAlias<"or $rs, $rt, $imm",
-                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-  def : MipsInstAlias<"or $rs, $imm",
-                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
-  def : MipsInstAlias<"xor $rs, $rt, $imm",
-                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-  def : MipsInstAlias<"xor $rs, $imm",
-                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>;
+
   def : MipsInstAlias<"not $rt, $rs",
                       (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
   def : MipsInstAlias<"not $rt",
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 670272d47e95..9615bc38bfce 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -156,6 +156,8 @@ def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
                                 "Mips64r6 ISA Support [experimental]",
                                 [FeatureMips32r6, FeatureMips64r5,
                                  FeatureNaN2008]>;
+def FeatureSym32       : SubtargetFeature<"sym32", "HasSym32", "true",
+                                          "Symbols are 32 bit on Mips64">;
 
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
                                       "Mips16 mode">;
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 191006d6463c..a71b161b24cc 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -405,7 +405,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
           "__mips16_ret_dc"
         };
         const char *Name = Helper[RV];
-        AttributeSet A;
+        AttributeList A;
         Value *Params[] = {RVal};
         Modified = true;
         //
@@ -414,13 +414,13 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         // during call setup, the proper call lowering to the helper
         // functions will take place.
         //
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            "__Mips16RetHelper");
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            Attribute::ReadNone);
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            Attribute::NoInline);
-        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T));
         CallInst::Create(F, Params, "", &I);
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
         FunctionType *FT = CI->getFunctionType();
@@ -490,15 +490,15 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
 // remove the use-soft-float attribute
 //
 static void removeUseSoftFloat(Function &F) {
-  AttributeSet A;
+  AttributeList A;
   DEBUG(errs() << "removing -use-soft-float\n");
-  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+  A = A.addAttribute(F.getContext(), AttributeList::FunctionIndex,
                      "use-soft-float", "false");
-  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  F.removeAttributes(AttributeList::FunctionIndex, A);
   if (F.hasFnAttribute("use-soft-float")) {
     DEBUG(errs() << "still has -use-soft-float\n");
   }
-  F.addAttributes(AttributeSet::FunctionIndex, A);
+  F.addAttributes(AttributeList::FunctionIndex, A);
 }
 
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 021fb8678686..52bf690a8083 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -766,6 +766,7 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
   let hasDelaySlot = 1;
   let isTerminator=1;
   let isBarrier=1;
+  let isReturn=1;
 }
 
 def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
@@ -773,6 +774,7 @@ def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
   let isIndirectBranch = 1;
   let isTerminator=1;
   let isBarrier=1;
+  let isReturn=1;
 }
 
 def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 1b4d73b79895..3272319ad50f 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -917,6 +917,12 @@ def : MipsInstAlias<"jrc $rs", (JIC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
 let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalrc $rs", (JIALC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
 }
+
+def : MipsInstAlias<"div $rs, $rt", (DIV GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                         GPR32Opnd:$rt)>, ISA_MIPS32R6;
+def : MipsInstAlias<"divu $rs, $rt", (DIVU GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                           GPR32Opnd:$rt)>, ISA_MIPS32R6;
+
 //===----------------------------------------------------------------------===//
 //
 // Patterns and Pseudo Instructions
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 521e22fb7992..99025fe1341d 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -326,6 +326,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
               EXT_FM<5>, ISA_MIPS64R2;
 }
 
+let isCodeGenOnly = 1, AdditionalPredicates = [NotInMicroMips] in {
+  def DEXT64_32 : InstSE<(outs GPR64Opnd:$rt),
+                         (ins GPR32Opnd:$rs, uimm5_report_uimm6:$pos,
+                              uimm5_plus1:$size),
+                         "dext $rt, $rs, $pos, $size", [], II_EXT, FrmR, "dext">,
+                  EXT_FM<3>, ISA_MIPS64R2;
+}
+
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
                      "dsll\t$rd, $rt, 32", [], II_DSLL>;
@@ -356,11 +364,11 @@ class Count1s<string opstr, RegisterOperand RO>:
   let TwoOperandAliasConstraint = "$rd = $rs";
 }
 
-class ExtsCins<string opstr, InstrItinClass itin,
-               SDPatternOperator Op = null_frag>:
-  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
-         !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
-         [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+class ExtsCins<string opstr, InstrItinClass itin, RegisterOperand RO,
+               PatFrag PosImm, SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm5:$pos, uimm5:$lenm1),
+         !strconcat(opstr, "\t$rt, $rs, $pos, $lenm1"),
+         [(set RO:$rt, (Op RO:$rs, PosImm:$pos, imm:$lenm1))],
          itin, FrmR, opstr> {
   let TwoOperandAliasConstraint = "$rt = $rs";
 }
@@ -424,13 +432,28 @@ def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
   let Defs = [HI0, LO0, P0, P1, P2];
 }
 
-// Extract a signed bit field /+32
-def EXTS  : ExtsCins<"exts", II_EXT>, EXTS_FM<0x3a>, ASE_CNMIPS;
-def EXTS32: ExtsCins<"exts32", II_EXT>, EXTS_FM<0x3b>, ASE_CNMIPS;
-
-// Clear and insert a bit field /+32
-def CINS  : ExtsCins<"cins", II_INS>, EXTS_FM<0x32>, ASE_CNMIPS;
-def CINS32: ExtsCins<"cins32", II_INS>, EXTS_FM<0x33>, ASE_CNMIPS;
+let AdditionalPredicates = [NotInMicroMips] in {
+  // Extract a signed bit field /+32
+  def EXTS  : ExtsCins<"exts", II_EXT, GPR64Opnd, immZExt5>, EXTS_FM<0x3a>,
+              ASE_MIPS64_CNMIPS;
+  def EXTS32: ExtsCins<"exts32", II_EXT, GPR64Opnd, immZExt5Plus32>,
+              EXTS_FM<0x3b>, ASE_MIPS64_CNMIPS;
+
+  // Clear and insert a bit field /+32
+  def CINS  : ExtsCins<"cins", II_INS, GPR64Opnd, immZExt5, MipsCIns>,
+              EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+  def CINS32: ExtsCins<"cins32", II_INS, GPR64Opnd, immZExt5Plus32, MipsCIns>,
+              EXTS_FM<0x33>, ASE_MIPS64_CNMIPS;
+  let isCodeGenOnly = 1 in {
+    def CINS_i32 : ExtsCins<"cins", II_INS, GPR32Opnd, immZExt5, MipsCIns>,
+                   EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+    def CINS64_32 :InstSE<(outs GPR64Opnd:$rt),
+                          (ins GPR32Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+                          "cins\t$rt, $rs, $pos, $lenm1", [], II_INS, FrmR,
+                          "cins">,
+                   EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+  }
+}
 
 // Move to multiplier/product register
 def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>,
@@ -513,41 +536,87 @@ def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
 def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 
 // hi/lo relocs
-def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
-def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
-def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
-def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
-def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+let AdditionalPredicates = [NotInMicroMips] in
+defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+
+multiclass MipsHighestHigherHiLoRelocs<Instruction Lui, Instruction Daddiu> {
+  def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)),
+                (JAL texternalsym:$dst)>;
+  def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
+                (Lui tglobaladdr:$in)>;
+  def : MipsPat<(MipsHighest (i64 tblockaddress:$in)),
+                (Lui tblockaddress:$in)>;
+  def : MipsPat<(MipsHighest (i64 tjumptable:$in)),
+                (Lui tjumptable:$in)>;
+  def : MipsPat<(MipsHighest (i64 tconstpool:$in)),
+                (Lui tconstpool:$in)>;
+  def : MipsPat<(MipsHighest (i64 tglobaltlsaddr:$in)),
+                (Lui tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsHighest (i64 texternalsym:$in)),
+                (Lui texternalsym:$in)>;
+
+  def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)),
+                (Daddiu ZERO_64, tglobaladdr:$in)>;
+  def : MipsPat<(MipsHigher (i64 tblockaddress:$in)),
+                (Daddiu ZERO_64, tblockaddress:$in)>;
+  def : MipsPat<(MipsHigher (i64 tjumptable:$in)),
+                (Daddiu ZERO_64, tjumptable:$in)>;
+  def : MipsPat<(MipsHigher (i64 tconstpool:$in)),
+                (Daddiu ZERO_64, tconstpool:$in)>;
+  def : MipsPat<(MipsHigher (i64 tglobaltlsaddr:$in)),
+                (Daddiu ZERO_64, tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsHigher (i64 texternalsym:$in)),
+                (Daddiu ZERO_64, texternalsym:$in)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))),
+                (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tblockaddress:$lo))),
+                (Daddiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tjumptable:$lo))),
+                (Daddiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))),
+                (Daddiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaltlsaddr:$lo))),
+                (Daddiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
+                (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tblockaddress:$lo))),
+                (Daddiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tjumptable:$lo))),
+                (Daddiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))),
+                (Daddiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaltlsaddr:$lo))),
+                (Daddiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
+                (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tblockaddress:$lo))),
+                (Daddiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tjumptable:$lo))),
+                (Daddiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tconstpool:$lo))),
+                (Daddiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))),
+                (Daddiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+}
+
+// highest/higher/hi/lo relocs
+let AdditionalPredicates = [NotInMicroMips] in
+defm : MipsHighestHigherHiLoRelocs<LUi64, DADDiu>, SYM_64;
+
+def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+def : WrapperPat<tconstpool, DADDiu, GPR64>;
+def : WrapperPat<texternalsym, DADDiu, GPR64>;
+def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+def : WrapperPat<tjumptable, DADDiu, GPR64>;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
 
-let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
-  def : MipsPat<(MipsLo tblockaddress:$in),
-                (DADDiu ZERO_64, tblockaddress:$in)>;
-  def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
-  def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
-  def : MipsPat<(MipsLo tglobaltlsaddr:$in),
-                (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
-  def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
-
-  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
-                (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
-                (DADDiu GPR64:$hi, tblockaddress:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
-                (DADDiu GPR64:$hi, tjumptable:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
-                (DADDiu GPR64:$hi, tconstpool:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
-                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
-
-  def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
-  def : WrapperPat<tconstpool, DADDiu, GPR64>;
-  def : WrapperPat<texternalsym, DADDiu, GPR64>;
-  def : WrapperPat<tblockaddress, DADDiu, GPR64>;
-  def : WrapperPat<tjumptable, DADDiu, GPR64>;
-  def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
-}
 
 defm : BrcondPats<GPR64, BEQ64, BEQ, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
@@ -600,6 +669,14 @@ def : MipsPat<(i64 (anyext GPR32:$src)),
 def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
 def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
 
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(i64 (zext GPR32:$src)), (DEXT64_32 GPR32:$src, 0, 32)>,
+        ISA_MIPS64R2;
+  def : MipsPat<(i64 (zext (i32 (shl GPR32:$rt, immZExt5:$imm)))),
+                (CINS64_32 GPR32:$rt, imm:$imm, (immZExt5To31 imm:$imm))>,
+        ASE_MIPS64_CNMIPS;
+}
+
 // Sign extend in register
 def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
               (SLL64_64 GPR64:$src)>;
@@ -661,6 +738,15 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"daddu $rs, $imm",
                       (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
                       0>, ISA_MIPS3;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi64, GPR64Opnd, imm64>,
+         GPR_64;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi64, GPR64Opnd, imm64>,
+         GPR_64;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
+         GPR_64;
 }
 def : MipsInstAlias<"dsll $rd, $rt, $rs",
                     (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
@@ -741,21 +827,21 @@ def : MipsInstAlias<"bbit1 $rs, $p, $offset",
 def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
                     (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 def : MipsInstAlias<"exts $rt, $pos, $lenm1",
                     (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 
 // cins with $pos 32-63 in converted to cins32 with $pos 0-31
 def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
                     (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 def : MipsInstAlias<"cins $rt, $pos, $lenm1",
                     (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
@@ -770,3 +856,81 @@ def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
                                        "dla\t$rt, $addr">;
 def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
                                        "dla\t$rt, $imm64">;
+
+def DMULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                  simm32_relaxed:$imm),
+                                     "dmul\t$rs, $rt, $imm">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def DMULOMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                GPR64Opnd:$rd),
+                                   "dmulo\t$rs, $rt, $rd">,
+                 ISA_MIPS3_NOT_32R6_64R6;
+def DMULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                 GPR64Opnd:$rd),
+                                    "dmulou\t$rs, $rt, $rd">,
+                  ISA_MIPS3_NOT_32R6_64R6;
+
+def DMULMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                               GPR64Opnd:$rd),
+                                  "dmul\t$rs, $rt, $rd"> {
+  let InsnPredicates = [HasMips3, NotMips64r6, NotCnMips];
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSDivMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "ddiv\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DSDivIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, imm64:$imm),
+                                      "ddiv\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+  def DUDivMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "ddivu\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DUDivIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, imm64:$imm),
+                                      "ddivu\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+
+  // GAS expands 'div' and 'ddiv' differently when the destination
+  // register is $zero and the instruction is in the two operand
+  // form. 'ddiv' gets expanded, while 'div' is not expanded.
+
+  def : MipsInstAlias<"ddiv $rs, $rt", (DSDivMacro GPR64Opnd:$rs,
+                                               GPR64Opnd:$rs,
+                                               GPR64Opnd:$rt), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"ddiv $rd, $imm", (DSDivIMacro GPR64Opnd:$rd,
+                                                     GPR64Opnd:$rd,
+                                                     imm64:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+
+  // GAS expands 'divu' and 'ddivu' differently when the destination
+  // register is $zero and the instruction is in the two operand
+  // form. 'ddivu' gets expanded, while 'divu' is not expanded.
+
+  def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rs), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"ddivu $rd, $imm", (DUDivIMacro GPR64Opnd:$rd,
+                                                      GPR64Opnd:$rd,
+                                                      imm64:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+}
+
+def NORImm64 : NORIMM_DESC_BASE<GPR64Opnd, imm64>, GPR_64;
+def : MipsInstAlias<"nor\t$rs, $imm", (NORImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                imm64:$imm)>, GPR_64;
+def SLTImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
+                                 (ins GPR64Opnd:$rt, imm64:$imm),
+                                 "slt\t$rs, $rt, $imm">, GPR_64;
+def : MipsInstAlias<"slt\t$rs, $imm", (SLTImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                imm64:$imm)>, GPR_64;
+def SLTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
+                                  (ins GPR64Opnd:$rt, imm64:$imm),
+                                  "sltu\t$rs, $rt, $imm">, GPR_64;
+def : MipsInstAlias<"sltu\t$rs, $imm", (SLTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                  imm64:$imm)>, GPR_64;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 04d6529a073d..2a9d96205eb9 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -79,6 +80,9 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     NaClAlignIndirectJumpTargets(MF);
 
   AsmPrinter::runOnMachineFunction(MF);
+
+  EmitXRayTable();
+
   return true;
 }
 
@@ -132,6 +136,7 @@ void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
 
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MipsTargetStreamer &TS = getTargetStreamer();
+  unsigned Opc = MI->getOpcode();
   TS.forbidModuleDirective();
 
   if (MI->isDebugValue()) {
@@ -143,20 +148,20 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   // If we just ended a constant pool, mark it as such.
-  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+  if (InConstantPool && Opc != Mips::CONSTPOOL_ENTRY) {
     OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
     InConstantPool = false;
   }
-  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+  if (Opc == Mips::CONSTPOOL_ENTRY) {
     // CONSTPOOL_ENTRY - This instruction represents a floating
-    //constant pool in the function.  The first operand is the ID#
+    // constant pool in the function.  The first operand is the ID#
     // for this instruction, the second is the index into the
     // MachineConstantPool that this is, the third is the size in
     // bytes of this constant pool entry.
     // The required alignment is specified on the basic block holding this MI.
     //
     unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
-    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+    unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex();
 
     // If this is the first entry of the pool, mark it.
     if (!InConstantPool) {
@@ -174,6 +179,17 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  switch (Opc) {
+  case Mips::PATCHABLE_FUNCTION_ENTER:
+    LowerPATCHABLE_FUNCTION_ENTER(*MI);
+    return;
+  case Mips::PATCHABLE_FUNCTION_EXIT:
+    LowerPATCHABLE_FUNCTION_EXIT(*MI);
+    return;
+  case Mips::PATCHABLE_TAIL_CALL:
+    LowerPATCHABLE_TAIL_CALL(*MI);
+    return;
+  }
 
   MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
@@ -574,6 +590,8 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MipsII::MO_GOT:      O << "%got(";    break;
   case MipsII::MO_ABS_HI:   O << "%hi(";     break;
   case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_HIGHER:   O << "%higher("; break;
+  case MipsII::MO_HIGHEST:  O << "%highest(("; break;
   case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
   case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
   case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
@@ -698,7 +716,7 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
     //        Ideally it should test for properties of the ABI and not the ABI
     //        itself.
     //        For the moment, I'm only correcting enough to make MIPS-IV work.
-    if (!isPositionIndependent() && !ABI.IsN64())
+    if (!isPositionIndependent() && STI.hasSym32())
       TS.emitDirectiveOptionPic0();
   }
 
@@ -1032,6 +1050,149 @@ void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
   OutStreamer->SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
 }
 
+void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
+  const uint8_t NoopsInSledCount = Subtarget->isGP64bit() ? 15 : 11;
+  // For mips32 we want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B .tmpN
+  //   11 NOP instructions (44 bytes)
+  //   ADDIU T9, T9, 52 
+  // .tmpN
+  //
+  // We need the 44 bytes (11 instructions) because at runtime, we'd
+  // be patching over the full 48 bytes (12 instructions) with the following
+  // pattern:
+  //
+  //   ADDIU	SP, SP, -8
+  //   NOP
+  //   SW	RA, 4(SP)
+  //   SW       T9, 0(SP)
+  //   LUI      T9, %hi(__xray_FunctionEntry/Exit)
+  //   ORI      T9, T9, %lo(__xray_FunctionEntry/Exit)
+  //   LUI      T0, %hi(function_id)
+  //   JALR	T9
+  //   ORI	T0, T0, %lo(function_id)
+  //   LW	T9, 0(SP)
+  //   LW       RA, 4(SP)
+  //   ADDIU    SP, SP, 8
+  //
+  // We add 52 bytes to t9 because we want to adjust the function pointer to
+  // the actual start of function i.e. the address just after the noop sled.
+  // We do this because gp displacement relocation is emitted at the start of
+  // of the function i.e after the nop sled and to correctly calculate the
+  // global offset table address, t9 must hold the address of the instruction
+  // containing the gp displacement relocation.
+  // FIXME: Is this correct for the static relocation model?
+  //
+  // For mips64 we want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B .tmpN
+  //   15 NOP instructions (60 bytes)
+  // .tmpN
+  //
+  // We need the 60 bytes (15 instructions) because at runtime, we'd
+  // be patching over the full 64 bytes (16 instructions) with the following
+  // pattern:
+  //
+  //   DADDIU   SP, SP, -16
+  //   NOP
+  //   SD       RA, 8(SP)
+  //   SD       T9, 0(SP)
+  //   LUI      T9, %highest(__xray_FunctionEntry/Exit)
+  //   ORI      T9, T9, %higher(__xray_FunctionEntry/Exit)
+  //   DSLL     T9, T9, 16
+  //   ORI      T9, T9, %hi(__xray_FunctionEntry/Exit)
+  //   DSLL     T9, T9, 16
+  //   ORI      T9, T9, %lo(__xray_FunctionEntry/Exit)
+  //   LUI      T0, %hi(function_id)
+  //   JALR     T9
+  //   ADDIU    T0, T0, %lo(function_id)
+  //   LD       T9, 0(SP)
+  //   LD       RA, 8(SP)
+  //   DADDIU   SP, SP, 16
+  //
+  OutStreamer->EmitCodeAlignment(4);
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Emit "B .tmpN" instruction, which jumps over the nop sled to the actual
+  // start of function
+  const MCExpr *TargetExpr = MCSymbolRefExpr::create(
+      Target, MCSymbolRefExpr::VariantKind::VK_None, OutContext);
+  EmitToStreamer(*OutStreamer, MCInstBuilder(Mips::BEQ)
+                                   .addReg(Mips::ZERO)
+                                   .addReg(Mips::ZERO)
+                                   .addExpr(TargetExpr));
+
+  for (int8_t I = 0; I < NoopsInSledCount; I++)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(Mips::SLL)
+                                     .addReg(Mips::ZERO)
+                                     .addReg(Mips::ZERO)
+                                     .addImm(0));
+
+  OutStreamer->EmitLabel(Target);
+
+  if (!Subtarget->isGP64bit()) {
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(Mips::ADDiu)
+                       .addReg(Mips::T9)
+                       .addReg(Mips::T9)
+                       .addImm(0x34));
+  }
+
+  recordSled(CurSled, MI, Kind);
+}
+
+void MipsAsmPrinter::EmitXRayTable() {
+  if (Sleds.empty())
+    return;
+  if (Subtarget->isTargetELF()) {
+    auto PrevSection = OutStreamer->getCurrentSectionOnly();
+    auto Fn = MF->getFunction();
+    MCSection *Section;
+
+    if (Fn->hasComdat())
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+                                         Fn->getComdat()->getName());
+    else
+      Section =
+          OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                   ELF::SHF_ALLOC, 0, CurrentFnSym->getName());
+
+    OutStreamer->SwitchSection(Section);
+    for (const auto &Sled : Sleds) {
+      OutStreamer->EmitSymbolValue(Sled.Sled, Subtarget->isGP64bit() ? 8 : 4);
+      OutStreamer->EmitSymbolValue(CurrentFnSym, Subtarget->isGP64bit() ? 8 : 4);
+      auto Kind = static_cast<uint8_t>(Sled.Kind);
+      OutStreamer->EmitBytes(
+          StringRef(reinterpret_cast<const char *>(&Kind), 1));
+      OutStreamer->EmitBytes(
+          StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+      OutStreamer->EmitZeros(Subtarget->isGP64bit() ? 14 : 6);
+    }
+    OutStreamer->SwitchSection(PrevSection);
+  }
+  Sleds.clear();
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::TAIL_CALL);
+}
+
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
                                            raw_ostream &OS) {
   // TODO: implement
@@ -1039,7 +1200,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
-void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value,
+void MipsAsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
                                           unsigned Size) const {
   switch (Size) {
   case 4:
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index c5cf5241c236..4699e1b0bd3b 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -35,7 +35,21 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
+  //===------------------------------------------------------------------===//
+  // XRay implementation
+  //===------------------------------------------------------------------===//
+public:
+  // XRay-specific lowering for Mips.
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+  // Helper function that emits the XRay sleds we've collected for a particular
+  // function.
+  void EmitXRayTable();
+
 private:
+  void EmitSled(const MachineInstr &MI, SledKind Kind);
+
   // tblgen'erated function.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
@@ -140,7 +154,7 @@ public:
   void EmitStartOfAsmFile(Module &M) override;
   void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-  void EmitDebugValue(const MCExpr *Value, unsigned Size) const override;
+  void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const override;
 };
 }
 
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index 08b8ed31ccbb..026f66a1c0e1 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//
 // This pass is used to make Pc relative loads of constants.
 // For now, only Mips16 will use this. 
 //
@@ -19,30 +18,43 @@
 // This can be particularly helpful in static relocation mode for embedded
 // non-linux targets.
 //
-//
+//===----------------------------------------------------------------------===//
 
 #include "Mips.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips16InstrInfo.h"
 #include "MipsMachineFunction.h"
-#include "MipsTargetMachine.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <vector>
 
 using namespace llvm;
 
@@ -58,7 +70,6 @@ static cl::opt<bool>
 AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
           cl::desc("Align constant islands in code"));
 
-
 // Rather than do make check tests with huge amounts of code, we force
 // the test to use this amount.
 //
@@ -178,7 +189,6 @@ static unsigned int branchMaxOffsets(unsigned int Opcode) {
 
 namespace {
 
-
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
@@ -195,7 +205,6 @@ namespace {
   ///             tracks a list of users.
 
   class MipsConstantIslands : public MachineFunctionPass {
-
     /// BasicBlockInfo - Information about the offset and size of a single
     /// basic block.
     struct BasicBlockInfo {
@@ -208,14 +217,16 @@ namespace {
       ///
       /// Because worst case padding is used, the computed offset of an aligned
       /// block may not actually be aligned.
-      unsigned Offset;
+      unsigned Offset = 0;
 
       /// Size - Size of the basic block in bytes.  If the block contains
       /// inline assembly, this is a worst case estimate.
       ///
       /// The size does not include any alignment padding whether from the
       /// beginning of the block, or from an aligned jump table at the end.
-      unsigned Size;
+      unsigned Size = 0;
+
+      BasicBlockInfo() = default;
 
       // FIXME: ignore LogAlign for this patch
       //
@@ -223,9 +234,6 @@ namespace {
         unsigned PO = Offset + Size;
         return PO;
       }
-
-      BasicBlockInfo() : Offset(0), Size(0) {}
-
     };
 
     std::vector<BasicBlockInfo> BBInfo;
@@ -257,13 +265,16 @@ namespace {
       MachineInstr *MI;
       MachineInstr *CPEMI;
       MachineBasicBlock *HighWaterMark;
+
     private:
       unsigned MaxDisp;
       unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
                                 // with different displacements
       unsigned LongFormOpcode;
+
     public:
       bool NegOk;
+
       CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
              bool neg,
              unsigned longformmaxdisp, unsigned longformopcode)
@@ -272,18 +283,22 @@ namespace {
           NegOk(neg){
         HighWaterMark = CPEMI->getParent();
       }
+
       /// getMaxDisp - Returns the maximum displacement supported by MI.
       unsigned getMaxDisp() const {
         unsigned xMaxDisp = ConstantIslandsSmallOffset?
                             ConstantIslandsSmallOffset: MaxDisp;
         return xMaxDisp;
       }
+
       void setMaxDisp(unsigned val) {
         MaxDisp = val;
       }
+
       unsigned getLongFormMaxDisp() const {
         return LongFormMaxDisp;
       }
+
       unsigned getLongFormOpcode() const {
           return LongFormOpcode;
       }
@@ -300,6 +315,7 @@ namespace {
     MachineInstr *CPEMI;
     unsigned CPI;
     unsigned RefCount;
+
     CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
       : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
   };
@@ -309,7 +325,7 @@ namespace {
   /// existed upon entry to this pass), it keeps a vector of entries.
   /// Original elements are cloned as we go along; the clones are
   /// put in the vector of the original element, but have distinct CPIs.
-  std::vector<std::vector<CPEntry> > CPEntries;
+  std::vector<std::vector<CPEntry>> CPEntries;
 
   /// ImmBranch - One per immediate branch, keeping the machine instruction
   /// pointer, conditional or unconditional, the max displacement,
@@ -320,6 +336,7 @@ namespace {
     unsigned MaxDisp : 31;
     bool isCond : 1;
     int UncondBr;
+
     ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
       : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
   };
@@ -332,29 +349,27 @@ namespace {
   /// the branch fix up pass.
   bool HasFarJump;
 
-  const MipsSubtarget *STI;
+  const MipsSubtarget *STI = nullptr;
   const Mips16InstrInfo *TII;
   MipsFunctionInfo *MFI;
-  MachineFunction *MF;
-  MachineConstantPool *MCP;
+  MachineFunction *MF = nullptr;
+  MachineConstantPool *MCP = nullptr;
 
   unsigned PICLabelUId;
-  bool PrescannedForConstants;
+  bool PrescannedForConstants = false;
 
   void initPICLabelUId(unsigned UId) {
     PICLabelUId = UId;
   }
 
-
   unsigned createPICLabelUId() {
     return PICLabelUId++;
   }
 
   public:
     static char ID;
-    MipsConstantIslands()
-        : MachineFunctionPass(ID), STI(nullptr), MF(nullptr), MCP(nullptr),
-          PrescannedForConstants(false) {}
+
+    MipsConstantIslands() : MachineFunctionPass(ID) {}
 
     StringRef getPassName() const override { return "Mips Constant Islands"; }
 
@@ -403,13 +418,11 @@ namespace {
     bool fixupUnconditionalBr(ImmBranch &Br);
 
     void prescanForConstants();
-
-  private:
-
   };
 
   char MipsConstantIslands::ID = 0;
-} // end of anonymous namespace
+
+} // end anonymous namespace
 
 bool MipsConstantIslands::isOffsetInRange
   (unsigned UserOffset, unsigned TrialOffset,
@@ -417,20 +430,17 @@ bool MipsConstantIslands::isOffsetInRange
   return isOffsetInRange(UserOffset, TrialOffset,
                          U.getMaxDisp(), U.NegOk);
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
-void MipsConstantIslands::dumpBBs() {
-  DEBUG({
-    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
-      const BasicBlockInfo &BBI = BBInfo[J];
-      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
-             << format(" size=%#x\n", BBInfo[J].Size);
-    }
-  });
-}
-/// Returns a pass that converts branches to long branches.
-FunctionPass *llvm::createMipsConstantIslandPass() {
-  return new MipsConstantIslands();
+LLVM_DUMP_METHOD void MipsConstantIslands::dumpBBs() {
+  for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+    const BasicBlockInfo &BBI = BBInfo[J];
+    dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+           << format(" size=%#x\n", BBInfo[J].Size);
+  }
 }
+#endif
 
 bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // The intention is for this to be a mips16 only pass for now
@@ -527,7 +537,6 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
 
-
   // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
   unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
 
@@ -647,7 +656,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
     computeBlockSize(&*I);
 
-
   // Compute block offsets.
   adjustBBOffsetsAfter(&MF->front());
 
@@ -737,7 +745,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
       if (Opc == Mips::CONSTPOOL_ENTRY)
         continue;
 
-
       // Scan the instructions for constant pool operands.
       for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op)
         if (MI.getOperand(op).isCPI()) {
@@ -784,12 +791,9 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           // Instructions can only use one CP entry, don't bother scanning the
           // rest of the operands.
           break;
-
         }
-
     }
   }
-
 }
 
 /// computeBlockSize - Compute the size and some alignment information for MBB.
@@ -921,8 +925,6 @@ MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
   return NewBB;
 }
 
-
-
 /// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
 /// reference) is within MaxDisp of TrialOffset (a proposed location of a
 /// constant pool entry).
@@ -1337,7 +1339,6 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   if (result==1) return false;
   else if (result==2) return true;
 
-
   // Look for water where we can place this CPE.
   MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
   MachineBasicBlock *NewMBB;
@@ -1371,7 +1372,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // it.  Check for this so it will be removed from the WaterList.
     // Also remove any entry from NewWaterList.
     MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
-    IP = find(WaterList, WaterBB);
+    IP = llvm::find(WaterList, WaterBB);
     if (IP != WaterList.end())
       NewWaterList.erase(WaterBB);
 
@@ -1473,9 +1474,7 @@ bool MipsConstantIslands::removeUnusedCPEntries() {
 /// specific BB can fit in MI's displacement field.
 bool MipsConstantIslands::isBBInRange
   (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
-
-unsigned PCAdj = 4;
-
+  unsigned PCAdj = 4;
   unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
   unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
 
@@ -1553,7 +1552,6 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   return true;
 }
 
-
 /// fixupConditionalBr - Fix up a conditional branch whose destination is too
 /// far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
@@ -1614,7 +1612,6 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     }
   }
 
-
   if (NeedSplit) {
     splitBlockBeforeInstr(*MI);
     // No need for the branch to the next block. We're adding an unconditional
@@ -1654,7 +1651,6 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   return true;
 }
 
-
 void MipsConstantIslands::prescanForConstants() {
   unsigned J = 0;
   (void)J;
@@ -1667,11 +1663,11 @@ void MipsConstantIslands::prescanForConstants() {
           PrescannedForConstants = true;
           DEBUG(dbgs() << "constant island constant " << *I << "\n");
           J = I->getNumOperands();
-          DEBUG(dbgs() << "num operands " << J  << "\n");
+          DEBUG(dbgs() << "num operands " << J << "\n");
           MachineOperand& Literal = I->getOperand(1);
           if (Literal.isImm()) {
             int64_t V = Literal.getImm();
-            DEBUG(dbgs() << "literal " << V  << "\n");
+            DEBUG(dbgs() << "literal " << V << "\n");
             Type *Int32Ty =
               Type::getInt32Ty(MF->getFunction()->getContext());
             const Constant *C = ConstantInt::get(Int32Ty, V);
@@ -1692,3 +1688,8 @@ void MipsConstantIslands::prescanForConstants() {
     }
   }
 }
+
+/// Returns a pass that converts branches to long branches.
+FunctionPass *llvm::createMipsConstantIslandPass() {
+  return new MipsConstantIslands();
+}
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index c821084f68cf..ae58c26e145a 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -14,21 +14,39 @@
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
 
 using namespace llvm;
 
@@ -84,6 +102,7 @@ static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
 );
 
 namespace {
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
   typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
@@ -91,6 +110,7 @@ namespace {
   class RegDefsUses {
   public:
     RegDefsUses(const TargetRegisterInfo &TRI);
+
     void init(const MachineInstr &MI);
 
     /// This function sets all caller-saved registers in Defs.
@@ -120,18 +140,18 @@ namespace {
   /// Base class for inspecting loads and stores.
   class InspectMemInstr {
   public:
-    InspectMemInstr(bool ForbidMemInstr_)
-      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
-        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+    InspectMemInstr(bool ForbidMemInstr_) : ForbidMemInstr(ForbidMemInstr_) {}
+    virtual ~InspectMemInstr() = default;
 
     /// Return true if MI cannot be moved to delay slot.
     bool hasHazard(const MachineInstr &MI);
 
-    virtual ~InspectMemInstr() {}
-
   protected:
     /// Flags indicating whether loads or stores have been seen.
-    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+    bool OrigSeenLoad = false;
+    bool OrigSeenStore = false;
+    bool SeenLoad = false;
+    bool SeenStore = false;
 
     /// Memory instructions are not allowed to move to delay slot if this flag
     /// is true.
@@ -145,6 +165,7 @@ namespace {
   class NoMemInstr : public InspectMemInstr {
   public:
     NoMemInstr() : InspectMemInstr(true) {}
+
   private:
     bool hasHazard_(const MachineInstr &MI) override { return true; }
   };
@@ -153,6 +174,7 @@ namespace {
   class LoadFromStackOrConst : public InspectMemInstr {
   public:
     LoadFromStackOrConst() : InspectMemInstr(false) {}
+
   private:
     bool hasHazard_(const MachineInstr &MI) override;
   };
@@ -183,7 +205,8 @@ namespace {
 
     /// Flags indicating whether loads or stores with no underlying objects have
     /// been seen.
-    bool SeenNoObjLoad, SeenNoObjStore;
+    bool SeenNoObjLoad = false;
+    bool SeenNoObjStore = false;
   };
 
   class Filler : public MachineFunctionPass {
@@ -271,8 +294,10 @@ namespace {
 
     static char ID;
   };
+
   char Filler::ID = 0;
-} // end of anonymous namespace
+
+} // end anonymous namespace
 
 static bool hasUnoccupiedSlot(const MachineInstr *MI) {
   return MI->hasDelaySlot() && !MI->isBundledWithSucc();
@@ -458,8 +483,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
 }
 
 MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
-    : InspectMemInstr(false), MFI(MFI_), DL(DL), SeenNoObjLoad(false),
-      SeenNoObjStore(false) {}
+    : InspectMemInstr(false), MFI(MFI_), DL(DL) {}
 
 bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   bool HasHazard = false;
@@ -646,12 +670,6 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   return Changed;
 }
 
-/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
-/// slots in Mips MachineFunctions
-FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
-  return new Filler(tm);
-}
-
 template<typename IterTy>
 bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
                          RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
@@ -889,3 +907,9 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const {
           Candidate.isPosition() || Candidate.isInlineAsm() ||
           Candidate.hasUnmodeledSideEffects());
 }
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index a44192f57aa0..c060cf06099d 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1,4 +1,4 @@
-//===-- MipsFastISel.cpp - Mips FastISel implementation --------------------===//
+//===-- MipsFastISel.cpp - Mips FastISel implementation -------------------===//
 //
 // The LLVM Compiler Infrastructure
 //
@@ -14,24 +14,62 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsCCState.h"
 #include "MipsInstrInfo.h"
 #include "MipsISelLowering.h"
 #include "MipsMachineFunction.h"
-#include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <new>
 
 #define DEBUG_TYPE "mips-fastisel"
 
@@ -47,35 +85,40 @@ class MipsFastISel final : public FastISel {
     typedef enum { RegBase, FrameIndexBase } BaseKind;
 
   private:
-    BaseKind Kind;
+    BaseKind Kind = RegBase;
     union {
       unsigned Reg;
       int FI;
     } Base;
 
-    int64_t Offset;
+    int64_t Offset = 0;
 
-    const GlobalValue *GV;
+    const GlobalValue *GV = nullptr;
 
   public:
     // Innocuous defaults for our address.
-    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    Address() { Base.Reg = 0; }
+
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
+
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
       Base.Reg = Reg;
     }
+
     unsigned getReg() const {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index access!");
       Base.FI = FI;
     }
+
     unsigned getFI() const {
       assert(isFIBase() && "Invalid base frame index access!");
       return Base.FI;
@@ -165,14 +208,17 @@ private:
   MachineInstrBuilder emitInst(unsigned Opc) {
     return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
   }
+
   MachineInstrBuilder emitInst(unsigned Opc, unsigned DstReg) {
     return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                    DstReg);
   }
+
   MachineInstrBuilder emitInstStore(unsigned Opc, unsigned SrcReg,
                                     unsigned MemReg, int64_t MemOffset) {
     return emitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
   }
+
   MachineInstrBuilder emitInstLoad(unsigned Opc, unsigned DstReg,
                                    unsigned MemReg, int64_t MemOffset) {
     return emitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
@@ -198,6 +244,7 @@ private:
   bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
                        unsigned &NumBytes);
   bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+
   const MipsABIInfo &getABI() const {
     return static_cast<const MipsTargetMachine &>(TM).getABI();
   }
@@ -220,7 +267,8 @@ public:
 
 #include "MipsGenFastISel.inc"
 };
-} // end anonymous namespace.
+
+} // end anonymous namespace
 
 static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
@@ -414,7 +462,6 @@ unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
 }
 
 bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
-
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
@@ -432,10 +479,9 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
   switch (Opcode) {
   default:
     break;
-  case Instruction::BitCast: {
+  case Instruction::BitCast:
     // Look through bitcasts.
     return computeAddress(U->getOperand(0), Addr);
-  }
   case Instruction::GetElementPtr: {
     Address SavedAddr = Addr;
     int64_t TmpOffset = Addr.getOffset();
@@ -451,7 +497,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
         TmpOffset += SL->getElementOffset(Idx);
       } else {
         uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
+        while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
             TmpOffset += CI->getSExtValue() * S;
@@ -613,14 +659,12 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg);
     break;
   }
-  case CmpInst::ICMP_UGT: {
+  case CmpInst::ICMP_UGT:
     emitInst(Mips::SLTu, ResultReg).addReg(RightReg).addReg(LeftReg);
     break;
-  }
-  case CmpInst::ICMP_ULT: {
+  case CmpInst::ICMP_ULT:
     emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg);
     break;
-  }
   case CmpInst::ICMP_UGE: {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg);
@@ -633,14 +677,12 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
     break;
   }
-  case CmpInst::ICMP_SGT: {
+  case CmpInst::ICMP_SGT:
     emitInst(Mips::SLT, ResultReg).addReg(RightReg).addReg(LeftReg);
     break;
-  }
-  case CmpInst::ICMP_SLT: {
+  case CmpInst::ICMP_SLT:
     emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg);
     break;
-  }
   case CmpInst::ICMP_SGE: {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg);
@@ -709,6 +751,7 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
   }
   return true;
 }
+
 bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                             unsigned Alignment) {
   //
@@ -716,35 +759,30 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   //
   unsigned Opc;
   switch (VT.SimpleTy) {
-  case MVT::i32: {
+  case MVT::i32:
     ResultReg = createResultReg(&Mips::GPR32RegClass);
     Opc = Mips::LW;
     break;
-  }
-  case MVT::i16: {
+  case MVT::i16:
     ResultReg = createResultReg(&Mips::GPR32RegClass);
     Opc = Mips::LHu;
     break;
-  }
-  case MVT::i8: {
+  case MVT::i8:
     ResultReg = createResultReg(&Mips::GPR32RegClass);
     Opc = Mips::LBu;
     break;
-  }
-  case MVT::f32: {
+  case MVT::f32:
     if (UnsupportedFPMode)
       return false;
     ResultReg = createResultReg(&Mips::FGR32RegClass);
     Opc = Mips::LWC1;
     break;
-  }
-  case MVT::f64: {
+  case MVT::f64:
     if (UnsupportedFPMode)
       return false;
     ResultReg = createResultReg(&Mips::AFGR64RegClass);
     Opc = Mips::LDC1;
     break;
-  }
   default:
     return false;
   }
@@ -1730,6 +1768,7 @@ bool MipsFastISel::selectTrunc(const Instruction *I) {
   updateValueMap(I, SrcReg);
   return true;
 }
+
 bool MipsFastISel::selectIntExt(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Src = I->getOperand(0);
@@ -1757,6 +1796,7 @@ bool MipsFastISel::selectIntExt(const Instruction *I) {
   updateValueMap(I, ResultReg);
   return true;
 }
+
 bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                    unsigned DestReg) {
   unsigned ShiftAmt;
@@ -2074,8 +2114,10 @@ unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
 }
 
 namespace llvm {
+
 FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
                                const TargetLibraryInfo *libInfo) {
   return new MipsFastISel(funcInfo, libInfo);
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/Mips/MipsHazardSchedule.cpp b/lib/Target/Mips/MipsHazardSchedule.cpp
index 31b86124bc8d..f6fcf6ec9385 100644
--- a/lib/Target/Mips/MipsHazardSchedule.cpp
+++ b/lib/Target/Mips/MipsHazardSchedule.cpp
@@ -36,7 +36,7 @@
 ///
 /// A) A previous pass has created a compact branch directly.
 /// B) Transforming a delay slot branch into compact branch. This case can be
-///    difficult to process as lookahead for hazards is insufficent, as
+///    difficult to process as lookahead for hazards is insufficient, as
 ///    backwards delay slot fillling can also produce hazards in previously
 ///    processed instuctions.
 ///
@@ -103,23 +103,24 @@ static Iter getNextMachineInstrInBB(Iter Position) {
 
 // Find the next real instruction from the current position, looking through
 // basic block boundaries.
-static Iter getNextMachineInstr(Iter Position, MachineBasicBlock *Parent) {
+static std::pair<Iter, bool> getNextMachineInstr(Iter Position, MachineBasicBlock * Parent) {
   if (Position == Parent->end()) {
-    MachineBasicBlock *Succ = Parent->getNextNode();
-    if (Succ != nullptr && Parent->isSuccessor(Succ)) {
-      Position = Succ->begin();
-      Parent = Succ;
-    } else {
-      llvm_unreachable(
-          "Should have identified the end of the function earlier!");
-    }
+    do {
+      MachineBasicBlock *Succ = Parent->getNextNode();
+      if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+        Position = Succ->begin();
+        Parent = Succ;
+      } else {
+        return std::make_pair(Position, true);
+      }
+    } while (Parent->empty());
   }
 
   Iter Instr = getNextMachineInstrInBB(Position);
   if (Instr == Parent->end()) {
     return getNextMachineInstr(Instr, Parent);
   }
-  return Instr;
+  return std::make_pair(Instr, false);
 }
 
 bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
@@ -145,7 +146,9 @@ bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
       bool LastInstInFunction =
           std::next(I) == FI->end() && std::next(FI) == MF.end();
       if (!LastInstInFunction) {
-        Inst = getNextMachineInstr(std::next(I), &*FI);
+        std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
+        LastInstInFunction |= Res.second;
+        Inst = Res.first;
       }
 
       if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 9c511bd77822..93c5f496ce97 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -112,8 +112,11 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::FIRST_NUMBER:      break;
   case MipsISD::JmpLink:           return "MipsISD::JmpLink";
   case MipsISD::TailCall:          return "MipsISD::TailCall";
+  case MipsISD::Highest:           return "MipsISD::Highest";
+  case MipsISD::Higher:            return "MipsISD::Higher";
   case MipsISD::Hi:                return "MipsISD::Hi";
   case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GotHi:             return "MipsISD::GotHi";
   case MipsISD::GPRel:             return "MipsISD::GPRel";
   case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
   case MipsISD::Ret:               return "MipsISD::Ret";
@@ -144,6 +147,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Sync:              return "MipsISD::Sync";
   case MipsISD::Ext:               return "MipsISD::Ext";
   case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::CIns:              return "MipsISD::CIns";
   case MipsISD::LWL:               return "MipsISD::LWL";
   case MipsISD::LWR:               return "MipsISD::LWR";
   case MipsISD::SWL:               return "MipsISD::SWL";
@@ -425,6 +429,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::SHL);
 
   if (ABI.IsO32()) {
     // These libcalls are not available in 32-bit.
@@ -699,41 +704,81 @@ static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // Pattern match EXT.
-  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
-  //  => ext $dst, $src, size, pos
   if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
     return SDValue();
 
-  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
-  unsigned ShiftRightOpc = ShiftRight.getOpcode();
-
-  // Op's first operand must be a shift right.
-  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
-    return SDValue();
+  SDValue FirstOperand = N->getOperand(0);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  SDValue Mask = N->getOperand(1);
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
 
-  // The second operand of the shift must be an immediate.
+  uint64_t Pos = 0, SMPos, SMSize;
   ConstantSDNode *CN;
-  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
-    return SDValue();
-
-  uint64_t Pos = CN->getZExtValue();
-  uint64_t SMPos, SMSize;
+  SDValue NewOperand;
+  unsigned Opc;
 
   // Op's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
       !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
     return SDValue();
 
-  // Return if the shifted mask does not start at bit 0 or the sum of its size
-  // and Pos exceeds the word's size.
-  EVT ValTy = N->getValueType(0);
-  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
-    return SDValue();
+  if (FirstOperandOpc == ISD::SRA || FirstOperandOpc == ISD::SRL) {
+    // Pattern match EXT.
+    //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+    //  => ext $dst, $src, pos, size
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    Pos = CN->getZExtValue();
+
+    // Return if the shifted mask does not start at bit 0 or the sum of its size
+    // and Pos exceeds the word's size.
+    if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+      return SDValue();
+
+    Opc = MipsISD::Ext;
+    NewOperand = FirstOperand.getOperand(0);
+  } else if (FirstOperandOpc == ISD::SHL && Subtarget.hasCnMips()) {
+    // Pattern match CINS.
+    //  $dst = and (shl $src , pos), mask
+    //  => cins $dst, $src, pos, size
+    // mask is a shifted mask with consecutive 1's, pos = shift amount,
+    // size = population count.
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    Pos = CN->getZExtValue();
+
+    if (SMPos != Pos || Pos >= ValTy.getSizeInBits() || SMSize >= 32 ||
+        Pos + SMSize > ValTy.getSizeInBits())
+      return SDValue();
+
+    NewOperand = FirstOperand.getOperand(0);
+    // SMSize is 'location' (position) in this case, not size.
+    SMSize--;
+    Opc = MipsISD::CIns;
+  } else {
+    // Pattern match EXT.
+    //  $dst = and $src, (2**size - 1) , if size > 16
+    //  => ext $dst, $src, pos, size , pos = 0
 
-  SDLoc DL(N);
-  return DAG.getNode(MipsISD::Ext, DL, ValTy,
-                     ShiftRight.getOperand(0),
+    // If the mask is <= 0xffff, andi can be used instead.
+    if (CN->getZExtValue() <= 0xffff)
+      return SDValue();
+
+    // Return if the mask doesn't start at position 0.
+    if (SMPos)
+      return SDValue();
+
+    Opc = MipsISD::Ext;
+    NewOperand = FirstOperand;
+  }
+  return DAG.getNode(Opc, DL, ValTy, NewOperand,
                      DAG.getConstant(Pos, DL, MVT::i32),
                      DAG.getConstant(SMSize, DL, MVT::i32));
 }
@@ -852,6 +897,58 @@ static SDValue performAssertZextCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // Pattern match CINS.
+  //  $dst = shl (and $src , imm), pos
+  //  => cins $dst, $src, pos, size
+
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasCnMips())
+    return SDValue();
+
+  SDValue FirstOperand = N->getOperand(0);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  SDValue SecondOperand = N->getOperand(1);
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+
+  uint64_t Pos = 0, SMPos, SMSize;
+  ConstantSDNode *CN;
+  SDValue NewOperand;
+
+  // The second operand of the shift must be an immediate.
+  if (!(CN = dyn_cast<ConstantSDNode>(SecondOperand)))
+    return SDValue();
+
+  Pos = CN->getZExtValue();
+
+  if (Pos >= ValTy.getSizeInBits())
+    return SDValue();
+
+  if (FirstOperandOpc != ISD::AND)
+    return SDValue();
+
+  // AND's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  if (SMPos != 0 || SMSize > 32 || Pos + SMSize > ValTy.getSizeInBits())
+    return SDValue();
+
+  NewOperand = FirstOperand.getOperand(0);
+  // SMSize is 'location' (position) in this case, not size.
+  SMSize--;
+
+  return DAG.getNode(MipsISD::CIns, DL, ValTy, NewOperand,
+                     DAG.getConstant(Pos, DL, MVT::i32),
+                     DAG.getConstant(SMSize, DL, MVT::i32));
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -875,6 +972,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performADDCombine(N, DAG, DCI, Subtarget);
   case ISD::AssertZext:
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
+  case ISD::SHL:
+    return performSHLCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -1733,7 +1832,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
-  if (!isPositionIndependent() && !ABI.IsN64()) {
+  if (!isPositionIndependent()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
@@ -1742,8 +1841,10 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
-    // %hi/%lo relocation
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+                                 // %hi/%lo relocation
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                 // %highest/%higher/%hi/%lo relocation
+                                 : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
   }
 
   // Every other architecture would use shouldAssumeDSOLocal in here, but
@@ -1777,8 +1878,9 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
   BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (!isPositionIndependent() && !ABI.IsN64())
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+  if (!isPositionIndependent())
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
 
   return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
@@ -1820,8 +1922,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
-      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
+    CLI.setDebugLoc(DL)
+        .setChain(DAG.getEntryNode())
+        .setLibCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -1870,8 +1973,9 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (!isPositionIndependent() && !ABI.IsN64())
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+  if (!isPositionIndependent())
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
 
   return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
@@ -1882,7 +1986,7 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (!isPositionIndependent() && !ABI.IsN64()) {
+  if (!isPositionIndependent()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
@@ -1892,10 +1996,11 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
   }
 
-  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+ return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -2796,14 +2901,13 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  bool IsPICCall = (ABI.IsN64() || IsPIC); // true if calls are translated to
-                                           // jalr $25
+
   SDValue CalleeLo;
   EVT Ty = Callee.getValueType();
   bool GlobalOrExternal = false, IsCallReloc = false;
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    if (IsPICCall) {
+    if (IsPIC) {
       const GlobalValue *Val = G->getGlobal();
       InternalLinkage = Val->hasInternalLinkage();
 
@@ -2828,7 +2932,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
 
-    if (!ABI.IsN64() && !IsPIC) // !N64 && static
+    if (!IsPIC) // static
       Callee = DAG.getTargetExternalSymbol(
           Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
     else if (LargeGOT) {
@@ -2836,7 +2940,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      MipsII::MO_CALL_LO16, Chain,
                                      FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
-    } else { // N64 || PIC
+    } else { // PIC
       Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                              FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
@@ -2848,7 +2952,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> Ops(1, Chain);
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+  getOpndList(Ops, RegsToPass, IsPIC, GlobalOrExternal, InternalLinkage,
               IsCallReloc, CLI, Callee, Chain);
 
   if (IsTailCall) {
@@ -3683,7 +3787,9 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 }
 
 unsigned MipsTargetLowering::getJumpTableEncoding() const {
-  if (ABI.IsN64())
+
+  // FIXME: For space reasons this should be: EK_GPRel32BlockAddress.
+  if (ABI.IsN64() && isPositionIndependent())
     return MachineJumpTableInfo::EK_GPRel64BlockAddress;
 
   return TargetLowering::getJumpTableEncoding();
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index cddf0903ca6a..2dcafd51061a 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -37,14 +37,23 @@ namespace llvm {
       // Tail call
       TailCall,
 
-      // Get the Higher 16 bits from a 32-bit immediate
+      // Get the Highest (63-48) 16 bits from a 64-bit immediate
+      Highest,
+
+      // Get the Higher (47-32) 16 bits from a 64-bit immediate
+      Higher,
+
+      // Get the High 16 bits from a 32/64-bit immediate
       // No relation with Mips Hi register
       Hi,
 
-      // Get the Lower 16 bits from a 32-bit immediate
+      // Get the Lower 16 bits from a 32/64-bit immediate
       // No relation with Mips Lo register
       Lo,
 
+      // Get the High 16 bits from a 32 bit immediate for accessing the GOT.
+      GotHi,
+
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
@@ -107,6 +116,7 @@ namespace llvm {
 
       Ext,
       Ins,
+      CIns,
 
       // EXTR.W instrinsic nodes.
       EXTP,
@@ -297,7 +307,7 @@ namespace llvm {
     }
 
     bool isJumpTableRelative() const override {
-      return getTargetMachine().isPositionIndependent() || ABI.IsN64();
+      return getTargetMachine().isPositionIndependent();
     }
 
   protected:
@@ -344,8 +354,8 @@ namespace llvm {
                                   SelectionDAG &DAG, unsigned HiFlag,
                                   unsigned LoFlag, SDValue Chain,
                                   const MachinePointerInfo &PtrInfo) const {
-      SDValue Hi =
-          DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(N, Ty, DAG, HiFlag));
+      SDValue Hi = DAG.getNode(MipsISD::GotHi, DL, Ty,
+                               getTargetNode(N, Ty, DAG, HiFlag));
       Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
       SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
                                     getTargetNode(N, Ty, DAG, LoFlag));
@@ -356,6 +366,8 @@ namespace llvm {
     // computing a symbol's address in non-PIC mode:
     //
     // (add %hi(sym), %lo(sym))
+    //
+    // This method covers O32, N32 and N64 in sym32 mode.
     template <class NodeTy>
     SDValue getAddrNonPIC(NodeTy *N, const SDLoc &DL, EVT Ty,
                           SelectionDAG &DAG) const {
@@ -364,7 +376,37 @@ namespace llvm {
       return DAG.getNode(ISD::ADD, DL, Ty,
                          DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
                          DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
-    }
+   }
+
+   // This method creates the following nodes, which are necessary for
+   // computing a symbol's address in non-PIC mode for N64.
+   //
+   // (add (shl (add (shl (add %highest(sym), %higher(sim)), 16), %high(sym)),
+   //            16), %lo(%sym))
+   //
+   // FIXME: This method is not efficent for (micro)MIPS64R6.
+   template <class NodeTy>
+   SDValue getAddrNonPICSym64(NodeTy *N, const SDLoc &DL, EVT Ty,
+                          SelectionDAG &DAG) const {
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+
+      SDValue Highest =
+          DAG.getNode(MipsISD::Highest, DL, Ty,
+                      getTargetNode(N, Ty, DAG, MipsII::MO_HIGHEST));
+      SDValue Higher = getTargetNode(N, Ty, DAG, MipsII::MO_HIGHER);
+      SDValue HigherPart =
+          DAG.getNode(ISD::ADD, DL, Ty, Highest,
+                      DAG.getNode(MipsISD::Higher, DL, Ty, Higher));
+      SDValue Cst = DAG.getConstant(16, DL, MVT::i32);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, Ty, HigherPart, Cst);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, Ty, Shift,
+                                DAG.getNode(MipsISD::Hi, DL, Ty, Hi));
+      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, Ty, Add, Cst);
+
+      return DAG.getNode(ISD::ADD, DL, Ty, Shift2,
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+   }
 
     // This method creates the following nodes, which are necessary for
     // computing a symbol's address using gp-relative addressing:
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 19af1914c819..df62c66b75a3 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -482,7 +482,7 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
       MIB->RemoveOperand(0);
 
     for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
-      MIB.addOperand(I->getOperand(J));
+      MIB.add(I->getOperand(J));
     }
 
     MIB.addImm(0);
@@ -492,7 +492,7 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
       if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J)
         continue;
 
-      MIB.addOperand(I->getOperand(J));
+      MIB.add(I->getOperand(J));
     }
   }
 
@@ -501,3 +501,31 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
   MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
   return MIB;
 }
+
+bool MipsInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                                          unsigned &SrcOpIdx2) const {
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  switch (MI.getOpcode()) {
+  case Mips::DPADD_U_H:
+  case Mips::DPADD_U_W:
+  case Mips::DPADD_U_D:
+  case Mips::DPADD_S_H:
+  case Mips::DPADD_S_W:
+  case Mips::DPADD_S_D: {
+    // The first operand is both input and output, so it should not commute
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3))
+      return false;
+
+    if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
+      return false;
+    return true;
+  }
+  }
+  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 347b9187d08c..45d700d8afd6 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -135,6 +135,9 @@ public:
   MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
                                          MachineBasicBlock::iterator I) const;
 
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
 protected:
   bool isZeroImm(const MachineOperand &op) const;
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 5bc48336121a..b90077d7807d 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -59,10 +59,20 @@ def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
 // Hi and Lo nodes are used to handle global addresses. Used on
 // MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
 // static model. (nothing to do with Mips Registers Hi and Lo)
+
+// Hi is the odd node out, on MIPS64 it can expand to either daddiu when
+// using static relocations with 64 bit symbols, or lui when using 32 bit
+// symbols.
+def MipsHigher : SDNode<"MipsISD::Higher", SDTIntUnaryOp>;
+def MipsHighest : SDNode<"MipsISD::Highest", SDTIntUnaryOp>;
 def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
 def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+
 def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
 
+// Hi node for accessing the GOT.
+def MipsGotHi : SDNode<"MipsISD::GotHi", SDTIntUnaryOp>;
+
 // TlsGd node is used to handle General Dynamic TLS
 def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
 
@@ -128,6 +138,7 @@ def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
 
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+def MipsCIns : SDNode<"MipsISD::CIns", SDT_Ext>;
 
 def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
                      [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -205,6 +216,10 @@ def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
                       AssemblerPredicate<"FeatureCnMips">;
 def NotCnMips    :    Predicate<"!Subtarget->hasCnMips()">,
                       AssemblerPredicate<"!FeatureCnMips">;
+def IsSym32     :     Predicate<"Subtarget->HasSym32()">,
+                      AssemblerPredicate<"FeatureSym32">;
+def IsSym64     :     Predicate<"!Subtarget->HasSym32()">,
+                      AssemblerPredicate<"!FeatureSym32">;
 def RelocNotPIC :     Predicate<"!TM.isPositionIndependent()">;
 def RelocPIC    :     Predicate<"TM.isPositionIndependent()">;
 def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
@@ -237,6 +252,14 @@ class PTR_32 { list<Predicate> PTRPredicates = [IsPTR32bit]; }
 class PTR_64 { list<Predicate> PTRPredicates = [IsPTR64bit]; }
 
 //===----------------------------------------------------------------------===//
+// Mips Symbol size adjectives.
+// They are mutally exculsive.
+//===----------------------------------------------------------------------===//
+
+class SYM_32 { list<Predicate> SYMPredicates = [IsSym32]; }
+class SYM_64 { list<Predicate> SYMPredicates = [IsSym64]; }
+
+//===----------------------------------------------------------------------===//
 // Mips ISA/ASE membership and instruction group membership adjectives.
 // They are mutually exclusive.
 //===----------------------------------------------------------------------===//
@@ -519,7 +542,7 @@ def UImm32CoercedAsmOperandClass : UImmAnyAsmOperandClass<33, []> {
 def SImm32RelaxedAsmOperandClass
     : SImmAsmOperandClass<32, [UImm32CoercedAsmOperandClass]> {
   let Name = "SImm32_Relaxed";
-  let PredicateMethod = "isAnyImm<32>";
+  let PredicateMethod = "isAnyImm<33>";
   let DiagnosticType = "SImm32_Relaxed";
 }
 def SImm32AsmOperandClass
@@ -1150,6 +1173,10 @@ def immZExt5Plus33 : PatLeaf<(imm), [{
   return isUInt<5>(N->getZExtValue() - 33);
 }]>;
 
+def immZExt5To31 : SDNodeXForm<imm, [{
+  return getImm(N, 31 - N->getZExtValue());
+}]>;
+
 // True if (N + 1) fits in 16-bit field.
 def immSExt16Plus1 : PatLeaf<(imm), [{
   return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
@@ -2281,9 +2308,38 @@ def SEQIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
 def : MipsInstAlias<"seq $rd, $imm",
                     (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
                     NOT_ASE_CNMIPS;
+
+def MULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                                 simm32_relaxed:$imm),
+                                    "mul\t$rd, $rs, $imm">,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def MULOMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                               GPR32Opnd:$rt),
+                                  "mulo\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def MULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                                GPR32Opnd:$rt),
+                                   "mulou\t$rd, $rs, $rt">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
+
+multiclass OneOrTwoOperandMacroImmediateAlias<string Memnomic,
+                                              Instruction Opcode,
+                                              RegisterOperand RO = GPR32Opnd,
+                                              Operand Imm = simm32_relaxed> {
+  def : MipsInstAlias<!strconcat(Memnomic, " $rs, $rt, $imm"),
+                                (Opcode RO:$rs,
+                                        RO:$rt,
+                                        Imm:$imm), 0>;
+  def : MipsInstAlias<!strconcat(Memnomic, " $rs, $imm"),
+                                (Opcode RO:$rs,
+                                        RO:$rs,
+                                        Imm:$imm), 0>;
+}
+
 def : MipsInstAlias<"move $dst, $src",
                     (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
       GPR_32 {
@@ -2296,26 +2352,7 @@ def : MipsInstAlias<"move $dst, $src",
 }
 def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "addu $rs, $rt, $imm",
-          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "addu $rs, $imm",
-          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "add $rs, $rt, $imm",
-          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>,
-          ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "add $rs, $imm",
-          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>,
-          ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "and $rs, $rt, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "and $rs, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+
 def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
@@ -2343,36 +2380,26 @@ let AdditionalPredicates = [NotInMicroMips] in {
           "sgtu $$rs, $rt",
           (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
   def : MipsInstAlias<
-          "slt $rs, $rt, $imm",
-          (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "sltu $rt, $rs, $imm",
-          (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "and $rs, $rt, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "and $rs, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "xor $rs, $rt, $imm",
-          (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "xor $rs, $imm",
-          (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "or $rs, $rt, $imm",
-          (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "or $rs, $imm",
-          (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
           "not $rt, $rs",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
   def : MipsInstAlias<
           "not $rt",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
   def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, GPR_32;
 }
 def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
@@ -2445,6 +2472,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
 def : MipsInstAlias<"sync",
                     (SYNC 0), 1>, ISA_MIPS2;
+
+def : MipsInstAlias<"mulo $rs, $rt",
+                    (MULOMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"mulou $rs, $rt",
+                    (MULOUMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -2472,9 +2507,12 @@ def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
 def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
                       "jal\t$rs"> ;
 
-def NORImm : MipsAsmPseudoInst<
-                 (outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm32:$imm),
-                 "nor\t$rs, $rt, $imm"> ;
+class NORIMM_DESC_BASE<RegisterOperand RO, DAGOperand Imm> :
+   MipsAsmPseudoInst<(outs RO:$rs), (ins RO:$rt, Imm:$imm),
+                      "nor\t$rs, $rt, $imm">;
+def NORImm : NORIMM_DESC_BASE<GPR32Opnd, simm32_relaxed>, GPR_32;
+def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                              simm32_relaxed:$imm)>, GPR_32;
 
 let hasDelaySlot = 1, isCTI = 1 in {
 def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
@@ -2512,6 +2550,9 @@ class CondBranchImmPseudo<string instr_asm> :
   MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
                     !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
 
+def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6;
+def BNELImmMacro : CondBranchImmPseudo<"bnel">, ISA_MIPS2_NOT_32R6_64R6;
+
 def BLTImmMacro  : CondBranchImmPseudo<"blt">;
 def BLEImmMacro  : CondBranchImmPseudo<"ble">;
 def BGEImmMacro  : CondBranchImmPseudo<"bge">;
@@ -2535,34 +2576,46 @@ def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
 // Once the tablegen-erated errors are made better, this needs to be fixed and
 // predicates needs to be restored.
 
-def SDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+def SDivMacro : MipsAsmPseudoInst<(outs GPR32NonZeroOpnd:$rd),
                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
                                   "div\t$rd, $rs, $rt">,
                 ISA_MIPS1_NOT_32R6_64R6;
+def SDivIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32:$imm),
+                                   "div\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
 def UDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
                                   "divu\t$rd, $rs, $rt">,
                 ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"div $rt, $rs", (SDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                               GPR32Opnd:$rs), 0>,
+def UDivIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32:$imm),
+                                   "divu\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
+
+def : MipsInstAlias<"div $rs, $rt", (SDIV GPR32ZeroOpnd:$rs,
+                                          GPR32Opnd:$rt), 0>,
+     ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rs, $rt", (SDivMacro GPR32NonZeroOpnd:$rs,
+                                               GPR32NonZeroOpnd:$rs,
+                                               GPR32Opnd:$rt), 0>,
+     ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rd, $imm", (SDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                                 simm32:$imm), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+
+def : MipsInstAlias<"divu $rt, $rs", (UDIV GPR32ZeroOpnd:$rt,
+                                           GPR32Opnd:$rs), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32NonZeroOpnd:$rt,
+                                                GPR32NonZeroOpnd:$rt,
                                                 GPR32Opnd:$rs), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def DSDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
-                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddiv\t$rd, $rs, $rt">,
-                 ISA_MIPS64_NOT_64R6;
-def DUDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
-                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddivu\t$rd, $rs, $rt">,
-                 ISA_MIPS64_NOT_64R6;
-def : MipsInstAlias<"ddiv $rt, $rs", (DSDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                                 GPR32Opnd:$rs), 0>,
-      ISA_MIPS64_NOT_64R6;
-def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                                  GPR32Opnd:$rs), 0>,
-      ISA_MIPS64_NOT_64R6;
+
+def : MipsInstAlias<"divu $rd, $imm", (UDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                                  simm32:$imm), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
 
 def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
                             "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
@@ -2647,30 +2700,40 @@ def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
 def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
               (TAILCALL texternalsym:$dst)>;
 // hi/lo relocs
-def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
-def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
-def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
-def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
-def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsHi texternalsym:$in), (LUi texternalsym:$in)>;
-
-def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
-def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
-def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
-def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
-def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
-
-def : MipsPat<(add GPR32:$hi, (MipsLo tglobaladdr:$lo)),
-              (ADDiu GPR32:$hi, tglobaladdr:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tblockaddress:$lo)),
-              (ADDiu GPR32:$hi, tblockaddress:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tjumptable:$lo)),
-              (ADDiu GPR32:$hi, tjumptable:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tconstpool:$lo)),
-              (ADDiu GPR32:$hi, tconstpool:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (ADDiu GPR32:$hi, tglobaltlsaddr:$lo)>;
+multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
+                          Register ZeroReg, RegisterOperand GPROpnd> {
+  def : MipsPat<(MipsHi tglobaladdr:$in), (Lui tglobaladdr:$in)>;
+  def : MipsPat<(MipsHi tblockaddress:$in), (Lui tblockaddress:$in)>;
+  def : MipsPat<(MipsHi tjumptable:$in), (Lui tjumptable:$in)>;
+  def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>;
+  def : MipsPat<(MipsHi tglobaltlsaddr:$in), (Lui tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>;
+
+  def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
+  def : MipsPat<(MipsLo tblockaddress:$in),
+                (Addiu ZeroReg, tblockaddress:$in)>;
+  def : MipsPat<(MipsLo tjumptable:$in), (Addiu ZeroReg, tjumptable:$in)>;
+  def : MipsPat<(MipsLo tconstpool:$in), (Addiu ZeroReg, tconstpool:$in)>;
+  def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+                (Addiu ZeroReg, tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsLo texternalsym:$in), (Addiu ZeroReg, texternalsym:$in)>;
+
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)),
+              (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tblockaddress:$lo)),
+              (Addiu GPROpnd:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tjumptable:$lo)),
+              (Addiu GPROpnd:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tconstpool:$lo)),
+              (Addiu GPROpnd:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
+}
+
+defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi texternalsym:$in)>;
 
 // gp_rel relocs
 def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 1087d0e0140e..100503700a72 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -13,20 +13,31 @@
 // FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
 //===----------------------------------------------------------------------===//
 
-#include "Mips.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -47,21 +58,23 @@ static cl::opt<bool> ForceLongBranch(
   cl::Hidden);
 
 namespace {
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
   struct MBBInfo {
-    uint64_t Size, Address;
-    bool HasLongBranch;
-    MachineInstr *Br;
+    uint64_t Size = 0;
+    uint64_t Address;
+    bool HasLongBranch = false;
+    MachineInstr *Br = nullptr;
 
-    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
+    MBBInfo() = default;
   };
 
   class MipsLongBranch : public MachineFunctionPass {
-
   public:
     static char ID;
+
     MipsLongBranch(TargetMachine &tm)
         : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
           ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
@@ -92,13 +105,8 @@ namespace {
   };
 
   char MipsLongBranch::ID = 0;
-} // end of anonymous namespace
 
-/// createMipsLongBranchPass - Returns a pass that converts branches to long
-/// branches.
-FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
-  return new MipsLongBranch(tm);
-}
+} // end anonymous namespace
 
 /// Iterate over list of Br's operands and search for a MachineBasicBlock
 /// operand.
@@ -530,3 +538,9 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
 
   return true;
 }
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index d0609b15341d..5bf4c958c7b9 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/MipsBaseInfo.h"
-#include "MipsInstrInfo.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -24,7 +23,7 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
-MipsFunctionInfo::~MipsFunctionInfo() {}
+MipsFunctionInfo::~MipsFunctionInfo() = default;
 
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
@@ -101,4 +100,4 @@ int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
   return MoveF64ViaSpillFI;
 }
 
-void MipsFunctionInfo::anchor() { }
+void MipsFunctionInfo::anchor() {}
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index c9e5fddc1932..553a66703b26 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -1,4 +1,4 @@
-//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//===- MipsMachineFunctionInfo.h - Private data used for Mips ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,12 +15,8 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
 
 #include "Mips16HardFloatInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include <map>
 
 namespace llvm {
@@ -29,12 +25,9 @@ namespace llvm {
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
 public:
-  MipsFunctionInfo(MachineFunction &MF)
-      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0),
-        CallsEhReturn(false), IsISR(false), SaveS2(false),
-        MoveF64ViaSpillFI(-1) {}
+  MipsFunctionInfo(MachineFunction &MF) : MF(MF) {}
 
-  ~MipsFunctionInfo();
+  ~MipsFunctionInfo() override;
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
@@ -81,25 +74,26 @@ public:
 
   int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
 
-  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  std::map<const char *, const Mips16HardFloatInfo::FuncSignature *>
   StubsNeeded;
 
 private:
   virtual void anchor();
 
   MachineFunction& MF;
+
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
+  unsigned SRetReturnReg = 0;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
-  unsigned GlobalBaseReg;
+  unsigned GlobalBaseReg = 0;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
 
   /// True if function has a byval argument.
   bool HasByvalArg;
@@ -108,25 +102,25 @@ private:
   unsigned IncomingArgSize;
 
   /// CallsEhReturn - Whether the function calls llvm.eh.return.
-  bool CallsEhReturn;
+  bool CallsEhReturn = false;
 
   /// Frame objects for spilling eh data registers.
   int EhDataRegFI[4];
 
   /// ISR - Whether the function is an Interrupt Service Routine.
-  bool IsISR;
+  bool IsISR = false;
 
   /// Frame objects for spilling C0_STATUS, C0_EPC
   int ISRDataRegFI[2];
 
   // saveS2
-  bool SaveS2;
+  bool SaveS2 = false;
 
   /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
   /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
-  int MoveF64ViaSpillFI;
+  int MoveF64ViaSpillFI = -1;
 };
 
-} // end of namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h
index 23f0b7070d62..4708784063d3 100644
--- a/lib/Target/Mips/MipsOptionRecord.h
+++ b/lib/Target/Mips/MipsOptionRecord.h
@@ -1,4 +1,4 @@
-//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//===- MipsOptionRecord.h - Abstraction for storing information -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,14 +23,16 @@
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MipsELFStreamer;
-class MCSubtargetInfo;
 
 class MipsOptionRecord {
 public:
-  virtual ~MipsOptionRecord(){};
+  virtual ~MipsOptionRecord() = default;
+
   virtual void EmitMipsOptionRecord() = 0;
 };
 
@@ -53,7 +55,8 @@ public:
     COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
     COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
   }
-  ~MipsRegInfoRecord() override {}
+
+  ~MipsRegInfoRecord() override = default;
 
   void EmitMipsOptionRecord() override;
   void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
@@ -74,5 +77,7 @@ private:
   uint32_t ri_cprmask[4];
   int64_t ri_gp_value;
 };
-} // namespace llvm
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 51ac5620f585..670b6c96e78e 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -57,7 +57,7 @@ static  bool needsFPFromSig(Function &F) {
     ;
   }
   if (F.arg_size() >=1) {
-    Argument &Arg = F.getArgumentList().front();
+    Argument &Arg = *F.arg_begin();
     switch (Arg.getType()->getTypeID()) {
     case Type::FloatTyID:
     case Type::DoubleTyID:
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 8c82239ebbd3..ccfdcc89b078 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -290,6 +290,25 @@ class GPR32Class<list<ValueType> regTypes> :
   K0, K1, GP, SP, FP, RA)>;
 
 def GPR32 : GPR32Class<[i32]>;
+
+def GPR32ZERO : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO)>;
+
+def GPR32NONZERO : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  AT,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7,
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
+  // Reserved
+  K0, K1, GP, SP, FP, RA)>;
+
 def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
 def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
@@ -317,7 +336,7 @@ def GPRMM16MoveP : RegisterClass<"Mips", [i32], 32, (add
   S0, S2, S3, S4)>;
 
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
-// Reserved
+  // Reserved
   ZERO_64, AT_64,
   // Return Values and Arguments
   V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
@@ -479,6 +498,16 @@ def GPR64AsmOperand : MipsAsmRegOperand {
   let PredicateMethod = "isGPRAsmReg";
 }
 
+def GPR32ZeroAsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32ZeroAsmReg";
+  let PredicateMethod = "isGPRZeroAsmReg";
+}
+
+def GPR32NonZeroAsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32NonZeroAsmReg";
+  let PredicateMethod = "isGPRNonZeroAsmReg";
+}
+
 def GPR32AsmOperand : MipsAsmRegOperand {
   let Name = "GPR32AsmReg";
   let PredicateMethod = "isGPRAsmReg";
@@ -550,6 +579,14 @@ def MSACtrlAsmOperand : MipsAsmRegOperand {
   let Name = "MSACtrlAsmReg";
 }
 
+def GPR32ZeroOpnd : RegisterOperand<GPR32ZERO> {
+  let ParserMatchClass = GPR32ZeroAsmOperand;
+}
+
+def GPR32NonZeroOpnd : RegisterOperand<GPR32NONZERO> {
+  let ParserMatchClass = GPR32NonZeroAsmOperand;
+}
+
 def GPR32Opnd : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 4996d070eb29..ef8d18c6deb1 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -11,27 +11,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsSEFrameLowering.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSEFrameLowering.h"
 #include "MipsSEInstrInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
-namespace {
-typedef MachineBasicBlock::iterator Iter;
-
 static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
   if (Mips::ACC64RegClass.contains(Src))
     return std::make_pair((unsigned)Mips::PseudoMFHI,
@@ -47,6 +62,8 @@ static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
   return std::make_pair(0, 0);
 }
 
+namespace {
+
 /// Helper class to expand pseudos.
 class ExpandPseudo {
 public:
@@ -54,6 +71,8 @@ public:
   bool expand();
 
 private:
+  typedef MachineBasicBlock::iterator Iter;
+
   bool expandInstr(MachineBasicBlock &MBB, Iter I);
   void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
   void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
@@ -74,7 +93,8 @@ private:
   const MipsSEInstrInfo &TII;
   const MipsRegisterInfo &RegInfo;
 };
-}
+
+} // end anonymous namespace
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
     : MF(MF_), MRI(MF.getRegInfo()),
@@ -419,7 +439,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
-  if (CSI.size()) {
+  if (!CSI.empty()) {
     // Find the instruction past the last instruction that saves a callee-saved
     // register to the stack.
     for (unsigned i = 0; i < CSI.size(); ++i)
@@ -471,7 +491,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       } else {
         // Reg is either in GPR32 or FGR32.
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-            nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
       }
@@ -534,7 +554,6 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
 void MipsSEFrameLowering::emitInterruptPrologueStub(
     MachineFunction &MF, MachineBasicBlock &MBB) const {
-
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -722,7 +741,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
 
 void MipsSEFrameLowering::emitInterruptEpilogueStub(
     MachineFunction &MF, MachineBasicBlock &MBB) const {
-
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -820,7 +838,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 bool
 MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
   // Reserve call frame if the size of the maximum call frame fits into 16-bit
   // immediate field and there are no variable sized objects on the stack.
   // Make sure the second register scavenger spill slot can be accessed with one
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 63cd3cebc56a..bf30deb1905e 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -1,4 +1,4 @@
-//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//===- MipsSEFrameLowering.h - Mips32/64 frame lowering ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,8 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
 
 #include "MipsFrameLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <vector>
 
 namespace llvm {
 
@@ -47,6 +49,7 @@ private:
   void emitInterruptPrologueStub(MachineFunction &MF,
                                  MachineBasicBlock &MBB) const;
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 92d3c001df94..c9cf9363b8c9 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -97,11 +97,13 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
   // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
   if ((MI.getOpcode() == Mips::ADDiu) &&
       (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).isImm()) &&
       (MI.getOperand(2).getImm() == 0)) {
     DstReg = MI.getOperand(0).getReg();
     ZeroReg = Mips::ZERO;
   } else if ((MI.getOpcode() == Mips::DADDiu) &&
              (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).isImm()) &&
              (MI.getOperand(2).getImm() == 0)) {
     DstReg = MI.getOperand(0).getReg();
     ZeroReg = Mips::ZERO_64;
@@ -690,7 +692,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
     // as the original value.
     if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
 
-      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation() - 1, SDLoc(N),
                                       EltTy);
       return true;
     }
@@ -722,7 +724,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
     // Extract the run of set bits starting with bit zero, and test that the
     // result is the same as the original value
     if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
-      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation() - 1, SDLoc(N),
                                       EltTy);
       return true;
     }
@@ -932,6 +934,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     // same set/ of registers. Similarly, ldi.h isn't capable of producing {
     // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
 
+    const MipsABIInfo &ABI =
+        static_cast<const MipsTargetMachine &>(TM).getABI();
+
     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
     APInt SplatValue, SplatUndef;
     unsigned SplatBitSize;
@@ -969,13 +974,233 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
       break;
     }
 
-    if (!SplatValue.isSignedIntN(10))
-      return false;
-
-    SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
-                                            ViaVecTy.getVectorElementType());
+    SDNode *Res;
 
-    SDNode *Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+    // If we have a signed 10 bit integer, we can splat it directly.
+    //
+    // If we have something bigger we can synthesize the value into a GPR and
+    // splat from there.
+    if (SplatValue.isSignedIntN(10)) {
+      SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
+                                              ViaVecTy.getVectorElementType());
+
+      Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+    } else if (SplatValue.isSignedIntN(16) &&
+               ((ABI.IsO32() && SplatBitSize < 64) ||
+                (ABI.IsN32() || ABI.IsN64()))) {
+      // Only handle signed 16 bit values when the element size is GPR width.
+      // MIPS64 can handle all the cases but MIPS32 would need to handle
+      // negative cases specifically here. Instead, handle those cases as
+      // 64bit values.
+
+      bool Is32BitSplat = ABI.IsO32() || SplatBitSize < 64;
+      const unsigned ADDiuOp = Is32BitSplat ? Mips::ADDiu : Mips::DADDiu;
+      const MVT SplatMVT = Is32BitSplat ? MVT::i32 : MVT::i64;
+      SDValue ZeroVal = CurDAG->getRegister(
+          Is32BitSplat ? Mips::ZERO : Mips::ZERO_64, SplatMVT);
+
+      const unsigned FILLOp =
+          SplatBitSize == 16
+              ? Mips::FILL_H
+              : (SplatBitSize == 32 ? Mips::FILL_W
+                                    : (SplatBitSize == 64 ? Mips::FILL_D : 0));
+
+      assert(FILLOp != 0 && "Unknown FILL Op for splat synthesis!");
+      assert((!ABI.IsO32() || (FILLOp != Mips::FILL_D)) &&
+             "Attempting to use fill.d on MIPS32!");
+
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, SplatMVT);
+
+      Res = CurDAG->getMachineNode(ADDiuOp, DL, SplatMVT, ZeroVal, LoVal);
+      Res = CurDAG->getMachineNode(FILLOp, DL, ViaVecTy, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(32) && SplatBitSize == 32) {
+      // Only handle the cases where the splat size agrees with the size
+      // of the SplatValue here.
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      assert((Hi || Lo) && "Zero case reached 32 bit case splat synthesis!");
+      Res = CurDAG->getMachineNode(Mips::FILL_W, DL, MVT::v4i32, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(32) && SplatBitSize == 64 &&
+               (ABI.IsN32() || ABI.IsN64())) {
+      // N32 and N64 can perform some tricks that O32 can't for signed 32 bit
+      // integers due to having 64bit registers. lui will cause the necessary
+      // zero/sign extension.
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      Res = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Hi >> 15) & 0x1), DL, MVT::i64),
+              SDValue(Res, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+      Res =
+          CurDAG->getMachineNode(Mips::FILL_D, DL, MVT::v2i64, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(64)) {
+      // If we have a 64 bit Splat value, we perform a similar sequence to the
+      // above:
+      //
+      // MIPS32:                            MIPS64:
+      //   lui $res, %highest(val)            lui $res, %highest(val)
+      //   ori $res, $res, %higher(val)       ori $res, $res, %higher(val)
+      //   lui $res2, %hi(val)                lui $res2, %hi(val)
+      //   ori $res2, %res2, %lo(val)         ori $res2, %res2, %lo(val)
+      //   $res3 = fill $res2                 dinsu $res, $res2, 0, 32
+      //   $res4 = insert.w $res3[1], $res    fill.d $res
+      //   splat.d $res4, 0
+      //
+      // The ability to use dinsu is guaranteed as MSA requires MIPSR5. This saves
+      // having to materialize the value by shifts and ors.
+      //
+      // FIXME: Implement the preferred sequence for MIPS64R6:
+      //
+      // MIPS64R6:
+      //   ori $res, $zero, %lo(val)
+      //   daui $res, $res, %hi(val)
+      //   dahi $res, $res, %higher(val)
+      //   dati $res, $res, %highest(cal)
+      //   fill.d $res
+      //
+
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      const unsigned Higher = SplatValue.lshr(32).getLoBits(16).getZExtValue();
+      const unsigned Highest = SplatValue.lshr(48).getLoBits(16).getZExtValue();
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+      SDValue HigherVal = CurDAG->getTargetConstant(Higher, DL, MVT::i32);
+      SDValue HighestVal = CurDAG->getTargetConstant(Highest, DL, MVT::i32);
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      // Independent of whether we're targeting MIPS64 or not, the basic
+      // operations are the same. Also, directly use the $zero register if
+      // the 16 bit chunk is zero.
+      //
+      // For optimization purposes we always synthesize the splat value as
+      // an i32 value, then if we're targetting MIPS64, use SUBREG_TO_REG
+      // just before combining the values with dinsu to produce an i64. This
+      // enables SelectionDAG to aggressively share components of splat values
+      // where possible.
+      //
+      // FIXME: This is the general constant synthesis problem. This code
+      //        should be factored out into a class shared between all the
+      //        classes that need it. Specifically, for a splat size of 64
+      //        bits that's a negative number we can do better than LUi/ORi
+      //        for the upper 32bits.
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      SDNode *HiRes;
+      if (Highest)
+        HiRes = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HighestVal);
+
+      if (Higher)
+        HiRes = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                       Highest ? SDValue(HiRes, 0) : ZeroVal,
+                                       HigherVal);
+
+
+      if (ABI.IsO32()) {
+        Res = CurDAG->getMachineNode(Mips::FILL_W, DL, MVT::v4i32,
+                                     (Hi || Lo) ? SDValue(Res, 0) : ZeroVal);
+
+        Res = CurDAG->getMachineNode(
+            Mips::INSERT_W, DL, MVT::v4i32, SDValue(Res, 0),
+            (Highest || Higher) ? SDValue(HiRes, 0) : ZeroVal,
+            CurDAG->getTargetConstant(1, DL, MVT::i32));
+
+        const TargetLowering *TLI = getTargetLowering();
+        const TargetRegisterClass *RC =
+            TLI->getRegClassFor(ViaVecTy.getSimpleVT());
+
+        Res = CurDAG->getMachineNode(
+            Mips::COPY_TO_REGCLASS, DL, ViaVecTy, SDValue(Res, 0),
+            CurDAG->getTargetConstant(RC->getID(), DL, MVT::i32));
+
+        Res = CurDAG->getMachineNode(
+            Mips::SPLATI_D, DL, MVT::v2i64, SDValue(Res, 0),
+            CurDAG->getTargetConstant(0, DL, MVT::i32));
+      } else if (ABI.IsN64() || ABI.IsN32()) {
+
+        SDValue Zero64Val = CurDAG->getRegister(Mips::ZERO_64, MVT::i64);
+        const bool HiResNonZero = Highest || Higher;
+        const bool ResNonZero = Hi || Lo;
+
+        if (HiResNonZero)
+          HiRes = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Highest >> 15) & 0x1), DL, MVT::i64),
+              SDValue(HiRes, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+        if (ResNonZero)
+          Res = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Hi >> 15) & 0x1), DL, MVT::i64),
+              SDValue(Res, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+        // We have 3 cases:
+        //   The HiRes is nonzero but Res is $zero  => dsll32 HiRes, 0
+        //   The Res is nonzero but HiRes is $zero  => dinsu Res, $zero, 32, 32
+        //   Both are non zero                      => dinsu Res, HiRes, 32, 32
+        //
+        // The obvious "missing" case is when both are zero, but that case is
+        // handled by the ldi case.
+        if (ResNonZero) {
+          SDValue Ops[4] = {HiResNonZero ? SDValue(HiRes, 0) : Zero64Val,
+                            CurDAG->getTargetConstant(64, DL, MVT::i32),
+                            CurDAG->getTargetConstant(32, DL, MVT::i32),
+                            SDValue(Res, 0)};
+
+          Res = CurDAG->getMachineNode(Mips::DINSU, DL, MVT::i64, Ops);
+        } else if (HiResNonZero) {
+          Res = CurDAG->getMachineNode(
+              Mips::DSLL32, DL, MVT::i64, SDValue(HiRes, 0),
+              CurDAG->getTargetConstant(0, DL, MVT::i32));
+        } else
+          llvm_unreachable(
+              "Zero splat value handled by non-zero 64bit splat synthesis!");
+
+        Res = CurDAG->getMachineNode(Mips::FILL_D, DL, MVT::v2i64, SDValue(Res, 0));
+      } else
+        llvm_unreachable("Unknown ABI in MipsISelDAGToDAG!");
+
+    } else
+      return false;
 
     if (ResVecTy != ViaVecTy) {
       // If LdiOp is writing to a different register class to ResVecTy, then
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index f28e8b36fdbc..e2da8477295b 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1123,7 +1123,8 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
-    return performSHLCombine(N, DAG, DCI, Subtarget);
+    Val = performSHLCombine(N, DAG, DCI, Subtarget);
+    break;
   case ISD::SRA:
     return performSRACombine(N, DAG, DCI, Subtarget);
   case ISD::SRL:
@@ -1643,7 +1644,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
       report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
-                                       Op->getConstantOperandVal(3));
+                                       Op->getConstantOperandVal(3) + 1);
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
                        DAG.getConstant(Mask, DL, VecTy, true),
                        Op->getOperand(2), Op->getOperand(1));
@@ -1658,7 +1659,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
       report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
-                                      Op->getConstantOperandVal(3));
+                                      Op->getConstantOperandVal(3) + 1);
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
                        DAG.getConstant(Mask, DL, VecTy, true),
                        Op->getOperand(2), Op->getOperand(1));
@@ -2529,11 +2530,10 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
         SplatBitSize != 64)
       return SDValue();
 
-    // If the value fits into a simm10 then we can use ldi.[bhwd]
-    // However, if it isn't an integer type we will have to bitcast from an
-    // integer type first. Also, if there are any undefs, we must lower them
-    // to defined values first.
-    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+    // If the value isn't an integer type we will have to bitcast
+    // from an integer type first. Also, if there are any undefs, we must
+    // lower them to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs)
       return Op;
 
     EVT ViaVecTy;
@@ -3628,7 +3628,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
   MachineInstrBuilder MIB =
       BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
   for (unsigned i = 1; i < MI.getNumOperands(); i++)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
 
   BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index ea703d0edd96..91e712a7a54e 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -540,11 +540,20 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) const {
+
+  MachineInstrBuilder MIB;
   if (Subtarget.isGP64bit())
-    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
-        .addReg(Mips::RA_64);
+    MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+              .addReg(Mips::RA_64, RegState::Undef);
   else
-    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
+    MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn))
+              .addReg(Mips::RA, RegState::Undef);
+
+  // Retain any imp-use flags.
+  for (auto & MO : I->operands()) {
+    if (MO.isImplicit())
+      MIB.add(MO);
+  }
 }
 
 void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 3e7570ff46ed..8f5ecadecdea 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -70,8 +70,8 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
       HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
-      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM),
-      TargetTriple(TT), TSInfo(),
+      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
+      HasEVA(false), TM(TM), TargetTriple(TT), TSInfo(),
       InstrInfo(
           MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
@@ -117,6 +117,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
   if (NoABICalls && TM.isPositionIndependent())
     report_fatal_error("position-independent code requires '-mabicalls'");
 
+  if (isABI_N64() && !TM.isPositionIndependent() && !hasSym32())
+    NoABICalls = true;
+
   // Set UseSmallSection.
   UseSmallSection = GPOpt;
   if (!NoABICalls && GPOpt) {
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 38d3cee70477..cca2cb8a4660 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -142,6 +142,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // UseTCCInDIV -- Enables the use of trapping in the assembler.
   bool UseTCCInDIV;
 
+  // Sym32 -- On Mips64 symbols are 32 bits.
+  bool HasSym32;
+
   // HasEVA -- supports EVA ASE.
   bool HasEVA;
 
@@ -229,7 +232,11 @@ public:
   unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
   bool isPTR64bit() const { return IsPTR64bit; }
   bool isPTR32bit() const { return !IsPTR64bit; }
+  bool hasSym32() const {
+    return (HasSym32 && isABI_N64()) || isABI_N32() || isABI_O32();
+  }
   bool isSingleFloat() const { return IsSingleFloat; }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
   bool inMips16ModeDefault() const {
@@ -271,6 +278,8 @@ public:
 
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
 
+  bool isXRaySupported() const override { return true; }
+
   // for now constant islands are on for the whole compilation unit but we only
   // really use them if in addition we are in mips16 mode
   static bool useConstantIslands();
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index bb48188e3b87..a45a9c4b41c3 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -11,27 +11,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsTargetMachine.h"
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "Mips.h"
-#include "Mips16FrameLowering.h"
 #include "Mips16ISelDAGToDAG.h"
-#include "Mips16ISelLowering.h"
-#include "Mips16InstrInfo.h"
-#include "MipsFrameLowering.h"
-#include "MipsInstrInfo.h"
-#include "MipsSEFrameLowering.h"
 #include "MipsSEISelDAGToDAG.h"
-#include "MipsSEISelLowering.h"
-#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetObjectFile.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
 
 using namespace llvm;
 
@@ -48,7 +51,7 @@ extern "C" void LLVMInitializeMipsTarget() {
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options,
                                      bool isLittle) {
-  std::string Ret = "";
+  std::string Ret;
   MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions);
 
   // There are both little and big endian mips.
@@ -102,7 +105,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
                         CPU, FS, Options, getEffectiveRelocModel(CM, RM), CM,
                         OL),
-      isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
+      isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()),
       ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
       Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this),
       NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
@@ -113,9 +116,9 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-MipsTargetMachine::~MipsTargetMachine() {}
+MipsTargetMachine::~MipsTargetMachine() = default;
 
-void MipsebTargetMachine::anchor() { }
+void MipsebTargetMachine::anchor() {}
 
 MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
@@ -125,7 +128,7 @@ MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OL)
     : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void MipselTargetMachine::anchor() { }
+void MipselTargetMachine::anchor() {}
 
 MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
@@ -182,10 +185,10 @@ void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
 
   Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(*MF->getFunction()));
   MF->setSubtarget(Subtarget);
-  return;
 }
 
 namespace {
+
 /// Mips Code Generator Pass Configuration Options.
 class MipsPassConfig : public TargetPassConfig {
 public:
@@ -209,11 +212,10 @@ public:
   void addIRPasses() override;
   bool addInstSelector() override;
   void addPreEmitPass() override;
-
   void addPreRegAlloc() override;
-
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new MipsPassConfig(this, PM);
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index e4cf17e2abd8..140d7133f879 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -1,4 +1,4 @@
-//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//===- MipsTargetMachine.h - Define TargetMachine for Mips ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,14 @@
 
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsSubtarget.h"
-#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
-class formatted_raw_ostream;
-class MipsRegisterInfo;
 
 class MipsTargetMachine : public LLVMTargetMachine {
   bool isLittle;
@@ -73,6 +72,7 @@ public:
 ///
 class MipsebTargetMachine : public MipsTargetMachine {
   virtual void anchor();
+
 public:
   MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
@@ -84,6 +84,7 @@ public:
 ///
 class MipselTargetMachine : public MipsTargetMachine {
   virtual void anchor();
+
 public:
   MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
@@ -91,6 +92,6 @@ public:
                       CodeGenOpt::Level OL);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 399ff1fd96e0..a8eecfcc138c 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -17,7 +17,6 @@ set(NVPTXCodeGen_sources
   NVPTXISelDAGToDAG.cpp
   NVPTXISelLowering.cpp
   NVPTXImageOptimizer.cpp
-  NVPTXInferAddressSpaces.cpp
   NVPTXInstrInfo.cpp
   NVPTXLowerAggrCopies.cpp
   NVPTXLowerArgs.cpp
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index 4594c22b8701..b774fe169d71 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -61,6 +61,12 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   case 6:
     OS << "%fd";
     break;
+  case 7:
+    OS << "%h";
+    break;
+  case 8:
+    OS << "%hh";
+    break;
   }
 
   unsigned VReg = RegNo & 0x0FFFFFFF;
@@ -247,8 +253,12 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
         O << "s";
       else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
         O << "u";
-      else
+      else if (Imm == NVPTX::PTXLdStInstCode::Untyped)
+        O << "b";
+      else if (Imm == NVPTX::PTXLdStInstCode::Float)
         O << "f";
+      else
+        llvm_unreachable("Unknown register type");
     } else if (!strcmp(Modifier, "vec")) {
       if (Imm == NVPTX::PTXLdStInstCode::V2)
         O << ".v2";
diff --git a/lib/Target/NVPTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt
index 70a2de3441ce..ee8aaa998bb6 100644
--- a/lib/Target/NVPTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt
@@ -28,5 +28,5 @@ has_asmprinter = 1
 type = Library
 name = NVPTXCodeGen
 parent = NVPTX
-required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target TransformUtils Vectorize
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target TransformUtils Vectorize
 add_to_library_groups = NVPTX
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index c455a437d8d5..902d1b25e7dd 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -45,10 +45,8 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
                                  llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
-FunctionPass *createNVPTXInferAddressSpacesPass();
 FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
 FunctionPass *createNVVMReflectPass();
-FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
@@ -108,7 +106,8 @@ enum AddressSpace {
 enum FromType {
   Unsigned = 0,
   Signed,
-  Float
+  Float,
+  Untyped
 };
 enum VecType {
   Scalar = 1,
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 3c2594c77f45..21e25de80dc7 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -320,6 +320,10 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
 
     switch (Cnt->getType()->getTypeID()) {
     default: report_fatal_error("Unsupported FP type"); break;
+    case Type::HalfTyID:
+      MCOp = MCOperand::createExpr(
+        NVPTXFloatMCExpr::createConstantFPHalf(Val, OutContext));
+      break;
     case Type::FloatTyID:
       MCOp = MCOperand::createExpr(
         NVPTXFloatMCExpr::createConstantFPSingle(Val, OutContext));
@@ -357,6 +361,10 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
       Ret = (5 << 28);
     } else if (RC == &NVPTX::Float64RegsRegClass) {
       Ret = (6 << 28);
+    } else if (RC == &NVPTX::Float16RegsRegClass) {
+      Ret = (7 << 28);
+    } else if (RC == &NVPTX::Float16x2RegsRegClass) {
+      Ret = (8 << 28);
     } else {
       report_fatal_error("Bad register class");
     }
@@ -396,12 +404,15 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
       unsigned size = 0;
       if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
         size = ITy->getBitWidth();
-        if (size < 32)
-          size = 32;
       } else {
         assert(Ty->isFloatingPointTy() && "Floating point type expected here");
         size = Ty->getPrimitiveSizeInBits();
       }
+      // PTX ABI requires all scalar return values to be at least 32
+      // bits in size.  fp16 normally uses .b16 as its storage type in
+      // PTX, so its size must be adjusted here, too.
+      if (size < 32)
+        size = 32;
 
       O << ".param .b" << size << " func_retval0";
     } else if (isa<PointerType>(Ty)) {
@@ -1221,7 +1232,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   else
     O << " .align " << GVar->getAlignment();
 
-  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+  if (ETy->isFloatingPointTy() || ETy->isPointerTy() ||
+      (ETy->isIntegerTy() && ETy->getScalarSizeInBits() <= 64)) {
     O << " .";
     // Special case: ABI requires that we use .u8 for predicates
     if (ETy->isIntegerTy(1))
@@ -1262,6 +1274,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     // targets that support these high level field accesses. Structs, arrays
     // and vectors are lowered into arrays of bytes.
     switch (ETy->getTypeID()) {
+    case Type::IntegerTyID: // Integers larger than 64 bits
     case Type::StructTyID:
     case Type::ArrayTyID:
     case Type::VectorTyID:
@@ -1376,6 +1389,9 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
     }
     break;
   }
+  case Type::HalfTyID:
+    // fp16 is stored as .b16 for compatibility with pre-sm_53 PTX assembly.
+    return "b16";
   case Type::FloatTyID:
     return "f32";
   case Type::DoubleTyID:
@@ -1477,7 +1493,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
   const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
@@ -1534,7 +1550,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       }
     }
 
-    if (!PAL.hasAttribute(paramIndex + 1, Attribute::ByVal)) {
+    if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) {
       if (Ty->isAggregateType() || Ty->isVectorTy()) {
         // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
@@ -1601,6 +1617,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
           sz = 32;
       } else if (isa<PointerType>(Ty))
         sz = thePointerTy.getSizeInBits();
+      else if (Ty->isHalfTy())
+        // PTX ABI requires all scalar parameters to be at least 32
+        // bits in size.  fp16 normally uses .b16 as its storage type
+        // in PTX, so its size must be adjusted here, too.
+        sz = 32;
       else
         sz = Ty->getPrimitiveSizeInBits();
       if (isABI)
@@ -1977,6 +1998,17 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
   const DataLayout &DL = getDataLayout();
   int Bytes;
 
+  // Integers of arbitrary width
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    APInt Val = CI->getValue();
+    for (unsigned I = 0, E = DL.getTypeAllocSize(CPV->getType()); I < E; ++I) {
+      uint8_t Byte = Val.getLoBits(8).getZExtValue();
+      aggBuffer->addBytes(&Byte, 1, 1);
+      Val = Val.lshr(8);
+    }
+    return;
+  }
+
   // Old constants
   if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
     if (CPV->getNumOperands())
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 43c478f4212f..274977254046 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,23 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "nvptx-isel"
 
-static cl::opt<int> UsePrecDivF32(
-    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
-    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
-             " IEEE Compliant F32 div.rnd if available."),
-    cl::init(2));
-
-static cl::opt<bool>
-UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
-          cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
-          cl::init(true));
-
-static cl::opt<bool>
-FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
-           cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
-           cl::init(false));
-
-
 /// createNVPTXISelDag - This pass converts a legalized DAG into a
 /// NVPTX-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -57,45 +40,20 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
 }
 
 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-    Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
-    return SelectionDAGISel::runOnMachineFunction(MF);
+  Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
+  return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
 int NVPTXDAGToDAGISel::getDivF32Level() const {
-  if (UsePrecDivF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-div32=N is used on the command-line, always honor it
-    return UsePrecDivF32;
-  } else {
-    // Otherwise, use div.approx if fast math is enabled
-    if (TM.Options.UnsafeFPMath)
-      return 0;
-    else
-      return 2;
-  }
+  return Subtarget->getTargetLowering()->getDivF32Level();
 }
 
 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
-  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
-    return UsePrecSqrtF32;
-  } else {
-    // Otherwise, use sqrt.approx if fast math is enabled
-    return !TM.Options.UnsafeFPMath;
-  }
+  return Subtarget->getTargetLowering()->usePrecSqrtF32();
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
-  if (FtzEnabled.getNumOccurrences() > 0) {
-    // If nvptx-f32ftz is used on the command-line, always honor it
-    return FtzEnabled;
-  } else {
-    const Function *F = MF->getFunction();
-    // Otherwise, check for an nvptx-f32ftz attribute on the function
-    if (F->hasFnAttribute("nvptx-f32ftz"))
-      return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
-    else
-      return false;
-  }
+  return Subtarget->getTargetLowering()->useF32FTZ(*MF);
 }
 
 bool NVPTXDAGToDAGISel::allowFMA() const {
@@ -103,6 +61,11 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
   return TL->allowFMA(*MF, OptLevel);
 }
 
+bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
+  const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
+  return TL->allowUnsafeFPMath(*MF);
+}
+
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 void NVPTXDAGToDAGISel::Select(SDNode *N) {
@@ -121,6 +84,14 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryStore(N))
       return;
     break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (tryEXTRACT_VECTOR_ELEMENT(N))
+      return;
+    break;
+  case NVPTXISD::SETP_F16X2:
+    SelectSETP_F16X2(N);
+    return;
+
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
     if (tryLoadVector(N))
@@ -515,6 +486,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
   case ISD::ADDRSPACECAST:
     SelectAddrSpaceCast(N);
     return;
+  case ISD::ConstantFP:
+    if (tryConstantFP16(N))
+      return;
+    break;
   default:
     break;
   }
@@ -536,6 +511,140 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   }
 }
 
+// There's no way to specify FP16 immediates in .f16 ops, so we have to
+// load them into an .f16 register first.
+bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
+  if (N->getValueType(0) != MVT::f16)
+    return false;
+  SDValue Val = CurDAG->getTargetConstantFP(
+      cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
+  SDNode *LoadConstF16 =
+      CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
+  ReplaceNode(N, LoadConstF16);
+  return true;
+}
+
+// Map ISD:CONDCODE value to appropriate CmpMode expected by
+// NVPTXInstPrinter::printCmpMode()
+static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
+  using NVPTX::PTXCmpMode::CmpMode;
+  unsigned PTXCmpMode = [](ISD::CondCode CC) {
+    switch (CC) {
+    default:
+      llvm_unreachable("Unexpected condition code.");
+    case ISD::SETOEQ:
+      return CmpMode::EQ;
+    case ISD::SETOGT:
+      return CmpMode::GT;
+    case ISD::SETOGE:
+      return CmpMode::GE;
+    case ISD::SETOLT:
+      return CmpMode::LT;
+    case ISD::SETOLE:
+      return CmpMode::LE;
+    case ISD::SETONE:
+      return CmpMode::NE;
+    case ISD::SETO:
+      return CmpMode::NUM;
+    case ISD::SETUO:
+      return CmpMode::NotANumber;
+    case ISD::SETUEQ:
+      return CmpMode::EQU;
+    case ISD::SETUGT:
+      return CmpMode::GTU;
+    case ISD::SETUGE:
+      return CmpMode::GEU;
+    case ISD::SETULT:
+      return CmpMode::LTU;
+    case ISD::SETULE:
+      return CmpMode::LEU;
+    case ISD::SETUNE:
+      return CmpMode::NEU;
+    case ISD::SETEQ:
+      return CmpMode::EQ;
+    case ISD::SETGT:
+      return CmpMode::GT;
+    case ISD::SETGE:
+      return CmpMode::GE;
+    case ISD::SETLT:
+      return CmpMode::LT;
+    case ISD::SETLE:
+      return CmpMode::LE;
+    case ISD::SETNE:
+      return CmpMode::NE;
+    }
+  }(CondCode.get());
+
+  if (FTZ)
+    PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
+
+  return PTXCmpMode;
+}
+
+bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
+  unsigned PTXCmpMode =
+      getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
+  SDLoc DL(N);
+  SDNode *SetP = CurDAG->getMachineNode(
+      NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
+      N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
+  ReplaceNode(N, SetP);
+  return true;
+}
+
+// Find all instances of extract_vector_elt that use this v2f16 vector
+// and coalesce them into a scattering move instruction.
+bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
+  SDValue Vector = N->getOperand(0);
+
+  // We only care about f16x2 as it's the only real vector type we
+  // need to deal with.
+  if (Vector.getSimpleValueType() != MVT::v2f16)
+    return false;
+
+  // Find and record all uses of this vector that extract element 0 or 1.
+  SmallVector<SDNode *, 4> E0, E1;
+  for (const auto &U : Vector.getNode()->uses()) {
+    if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      continue;
+    if (U->getOperand(0) != Vector)
+      continue;
+    if (const ConstantSDNode *IdxConst =
+            dyn_cast<ConstantSDNode>(U->getOperand(1))) {
+      if (IdxConst->getZExtValue() == 0)
+        E0.push_back(U);
+      else if (IdxConst->getZExtValue() == 1)
+        E1.push_back(U);
+      else
+        llvm_unreachable("Invalid vector index.");
+    }
+  }
+
+  // There's no point scattering f16x2 if we only ever access one
+  // element of it.
+  if (E0.empty() || E1.empty())
+    return false;
+
+  unsigned Op = NVPTX::SplitF16x2;
+  // If the vector has been BITCAST'ed from i32, we can use original
+  // value directly and avoid register-to-register move.
+  SDValue Source = Vector;
+  if (Vector->getOpcode() == ISD::BITCAST) {
+    Op = NVPTX::SplitI32toF16x2;
+    Source = Vector->getOperand(0);
+  }
+  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
+  // into f16,f16 SplitF16x2(V)
+  SDNode *ScatterOp =
+      CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
+  for (auto *Node : E0)
+    ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
+  for (auto *Node : E1)
+    ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
+
+  return true;
+}
+
 static unsigned int getCodeAddrSpace(MemSDNode *N) {
   const Value *Src = N->getMemOperand()->getValue();
 
@@ -681,6 +790,35 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   }
 }
 
+// Helper function template to reduce amount of boilerplate code for
+// opcode selection.
+static Optional<unsigned> pickOpcodeForVT(
+    MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
+    unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
+    unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
+  switch (VT) {
+  case MVT::i1:
+  case MVT::i8:
+    return Opcode_i8;
+  case MVT::i16:
+    return Opcode_i16;
+  case MVT::i32:
+    return Opcode_i32;
+  case MVT::i64:
+    return Opcode_i64;
+  case MVT::f16:
+    return Opcode_f16;
+  case MVT::v2f16:
+    return Opcode_f16x2;
+  case MVT::f32:
+    return Opcode_f32;
+  case MVT::f64:
+    return Opcode_f64;
+  default:
+    return None;
+  }
+}
+
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -709,33 +847,32 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
       codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
-  // Vector Setting
-  MVT SimpleVT = LoadedVT.getSimpleVT();
-  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    unsigned num = SimpleVT.getVectorNumElements();
-    if (num == 2)
-      vecType = NVPTX::PTXLdStInstCode::V2;
-    else if (num == 4)
-      vecType = NVPTX::PTXLdStInstCode::V4;
-    else
-      return false;
-  }
-
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT SimpleVT = LoadedVT.getSimpleVT();
   MVT ScalarVT = SimpleVT.getScalarType();
   // Read at least 8 bits (predicates are stored as 8-bit values)
   unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int fromType;
+
+  // Vector Setting
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
+    // v2f16 is loaded using ld.b32
+    fromTypeWidth = 32;
+  }
+
   if ((LD->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
-    fromType = NVPTX::PTXLdStInstCode::Float;
+    // f16 uses .b16 as its storage type.
+    fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                             : NVPTX::PTXLdStInstCode::Float;
   else
     fromType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -744,169 +881,72 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   SDValue Addr;
   SDValue Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
-    switch (TargetVT) {
-    case MVT::i8:
-      Opcode = NVPTX::LD_i8_avar;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::LD_i16_avar;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::LD_i32_avar;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::LD_i64_avar;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::LD_f32_avar;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::LD_f64_avar;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(
+        TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
+        NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
+        NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                           : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
-    switch (TargetVT) {
-    case MVT::i8:
-      Opcode = NVPTX::LD_i8_asi;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::LD_i16_asi;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::LD_i32_asi;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::LD_i64_asi;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::LD_f32_asi;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::LD_f64_asi;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
+                                 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
+                                 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
+                                 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                           : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
-    if (TM.is64Bit()) {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_ari_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_ari_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_ari_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_ari_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_ari_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_ari_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_ari;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_ari;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_ari;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_ari;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_ari;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_ari;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
+          NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
+          NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
+    else
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
+          NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
+          NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else {
-    if (TM.is64Bit()) {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_areg_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_areg_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_areg_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_areg_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_areg_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_areg_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_areg;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_areg;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_areg;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_areg;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_areg;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_areg;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
+          NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
+          NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
+          NVPTX::LD_f64_areg_64);
+    else
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
+          NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
+          NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), N1, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   }
 
   if (!NVPTXLD)
@@ -925,7 +965,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   MemSDNode *MemSD = cast<MemSDNode>(N);
@@ -968,7 +1008,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   if (ExtensionType == ISD::SEXTLOAD)
     FromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
-    FromType = NVPTX::PTXLdStInstCode::Float;
+    FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                             : NVPTX::PTXLdStInstCode::Float;
   else
     FromType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -987,111 +1028,67 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
   EVT EltVT = N->getValueType(0);
 
+  // v8f16 is a special case. PTX doesn't have ld.v8.f16
+  // instruction. Instead, we split the vector into v2f16 chunks and
+  // load them with ld.v4.b32.
+  if (EltVT == MVT::v2f16) {
+    assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
+    EltVT = MVT::i32;
+    FromType = NVPTX::PTXLdStInstCode::Untyped;
+    FromTypeWidth = 32;
+  }
+
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v2_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v2_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v2_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LDV_i64_v2_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v2_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LDV_f64_v2_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
+                               NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
+                               NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
+                               NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
       break;
     case NVPTXISD::LoadV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v4_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v4_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v4_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v4_avar;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
+                          NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
+                          NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
+                          NVPTX::LDV_f32_v4_avar, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v2_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v2_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v2_asi;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LDV_i64_v2_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v2_asi;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LDV_f64_v2_asi;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
+                               NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
+                               NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
+                               NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
       break;
     case NVPTXISD::LoadV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v4_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v4_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v4_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v4_asi;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
+                          NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
+                          NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
+                          NVPTX::LDV_f32_v4_asi, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1099,46 +1096,19 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_ari_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_ari_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
+            NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
+            NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
+            NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
+            NVPTX::LDV_f64_v2_ari_64);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
+            NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
+            NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
+            NVPTX::LDV_f32_v4_ari_64, None);
         break;
       }
     } else {
@@ -1146,101 +1116,47 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_ari;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_ari;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_ari;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
+                                 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
+                                 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
+                                 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_ari;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
+                            NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
+                            NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
+                            NVPTX::LDV_f32_v4_ari, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_areg_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_areg_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
+            NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
+            NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
+            NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
+            NVPTX::LDV_f64_v2_areg_64);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
+            NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
+            NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
+            NVPTX::LDV_f32_v4_areg_64, None);
         break;
       }
     } else {
@@ -1248,54 +1164,28 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_areg;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_areg;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
+                            NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
+                            NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
+                            NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
+                            NVPTX::LDV_f64_v2_areg);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_areg;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
+            NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
+            NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
+            NVPTX::LDV_f32_v4_areg, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -1338,7 +1228,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     Mem = cast<MemSDNode>(N);
   }
 
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   SDValue Base, Offset, Addr;
@@ -1366,142 +1256,72 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     default:
       return false;
     case ISD::INTRINSIC_W_CHAIN:
-      if (IsLDG) {
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
-          break;
-        }
-      } else {
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
-          break;
-        }
-      }
+      if (IsLDG)
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+      else
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
       break;
     case NVPTXISD::LDGV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDUV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDGV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
+                               NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
       break;
     case NVPTXISD::LDUV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
+                               NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1510,139 +1330,68 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
         break;
       }
     } else {
@@ -1651,146 +1400,75 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
         break;
       }
     }
-
-    SDValue Ops[] = { Base, Offset, Chain };
-
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    if (!Opcode)
+      return false;
+    SDValue Ops[] = {Base, Offset, Chain};
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
@@ -1798,139 +1476,68 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
         break;
       }
     } else {
@@ -1939,145 +1546,75 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -2151,24 +1688,23 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   // Vector Setting
   MVT SimpleVT = StoreVT.getSimpleVT();
   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    unsigned num = SimpleVT.getVectorNumElements();
-    if (num == 2)
-      vecType = NVPTX::PTXLdStInstCode::V2;
-    else if (num == 4)
-      vecType = NVPTX::PTXLdStInstCode::V4;
-    else
-      return false;
-  }
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   //
   MVT ScalarVT = SimpleVT.getScalarType();
   unsigned toTypeWidth = ScalarVT.getSizeInBits();
+  if (SimpleVT.isVector()) {
+    assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
+    // v2f16 is stored using st.b32
+    toTypeWidth = 32;
+  }
+
   unsigned int toType;
   if (ScalarVT.isFloatingPoint())
-    toType = NVPTX::PTXLdStInstCode::Float;
+    // f16 uses .b16 as its storage type.
+    toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                           : NVPTX::PTXLdStInstCode::Float;
   else
     toType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -2178,173 +1714,73 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   SDValue Addr;
   SDValue Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N2, Addr)) {
-    switch (SourceVT) {
-    case MVT::i8:
-      Opcode = NVPTX::ST_i8_avar;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::ST_i16_avar;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::ST_i32_avar;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::ST_i64_avar;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::ST_f32_avar;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::ST_f64_avar;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
+                             NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
+                             NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
+                             NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
                       Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
                           : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
-    switch (SourceVT) {
-    case MVT::i8:
-      Opcode = NVPTX::ST_i8_asi;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::ST_i16_asi;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::ST_i32_asi;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::ST_i64_asi;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::ST_f32_asi;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::ST_f64_asi;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
+                             NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
+                             NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
+                             NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
                           : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (TM.is64Bit()) {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_ari_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_ari_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_ari_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_ari_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_ari_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_ari_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_ari;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_ari;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_ari;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_ari;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_ari;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_ari;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
+          NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
+          NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
+    else
+      Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
+                               NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
+                               NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
+                               NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
+    if (!Opcode)
+      return false;
+
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else {
-    if (TM.is64Bit()) {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_areg_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_areg_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_areg_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_areg_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_areg_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_areg_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_areg;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_areg;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_areg;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_areg;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_areg;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_areg;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode =
+          pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
+                          NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
+                          NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
+                          NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
+    else
+      Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
+                               NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
+                               NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
+                               NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
                       Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   }
 
   if (!NVPTXST)
@@ -2361,7 +1797,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *ST;
   EVT EltVT = Op1.getValueType();
@@ -2391,7 +1827,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
   unsigned ToType;
   if (ScalarVT.isFloatingPoint())
-    ToType = NVPTX::PTXLdStInstCode::Float;
+    ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                           : NVPTX::PTXLdStInstCode::Float;
   else
     ToType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -2418,6 +1855,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     return false;
   }
 
+  // v8f16 is a special case. PTX doesn't have st.v8.f16
+  // instruction. Instead, we split the vector into v2f16 chunks and
+  // store them with st.v4.b32.
+  if (EltVT == MVT::v2f16) {
+    assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
+    EltVT = MVT::i32;
+    ToType = NVPTX::PTXLdStInstCode::Untyped;
+    ToTypeWidth = 32;
+  }
+
   StOps.push_back(getI32Imm(IsVolatile, DL));
   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
   StOps.push_back(getI32Imm(VecType, DL));
@@ -2429,46 +1876,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     default:
       return false;
     case NVPTXISD::StoreV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v2_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v2_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v2_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::STV_i64_v2_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v2_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::STV_f64_v2_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
+                               NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
+                               NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
+                               NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
       break;
     case NVPTXISD::StoreV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v4_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v4_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v4_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v4_avar;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
+                          NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
+                          NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
+                          NVPTX::STV_f32_v4_avar, None);
       break;
     }
     StOps.push_back(Addr);
@@ -2478,46 +1897,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     default:
       return false;
     case NVPTXISD::StoreV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v2_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v2_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v2_asi;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::STV_i64_v2_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v2_asi;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::STV_f64_v2_asi;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
+                               NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
+                               NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
+                               NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
       break;
     case NVPTXISD::StoreV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v4_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v4_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v4_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v4_asi;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
+                          NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
+                          NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
+                          NVPTX::STV_f32_v4_asi, None);
       break;
     }
     StOps.push_back(Base);
@@ -2529,46 +1920,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_ari_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_ari_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
+            NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
+            NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
+            NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
+            NVPTX::STV_f64_v2_ari_64);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
+            NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
+            NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
+            NVPTX::STV_f32_v4_ari_64, None);
         break;
       }
     } else {
@@ -2576,46 +1940,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_ari;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_ari;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_ari;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
+                                 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
+                                 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
+                                 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_ari;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
+                            NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
+                            NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
+                            NVPTX::STV_f32_v4_ari, None);
         break;
       }
     }
@@ -2627,46 +1963,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_areg_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_areg_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
+            NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
+            NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
+            NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
+            NVPTX::STV_f64_v2_areg_64);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
+            NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
+            NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
+            NVPTX::STV_f32_v4_areg_64, None);
         break;
       }
     } else {
@@ -2674,55 +1983,31 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_areg;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_areg;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
+                            NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
+                            NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
+                            NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
+                            NVPTX::STV_f64_v2_areg);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
+                            NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
+                            NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
+                            NVPTX::STV_f32_v4_areg, None);
         break;
       }
     }
     StOps.push_back(N2);
   }
 
+  if (!Opcode)
+    return false;
+
   StOps.push_back(Chain);
 
-  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
@@ -2757,87 +2042,36 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   EVT EltVT = Node->getValueType(0);
   EVT MemVT = Mem->getMemoryVT();
 
-  unsigned Opc = 0;
+  Optional<unsigned> Opcode;
 
   switch (VecSize) {
   default:
     return false;
   case 1:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemI8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemI8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemI16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemI32;
-      break;
-    case MVT::i64:
-      Opc = NVPTX::LoadParamMemI64;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemF32;
-      break;
-    case MVT::f64:
-      Opc = NVPTX::LoadParamMemF64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
+                             NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
+                             NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
+                             NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
+                             NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
     break;
   case 2:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemV2I8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemV2I8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemV2I16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemV2I32;
-      break;
-    case MVT::i64:
-      Opc = NVPTX::LoadParamMemV2I64;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemV2F32;
-      break;
-    case MVT::f64:
-      Opc = NVPTX::LoadParamMemV2F64;
-      break;
-    }
+    Opcode =
+        pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
+                        NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
+                        NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
+                        NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
+                        NVPTX::LoadParamMemV2F64);
     break;
   case 4:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemV4I8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemV4I8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemV4I16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemV4I32;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemV4F32;
-      break;
-    }
+    Opcode = pickOpcodeForVT(
+        MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
+        NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
+        NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
+        NVPTX::LoadParamMemV4F32, None);
     break;
   }
+  if (!Opcode)
+    return false;
 
   SDVTList VTs;
   if (VecSize == 1) {
@@ -2856,7 +2090,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
-  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
   return true;
 }
 
@@ -2893,89 +2127,36 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  unsigned Opcode = 0;
+  Optional<unsigned> Opcode = 0;
   switch (NumElts) {
   default:
     return false;
   case 1:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalI8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalI8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalI16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalI32;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::StoreRetvalI64;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalF32;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::StoreRetvalF64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
+                             NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
+                             NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
+                             NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
     break;
   case 2:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalV2I8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalV2I8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalV2I16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalV2I32;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::StoreRetvalV2I64;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalV2F32;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::StoreRetvalV2F64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
+                             NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
+                             NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
+                             NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
     break;
   case 4:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalV4I8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalV4I8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalV4I16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalV4I32;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalV4F32;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
+                             NVPTX::StoreRetvalV4I32, None,
+                             NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
+                             NVPTX::StoreRetvalV4F32, None);
     break;
   }
+  if (!Opcode)
+    return false;
 
-  SDNode *Ret =
-      CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -3024,88 +2205,36 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  unsigned Opcode = 0;
+  Optional<unsigned> Opcode = 0;
   switch (N->getOpcode()) {
   default:
     switch (NumElts) {
     default:
       return false;
     case 1:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamI8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamI8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamI16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamI32;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamI64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamF32;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::StoreParamF64;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamI8, NVPTX::StoreParamI16,
+                               NVPTX::StoreParamI32, NVPTX::StoreParamI64,
+                               NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
+                               NVPTX::StoreParamF32, NVPTX::StoreParamF64);
       break;
     case 2:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamV2I8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamV2I8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamV2I16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamV2I32;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamV2I64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamV2F32;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::StoreParamV2F64;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
+                               NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
+                               NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
+                               NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
       break;
     case 4:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamV4I8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamV4I8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamV4I16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamV4I32;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamV4F32;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
+                               NVPTX::StoreParamV4I32, None,
+                               NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
+                               NVPTX::StoreParamV4F32, None);
       break;
     }
+    if (!Opcode)
+      return false;
     break;
   // Special case: if we have a sign-extend/zero-extend node, insert the
   // conversion instruction first, and use that as the value operand to
@@ -3132,7 +2261,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
 
   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
   SDNode *Ret =
-      CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+      CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 0591035a6aa8..8fc38e7c4612 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -34,6 +34,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool usePrecSqrtF32() const;
   bool useF32FTZ() const;
   bool allowFMA() const;
+  bool allowUnsafeFPMath() const;
 
 public:
   explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
@@ -69,6 +70,9 @@ private:
   bool tryTextureIntrinsic(SDNode *N);
   bool trySurfaceIntrinsic(SDNode *N);
   bool tryBFE(SDNode *N);
+  bool tryConstantFP16(SDNode *N);
+  bool SelectSETP_F16X2(SDNode *N);
+  bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
 
   inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7a760fd38d0f..4d06912054a2 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -79,6 +79,60 @@ FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
                              " 1: do it  2: do it aggressively"),
                     cl::init(2));
 
+static cl::opt<int> UsePrecDivF32(
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+             " IEEE Compliant F32 div.rnd if available."),
+    cl::init(2));
+
+static cl::opt<bool> UsePrecSqrtF32(
+    "nvptx-prec-sqrtf32", cl::Hidden,
+    cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+    cl::init(true));
+
+static cl::opt<bool> FtzEnabled(
+    "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+    cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+    cl::init(false));
+
+int NVPTXTargetLowering::getDivF32Level() const {
+  if (UsePrecDivF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-div32=N is used on the command-line, always honor it
+    return UsePrecDivF32;
+  } else {
+    // Otherwise, use div.approx if fast math is enabled
+    if (getTargetMachine().Options.UnsafeFPMath)
+      return 0;
+    else
+      return 2;
+  }
+}
+
+bool NVPTXTargetLowering::usePrecSqrtF32() const {
+  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+    return UsePrecSqrtF32;
+  } else {
+    // Otherwise, use sqrt.approx if fast math is enabled
+    return !getTargetMachine().Options.UnsafeFPMath;
+  }
+}
+
+bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
+  // TODO: Get rid of this flag; there can be only one way to do this.
+  if (FtzEnabled.getNumOccurrences() > 0) {
+    // If nvptx-f32ftz is used on the command-line, always honor it
+    return FtzEnabled;
+  } else {
+    const Function *F = MF.getFunction();
+    // Otherwise, check for an nvptx-f32ftz attribute on the function
+    if (F->hasFnAttribute("nvptx-f32ftz"))
+      return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
+    else
+      return false;
+  }
+}
+
 static bool IsPTXVectorType(MVT VT) {
   switch (VT.SimpleTy) {
   default:
@@ -92,6 +146,9 @@ static bool IsPTXVectorType(MVT VT) {
   case MVT::v2i32:
   case MVT::v4i32:
   case MVT::v2i64:
+  case MVT::v2f16:
+  case MVT::v4f16:
+  case MVT::v8f16: // <4 x f16x2>
   case MVT::v2f32:
   case MVT::v4f32:
   case MVT::v2f64:
@@ -116,13 +173,24 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
     uint64_t Off = TempOffsets[i];
-    if (VT.isVector())
-      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
-        ValueVTs.push_back(VT.getVectorElementType());
+    // Split vectors into individual elements, except for v2f16, which
+    // we will pass as a single scalar.
+    if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      EVT EltVT = VT.getVectorElementType();
+      // Vectors with an even number of f16 elements will be passed to
+      // us as an array of v2f16 elements. We must match this so we
+      // stay in sync with Ins/Outs.
+      if (EltVT == MVT::f16 && NumElts % 2 == 0) {
+        EltVT = MVT::v2f16;
+        NumElts /= 2;
+      }
+      for (unsigned j = 0; j != NumElts; ++j) {
+        ValueVTs.push_back(EltVT);
         if (Offsets)
-          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+          Offsets->push_back(Off + j * EltVT.getStoreSize());
       }
-    else {
+    } else {
       ValueVTs.push_back(VT);
       if (Offsets)
         Offsets->push_back(Off);
@@ -130,6 +198,125 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   }
 }
 
+// Check whether we can merge loads/stores of some of the pieces of a
+// flattened function parameter or return value into a single vector
+// load/store.
+//
+// The flattened parameter is represented as a list of EVTs and
+// offsets, and the whole structure is aligned to ParamAlignment. This
+// function determines whether we can load/store pieces of the
+// parameter starting at index Idx using a single vectorized op of
+// size AccessSize. If so, it returns the number of param pieces
+// covered by the vector op. Otherwise, it returns 1.
+static unsigned CanMergeParamLoadStoresStartingAt(
+    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
+    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
+  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
+
+  // Can't vectorize if param alignment is not sufficient.
+  if (AccessSize > ParamAlignment)
+    return 1;
+  // Can't vectorize if offset is not aligned.
+  if (Offsets[Idx] & (AccessSize - 1))
+    return 1;
+
+  EVT EltVT = ValueVTs[Idx];
+  unsigned EltSize = EltVT.getStoreSize();
+
+  // Element is too large to vectorize.
+  if (EltSize >= AccessSize)
+    return 1;
+
+  unsigned NumElts = AccessSize / EltSize;
+  // Can't vectorize if AccessBytes if not a multiple of EltSize.
+  if (AccessSize != EltSize * NumElts)
+    return 1;
+
+  // We don't have enough elements to vectorize.
+  if (Idx + NumElts > ValueVTs.size())
+    return 1;
+
+  // PTX ISA can only deal with 2- and 4-element vector ops.
+  if (NumElts != 4 && NumElts != 2)
+    return 1;
+
+  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
+    // Types do not match.
+    if (ValueVTs[j] != EltVT)
+      return 1;
+
+    // Elements are not contiguous.
+    if (Offsets[j] - Offsets[j - 1] != EltSize)
+      return 1;
+  }
+  // OK. We can vectorize ValueVTs[i..i+NumElts)
+  return NumElts;
+}
+
+// Flags for tracking per-element vectorization state of loads/stores
+// of a flattened function parameter or return value.
+enum ParamVectorizationFlags {
+  PVF_INNER = 0x0, // Middle elements of a vector.
+  PVF_FIRST = 0x1, // First element of the vector.
+  PVF_LAST = 0x2,  // Last element of the vector.
+  // Scalar is effectively a 1-element vector.
+  PVF_SCALAR = PVF_FIRST | PVF_LAST
+};
+
+// Computes whether and how we can vectorize the loads/stores of a
+// flattened function parameter or return value.
+//
+// The flattened parameter is represented as the list of ValueVTs and
+// Offsets, and is aligned to ParamAlignment bytes. We return a vector
+// of the same size as ValueVTs indicating how each piece should be
+// loaded/stored (i.e. as a scalar, or as part of a vector
+// load/store).
+static SmallVector<ParamVectorizationFlags, 16>
+VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
+                     const SmallVectorImpl<uint64_t> &Offsets,
+                     unsigned ParamAlignment) {
+  // Set vector size to match ValueVTs and mark all elements as
+  // scalars by default.
+  SmallVector<ParamVectorizationFlags, 16> VectorInfo;
+  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
+
+  // Check what we can vectorize using 128/64/32-bit accesses.
+  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
+    // Skip elements we've already processed.
+    assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
+    for (unsigned AccessSize : {16, 8, 4, 2}) {
+      unsigned NumElts = CanMergeParamLoadStoresStartingAt(
+          I, AccessSize, ValueVTs, Offsets, ParamAlignment);
+      // Mark vectorized elements.
+      switch (NumElts) {
+      default:
+        llvm_unreachable("Unexpected return value");
+      case 1:
+        // Can't vectorize using this size, try next smaller size.
+        continue;
+      case 2:
+        assert(I + 1 < E && "Not enough elements.");
+        VectorInfo[I] = PVF_FIRST;
+        VectorInfo[I + 1] = PVF_LAST;
+        I += 1;
+        break;
+      case 4:
+        assert(I + 3 < E && "Not enough elements.");
+        VectorInfo[I] = PVF_FIRST;
+        VectorInfo[I + 1] = PVF_INNER;
+        VectorInfo[I + 2] = PVF_INNER;
+        VectorInfo[I + 3] = PVF_LAST;
+        I += 3;
+        break;
+      }
+      // Break out of the inner loop because we've already succeeded
+      // using largest possible AccessSize.
+      break;
+    }
+  }
+  return VectorInfo;
+}
+
 // NVPTXTargetLowering Constructor.
 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                                          const NVPTXSubtarget &STI)
@@ -158,14 +345,32 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   else
     setSchedulingPreference(Sched::Source);
 
+  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
+                                    LegalizeAction NoF16Action) {
+    setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
+  };
+
   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
+  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
+
+  // Conversion to/from FP16/FP16x2 is always legal.
+  setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
+  setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
+  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
 
   // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
@@ -173,6 +378,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -195,6 +402,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
 
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+
   if (STI.hasROT64()) {
     setOperationAction(ISD::ROTL, MVT::i64, Legal);
     setOperationAction(ISD::ROTR, MVT::i64, Legal);
@@ -259,6 +469,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // This is legal in NVPTX
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
 
   // TRAP can be lowered to PTX trap
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
@@ -278,15 +489,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Custom handling for i8 intrinsics
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
 
-  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
-  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
-  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
+    setOperationAction(ISD::SMIN, Ty, Legal);
+    setOperationAction(ISD::SMAX, Ty, Legal);
+    setOperationAction(ISD::UMIN, Ty, Legal);
+    setOperationAction(ISD::UMAX, Ty, Legal);
+
+    setOperationAction(ISD::CTPOP, Ty, Legal);
+    setOperationAction(ISD::CTLZ, Ty, Legal);
+  }
+
   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
-  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
-  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
-  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
 
   // PTX does not directly support SELP of i1, so promote to i32 first
   setOperationAction(ISD::SELECT, MVT::i1, Custom);
@@ -301,28 +516,60 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SREM);
   setTargetDAGCombine(ISD::UREM);
 
-  // Library functions.  These default to Expand, but we have instructions
-  // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f64, Legal);
-  setOperationAction(ISD::FROUND, MVT::f32, Legal);
-  setOperationAction(ISD::FROUND, MVT::f64, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  // setcc for f16x2 needs special handling to prevent legalizer's
+  // attempt to scalarize it due to v2i1 not being legal.
+  if (STI.allowFP16Math())
+    setTargetDAGCombine(ISD::SETCC);
+
+  // Promote fp16 arithmetic if fp16 hardware isn't available or the
+  // user passed --nvptx-no-fp16-math. The flag is useful because,
+  // although sm_53+ GPUs have some sort of FP16 support in
+  // hardware, only sm_53 and sm_60 have full implementation. Others
+  // only have token amount of hardware and are likely to run faster
+  // by using fp32 units instead.
+  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
+    setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
+  }
+
+  // There's no neg.f16 instruction. Expand to (0-x).
+  setOperationAction(ISD::FNEG, MVT::f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
+
+  // (would be) Library functions.
+
+  // These map to conversion instructions for scalar FP types.
+  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
+                         ISD::FROUND, ISD::FTRUNC}) {
+    setOperationAction(Op, MVT::f16, Legal);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setOperationAction(Op, MVT::v2f16, Expand);
+  }
+
+  // 'Expand' implements FCOPYSIGN without calling an external library.
+  setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+
+  // These map to corresponding instructions for f32/f64. f16 must be
+  // promoted to f32. v2f16 is expanded to f16, which is then promoted
+  // to f32.
+  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
+                         ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
+    setOperationAction(Op, MVT::f16, Promote);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setOperationAction(Op, MVT::v2f16, Expand);
+  }
+  setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+  setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+  setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+  setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
 
   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
   // No FPOW or FREM in PTX.
@@ -434,6 +681,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::FUN_SHFR_CLAMP";
   case NVPTXISD::IMAD:
     return "NVPTXISD::IMAD";
+  case NVPTXISD::SETP_F16X2:
+    return "NVPTXISD::SETP_F16X2";
   case NVPTXISD::Dummy:
     return "NVPTXISD::Dummy";
   case NVPTXISD::MUL_WIDE_SIGNED:
@@ -932,10 +1181,60 @@ TargetLoweringBase::LegalizeTypeAction
 NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
     return TypeSplitVector;
-
+  if (VT == MVT::v2f16)
+    return TypeLegal;
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                             int Enabled, int &ExtraSteps,
+                                             bool &UseOneConst,
+                                             bool Reciprocal) const {
+  if (!(Enabled == ReciprocalEstimate::Enabled ||
+        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
+    return SDValue();
+
+  if (ExtraSteps == ReciprocalEstimate::Unspecified)
+    ExtraSteps = 0;
+
+  SDLoc DL(Operand);
+  EVT VT = Operand.getValueType();
+  bool Ftz = useF32FTZ(DAG.getMachineFunction());
+
+  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(IID, DL, MVT::i32), Operand);
+  };
+
+  // The sqrt and rsqrt refinement processes assume we always start out with an
+  // approximation of the rsqrt.  Therefore, if we're going to do any refinement
+  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
+  // any refinement, we must return a regular sqrt.
+  if (Reciprocal || ExtraSteps > 0) {
+    if (VT == MVT::f32)
+      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
+                                   : Intrinsic::nvvm_rsqrt_approx_f);
+    else if (VT == MVT::f64)
+      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
+    else
+      return SDValue();
+  } else {
+    if (VT == MVT::f32)
+      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
+                                   : Intrinsic::nvvm_sqrt_approx_f);
+    else {
+      // There's no sqrt.approx.f64 instruction, so we emit
+      // reciprocal(rsqrt(x)).  This is faster than
+      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
+      // x * rsqrt(x).)
+      return DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, VT,
+          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
+          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
+    }
+  }
+}
+
 SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -967,19 +1266,21 @@ std::string NVPTXTargetLowering::getPrototype(
       unsigned size = 0;
       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
         size = ITy->getBitWidth();
-        if (size < 32)
-          size = 32;
       } else {
         assert(retTy->isFloatingPointTy() &&
                "Floating point type expected here");
         size = retTy->getPrimitiveSizeInBits();
       }
+      // PTX ABI requires all scalar return values to be at least 32
+      // bits in size.  fp16 normally uses .b16 as its storage type in
+      // PTX, so its size must be adjusted here, too.
+      if (size < 32)
+        size = 32;
 
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
-    } else if ((retTy->getTypeID() == Type::StructTyID) ||
-               isa<VectorType>(retTy)) {
+    } else if (retTy->isAggregateType() || retTy->isVectorTy()) {
       auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
       O << ".param .align " << retAlignment << " .b8 _["
         << DL.getTypeAllocSize(retTy) << "]";
@@ -1018,7 +1319,7 @@ std::string NVPTXTargetLowering::getPrototype(
           OIdx += len - 1;
         continue;
       }
-       // i8 types in IR will be i16 types in SDAG
+      // i8 types in IR will be i16 types in SDAG
       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
              "type mismatch between callee prototype and arguments");
@@ -1028,8 +1329,13 @@ std::string NVPTXTargetLowering::getPrototype(
         sz = cast<IntegerType>(Ty)->getBitWidth();
         if (sz < 32)
           sz = 32;
-      } else if (isa<PointerType>(Ty))
+      } else if (isa<PointerType>(Ty)) {
         sz = PtrVT.getSizeInBits();
+      } else if (Ty->isHalfTy())
+        // PTX ABI requires all scalar parameters to be at least 32
+        // bits in size.  fp16 normally uses .b16 as its storage type
+        // in PTX, so its size must be adjusted here, too.
+        sz = 32;
       else
         sz = Ty->getPrimitiveSizeInBits();
       O << ".param .b" << sz << " ";
@@ -1113,21 +1419,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue Callee = CLI.Callee;
   bool &isTailCall = CLI.IsTailCall;
   ArgListTy &Args = CLI.getArgs();
-  Type *retTy = CLI.RetTy;
+  Type *RetTy = CLI.RetTy;
   ImmutableCallSite *CS = CLI.CS;
+  const DataLayout &DL = DAG.getDataLayout();
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
-  MachineFunction &MF = DAG.getMachineFunction();
-  const Function *F = MF.getFunction();
-  auto &DL = MF.getDataLayout();
 
   SDValue tempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getIntPtrConstant(uniqueCallSite, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(
+      Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
   SDValue InFlag = Chain.getValue(1);
 
   unsigned paramCount = 0;
@@ -1148,240 +1451,124 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Type *Ty = Args[i].Ty;
 
     if (!Outs[OIdx].Flags.isByVal()) {
-      if (Ty->isAggregateType()) {
-        // aggregate
-        SmallVector<EVT, 16> vtparts;
-        SmallVector<uint64_t, 16> Offsets;
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
-                           0);
-
-        unsigned align =
-            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
+      unsigned ArgAlign =
+          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      unsigned AllocSize = DL.getTypeAllocSize(Ty);
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      bool NeedAlign; // Does argument declaration specify alignment?
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
         // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
-                                                             MVT::i32),
-                                      DAG.getConstant(paramCount, dl, MVT::i32),
-                                      DAG.getConstant(sz, dl, MVT::i32),
-                                      InFlag };
+        SDValue DeclareParamOps[] = {
+            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+            DAG.getConstant(paramCount, dl, MVT::i32),
+            DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                             DeclareParamOps);
-        InFlag = Chain.getValue(1);
-        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-          EVT elemtype = vtparts[j];
-          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
-          if (elemtype.isInteger() && (sz < 8))
-            sz = 8;
-          SDValue StVal = OutVals[OIdx];
-          if (elemtype.getSizeInBits() < 16) {
-            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
-          }
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(Offsets[j], dl, MVT::i32),
-                                     StVal, InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          elemtype, MachinePointerInfo(),
-                                          ArgAlign);
-          InFlag = Chain.getValue(1);
-          ++OIdx;
+        NeedAlign = true;
+      } else {
+        // declare .param .b<size> .param<n>;
+        if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
+          // PTX ABI requires integral types to be at least 32 bits in
+          // size. FP16 is loaded/stored using i16, so it's handled
+          // here as well.
+          AllocSize = 4;
         }
-        if (vtparts.size() > 0)
-          --OIdx;
-        ++paramCount;
-        continue;
+        SDValue DeclareScalarParamOps[] = {
+            Chain, DAG.getConstant(paramCount, dl, MVT::i32),
+            DAG.getConstant(AllocSize * 8, dl, MVT::i32),
+            DAG.getConstant(0, dl, MVT::i32), InFlag};
+        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                            DeclareScalarParamOps);
+        NeedAlign = false;
       }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        unsigned align =
-            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
-        // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain,
-                                      DAG.getConstant(align, dl, MVT::i32),
-                                      DAG.getConstant(paramCount, dl, MVT::i32),
-                                      DAG.getConstant(sz, dl, MVT::i32),
-                                      InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps);
-        InFlag = Chain.getValue(1);
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        EVT EltVT = ObjectVT.getVectorElementType();
-        EVT MemVT = EltVT;
-        bool NeedExtend = false;
-        if (EltVT.getSizeInBits() < 16) {
-          NeedExtend = true;
-          EltVT = MVT::i16;
+      InFlag = Chain.getValue(1);
+
+      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+      // than 32-bits are sign extended or zero extended, depending on
+      // whether they are signed or unsigned types. This case applies
+      // only to scalar parameters and not to aggregate values.
+      bool ExtendIntegerParam =
+          Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
+
+      auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+      SmallVector<SDValue, 6> StoreOperands;
+      for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+        // New store.
+        if (VectorInfo[j] & PVF_FIRST) {
+          assert(StoreOperands.empty() && "Unfinished preceeding store.");
+          StoreOperands.push_back(Chain);
+          StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+          StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
         }
 
-        // V1 store
-        if (NumElts == 1) {
-          SDValue Elt = OutVals[OIdx++];
-          if (NeedExtend)
-            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
-
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(0, dl, MVT::i32), Elt,
-                                     InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          MemVT, MachinePointerInfo());
-          InFlag = Chain.getValue(1);
-        } else if (NumElts == 2) {
-          SDValue Elt0 = OutVals[OIdx++];
-          SDValue Elt1 = OutVals[OIdx++];
-          if (NeedExtend) {
-            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
-            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+        EVT EltVT = VTs[j];
+        SDValue StVal = OutVals[OIdx];
+        if (ExtendIntegerParam) {
+          assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
+          // zext/sext to i32
+          StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                        : ISD::ZERO_EXTEND,
+                              dl, MVT::i32, StVal);
+        } else if (EltVT.getSizeInBits() < 16) {
+          // Use 16-bit registers for small stores as it's the
+          // smallest general purpose register size supported by NVPTX.
+          StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+        }
+
+        // Record the value to store.
+        StoreOperands.push_back(StVal);
+
+        if (VectorInfo[j] & PVF_LAST) {
+          unsigned NumElts = StoreOperands.size() - 3;
+          NVPTXISD::NodeType Op;
+          switch (NumElts) {
+          case 1:
+            Op = NVPTXISD::StoreParam;
+            break;
+          case 2:
+            Op = NVPTXISD::StoreParamV2;
+            break;
+          case 4:
+            Op = NVPTXISD::StoreParamV4;
+            break;
+          default:
+            llvm_unreachable("Invalid vector info.");
           }
 
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(0, dl, MVT::i32), Elt0,
-                                     Elt1, InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          MemVT, MachinePointerInfo());
-          InFlag = Chain.getValue(1);
-        } else {
-          unsigned curOffset = 0;
-          // V4 stores
-          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the
-          // vector will be expanded to a power of 2 elements, so we know we can
-          // always round up to the next multiple of 4 when creating the vector
-          // stores.
-          // e.g.  4 elem => 1 st.v4
-          //       6 elem => 2 st.v4
-          //       8 elem => 2 st.v4
-          //      11 elem => 3 st.v4
-          unsigned VecSize = 4;
-          if (EltVT.getSizeInBits() == 64)
-            VecSize = 2;
-
-          // This is potentially only part of a vector, so assume all elements
-          // are packed together.
-          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
-
-          for (unsigned i = 0; i < NumElts; i += VecSize) {
-            // Get values
-            SDValue StoreVal;
-            SmallVector<SDValue, 8> Ops;
-            Ops.push_back(Chain);
-            Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
-            Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
-
-            unsigned Opc = NVPTXISD::StoreParamV2;
-
-            StoreVal = OutVals[OIdx++];
-            if (NeedExtend)
-              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-            Ops.push_back(StoreVal);
-
-            if (i + 1 < NumElts) {
-              StoreVal = OutVals[OIdx++];
-              if (NeedExtend)
-                StoreVal =
-                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-            } else {
-              StoreVal = DAG.getUNDEF(EltVT);
-            }
-            Ops.push_back(StoreVal);
-
-            if (VecSize == 4) {
-              Opc = NVPTXISD::StoreParamV4;
-              if (i + 2 < NumElts) {
-                StoreVal = OutVals[OIdx++];
-                if (NeedExtend)
-                  StoreVal =
-                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-              } else {
-                StoreVal = DAG.getUNDEF(EltVT);
-              }
-              Ops.push_back(StoreVal);
-
-              if (i + 3 < NumElts) {
-                StoreVal = OutVals[OIdx++];
-                if (NeedExtend)
-                  StoreVal =
-                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-              } else {
-                StoreVal = DAG.getUNDEF(EltVT);
-              }
-              Ops.push_back(StoreVal);
-            }
+          StoreOperands.push_back(InFlag);
 
-            Ops.push_back(InFlag);
+          // Adjust type of the store op if we've extended the scalar
+          // return value.
+          EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
+          unsigned EltAlign =
+              NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
 
-            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
-                                            MemVT, MachinePointerInfo());
-            InFlag = Chain.getValue(1);
-            curOffset += PerStoreOffset;
-          }
+          Chain = DAG.getMemIntrinsicNode(
+              Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
+              TheStoreType, MachinePointerInfo(), EltAlign);
+          InFlag = Chain.getValue(1);
+
+          // Cleanup.
+          StoreOperands.clear();
         }
-        ++paramCount;
-        --OIdx;
-        continue;
-      }
-      // Plain scalar
-      // for ABI,    declare .param .b<size> .param<n>;
-      unsigned sz = VT.getSizeInBits();
-      bool needExtend = false;
-      if (VT.isInteger()) {
-        if (sz < 16)
-          needExtend = true;
-        if (sz < 32)
-          sz = 32;
+        ++OIdx;
       }
-      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareParamOps[] = { Chain,
-                                    DAG.getConstant(paramCount, dl, MVT::i32),
-                                    DAG.getConstant(sz, dl, MVT::i32),
-                                    DAG.getConstant(0, dl, MVT::i32), InFlag };
-      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                          DeclareParamOps);
-      InFlag = Chain.getValue(1);
-      SDValue OutV = OutVals[OIdx];
-      if (needExtend) {
-        // zext/sext i1 to i16
-        unsigned opc = ISD::ZERO_EXTEND;
-        if (Outs[OIdx].Flags.isSExt())
-          opc = ISD::SIGN_EXTEND;
-        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
-      }
-      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue CopyParamOps[] = { Chain,
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(0, dl, MVT::i32), OutV,
-                                 InFlag };
-
-      unsigned opcode = NVPTXISD::StoreParam;
-      if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
-        opcode = NVPTXISD::StoreParamU32;
-      else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
-        opcode = NVPTXISD::StoreParamS32;
-      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
-                                      VT, MachinePointerInfo());
-
-      InFlag = Chain.getValue(1);
+      assert(StoreOperands.empty() && "Unfinished parameter store.");
+      if (VTs.size() > 0)
+        --OIdx;
       ++paramCount;
       continue;
     }
-    // struct or vector
-    SmallVector<EVT, 16> vtparts;
+
+    // ByVal arguments
+    SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
     auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
-                       vtparts, &Offsets, 0);
+    ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
@@ -1402,11 +1589,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                         DeclareParamOps);
     InFlag = Chain.getValue(1);
-    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-      EVT elemtype = vtparts[j];
+    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+      EVT elemtype = VTs[j];
       int curOffset = Offsets[j];
       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      auto PtrVT = getPointerTy(DL);
       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
                                     DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
@@ -1434,18 +1621,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Handle Result
   if (Ins.size() > 0) {
     SmallVector<EVT, 16> resvtparts;
-    ComputeValueVTs(*this, DL, retTy, resvtparts);
+    ComputeValueVTs(*this, DL, RetTy, resvtparts);
 
     // Declare
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
-    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
+    unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
     // these three types to match the logic in
     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
     // Plus, this behavior is consistent with nvcc's.
-    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
-        retTy->isPointerTy()) {
+    if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
+        RetTy->isPointerTy()) {
       // Scalar needs to be at least 32bit wide
       if (resultsz < 32)
         resultsz = 32;
@@ -1457,7 +1644,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           DeclareRetOps);
       InFlag = Chain.getValue(1);
     } else {
-      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+      retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       SDValue DeclareRetOps[] = { Chain,
                                   DAG.getConstant(retAlignment, dl, MVT::i32),
@@ -1478,8 +1665,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto =
-        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
+    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
     const char *ProtoStr =
       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
     SDValue ProtoOps[] = {
@@ -1544,175 +1730,84 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
-    if (retTy && retTy->isVectorTy()) {
-      EVT ObjectVT = getValueType(DL, retTy);
-      unsigned NumElts = ObjectVT.getVectorNumElements();
-      EVT EltVT = ObjectVT.getVectorElementType();
-      assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
-                                                      ObjectVT) == NumElts &&
-             "Vector was not scalarized");
-      unsigned sz = EltVT.getSizeInBits();
-      bool needTruncate = sz < 8;
-
-      if (NumElts == 1) {
-        // Just a simple load
-        SmallVector<EVT, 4> LoadRetVTs;
-        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-          // If loading i1/i8 result, generate
-          //   load.b8 i16
-          //   if i1
-          //   trunc i16 to i1
-          LoadRetVTs.push_back(MVT::i16);
-        } else
-          LoadRetVTs.push_back(EltVT);
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(0, dl, MVT::i32), InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParam, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
-        Chain = retval.getValue(1);
-        InFlag = retval.getValue(2);
-        SDValue Ret0 = retval;
-        if (needTruncate)
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
-        InVals.push_back(Ret0);
-      } else if (NumElts == 2) {
-        // LoadV2
-        SmallVector<EVT, 4> LoadRetVTs;
-        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-          // If loading i1/i8 result, generate
-          //   load.b8 i16
-          //   if i1
-          //   trunc i16 to i1
-          LoadRetVTs.push_back(MVT::i16);
-          LoadRetVTs.push_back(MVT::i16);
-        } else {
-          LoadRetVTs.push_back(EltVT);
-          LoadRetVTs.push_back(EltVT);
-        }
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(0, dl, MVT::i32), InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParamV2, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
-        Chain = retval.getValue(2);
-        InFlag = retval.getValue(3);
-        SDValue Ret0 = retval.getValue(0);
-        SDValue Ret1 = retval.getValue(1);
-        if (needTruncate) {
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
-          InVals.push_back(Ret0);
-          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
-          InVals.push_back(Ret1);
-        } else {
-          InVals.push_back(Ret0);
-          InVals.push_back(Ret1);
-        }
-      } else {
-        // Split into N LoadV4
-        unsigned Ofst = 0;
-        unsigned VecSize = 4;
-        unsigned Opc = NVPTXISD::LoadParamV4;
-        if (EltVT.getSizeInBits() == 64) {
-          VecSize = 2;
-          Opc = NVPTXISD::LoadParamV2;
-        }
-        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-        for (unsigned i = 0; i < NumElts; i += VecSize) {
-          SmallVector<EVT, 8> LoadRetVTs;
-          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-            // If loading i1/i8 result, generate
-            //   load.b8 i16
-            //   if i1
-            //   trunc i16 to i1
-            for (unsigned j = 0; j < VecSize; ++j)
-              LoadRetVTs.push_back(MVT::i16);
-          } else {
-            for (unsigned j = 0; j < VecSize; ++j)
-              LoadRetVTs.push_back(EltVT);
-          }
-          LoadRetVTs.push_back(MVT::Other);
-          LoadRetVTs.push_back(MVT::Glue);
-          SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                  DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
-          SDValue retval = DAG.getMemIntrinsicNode(
-              Opc, dl, DAG.getVTList(LoadRetVTs),
-              LoadRetOps, EltVT, MachinePointerInfo());
-          if (VecSize == 2) {
-            Chain = retval.getValue(2);
-            InFlag = retval.getValue(3);
-          } else {
-            Chain = retval.getValue(4);
-            InFlag = retval.getValue(5);
-          }
+    SmallVector<EVT, 16> VTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
+    assert(VTs.size() == Ins.size() && "Bad value decomposition");
+
+    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+
+    SmallVector<EVT, 6> LoadVTs;
+    int VecIdx = -1; // Index of the first element of the vector.
+
+    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+    // 32-bits are sign extended or zero extended, depending on whether
+    // they are signed or unsigned types.
+    bool ExtendIntegerRetVal =
+        RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+
+    for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+      bool needTruncate = false;
+      EVT TheLoadType = VTs[i];
+      EVT EltType = Ins[i].VT;
+      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+      if (ExtendIntegerRetVal) {
+        TheLoadType = MVT::i32;
+        EltType = MVT::i32;
+        needTruncate = true;
+      } else if (TheLoadType.getSizeInBits() < 16) {
+        if (VTs[i].isInteger())
+          needTruncate = true;
+        EltType = MVT::i16;
+      }
 
-          for (unsigned j = 0; j < VecSize; ++j) {
-            if (i + j >= NumElts)
-              break;
-            SDValue Elt = retval.getValue(j);
-            if (needTruncate)
-              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
-            InVals.push_back(Elt);
-          }
-          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-        }
+      // Record index of the very first element of the vector.
+      if (VectorInfo[i] & PVF_FIRST) {
+        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+        VecIdx = i;
       }
-    } else {
-      SmallVector<EVT, 16> VTs;
-      SmallVector<uint64_t, 16> Offsets;
-      auto &DL = DAG.getDataLayout();
-      ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0);
-      assert(VTs.size() == Ins.size() && "Bad value decomposition");
-      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL);
-      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-        unsigned sz = VTs[i].getSizeInBits();
-        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
-        bool needTruncate = false;
-        if (VTs[i].isInteger() && sz < 8) {
-          sz = 8;
-          needTruncate = true;
+
+      LoadVTs.push_back(EltType);
+
+      if (VectorInfo[i] & PVF_LAST) {
+        unsigned NumElts = LoadVTs.size();
+        LoadVTs.push_back(MVT::Other);
+        LoadVTs.push_back(MVT::Glue);
+        NVPTXISD::NodeType Op;
+        switch (NumElts) {
+        case 1:
+          Op = NVPTXISD::LoadParam;
+          break;
+        case 2:
+          Op = NVPTXISD::LoadParamV2;
+          break;
+        case 4:
+          Op = NVPTXISD::LoadParamV4;
+          break;
+        default:
+          llvm_unreachable("Invalid vector info.");
         }
 
-        SmallVector<EVT, 4> LoadRetVTs;
-        EVT TheLoadType = VTs[i];
-        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
-          // This is for integer types only, and specifically not for
-          // aggregates.
-          LoadRetVTs.push_back(MVT::i32);
-          TheLoadType = MVT::i32;
-          needTruncate = true;
-        } else if (sz < 16) {
-          // If loading i1/i8 result, generate
-          //   load i8 (-> i16)
-          //   trunc i16 to i1/i8
-
-          // FIXME: Do we need to set needTruncate to true here, too?  We could
-          // not figure out what this branch is for in D17872, so we left it
-          // alone.  The comment above about loading i1/i8 may be wrong, as the
-          // branch above seems to cover integers of size < 32.
-          LoadRetVTs.push_back(MVT::i16);
-        } else
-          LoadRetVTs.push_back(Ins[i].VT);
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(Offsets[i], dl, MVT::i32),
-                                InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParam, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps,
-            TheLoadType, MachinePointerInfo(), AlignI);
-        Chain = retval.getValue(1);
-        InFlag = retval.getValue(2);
-        SDValue Ret0 = retval.getValue(0);
-        if (needTruncate)
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
-        InVals.push_back(Ret0);
+        SDValue LoadOperands[] = {
+            Chain, DAG.getConstant(1, dl, MVT::i32),
+            DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
+        SDValue RetVal = DAG.getMemIntrinsicNode(
+            Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
+            MachinePointerInfo(), EltAlign);
+
+        for (unsigned j = 0; j < NumElts; ++j) {
+          SDValue Ret = RetVal.getValue(j);
+          if (needTruncate)
+            Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
+          InVals.push_back(Ret);
+        }
+        Chain = RetVal.getValue(NumElts);
+        InFlag = RetVal.getValue(NumElts + 1);
+
+        // Cleanup
+        VecIdx = -1;
+        LoadVTs.clear();
       }
     }
   }
@@ -1752,6 +1847,55 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
+// We can init constant f16x2 with a single .b32 move.  Normally it
+// would get lowered as two constant loads and vector-packing move.
+//        mov.b16         %h1, 0x4000;
+//        mov.b16         %h2, 0x3C00;
+//        mov.b32         %hh2, {%h2, %h1};
+// Instead we want just a constant move:
+//        mov.b32         %hh2, 0x40003C00
+//
+// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
+// generates good SASS in both cases.
+SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  //return Op;
+  if (!(Op->getValueType(0) == MVT::v2f16 &&
+        isa<ConstantFPSDNode>(Op->getOperand(0)) &&
+        isa<ConstantFPSDNode>(Op->getOperand(1))))
+    return Op;
+
+  APInt E0 =
+      cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
+  APInt E1 =
+      cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
+  SDValue Const =
+      DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
+}
+
+SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Index = Op->getOperand(1);
+  // Constant index will be matched by tablegen.
+  if (isa<ConstantSDNode>(Index.getNode()))
+    return Op;
+
+  // Extract individual elements and select one of them.
+  SDValue Vector = Op->getOperand(0);
+  EVT VectorVT = Vector.getValueType();
+  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
+  EVT EltVT = VectorVT.getVectorElementType();
+
+  SDLoc dl(Op.getNode());
+  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                           DAG.getIntPtrConstant(0, dl));
+  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                           DAG.getIntPtrConstant(1, dl));
+  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
+                         ISD::CondCode::SETEQ);
+}
+
 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
 ///    amount, or
@@ -1885,8 +2029,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INTRINSIC_W_CHAIN:
     return Op;
   case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return Op;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::CONCAT_VECTORS:
     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::STORE:
@@ -1924,8 +2071,21 @@ SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::i1)
     return LowerLOADi1(Op, DAG);
-  else
-    return SDValue();
+
+  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+  // loads and have to handle it here.
+  if (Op.getValueType() == MVT::v2f16) {
+    LoadSDNode *Load = cast<LoadSDNode>(Op);
+    EVT MemVT = Load->getMemoryVT();
+    if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                            Load->getAddressSpace(), Load->getAlignment())) {
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+      return DAG.getMergeValues(Ops, SDLoc(Op));
+    }
+  }
+
+  return SDValue();
 }
 
 // v = ld i1* addr
@@ -1951,13 +2111,23 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  EVT ValVT = Op.getOperand(1).getValueType();
-  if (ValVT == MVT::i1)
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  if (VT == MVT::i1)
     return LowerSTOREi1(Op, DAG);
-  else if (ValVT.isVector())
+
+  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+  // stores and have to handle it here.
+  if (VT == MVT::v2f16 &&
+      !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          Store->getAddressSpace(), Store->getAlignment()))
+    return expandUnalignedStore(Store, DAG);
+
+  if (VT.isVector())
     return LowerSTOREVector(Op, DAG);
-  else
-    return SDValue();
+
+  return SDValue();
 }
 
 SDValue
@@ -1980,12 +2150,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case MVT::v2i16:
     case MVT::v2i32:
     case MVT::v2i64:
+    case MVT::v2f16:
     case MVT::v2f32:
     case MVT::v2f64:
     case MVT::v4i8:
     case MVT::v4i16:
     case MVT::v4i32:
+    case MVT::v4f16:
     case MVT::v4f32:
+    case MVT::v8f16: // <4 x f16x2>
       // This is a "native" vector type
       break;
     }
@@ -2016,6 +2189,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     if (EltVT.getSizeInBits() < 16)
       NeedExt = true;
 
+    bool StoreF16x2 = false;
     switch (NumElts) {
     default:
       return SDValue();
@@ -2025,6 +2199,14 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case 4:
       Opcode = NVPTXISD::StoreV4;
       break;
+    case 8:
+      // v8f16 is a special case. PTX doesn't have st.v8.f16
+      // instruction. Instead, we split the vector into v2f16 chunks and
+      // store them with st.v4.b32.
+      assert(EltVT == MVT::f16 && "Wrong type for the vector.");
+      Opcode = NVPTXISD::StoreV4;
+      StoreF16x2 = true;
+      break;
     }
 
     SmallVector<SDValue, 8> Ops;
@@ -2032,23 +2214,36 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     // First is the chain
     Ops.push_back(N->getOperand(0));
 
-    // Then the split values
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
-                                   DAG.getIntPtrConstant(i, DL));
-      if (NeedExt)
-        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
-      Ops.push_back(ExtVal);
+    if (StoreF16x2) {
+      // Combine f16,f16 -> v2f16
+      NumElts /= 2;
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                 DAG.getIntPtrConstant(i * 2, DL));
+        SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                 DAG.getIntPtrConstant(i * 2 + 1, DL));
+        SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
+        Ops.push_back(V2);
+      }
+    } else {
+      // Then the split values
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                     DAG.getIntPtrConstant(i, DL));
+        if (NeedExt)
+          ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+        Ops.push_back(ExtVal);
+      }
     }
 
     // Then any remaining arguments
     Ops.append(N->op_begin() + 2, N->op_end());
 
-    SDValue NewSt = DAG.getMemIntrinsicNode(
-        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
-        MemSD->getMemoryVT(), MemSD->getMemOperand());
+    SDValue NewSt =
+        DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+                                MemSD->getMemoryVT(), MemSD->getMemOperand());
 
-    //return DCI.CombineTo(N, NewSt, true);
+    // return DCI.CombineTo(N, NewSt, true);
     return NewSt;
   }
 
@@ -2120,7 +2315,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   const Function *F = MF.getFunction();
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
   const TargetLowering *TLI = STI.getTargetLowering();
 
   SDValue Root = DAG.getRoot();
@@ -2200,177 +2395,80 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     // to newly created nodes. The SDNodes for params have to
     // appear in the same order as their order of appearance
     // in the original function. "idx+1" holds that order.
-    if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
-      if (Ty->isAggregateType()) {
-        SmallVector<EVT, 16> vtparts;
-        SmallVector<uint64_t, 16> offsets;
+    if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
+      bool aggregateIsPacked = false;
+      if (StructType *STy = dyn_cast<StructType>(Ty))
+        aggregateIsPacked = STy->isPacked();
 
-        // NOTE: Here, we lose the ability to issue vector loads for vectors
-        // that are a part of a struct.  This should be investigated in the
-        // future.
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
-                           0);
-        assert(vtparts.size() > 0 && "empty aggregate type not expected");
-        bool aggregateIsPacked = false;
-        if (StructType *STy = dyn_cast<StructType>(Ty))
-          aggregateIsPacked = STy->isPacked();
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
+      assert(VTs.size() > 0 && "Unexpected empty type.");
+      auto VectorInfo =
+          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
 
-        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
-             ++parti) {
-          EVT partVT = vtparts[parti];
-          Value *srcValue = Constant::getNullValue(
-              PointerType::get(partVT.getTypeForEVT(F->getContext()),
-                               ADDRESS_SPACE_PARAM));
-          SDValue srcAddr =
-              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
-                          DAG.getConstant(offsets[parti], dl, PtrVT));
-          unsigned partAlign = aggregateIsPacked
-                                   ? 1
-                                   : DL.getABITypeAlignment(
-                                         partVT.getTypeForEVT(F->getContext()));
-          SDValue p;
-          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
-            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
-                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
-            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
-                               MachinePointerInfo(srcValue), partVT, partAlign);
-          } else {
-            p = DAG.getLoad(partVT, dl, Root, srcAddr,
-                            MachinePointerInfo(srcValue), partAlign);
-          }
-          if (p.getNode())
-            p.getNode()->setIROrder(idx + 1);
-          InVals.push_back(p);
-          ++InsIdx;
+      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+      int VecIdx = -1; // Index of the first element of the current vector.
+      for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
+        if (VectorInfo[parti] & PVF_FIRST) {
+          assert(VecIdx == -1 && "Orphaned vector.");
+          VecIdx = parti;
         }
-        if (vtparts.size() > 0)
-          --InsIdx;
-        continue;
-      }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
-               "Vector was not scalarized");
-        EVT EltVT = ObjectVT.getVectorElementType();
-
-        // V1 load
-        // f32 = load ...
-        if (NumElts == 1) {
-          // We only have one element, so just directly load it
-          Value *SrcValue = Constant::getNullValue(PointerType::get(
-              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-          SDValue P = DAG.getLoad(
-              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
-              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
-              MachineMemOperand::MODereferenceable |
-                  MachineMemOperand::MOInvariant);
-          if (P.getNode())
-            P.getNode()->setIROrder(idx + 1);
 
-          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
-            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
-          InVals.push_back(P);
-          ++InsIdx;
-        } else if (NumElts == 2) {
-          // V2 load
-          // f32,f32 = load ...
-          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
-          Value *SrcValue = Constant::getNullValue(PointerType::get(
-              VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-          SDValue P = DAG.getLoad(
-              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
-              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
-              MachineMemOperand::MODereferenceable |
-                  MachineMemOperand::MOInvariant);
+        // That's the last element of this store op.
+        if (VectorInfo[parti] & PVF_LAST) {
+          unsigned NumElts = parti - VecIdx + 1;
+          EVT EltVT = VTs[parti];
+          // i1 is loaded/stored as i8.
+          EVT LoadVT = EltVT;
+          if (EltVT == MVT::i1)
+            LoadVT = MVT::i8;
+          else if (EltVT == MVT::v2f16)
+            // getLoad needs a vector type, but it can't handle
+            // vectors which contain v2f16 elements. So we must load
+            // using i32 here and then bitcast back.
+            LoadVT = MVT::i32;
+
+          EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
+          SDValue VecAddr =
+              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                          DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
+          Value *srcValue = Constant::getNullValue(PointerType::get(
+              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+          SDValue P =
+              DAG.getLoad(VecVT, dl, Root, VecAddr,
+                          MachinePointerInfo(srcValue), aggregateIsPacked,
+                          MachineMemOperand::MODereferenceable |
+                              MachineMemOperand::MOInvariant);
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
-
-          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                     DAG.getIntPtrConstant(0, dl));
-          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                     DAG.getIntPtrConstant(1, dl));
-
-          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
-            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
-            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
-          }
-
-          InVals.push_back(Elt0);
-          InVals.push_back(Elt1);
-          InsIdx += 2;
-        } else {
-          // V4 loads
-          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the vector will be expanded to a power of 2 elements, so we know we
-          // can always round up to the next multiple of 4 when creating the
-          // vector loads.
-          // e.g.  4 elem => 1 ld.v4
-          //       6 elem => 2 ld.v4
-          //       8 elem => 2 ld.v4
-          //      11 elem => 3 ld.v4
-          unsigned VecSize = 4;
-          if (EltVT.getSizeInBits() == 64) {
-            VecSize = 2;
-          }
-          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-          unsigned Ofst = 0;
-          for (unsigned i = 0; i < NumElts; i += VecSize) {
-            Value *SrcValue = Constant::getNullValue(
-                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
-                                 ADDRESS_SPACE_PARAM));
-            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
-                                          DAG.getConstant(Ofst, dl, PtrVT));
-            SDValue P = DAG.getLoad(
-                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
-                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
-                MachineMemOperand::MODereferenceable |
-                    MachineMemOperand::MOInvariant);
-            if (P.getNode())
-              P.getNode()->setIROrder(idx + 1);
-
-            for (unsigned j = 0; j < VecSize; ++j) {
-              if (i + j >= NumElts)
-                break;
-              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                        DAG.getIntPtrConstant(j, dl));
-              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
-                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
-              InVals.push_back(Elt);
+          for (unsigned j = 0; j < NumElts; ++j) {
+            SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
+                                      DAG.getIntPtrConstant(j, dl));
+            // We've loaded i1 as an i8 and now must truncate it back to i1
+            if (EltVT == MVT::i1)
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
+            // v2f16 was loaded as an i32. Now we must bitcast it back.
+            else if (EltVT == MVT::v2f16)
+              Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+            // Extend the element if necesary (e.g. an i8 is loaded
+            // into an i16 register)
+            if (Ins[InsIdx].VT.isInteger() &&
+                Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+              unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                           : ISD::ZERO_EXTEND;
+              Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
             }
-            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+            InVals.push_back(Elt);
           }
-          InsIdx += NumElts;
-        }
 
-        if (NumElts > 0)
-          --InsIdx;
-        continue;
-      }
-      // A plain scalar.
-      EVT ObjectVT = getValueType(DL, Ty);
-      // If ABI, load from the param symbol
-      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-      Value *srcValue = Constant::getNullValue(PointerType::get(
-          ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-      SDValue p;
-       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
-        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
-                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
-        p = DAG.getExtLoad(
-            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
-            ObjectVT,
-            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
-      } else {
-        p = DAG.getLoad(
-            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
-            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+          // Reset vector tracking state.
+          VecIdx = -1;
+        }
+        ++InsIdx;
       }
-      if (p.getNode())
-        p.getNode()->setIROrder(idx + 1);
-      InVals.push_back(p);
+      if (VTs.size() > 0)
+        --InsIdx;
       continue;
     }
 
@@ -2412,164 +2510,77 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const Function *F = MF.getFunction();
-  Type *RetTy = F->getReturnType();
-  const DataLayout &TD = DAG.getDataLayout();
+  Type *RetTy = MF.getFunction()->getReturnType();
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
 
-  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
-    // If we have a vector type, the OutVals array will be the scalarized
-    // components and we have combine them into 1 or more vector stores.
-    unsigned NumElts = VTy->getNumElements();
-    assert(NumElts == Outs.size() && "Bad scalarization of return value");
+  const DataLayout DL = DAG.getDataLayout();
+  SmallVector<EVT, 16> VTs;
+  SmallVector<uint64_t, 16> Offsets;
+  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+
+  auto VectorInfo = VectorizePTXValueVTs(
+      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
+
+  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+  // 32-bits are sign extended or zero extended, depending on whether
+  // they are signed or unsigned types.
+  bool ExtendIntegerRetVal =
+      RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+
+  SmallVector<SDValue, 6> StoreOperands;
+  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+    // New load/store. Record chain and offset operands.
+    if (VectorInfo[i] & PVF_FIRST) {
+      assert(StoreOperands.empty() && "Orphaned operand list.");
+      StoreOperands.push_back(Chain);
+      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+    }
 
-    // const_cast can be removed in later LLVM versions
-    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
-    bool NeedExtend = false;
-    if (EltVT.getSizeInBits() < 16)
-      NeedExtend = true;
-
-    // V1 store
-    if (NumElts == 1) {
-      SDValue StoreVal = OutVals[0];
-      // We only have one element, so just directly store it
-      if (NeedExtend)
-        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      EltVT, MachinePointerInfo());
-    } else if (NumElts == 2) {
-      // V2 store
-      SDValue StoreVal0 = OutVals[0];
-      SDValue StoreVal1 = OutVals[1];
-
-      if (NeedExtend) {
-        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
-        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
-      }
+    SDValue RetVal = OutVals[i];
+    if (ExtendIntegerRetVal) {
+      RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                  : ISD::ZERO_EXTEND,
+                           dl, MVT::i32, RetVal);
+    } else if (RetVal.getValueSizeInBits() < 16) {
+      // Use 16-bit registers for small load-stores as it's the
+      // smallest general purpose register size supported by NVPTX.
+      RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
+    }
 
-      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
-                        StoreVal1 };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      EltVT, MachinePointerInfo());
-    } else {
-      // V4 stores
-      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
-      // vector will be expanded to a power of 2 elements, so we know we can
-      // always round up to the next multiple of 4 when creating the vector
-      // stores.
-      // e.g.  4 elem => 1 st.v4
-      //       6 elem => 2 st.v4
-      //       8 elem => 2 st.v4
-      //      11 elem => 3 st.v4
-
-      unsigned VecSize = 4;
-      if (OutVals[0].getValueSizeInBits() == 64)
-        VecSize = 2;
-
-      unsigned Offset = 0;
-
-      EVT VecVT =
-          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-      unsigned PerStoreOffset =
-          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-
-      for (unsigned i = 0; i < NumElts; i += VecSize) {
-        // Get values
-        SDValue StoreVal;
-        SmallVector<SDValue, 8> Ops;
-        Ops.push_back(Chain);
-        Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
-        unsigned Opc = NVPTXISD::StoreRetvalV2;
-        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
-
-        StoreVal = OutVals[i];
-        if (NeedExtend)
-          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-        Ops.push_back(StoreVal);
-
-        if (i + 1 < NumElts) {
-          StoreVal = OutVals[i + 1];
-          if (NeedExtend)
-            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-        } else {
-          StoreVal = DAG.getUNDEF(ExtendedVT);
-        }
-        Ops.push_back(StoreVal);
-
-        if (VecSize == 4) {
-          Opc = NVPTXISD::StoreRetvalV4;
-          if (i + 2 < NumElts) {
-            StoreVal = OutVals[i + 2];
-            if (NeedExtend)
-              StoreVal =
-                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-          } else {
-            StoreVal = DAG.getUNDEF(ExtendedVT);
-          }
-          Ops.push_back(StoreVal);
-
-          if (i + 3 < NumElts) {
-            StoreVal = OutVals[i + 3];
-            if (NeedExtend)
-              StoreVal =
-                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-          } else {
-            StoreVal = DAG.getUNDEF(ExtendedVT);
-          }
-          Ops.push_back(StoreVal);
-        }
+    // Record the value to return.
+    StoreOperands.push_back(RetVal);
 
-        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
-        Chain =
-            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
-                                    EltVT, MachinePointerInfo());
-        Offset += PerStoreOffset;
-      }
-    }
-  } else {
-    SmallVector<EVT, 16> ValVTs;
-    SmallVector<uint64_t, 16> Offsets;
-    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
-    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
-
-    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-      SDValue theVal = OutVals[i];
-      EVT TheValType = theVal.getValueType();
-      unsigned numElems = 1;
-      if (TheValType.isVector())
-        numElems = TheValType.getVectorNumElements();
-      for (unsigned j = 0, je = numElems; j != je; ++j) {
-        SDValue TmpVal = theVal;
-        if (TheValType.isVector())
-          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                               TheValType.getVectorElementType(), TmpVal,
-                               DAG.getIntPtrConstant(j, dl));
-        EVT TheStoreType = ValVTs[i];
-        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
-          // The following zero-extension is for integer types only, and
-          // specifically not for aggregates.
-          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
-          TheStoreType = MVT::i32;
-        }
-        else if (TmpVal.getValueSizeInBits() < 16)
-          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
-
-        SDValue Ops[] = {
-          Chain,
-          DAG.getConstant(Offsets[i], dl, MVT::i32),
-          TmpVal };
-        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                        DAG.getVTList(MVT::Other), Ops,
-                                        TheStoreType,
-                                        MachinePointerInfo());
+    // That's the last element of this store op.
+    if (VectorInfo[i] & PVF_LAST) {
+      NVPTXISD::NodeType Op;
+      unsigned NumElts = StoreOperands.size() - 2;
+      switch (NumElts) {
+      case 1:
+        Op = NVPTXISD::StoreRetval;
+        break;
+      case 2:
+        Op = NVPTXISD::StoreRetvalV2;
+        break;
+      case 4:
+        Op = NVPTXISD::StoreRetvalV4;
+        break;
+      default:
+        llvm_unreachable("Invalid vector info.");
       }
+
+      // Adjust type of load/store op if we've extended the scalar
+      // return value.
+      EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
+                                      StoreOperands, TheStoreType,
+                                      MachinePointerInfo(), 1);
+      // Cleanup vector state.
+      StoreOperands.clear();
     }
   }
 
@@ -3863,27 +3874,35 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
 
 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
                                    CodeGenOpt::Level OptLevel) const {
-  const Function *F = MF.getFunction();
-  const TargetOptions &TO = MF.getTarget().Options;
-
   // Always honor command-line argument
-  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+  if (FMAContractLevelOpt.getNumOccurrences() > 0)
     return FMAContractLevelOpt > 0;
-  } else if (OptLevel == 0) {
-    // Do not contract if we're not optimizing the code
+
+  // Do not contract if we're not optimizing the code.
+  if (OptLevel == 0)
     return false;
-  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
-    // Honor TargetOptions flags that explicitly say fusion is okay
+
+  // Honor TargetOptions flags that explicitly say fusion is okay.
+  if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
     return true;
-  } else if (F->hasFnAttribute("unsafe-fp-math")) {
-    // Check for unsafe-fp-math=true coming from Clang
+
+  return allowUnsafeFPMath(MF);
+}
+
+bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
+  // Honor TargetOptions flags that explicitly say unsafe math is okay.
+  if (MF.getTarget().Options.UnsafeFPMath)
+    return true;
+
+  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttribute("unsafe-fp-math")) {
     Attribute Attr = F->getFnAttribute("unsafe-fp-math");
     StringRef Val = Attr.getValueAsString();
     if (Val == "true")
       return true;
   }
 
-  // We did not have a clear indication that fusion is allowed, so assume not
   return false;
 }
 
@@ -4088,67 +4107,6 @@ static SDValue PerformANDCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue PerformSELECTCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  // Currently this detects patterns for integer min and max and
-  // lowers them to PTX-specific intrinsics that enable hardware
-  // support.
-
-  const SDValue Cond = N->getOperand(0);
-  if (Cond.getOpcode() != ISD::SETCC) return SDValue();
-
-  const SDValue LHS = Cond.getOperand(0);
-  const SDValue RHS = Cond.getOperand(1);
-  const SDValue True = N->getOperand(1);
-  const SDValue False = N->getOperand(2);
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  const EVT VT = N->getValueType(0);
-  if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
-
-  const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-  SDValue Larger;  // The larger of LHS and RHS when condition is true.
-  switch (CC) {
-    case ISD::SETULT:
-    case ISD::SETULE:
-    case ISD::SETLT:
-    case ISD::SETLE:
-      Larger = RHS;
-      break;
-
-    case ISD::SETGT:
-    case ISD::SETGE:
-    case ISD::SETUGT:
-    case ISD::SETUGE:
-      Larger = LHS;
-      break;
-
-    default:
-      return SDValue();
-  }
-  const bool IsMax = (Larger == True);
-  const bool IsSigned = ISD::isSignedIntSetCC(CC);
-
-  unsigned IntrinsicId;
-  if (VT == MVT::i32) {
-    if (IsSigned)
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
-    else
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
-  } else {
-    assert(VT == MVT::i64);
-    if (IsSigned)
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
-    else
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
-  }
-
-  SDLoc DL(N);
-  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
-}
-
 static SDValue PerformREMCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  CodeGenOpt::Level OptLevel) {
@@ -4344,6 +4302,27 @@ static SDValue PerformSHLCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformSETCCCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  EVT CCType = N->getValueType(0);
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+
+  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
+    return SDValue();
+
+  SDLoc DL(N);
+  // setp.f16x2 returns two scalar predicates, which we need to
+  // convert back to v2i1. The returned result will be scalarized by
+  // the legalizer, but the comparison will remain a single vector
+  // instruction.
+  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
+                                   DCI.DAG.getVTList(MVT::i1, MVT::i1),
+                                   {A, B, N->getOperand(2)});
+  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
+                         CCNode.getValue(1));
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
@@ -4358,11 +4337,11 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformSHLCombine(N, DCI, OptLevel);
     case ISD::AND:
       return PerformANDCombine(N, DCI);
-    case ISD::SELECT:
-      return PerformSELECTCombine(N, DCI);
     case ISD::UREM:
     case ISD::SREM:
       return PerformREMCombine(N, DCI, OptLevel);
+    case ISD::SETCC:
+      return PerformSETCCCombine(N, DCI);
   }
   return SDValue();
 }
@@ -4386,12 +4365,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   case MVT::v2i16:
   case MVT::v2i32:
   case MVT::v2i64:
+  case MVT::v2f16:
   case MVT::v2f32:
   case MVT::v2f64:
   case MVT::v4i8:
   case MVT::v4i16:
   case MVT::v4i32:
+  case MVT::v4f16:
   case MVT::v4f32:
+  case MVT::v8f16: // <4 x f16x2>
     // This is a "native" vector type
     break;
   }
@@ -4425,6 +4407,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   unsigned Opcode = 0;
   SDVTList LdResVTs;
+  bool LoadF16x2 = false;
 
   switch (NumElts) {
   default:
@@ -4439,6 +4422,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
     LdResVTs = DAG.getVTList(ListVTs);
     break;
   }
+  case 8: {
+    // v8f16 is a special case. PTX doesn't have ld.v8.f16
+    // instruction. Instead, we split the vector into v2f16 chunks and
+    // load them with ld.v4.b32.
+    assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
+    LoadF16x2 = true;
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
+                     MVT::Other};
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
   }
 
   // Copy regular operands
@@ -4452,13 +4447,26 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
                                           LD->getMemoryVT(),
                                           LD->getMemOperand());
 
-  SmallVector<SDValue, 4> ScalarRes;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Res = NewLD.getValue(i);
-    if (NeedTrunc)
-      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
-    ScalarRes.push_back(Res);
+  SmallVector<SDValue, 8> ScalarRes;
+  if (LoadF16x2) {
+    // Split v2f16 subvectors back into individual elements.
+    NumElts /= 2;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue SubVector = NewLD.getValue(i);
+      SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                               DAG.getIntPtrConstant(0, DL));
+      SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                               DAG.getIntPtrConstant(1, DL));
+      ScalarRes.push_back(E0);
+      ScalarRes.push_back(E1);
+    }
+  } else {
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue Res = NewLD.getValue(i);
+      if (NeedTrunc)
+        Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+      ScalarRes.push_back(Res);
+    }
   }
 
   SDValue LoadChain = NewLD.getValue(NumElts);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index e433aed7781b..9d7b70d80c11 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -56,6 +56,7 @@ enum NodeType : unsigned {
   MUL_WIDE_SIGNED,
   MUL_WIDE_UNSIGNED,
   IMAD,
+  SETP_F16X2,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -73,7 +74,7 @@ enum NodeType : unsigned {
   StoreParamV2,
   StoreParamV4,
   StoreParamS32, // to sext and store a <32bit value, not used currently
-  StoreParamU32, // to zext and store a <32bit value, not used currently 
+  StoreParamU32, // to zext and store a <32bit value, not used currently
   StoreRetval,
   StoreRetvalV2,
   StoreRetvalV4,
@@ -510,17 +511,48 @@ public:
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
+  // Get the degree of precision we want from 32-bit floating point division
+  // operations.
+  //
+  //  0 - Use ptx div.approx
+  //  1 - Use ptx.div.full (approximate, but less so than div.approx)
+  //  2 - Use IEEE-compliant div instructions, if available.
+  int getDivF32Level() const;
+
+  // Get whether we should use a precise or approximate 32-bit floating point
+  // sqrt instruction.
+  bool usePrecSqrtF32() const;
+
+  // Get whether we should use instructions that flush floating-point denormals
+  // to sign-preserving zero.
+  bool useF32FTZ(const MachineFunction &MF) const;
+
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                          int &ExtraSteps, bool &UseOneConst,
+                          bool Reciprocal) const override;
+
+  unsigned combineRepeatedFPDivisors() const override { return 2; }
+
   bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+  bool allowUnsafeFPMath(MachineFunction &MF) const;
 
   bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
 
   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
 
+  // The default is to transform llvm.ctlz(x, false) (where false indicates that
+  // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+  // and avoids calling ctlz in that case.  We have a dedicated ctlz
+  // instruction, so we say that ctlz is cheap to speculate.
+  bool isCheapToSpeculateCtlz() const override { return true; }
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
 
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 8d00bbb5e9c2..f12ed81b6d9f 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -96,9 +96,7 @@ bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
     // This is an OpenCL sampler, so it must be a samplerref
     replaceWith(&I, ConstantInt::getTrue(I.getContext()));
     return true;
-  } else if (isImageWriteOnly(*TexHandle) ||
-             isImageReadWrite(*TexHandle) ||
-             isImageReadOnly(*TexHandle)) {
+  } else if (isImage(*TexHandle)) {
     // This is an OpenCL image, so it cannot be a samplerref
     replaceWith(&I, ConstantInt::getFalse(I.getContext()));
     return true;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 7f89742a3215..3026f0be242d 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -52,6 +52,11 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else if (DestRC == &NVPTX::Int64RegsRegClass) {
     Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
                                              : NVPTX::BITCONVERT_64_F2I);
+  } else if (DestRC == &NVPTX::Float16RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Float16RegsRegClass ? NVPTX::FMOV16rr
+                                               : NVPTX::BITCONVERT_16_I2F);
+  } else if (DestRC == &NVPTX::Float16x2RegsRegClass) {
+    Op = NVPTX::IMOV32rr;
   } else if (DestRC == &NVPTX::Float32RegsRegClass) {
     Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
                                                : NVPTX::BITCONVERT_32_I2F);
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 0fbb0448e4c4..2b847414b8a8 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -18,6 +18,10 @@ let hasSideEffects = 0 in {
   def NOP : NVPTXInst<(outs), (ins), "", []>;
 }
 
+let OperandType = "OPERAND_IMMEDIATE" in {
+  def f16imm : Operand<f16>;
+}
+
 // List of vector specific properties
 def isVecLD      : VecInstTypeEnum<1>;
 def isVecST      : VecInstTypeEnum<2>;
@@ -98,6 +102,9 @@ def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
 def CmpMode : Operand<i32> {
   let PrintMethod = "printCmpMode";
 }
+def VecElement : Operand<i32> {
+  let PrintMethod = "printVecElement";
+}
 
 //===----------------------------------------------------------------------===//
 // NVPTX Instruction Predicate Definitions
@@ -134,6 +141,7 @@ def doMulWide      : Predicate<"doMulWide">;
 
 def allowFMA : Predicate<"allowFMA()">;
 def noFMA : Predicate<"!allowFMA()">;
+def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
 
 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
@@ -148,6 +156,7 @@ def true : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 
+def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
 
 //===----------------------------------------------------------------------===//
 // Some Common Instruction Class Templates
@@ -239,11 +248,11 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
 }
 
-// Template for instructions which take three fp64 or fp32 args.  The
+// Template for instructions which take three FP args.  The
 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
 //
 // Also defines ftz (flush subnormal inputs and results to sign-preserving
-// zero) variants for fp32 functions.
+// zero) variants for fp32/fp16 functions.
 //
 // This multiclass should be used for nodes that can be folded to make fma ops.
 // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
@@ -286,6 +295,32 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
                Requires<[allowFMA]>;
 
+   def f16rr_ftz :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+   def f16rr :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA]>;
+
+   def f16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+   def f16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA]>;
+
    // These have strange names so we don't perturb existing mir tests.
    def _rnf64rr :
      NVPTXInst<(outs Float64Regs:$dst),
@@ -323,6 +358,30 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
                Requires<[noFMA]>;
+   def _rnf16rr_ftz :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, noFMA, doF32FTZ]>;
+   def _rnf16rr :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, noFMA]>;
+   def _rnf16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, noFMA, doF32FTZ]>;
+   def _rnf16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, noFMA]>;
 }
 
 // Template for operations which take two f32 or f64 operands.  Provides three
@@ -358,57 +417,57 @@ let hasSideEffects = 0 in {
       NVPTXInst<(outs RC:$dst),
                 (ins Int16Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s8\t$dst, $src;"), []>;
+                FromName, ".s8 \t$dst, $src;"), []>;
     def _u8 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int16Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u8\t$dst, $src;"), []>;
+                FromName, ".u8 \t$dst, $src;"), []>;
     def _s16 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int16Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s16\t$dst, $src;"), []>;
+                FromName, ".s16 \t$dst, $src;"), []>;
     def _u16 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int16Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u16\t$dst, $src;"), []>;
-    def _f16 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int16Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".f16\t$dst, $src;"), []>;
+                FromName, ".u16 \t$dst, $src;"), []>;
     def _s32 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int32Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s32\t$dst, $src;"), []>;
+                FromName, ".s32 \t$dst, $src;"), []>;
     def _u32 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int32Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u32\t$dst, $src;"), []>;
+                FromName, ".u32 \t$dst, $src;"), []>;
     def _s64 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int64Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s64\t$dst, $src;"), []>;
+                FromName, ".s64 \t$dst, $src;"), []>;
     def _u64 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int64Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u64\t$dst, $src;"), []>;
+                FromName, ".u64 \t$dst, $src;"), []>;
+    def _f16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f16 \t$dst, $src;"), []>;
     def _f32 :
       NVPTXInst<(outs RC:$dst),
                 (ins Float32Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".f32\t$dst, $src;"), []>;
+                FromName, ".f32 \t$dst, $src;"), []>;
     def _f64 :
       NVPTXInst<(outs RC:$dst),
                 (ins Float64Regs:$src, CvtMode:$mode),
                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".f64\t$dst, $src;"), []>;
+                FromName, ".f64 \t$dst, $src;"), []>;
   }
 
   // Generate cvts from all types to all types.
@@ -416,11 +475,11 @@ let hasSideEffects = 0 in {
   defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
   defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
   defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
-  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
   defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
   defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
   defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
   defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+  defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
   defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
   defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
 
@@ -458,7 +517,7 @@ multiclass ADD_SUB_i1<SDNode OpNode> {
 defm ADD_i1 : ADD_SUB_i1<add>;
 defm SUB_i1 : ADD_SUB_i1<sub>;
 
-// int16, int32, and int64 signed addition.  Since nvptx is 2's compliment, we
+// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
 // also use these for unsigned arithmetic.
 defm ADD : I3<"add.s", add>;
 defm SUB : I3<"sub.s", sub>;
@@ -485,6 +544,24 @@ defm UDIV : I3<"div.u", udiv>;
 defm SREM : I3<"rem.s", srem>;
 defm UREM : I3<"rem.u", urem>;
 
+// Integer absolute value.  NumBits should be one minus the bit width of RC.
+// This idiom implements the algorithm at
+// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
+multiclass ABS<RegisterClass RC, int NumBits, string SizeName> {
+  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
+                  !strconcat("abs", SizeName, " \t$dst, $a;"),
+                  [(set RC:$dst, (xor (add (sra RC:$a, (i32 NumBits)), RC:$a),
+                                      (sra RC:$a, (i32 NumBits))))]>;
+}
+defm ABS_16 : ABS<Int16Regs, 15, ".s16">;
+defm ABS_32 : ABS<Int32Regs, 31, ".s32">;
+defm ABS_64 : ABS<Int64Regs, 63, ".s64">;
+
+// Integer min/max.
+defm SMAX : I3<"max.s", smax>;
+defm UMAX : I3<"max.u", umax>;
+defm SMIN : I3<"min.s", smin>;
+defm UMIN : I3<"min.u", umin>;
 
 //
 // Wide multiplication
@@ -748,6 +825,15 @@ def DoubleConst1 : PatLeaf<(fpimm), [{
          N->getValueAPF().convertToDouble() == 1.0;
 }]>;
 
+// Loads FP16 constant into a register.
+//
+// ptxas does not have hex representation for fp16, so we can't use
+// fp16 immediate values in .f16 instructions. Instead we have to load
+// the constant into a register using mov.b16.
+def LOAD_CONST_F16 :
+  NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
+            "mov.b16 \t$dst, $a;", []>;
+
 defm FADD : F3_fma_component<"add", fadd>;
 defm FSUB : F3_fma_component<"sub", fsub>;
 defm FMUL : F3_fma_component<"mul", fmul>;
@@ -908,18 +994,9 @@ def FDIV32ri_prec :
             Requires<[reqPTX20]>;
 
 //
-// F32 rsqrt
+// FMA
 //
 
-def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
-                       "rsqrt.approx.f32 \t$dst, $b;", []>;
-
-// Convert 1.0f/sqrt(x) to rsqrt.approx.f32.  (There is an rsqrt.approx.f64, but
-// it's emulated in software.)
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
-         (RSQRTF32approx1r Float32Regs:$b)>,
-         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
-
 multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
@@ -942,6 +1019,17 @@ multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred>
                        Requires<[Pred]>;
 }
 
+multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
+   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                       Requires<[useFP16Math, Pred]>;
+}
+
+defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
+defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
+defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
+defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
 defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
 defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
@@ -949,10 +1037,12 @@ defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
 // sin/cos
 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "sin.approx.f32 \t$dst, $src;",
-                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
+                      Requires<[allowUnsafeFPMath]>;
 def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "cos.approx.f32 \t$dst, $src;",
-                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
+                      Requires<[allowUnsafeFPMath]>;
 
 // Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
 // i.e. "poor man's fmod()"
@@ -1087,6 +1177,16 @@ defm SHL : SHIFT<"shl.b", shl>;
 defm SRA : SHIFT<"shr.s", sra>;
 defm SRL : SHIFT<"shr.u", srl>;
 
+// Bit-reverse
+def BREV32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+             "brev.b32 \t$dst, $a;",
+             [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
+def BREV64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
+             "brev.b64 \t$dst, $a;",
+             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
+
 //
 // Rotate: Use ptx shf instruction if available.
 //
@@ -1294,15 +1394,15 @@ let hasSideEffects = 0 in {
     def rr :
       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
-                           "\t$dst, $a, $b;"), []>;
+                           " \t$dst, $a, $b;"), []>;
     def ri :
       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
-                           "\t$dst, $a, $b;"), []>;
+                           " \t$dst, $a, $b;"), []>;
     def ir :
       NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
-                           "\t$dst, $a, $b;"), []>;
+                           " \t$dst, $a, $b;"), []>;
   }
 }
 
@@ -1317,6 +1417,19 @@ defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
 defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
 defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+def SETP_f16rr :
+      NVPTXInst<(outs Int1Regs:$dst),
+                (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
+                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
+                []>, Requires<[useFP16Math]>;
+
+def SETP_f16x2rr :
+      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
+                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
+                []>,
+                Requires<[useFP16Math]>;
+
 
 // FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
@@ -1326,13 +1439,13 @@ let hasSideEffects = 0 in {
   multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
     def rr : NVPTXInst<(outs Int32Regs:$dst),
                        (ins RC:$a, RC:$b, CmpMode:$cmp),
-                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
     def ri : NVPTXInst<(outs Int32Regs:$dst),
                        (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
-                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
     def ir : NVPTXInst<(outs Int32Regs:$dst),
                        (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
-                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
   }
 }
 
@@ -1345,6 +1458,7 @@ defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
 defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
 defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
 defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
 defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
 
@@ -1360,16 +1474,16 @@ let hasSideEffects = 0 in {
   multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
     def rr : NVPTXInst<(outs RC:$dst),
                        (ins RC:$a, RC:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
     def ri : NVPTXInst<(outs RC:$dst),
                        (ins RC:$a, ImmCls:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
     def ir : NVPTXInst<(outs RC:$dst),
                        (ins ImmCls:$a, RC:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
     def ii : NVPTXInst<(outs RC:$dst),
                        (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
   }
 
   multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
@@ -1377,22 +1491,22 @@ let hasSideEffects = 0 in {
     def rr :
       NVPTXInst<(outs RC:$dst),
                 (ins RC:$a, RC:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
                 [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
     def ri :
       NVPTXInst<(outs RC:$dst),
                 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
                 [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
     def ir :
       NVPTXInst<(outs RC:$dst),
                 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
     def ii :
       NVPTXInst<(outs RC:$dst),
                 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
   }
 }
@@ -1408,9 +1522,17 @@ defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
 defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
 defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
 defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
 defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
 defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
 
+def SELP_f16x2rr :
+    NVPTXInst<(outs Float16x2Regs:$dst),
+              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
+              "selp.b32 \t$dst, $a, $b, $p;",
+              [(set Float16x2Regs:$dst,
+                    (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
+
 //-----------------------------------
 // Data Movement (Load / Store, Move)
 //-----------------------------------
@@ -1472,6 +1594,9 @@ let IsSimpleMove=1, hasSideEffects=0 in {
   def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
                            "mov.u64 \t$dst, $sss;", []>;
 
+  def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
+                           // We have to use .b16 here as there's no mov.f16.
+                           "mov.b16 \t$dst, $src;", []>;
   def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                            "mov.f32 \t$dst, $src;", []>;
   def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
@@ -1633,6 +1758,26 @@ def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
 
 
 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+  // f16 -> pred
+  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math,doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+        Requires<[useFP16Math,doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math,doF32FTZ]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+
   // f32 -> pred
   def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
             (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
@@ -1658,6 +1803,26 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
             (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
 
+  // f16 -> i32
+  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math, doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+        Requires<[useFP16Math, doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math, doF32FTZ]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+
   // f32 -> i32
   def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
             (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
@@ -1825,40 +1990,39 @@ def RETURNNode :
 let mayLoad = 1 in {
   class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                  !strconcat(!strconcat("ld.param", opstr),
-                             "\t$dst, [retval0+$b];"),
+                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
                   []>;
 
   class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
                   !strconcat("ld.param.v2", opstr,
-                             "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
 
   class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
                         regclass:$dst4),
                   (ins i32imm:$b),
                   !strconcat("ld.param.v4", opstr,
-                             "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
                   []>;
 }
 
 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
       NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                !strconcat("mov", opstr, "\t$dst, retval$b;"),
+                !strconcat("mov", opstr, " \t$dst, retval$b;"),
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
 let mayStore = 1 in {
   class StoreParamInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                  !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
+                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
                   []>;
 
   class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
                                i32imm:$a, i32imm:$b),
                   !strconcat("st.param.v2", opstr,
-                             "\t[param$a+$b], {{$val, $val2}};"),
+                             " \t[param$a+$b], {{$val, $val2}};"),
                   []>;
 
   class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
@@ -1866,18 +2030,18 @@ let mayStore = 1 in {
                                regclass:$val4, i32imm:$a,
                                i32imm:$b),
                   !strconcat("st.param.v4", opstr,
-                             "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
                   []>;
 
   class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
-                  !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
+                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
                   []>;
 
   class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
                   !strconcat("st.param.v2", opstr,
-                             "\t[func_retval0+$a], {{$val, $val2}};"),
+                             " \t[func_retval0+$a], {{$val, $val2}};"),
                   []>;
 
   class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
@@ -1885,7 +2049,7 @@ let mayStore = 1 in {
                   (ins regclass:$val, regclass:$val2, regclass:$val3,
                        regclass:$val4, i32imm:$a),
                   !strconcat("st.param.v4", opstr,
-                             "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
                   []>;
 }
 
@@ -1941,10 +2105,16 @@ def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
 def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
 def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
 def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
+def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
+def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
 def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
 def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
+def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
 def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
 def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
+def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
 def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
 
 def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
@@ -1961,10 +2131,16 @@ def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
 def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
 def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
 
+def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
+def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
 def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
 def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
+def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
 def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
 def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
+def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
 def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
@@ -1981,9 +2157,15 @@ def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
 
 def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
 def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
+def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
 def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
 def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
+def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
 def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
+def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
 
 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
 def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
@@ -2057,17 +2239,18 @@ def DeclareScalarRegInst :
 
 class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
   NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
-            !strconcat("mov", asmstr, "\t$dst, $src;"),
+            !strconcat("mov", asmstr, " \t$dst, $src;"),
             [(set regclass:$dst, (MoveParam regclass:$src))]>;
 
 def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
 def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
 def MoveParamI16 :
   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "cvt.u16.u32\t$dst, $src;",
+            "cvt.u16.u32 \t$dst, $src;",
             [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
 def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
 def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
 
 class PseudoUseParamInst<NVPTXRegClass regclass> :
   NVPTXInst<(outs), (ins regclass:$src),
@@ -2128,6 +2311,8 @@ let mayLoad=1, hasSideEffects=0 in {
   defm LD_i16 : LD<Int16Regs>;
   defm LD_i32 : LD<Int32Regs>;
   defm LD_i64 : LD<Int64Regs>;
+  defm LD_f16 : LD<Float16Regs>;
+  defm LD_f16x2 : LD<Float16x2Regs>;
   defm LD_f32 : LD<Float32Regs>;
   defm LD_f64 : LD<Float64Regs>;
 }
@@ -2176,6 +2361,8 @@ let mayStore=1, hasSideEffects=0 in {
   defm ST_i16 : ST<Int16Regs>;
   defm ST_i32 : ST<Int32Regs>;
   defm ST_i64 : ST<Int64Regs>;
+  defm ST_f16 : ST<Float16Regs>;
+  defm ST_f16x2 : ST<Float16x2Regs>;
   defm ST_f32 : ST<Float32Regs>;
   defm ST_f64 : ST<Float64Regs>;
 }
@@ -2262,6 +2449,8 @@ let mayLoad=1, hasSideEffects=0 in {
   defm LDV_i16 : LD_VEC<Int16Regs>;
   defm LDV_i32 : LD_VEC<Int32Regs>;
   defm LDV_i64 : LD_VEC<Int64Regs>;
+  defm LDV_f16 : LD_VEC<Float16Regs>;
+  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
   defm LDV_f32 : LD_VEC<Float32Regs>;
   defm LDV_f64 : LD_VEC<Float64Regs>;
 }
@@ -2355,28 +2544,53 @@ let mayStore=1, hasSideEffects=0 in {
   defm STV_i16 : ST_VEC<Int16Regs>;
   defm STV_i32 : ST_VEC<Int32Regs>;
   defm STV_i64 : ST_VEC<Int64Regs>;
+  defm STV_f16 : ST_VEC<Float16Regs>;
+  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
   defm STV_f32 : ST_VEC<Float32Regs>;
   defm STV_f64 : ST_VEC<Float64Regs>;
 }
 
-
 //---- Conversion ----
 
 class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
   NVPTXRegClass regclassOut> :
            NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
-           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+           !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
      [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
 
+def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
+def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
 def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
 def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
 def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
 def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
+def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
 
 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
 // we cannot specify floating-point literals in isel patterns.  Therefore, we
 // use an integer selp to select either 1 or 0 and then cvt to floating-point.
 
+// sint -> f16
+def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
+          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
+          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
+          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
+          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f16
+def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
+          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
+          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
+          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
+          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
+
 // sint -> f32
 def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
           (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
@@ -2418,6 +2632,38 @@ def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
           (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
 
 
+// f16 -> sint
+def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
+          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
+          (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
+          (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
+          (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
+          (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
+          (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
+          (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
+
+// f16 -> uint
+def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
+          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
+          (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
+          (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
+          (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
+          (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
+          (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
+          (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
+
 // f32 -> sint
 def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
           (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
@@ -2562,6 +2808,9 @@ def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
 def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
           (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
+          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
 def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
           (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
@@ -2575,77 +2824,150 @@ let hasSideEffects = 0 in {
   def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
                              (ins Int16Regs:$s1, Int16Regs:$s2,
                                   Int16Regs:$s3, Int16Regs:$s4),
-                             "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
+                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
   def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
                              (ins Int16Regs:$s1, Int16Regs:$s2),
-                             "mov.b32\t$d, {{$s1, $s2}};", []>;
+                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
   def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
                              (ins Int32Regs:$s1, Int32Regs:$s2),
-                             "mov.b64\t$d, {{$s1, $s2}};", []>;
+                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
   def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
                              (ins Float32Regs:$s1, Float32Regs:$s2),
-                             "mov.b64\t$d, {{$s1, $s2}};", []>;
+                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
 
   // unpack a larger int register to a set of smaller int registers
   def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
                                    Int16Regs:$d3, Int16Regs:$d4),
                              (ins Int64Regs:$s),
-                             "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
   def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
                              (ins Int32Regs:$s),
-                             "mov.b32\t{{$d1, $d2}}, $s;", []>;
+                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
   def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
                              (ins Int64Regs:$s),
-                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
+                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
   def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
                              (ins Float64Regs:$s),
-                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
+                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+
+}
+
+let hasSideEffects = 0 in {
+  // Extract element of f16x2 register. PTX does not provide any way
+  // to access elements of f16x2 vector directly, so we need to
+  // extract it using a temporary register.
+  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
+                               (ins Float16x2Regs:$src),
+                               "{{ .reg .b16 \t%tmp_hi;\n\t"
+                               "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
+                               [(set Float16Regs:$dst,
+                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
+  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
+                               (ins Float16x2Regs:$src),
+                               "{{ .reg .b16 \t%tmp_lo;\n\t"
+                               "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
+                               [(set Float16Regs:$dst,
+                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
+
+  // Coalesce two f16 registers into f16x2
+  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
+                             (ins Float16Regs:$a, Float16Regs:$b),
+                             "mov.b32 \t$dst, {{$a, $b}};",
+                             [(set Float16x2Regs:$dst,
+                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
+
+  // Directly initializing underlying the b32 register is one less SASS
+  // instruction than than vector-packing move.
+  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
+                              "mov.b32 \t$dst, $src;",
+                              []>;
+
+  // Split f16x2 into two f16 registers.
+  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                              (ins Float16x2Regs:$src),
+                              "mov.b32 \t{{$lo, $hi}}, $src;",
+                              []>;
+  // Split an i32 into two f16
+  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                                   (ins Int32Regs:$src),
+                                   "mov.b32 \t{{$lo, $hi}}, $src;",
+                                   []>;
 }
 
 // Count leading zeros
 let hasSideEffects = 0 in {
   def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                         "clz.b32\t$d, $a;", []>;
+                         "clz.b32 \t$d, $a;", []>;
   def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                         "clz.b64\t$d, $a;", []>;
+                         "clz.b64 \t$d, $a;", []>;
 }
 
 // 32-bit has a direct PTX instruction
 def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
 
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// The return type of the ctlz ISD node is the same as its input, but the PTX
+// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
+// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+// truncating back down to 32 bits.
 def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
-// than 16 bits to store). We also need to subtract 16 because the
-// high-order 16 zeros were counted.
+// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+// result back to 16-bits if necessary.  We also need to subtract 16 because
+// the high-order 16 zeros were counted.
+//
+// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
+// use to save one SASS instruction (on sm_35 anyway):
+//
+//   mov.b32 $tmp, {0xffff, $a}
+//   ctlz.b32 $result, $tmp
+//
+// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
+// and then ctlz that value.  This way we don't have to subtract 16 from the
+// result.  Unfortunately today we don't have a way to generate
+// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
 def : Pat<(ctlz Int16Regs:$a),
-          (SUBi16ri (CVT_u16_u32 (CLZr32
-            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE), 16)>;
+          (SUBi16ri (CVT_u16_u32
+           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
 
 // Population count
 let hasSideEffects = 0 in {
   def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                          "popc.b32\t$d, $a;", []>;
+                          "popc.b32 \t$d, $a;", []>;
   def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                          "popc.b64\t$d, $a;", []>;
+                          "popc.b64 \t$d, $a;", []>;
 }
 
 // 32-bit has a direct PTX instruction
 def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
 
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
+// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
+// pattern that avoids the type conversion if we're truncating the result to
+// i32 anyway.
 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
 
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
-// than 16 bits to store)
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
+// If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop Int16Regs:$a),
           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
+
+// fpround f32 -> f16
+def : Pat<(f16 (fpround Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f16 (fpround Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
+// fpround f64 -> f16
+def : Pat<(f16 (fpround Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f16 (fpround Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
 
 // fpround f64 -> f32
 def : Pat<(f32 (fpround Float64Regs:$a)),
@@ -2653,6 +2975,18 @@ def : Pat<(f32 (fpround Float64Regs:$a)),
 def : Pat<(f32 (fpround Float64Regs:$a)),
           (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
 
+// fpextend f16 -> f32
+def : Pat<(f32 (fpextend Float16Regs:$a)),
+          (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend Float16Regs:$a)),
+          (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
+
+// fpextend f16 -> f64
+def : Pat<(f64 (fpextend Float16Regs:$a)),
+          (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend Float16Regs:$a)),
+          (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
+
 // fpextend f32 -> f64
 def : Pat<(f64 (fpextend Float32Regs:$a)),
           (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
@@ -2664,6 +2998,10 @@ def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
 
 // fceil, ffloor, fround, ftrunc.
 
+def : Pat<(fceil Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fceil Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
 def : Pat<(fceil Float32Regs:$a),
           (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(fceil Float32Regs:$a),
@@ -2671,6 +3009,10 @@ def : Pat<(fceil Float32Regs:$a),
 def : Pat<(fceil Float64Regs:$a),
           (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
 
+def : Pat<(ffloor Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ffloor Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
 def : Pat<(ffloor Float32Regs:$a),
           (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(ffloor Float32Regs:$a),
@@ -2678,6 +3020,10 @@ def : Pat<(ffloor Float32Regs:$a),
 def : Pat<(ffloor Float64Regs:$a),
           (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
 
+def : Pat<(fround Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f16 (fround Float16Regs:$a)),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
 def : Pat<(fround Float32Regs:$a),
           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fround Float32Regs:$a)),
@@ -2685,6 +3031,10 @@ def : Pat<(f32 (fround Float32Regs:$a)),
 def : Pat<(f64 (fround Float64Regs:$a)),
           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
 
+def : Pat<(ftrunc Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ftrunc Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
 def : Pat<(ftrunc Float32Regs:$a),
           (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(ftrunc Float32Regs:$a),
@@ -2696,6 +3046,10 @@ def : Pat<(ftrunc Float64Regs:$a),
 // strictly correct, because it causes us to ignore the rounding mode.  But it
 // matches what CUDA's "libm" does.
 
+def : Pat<(fnearbyint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fnearbyint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
 def : Pat<(fnearbyint Float32Regs:$a),
           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(fnearbyint Float32Regs:$a),
@@ -2703,6 +3057,10 @@ def : Pat<(fnearbyint Float32Regs:$a),
 def : Pat<(fnearbyint Float64Regs:$a),
           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
 
+def : Pat<(frint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(frint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
 def : Pat<(frint Float32Regs:$a),
           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(frint Float32Regs:$a),
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index b0408f12f5b1..8d228a9eeb74 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -36,33 +36,39 @@ let isConvergent = 1 in {
 def INT_BARRIER0 : NVPTXInst<(outs), (ins),
                   "bar.sync \t0;",
       [(int_nvvm_barrier0)]>;
+def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
+                  "bar.sync \t$src1;",
+      [(int_nvvm_barrier_n Int32Regs:$src1)]>;
+def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
+                  "bar.sync \t$src1, $src2;",
+      [(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>;
 def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
   !strconcat("{{ \n\t",
-      !strconcat(".reg .pred \t%p1; \n\t",
-      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
-      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
-        !strconcat("}}", ""))))),
+             ".reg .pred \t%p1; \n\t",
+             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
+             "bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+             "}}"),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
 def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
   !strconcat("{{ \n\t",
-      !strconcat(".reg .pred \t%p1; \n\t",
-      !strconcat(".reg .pred \t%p2; \n\t",
-      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
-      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
-      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
-        !strconcat("}}", ""))))))),
+             ".reg .pred \t%p1; \n\t",
+             ".reg .pred \t%p2; \n\t",
+             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
+             "bar.red.and.pred \t%p2, 0, %p1; \n\t",
+             "selp.u32 \t$dst, 1, 0, %p2; \n\t",
+             "}}"),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
 def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
   !strconcat("{{ \n\t",
-      !strconcat(".reg .pred \t%p1; \n\t",
-      !strconcat(".reg .pred \t%p2; \n\t",
-      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
-      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
-      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
-        !strconcat("}}", ""))))))),
+             ".reg .pred \t%p1; \n\t",
+             ".reg .pred \t%p2; \n\t",
+             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
+             "bar.red.or.pred \t%p2, 0, %p1; \n\t",
+             "selp.u32 \t$dst, 1, 0, %p2; \n\t",
+             "}}"),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
 
-def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
                              [(int_nvvm_bar_sync imm:$i)]>;
 
 // shfl.{up,down,bfly,idx}.b32
@@ -187,16 +193,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
 // MISC
 //
 
-def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_clz_i>;
-def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
-  int_nvvm_clz_ll>;
-
-def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_popc_i>;
-def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
-  int_nvvm_popc_ll>;
-
 def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
   Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
 
@@ -204,26 +200,6 @@ def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
 // Min Max
 //
 
-def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_min_i>;
-def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_min_ui>;
-
-def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_min_ll>;
-def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_min_ull>;
-
-def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_max_i>;
-def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_max_ui>;
-
-def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_max_ll>;
-def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_max_ull>;
-
 def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
   Float32Regs, Float32Regs, int_nvvm_fmin_f>;
 def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
@@ -239,6 +215,7 @@ def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
 def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
   Float64Regs, Float64Regs, int_nvvm_fmax_d>;
 
+
 //
 // Multiplication
 //
@@ -321,15 +298,6 @@ def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
 
 //
-// Brev
-//
-
-def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_brev32>;
-def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
-  int_nvvm_brev64>;
-
-//
 // Sad
 //
 
@@ -360,11 +328,6 @@ def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
 // Abs
 //
 
-def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_abs_i>;
-def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
-  int_nvvm_abs_ll>;
-
 def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
   Float32Regs, int_nvvm_fabs_ftz_f>;
 def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
@@ -703,16 +666,18 @@ def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
 def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
   Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
 
-def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
-                       !strconcat(".reg .b32 %temp; \n\t",
-             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
-               "}}"))),
-             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
-def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
-                       !strconcat(".reg .b32 %temp; \n\t",
-                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
-                           "}}"))),
-             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+def INT_NVVM_D2I_LO : F_MATH_1<
+  !strconcat("{{\n\t",
+             ".reg .b32 %temp; \n\t",
+             "mov.b64 \t{$dst, %temp}, $src0;\n\t",
+             "}}"),
+  Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<
+  !strconcat("{{\n\t",
+             ".reg .b32 %temp; \n\t",
+             "mov.b64 \t{%temp, $dst}, $src0;\n\t",
+             "}}"),
+  Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
 
 def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
           (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
@@ -803,49 +768,10 @@ def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
           (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
 
 
-// FIXME: Ideally, we could use these patterns instead of the scope-creating
-// patterns, but ptxas does not like these since .s16 is not compatible with
-// .f16.  The solution is to use .bXX for all integer register types, but we
-// are not there yet.
-//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
-//          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
-//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
-//          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
-//
-//def : Pat<(int_nvvm_h2f Int16Regs:$a),
-//          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
-
-def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
-                                   !strconcat(".reg .b16 %temp;\n\t",
-           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
-           !strconcat("mov.b16 \t$dst, %temp;\n",
-             "}}")))),
-                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
-def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
-                                   !strconcat(".reg .b16 %temp;\n\t",
-           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
-           !strconcat("mov.b16 \t$dst, %temp;\n",
-             "}}")))),
-           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
-
-def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
-                            !strconcat(".reg .b16 %temp;\n\t",
-          !strconcat("mov.b16 \t%temp, $src0;\n\t",
-          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
-            "}}")))),
-          Float32Regs, Int16Regs, int_nvvm_h2f>;
-
-def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
-          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
-
-def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
-          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
-          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+          (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ))>;
+def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+          (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
 
 //
 // Bitcast
@@ -882,20 +808,12 @@ multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, SDNode IMM, Predicate Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
   Requires<[Pred]>;
   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
   Requires<[Pred]>;
 }
 multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
@@ -911,21 +829,13 @@ multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, Predicate Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
-    !strconcat("{{ \n\t",
-         !strconcat(".reg \t.s",
-         !strconcat(TypeStr,
-         !strconcat(" temp; \n\t",
-         !strconcat("neg.s",
-         !strconcat(TypeStr,
-         !strconcat(" \ttemp, $b; \n\t",
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(".u",
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], temp; \n\t",
-           !strconcat("}}", "")))))))))))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+    !strconcat(
+      "{{ \n\t",
+      ".reg \t.s", TypeStr, " temp; \n\t",
+      "neg.s", TypeStr, " \ttemp, $b; \n\t",
+      "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
+      "}}"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
   Requires<[Pred]>;
 }
 multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
@@ -943,40 +853,26 @@ multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   Operand IMMType, Predicate Pred> {
   def reg : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, regclass:$b, regclass:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst,
-           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
-         Requires<[Pred]>;
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+
   def imm1 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, IMMType:$b, regclass:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
   Requires<[Pred]>;
+
   def imm2 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, regclass:$b, IMMType:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
   Requires<[Pred]>;
+
   def imm3 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
   Requires<[Pred]>;
 }
 multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
@@ -1607,6 +1503,8 @@ defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f16 : LDU_G<"b16 \t$result, [$src];", Float16Regs>;
+defm INT_PTX_LDU_GLOBAL_f16x2 : LDU_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
 defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
@@ -1657,6 +1555,10 @@ defm INT_PTX_LDU_G_v2i16_ELE
   : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDU_G_v2i32_ELE
   : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f16_ELE
+  : VLDU_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+defm INT_PTX_LDU_G_v2f16x2_ELE
+  : VLDU_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_G_v2f32_ELE
   : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDU_G_v2i64_ELE
@@ -1671,6 +1573,12 @@ defm INT_PTX_LDU_G_v4i16_ELE
 defm INT_PTX_LDU_G_v4i32_ELE
   : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE
+  : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE
+  : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float16x2Regs>;
 defm INT_PTX_LDU_G_v4f32_ELE
   : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Float32Regs>;
@@ -1710,6 +1618,10 @@ defm INT_PTX_LDG_GLOBAL_i32
   : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDG_GLOBAL_i64
   : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f16
+  : LDG_G<"b16 \t$result, [$src];", Float16Regs>;
+defm INT_PTX_LDG_GLOBAL_f16x2
+  : LDG_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_GLOBAL_f32
   : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDG_GLOBAL_f64
@@ -1765,6 +1677,10 @@ defm INT_PTX_LDG_G_v2i16_ELE
   : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v2i32_ELE
   : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f16_ELE
+  : VLDG_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+defm INT_PTX_LDG_G_v2f16x2_ELE
+  : VLDG_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v2f32_ELE
   : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDG_G_v2i64_ELE
@@ -1777,17 +1693,21 @@ defm INT_PTX_LDG_G_v4i16_ELE
   : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v4i32_ELE
   : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f16_ELE
+  : VLDG_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16Regs>;
+defm INT_PTX_LDG_G_v4f16x2_ELE
+  : VLDG_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v4f32_ELE
   : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
 
 
 multiclass NG_TO_G<string Str, Intrinsic Intrin> {
    def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+          !strconcat("cvta.", Str, ".u32 \t$result, $src;"),
       [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
    def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+          !strconcat("cvta.", Str, ".u64 \t$result, $src;"),
       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
 
@@ -1821,11 +1741,11 @@ multiclass NG_TO_G<string Str, Intrinsic Intrin> {
 
 multiclass G_TO_NG<string Str, Intrinsic Intrin> {
    def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+          !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"),
       [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
    def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+          !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"),
       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
    def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
@@ -1983,7 +1903,7 @@ def ISSPACEP_SHARED_64
 // Special register reads
 def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
                             (ins SpecialRegs:$r),
-                            "mov.b32\t$d, $r;", []>;
+                            "mov.b32 \t$d, $r;", []>;
 
 def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
 def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
@@ -2046,20 +1966,18 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
       Requires<[noHWROT32]> ;
 
 let hasSideEffects = 0 in {
-  def GET_LO_INT64
-    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
-                !strconcat("{{\n\t",
-                !strconcat(".reg .b32 %dummy;\n\t",
-                !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
-          !strconcat("}}", "")))),
+  def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+    !strconcat("{{\n\t",
+               ".reg .b32 %dummy;\n\t",
+               "mov.b64 \t{$dst,%dummy}, $src;\n\t",
+               "}}"),
           []> ;
 
-  def GET_HI_INT64
-    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
-                !strconcat("{{\n\t",
-                !strconcat(".reg .b32 %dummy;\n\t",
-                !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
-          !strconcat("}}", "")))),
+  def GET_HI_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+    !strconcat("{{\n\t",
+               ".reg .b32 %dummy;\n\t",
+               "mov.b64 \t{%dummy,$dst}, $src;\n\t",
+               "}}"),
           []> ;
 }
 
@@ -2164,19 +2082,19 @@ def TEX_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_F32_F32_LEVEL
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
-              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
 def TEX_1D_F32_F32_GRAD
@@ -2184,27 +2102,27 @@ def TEX_1D_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
 def TEX_1D_S32_F32_GRAD
@@ -2212,27 +2130,27 @@ def TEX_1D_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_U32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
 def TEX_1D_U32_F32_GRAD
@@ -2240,7 +2158,7 @@ def TEX_1D_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -2248,14 +2166,14 @@ def TEX_1D_ARRAY_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_F32_F32_LEVEL
@@ -2263,7 +2181,7 @@ def TEX_1D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_1D_ARRAY_F32_F32_GRAD
@@ -2271,21 +2189,21 @@ def TEX_1D_ARRAY_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_ARRAY_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_S32_F32_LEVEL
@@ -2293,7 +2211,7 @@ def TEX_1D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_1D_ARRAY_S32_F32_GRAD
@@ -2301,21 +2219,21 @@ def TEX_1D_ARRAY_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_ARRAY_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_U32_F32_LEVEL
@@ -2323,7 +2241,7 @@ def TEX_1D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_1D_ARRAY_U32_F32_GRAD
@@ -2331,7 +2249,7 @@ def TEX_1D_ARRAY_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -2339,14 +2257,14 @@ def TEX_2D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_F32_F32_LEVEL
@@ -2354,7 +2272,7 @@ def TEX_2D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_2D_F32_F32_GRAD
@@ -2363,7 +2281,7 @@ def TEX_2D_F32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2371,14 +2289,14 @@ def TEX_2D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_S32_F32_LEVEL
@@ -2386,7 +2304,7 @@ def TEX_2D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_2D_S32_F32_GRAD
@@ -2395,7 +2313,7 @@ def TEX_2D_S32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2403,14 +2321,14 @@ def TEX_2D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_U32_F32_LEVEL
@@ -2418,7 +2336,7 @@ def TEX_2D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_2D_U32_F32_GRAD
@@ -2427,7 +2345,7 @@ def TEX_2D_U32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2437,7 +2355,7 @@ def TEX_2D_ARRAY_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_F32_F32
@@ -2445,7 +2363,7 @@ def TEX_2D_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_F32_F32_LEVEL
@@ -2453,7 +2371,7 @@ def TEX_2D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_2D_ARRAY_F32_F32_GRAD
@@ -2462,7 +2380,7 @@ def TEX_2D_ARRAY_F32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2471,7 +2389,7 @@ def TEX_2D_ARRAY_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_S32_F32
@@ -2479,7 +2397,7 @@ def TEX_2D_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_S32_F32_LEVEL
@@ -2487,7 +2405,7 @@ def TEX_2D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_2D_ARRAY_S32_F32_GRAD
@@ -2497,7 +2415,7 @@ def TEX_2D_ARRAY_S32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2506,7 +2424,7 @@ def TEX_2D_ARRAY_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_U32_F32
@@ -2514,7 +2432,7 @@ def TEX_2D_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_U32_F32_LEVEL
@@ -2522,7 +2440,7 @@ def TEX_2D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_2D_ARRAY_U32_F32_GRAD
@@ -2532,7 +2450,7 @@ def TEX_2D_ARRAY_U32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2542,7 +2460,7 @@ def TEX_3D_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_F32_F32
@@ -2550,7 +2468,7 @@ def TEX_3D_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_F32_F32_LEVEL
@@ -2558,7 +2476,7 @@ def TEX_3D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_3D_F32_F32_GRAD
@@ -2569,7 +2487,7 @@ def TEX_3D_F32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -2579,7 +2497,7 @@ def TEX_3D_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_S32_F32
@@ -2587,7 +2505,7 @@ def TEX_3D_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_S32_F32_LEVEL
@@ -2595,7 +2513,7 @@ def TEX_3D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_3D_S32_F32_GRAD
@@ -2606,7 +2524,7 @@ def TEX_3D_S32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -2616,7 +2534,7 @@ def TEX_3D_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_U32_F32
@@ -2624,7 +2542,7 @@ def TEX_3D_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_U32_F32_LEVEL
@@ -2632,7 +2550,7 @@ def TEX_3D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_3D_U32_F32_GRAD
@@ -2643,7 +2561,7 @@ def TEX_3D_U32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -2654,7 +2572,7 @@ def TEX_CUBE_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_CUBE_F32_F32_LEVEL
@@ -2663,7 +2581,7 @@ def TEX_CUBE_F32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_CUBE_S32_F32
@@ -2671,7 +2589,7 @@ def TEX_CUBE_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_CUBE_S32_F32_LEVEL
@@ -2680,7 +2598,7 @@ def TEX_CUBE_S32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_CUBE_U32_F32
@@ -2688,7 +2606,7 @@ def TEX_CUBE_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_CUBE_U32_F32_LEVEL
@@ -2697,7 +2615,7 @@ def TEX_CUBE_U32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 
@@ -2706,7 +2624,7 @@ def TEX_CUBE_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_CUBE_ARRAY_F32_F32_LEVEL
@@ -2715,7 +2633,7 @@ def TEX_CUBE_ARRAY_F32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_CUBE_ARRAY_S32_F32
@@ -2723,7 +2641,7 @@ def TEX_CUBE_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_CUBE_ARRAY_S32_F32_LEVEL
@@ -2732,7 +2650,7 @@ def TEX_CUBE_ARRAY_S32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_CUBE_ARRAY_U32_F32
@@ -2740,7 +2658,7 @@ def TEX_CUBE_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_CUBE_ARRAY_U32_F32_LEVEL
@@ -2749,7 +2667,7 @@ def TEX_CUBE_ARRAY_U32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 
@@ -2757,84 +2675,84 @@ def TLD4_R_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_G_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_B_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_A_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_R_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_G_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_B_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_A_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_R_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_G_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_B_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_A_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 }
@@ -2847,19 +2765,19 @@ def TEX_UNIFIED_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_F32_F32_LEVEL
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
-              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_F32_F32_GRAD
@@ -2867,27 +2785,27 @@ def TEX_UNIFIED_1D_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_S32_F32_GRAD
@@ -2895,27 +2813,27 @@ def TEX_UNIFIED_1D_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_U32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_U32_F32_GRAD
@@ -2923,7 +2841,7 @@ def TEX_UNIFIED_1D_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -2931,14 +2849,14 @@ def TEX_UNIFIED_1D_ARRAY_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
@@ -2946,7 +2864,7 @@ def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
@@ -2954,21 +2872,21 @@ def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
@@ -2976,7 +2894,7 @@ def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
@@ -2984,21 +2902,21 @@ def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
@@ -3006,7 +2924,7 @@ def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
@@ -3014,7 +2932,7 @@ def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -3022,14 +2940,14 @@ def TEX_UNIFIED_2D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_F32_F32_LEVEL
@@ -3037,7 +2955,7 @@ def TEX_UNIFIED_2D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_F32_F32_GRAD
@@ -3046,7 +2964,7 @@ def TEX_UNIFIED_2D_F32_F32_GRAD
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3054,14 +2972,14 @@ def TEX_UNIFIED_2D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_S32_F32_LEVEL
@@ -3069,7 +2987,7 @@ def TEX_UNIFIED_2D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_S32_F32_GRAD
@@ -3078,7 +2996,7 @@ def TEX_UNIFIED_2D_S32_F32_GRAD
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3086,14 +3004,14 @@ def TEX_UNIFIED_2D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_U32_F32_LEVEL
@@ -3101,7 +3019,7 @@ def TEX_UNIFIED_2D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_U32_F32_GRAD
@@ -3110,7 +3028,7 @@ def TEX_UNIFIED_2D_U32_F32_GRAD
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3120,7 +3038,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_F32_F32
@@ -3128,7 +3046,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
@@ -3136,7 +3054,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
@@ -3145,7 +3063,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3154,7 +3072,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_S32_F32
@@ -3162,7 +3080,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
@@ -3170,7 +3088,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
@@ -3180,7 +3098,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3189,7 +3107,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_U32_F32
@@ -3197,7 +3115,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
@@ -3205,7 +3123,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
@@ -3215,7 +3133,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3225,7 +3143,7 @@ def TEX_UNIFIED_3D_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_F32_F32
@@ -3233,7 +3151,7 @@ def TEX_UNIFIED_3D_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_F32_F32_LEVEL
@@ -3241,7 +3159,7 @@ def TEX_UNIFIED_3D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_3D_F32_F32_GRAD
@@ -3252,7 +3170,7 @@ def TEX_UNIFIED_3D_F32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -3262,7 +3180,7 @@ def TEX_UNIFIED_3D_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_S32_F32
@@ -3270,7 +3188,7 @@ def TEX_UNIFIED_3D_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_S32_F32_LEVEL
@@ -3278,7 +3196,7 @@ def TEX_UNIFIED_3D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_3D_S32_F32_GRAD
@@ -3289,7 +3207,7 @@ def TEX_UNIFIED_3D_S32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -3299,7 +3217,7 @@ def TEX_UNIFIED_3D_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_U32_F32
@@ -3307,7 +3225,7 @@ def TEX_UNIFIED_3D_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_U32_F32_LEVEL
@@ -3315,7 +3233,7 @@ def TEX_UNIFIED_3D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_3D_U32_F32_GRAD
@@ -3326,7 +3244,7 @@ def TEX_UNIFIED_3D_U32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -3337,7 +3255,7 @@ def TEX_UNIFIED_CUBE_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_F32_F32_LEVEL
@@ -3346,7 +3264,7 @@ def TEX_UNIFIED_CUBE_F32_F32_LEVEL
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_S32_F32
@@ -3354,7 +3272,7 @@ def TEX_UNIFIED_CUBE_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_S32_F32_LEVEL
@@ -3363,7 +3281,7 @@ def TEX_UNIFIED_CUBE_S32_F32_LEVEL
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_U32_F32
@@ -3371,7 +3289,7 @@ def TEX_UNIFIED_CUBE_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_U32_F32_LEVEL
@@ -3380,7 +3298,7 @@ def TEX_UNIFIED_CUBE_U32_F32_LEVEL
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 
@@ -3389,7 +3307,7 @@ def TEX_UNIFIED_CUBE_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
@@ -3398,7 +3316,7 @@ def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_S32_F32
@@ -3406,7 +3324,7 @@ def TEX_UNIFIED_CUBE_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
@@ -3415,7 +3333,7 @@ def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_U32_F32
@@ -3423,7 +3341,7 @@ def TEX_UNIFIED_CUBE_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
@@ -3432,7 +3350,7 @@ def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 
@@ -3440,84 +3358,84 @@ def TLD4_UNIFIED_R_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_G_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_B_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_A_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_R_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_G_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_B_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_A_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_R_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_G_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_B_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_A_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 }
@@ -7172,12 +7090,12 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
 
 class PTX_READ_SREG_R64<string regname, Intrinsic intop>
   : NVPTXInst<(outs Int64Regs:$d), (ins),
-              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              !strconcat("mov.u64 \t$d, %", regname, ";"),
               [(set Int64Regs:$d, (intop))]>;
 
 class PTX_READ_SREG_R32<string regname, Intrinsic intop>
   : NVPTXInst<(outs Int32Regs:$d), (ins),
-              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              !strconcat("mov.u32 \t$d, %", regname, ";"),
               [(set Int32Regs:$d, (intop))]>;
 
 // TODO Add read vector-version of special registers
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index b925b632ee4a..3be291b48b8f 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 #define DEBUG_TYPE "nvptx"
 
@@ -54,188 +55,6 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
 char NVPTXLowerAggrCopies::ID = 0;
 
-// Lower memcpy to loop.
-void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
-                         Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
-                         bool DstIsVolatile, LLVMContext &Context,
-                         Function &F) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-  BasicBlock *NewBB =
-      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
-
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // SrcAddr and DstAddr are expected to be pointer types,
-  // so no check is made here.
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
-  // Cast pointers to (char *)
-  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
-  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
-
-  // load from SrcAddr+LoopIndex
-  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
-  // word-sized loads and stores.
-  Value *Element =
-      LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
-                                 LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
-                             SrcIsVolatile);
-  // store at DstAddr+LoopIndex
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
-                                                        DstAddr, LoopIndex),
-                          DstIsVolatile);
-
-  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
-// Lower memmove to IR. memmove is required to correctly copy overlapping memory
-// regions; therefore, it has to check the relative positions of the source and
-// destination pointers and choose the copy direction accordingly.
-//
-// The code below is an IR rendition of this C function:
-//
-// void* memmove(void* dst, const void* src, size_t n) {
-//   unsigned char* d = dst;
-//   const unsigned char* s = src;
-//   if (s < d) {
-//     // copy backwards
-//     while (n--) {
-//       d[n] = s[n];
-//     }
-//   } else {
-//     // copy forward
-//     for (size_t i = 0; i < n; ++i) {
-//       d[i] = s[i];
-//     }
-//   }
-//   return dst;
-// }
-void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
-                          Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
-                          bool DstIsVolatile, LLVMContext &Context,
-                          Function &F) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-
-  // Create the a comparison of src and dst, based on which we jump to either
-  // the forward-copy part of the function (if src >= dst) or the backwards-copy
-  // part (if src < dst).
-  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
-  // structure. Its block terminators (unconditional branches) are replaced by
-  // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
-                                      SrcAddr, DstAddr, "compare_src_dst");
-  TerminatorInst *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
-                                &ElseTerm);
-
-  // Each part of the function consists of two blocks:
-  //   copy_backwards:        used to skip the loop when n == 0
-  //   copy_backwards_loop:   the actual backwards loop BB
-  //   copy_forward:          used to skip the loop when n == 0
-  //   copy_forward_loop:     the actual forward loop BB
-  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
-  CopyBackwardsBB->setName("copy_backwards");
-  BasicBlock *CopyForwardBB = ElseTerm->getParent();
-  CopyForwardBB->setName("copy_forward");
-  BasicBlock *ExitBB = ConvertedInst->getParent();
-  ExitBB->setName("memmove_done");
-
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
-  // between both backwards and forward copy clauses.
-  ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
-
-  // Copying backwards.
-  BasicBlock *LoopBB =
-      BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  Value *IndexPtr = LoopBuilder.CreateSub(
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateLoad(
-      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
-  LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
-      ExitBB, LoopBB);
-  LoopPhi->addIncoming(IndexPtr, LoopBB);
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
-  ThenTerm->eraseFromParent();
-
-  // Copying forward.
-  BasicBlock *FwdLoopBB =
-      BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *FwdElement = FwdLoopBuilder.CreateLoad(
-      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
-  FwdLoopBuilder.CreateStore(
-      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
-                              ExitBB, FwdLoopBB);
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
-  ElseTerm->eraseFromParent();
-}
-
-// Lower memset to loop.
-void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
-                         Value *CopyLen, Value *SetValue, LLVMContext &Context,
-                         Function &F) {
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-  BasicBlock *NewBB =
-      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
-
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // Cast pointer to the type of value getting stored
-  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-  DstAddr = Builder.CreateBitCast(DstAddr,
-                                  PointerType::get(SetValue->getType(), dstAS));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
-  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
-
-  LoopBuilder.CreateStore(
-      SetValue,
-      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
-      false);
-
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
 bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   SmallVector<LoadInst *, 4> AggrLoads;
   SmallVector<MemIntrinsic *, 4> MemCalls;
@@ -287,13 +106,13 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
     Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
 
-    convertMemCpyToLoop(/* ConvertedInst */ SI,
-                        /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
-                        /* CopyLen */ CopyLen,
-                        /* SrcIsVolatile */ LI->isVolatile(),
-                        /* DstIsVolatile */ SI->isVolatile(),
-                        /* Context */ Context,
-                        /* Function F */ F);
+    createMemCpyLoop(/* ConvertedInst */ SI,
+                     /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                     /* CopyLen */ CopyLen,
+                     /* SrcAlign */ LI->getAlignment(),
+                     /* DestAlign */ SI->getAlignment(),
+                     /* SrcIsVolatile */ LI->isVolatile(),
+                     /* DstIsVolatile */ SI->isVolatile());
 
     SI->eraseFromParent();
     LI->eraseFromParent();
@@ -302,31 +121,11 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   // Transform mem* intrinsic calls.
   for (MemIntrinsic *MemCall : MemCalls) {
     if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
-      convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
-                          /* SrcAddr */ Memcpy->getRawSource(),
-                          /* DstAddr */ Memcpy->getRawDest(),
-                          /* CopyLen */ Memcpy->getLength(),
-                          /* SrcIsVolatile */ Memcpy->isVolatile(),
-                          /* DstIsVolatile */ Memcpy->isVolatile(),
-                          /* Context */ Context,
-                          /* Function F */ F);
+      expandMemCpyAsLoop(Memcpy);
     } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
-      convertMemMoveToLoop(/* ConvertedInst */ Memmove,
-                           /* SrcAddr */ Memmove->getRawSource(),
-                           /* DstAddr */ Memmove->getRawDest(),
-                           /* CopyLen */ Memmove->getLength(),
-                           /* SrcIsVolatile */ Memmove->isVolatile(),
-                           /* DstIsVolatile */ Memmove->isVolatile(),
-                           /* Context */ Context,
-                           /* Function F */ F);
-
+      expandMemMoveAsLoop(Memmove);
     } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
-      convertMemSetToLoop(/* ConvertedInst */ Memset,
-                          /* DstAddr */ Memset->getRawDest(),
-                          /* CopyLen */ Memset->getLength(),
-                          /* SetValue */ Memset->getValue(),
-                          /* Context */ Context,
-                          /* Function F */ F);
+      expandMemSetAsLoop(Memset);
     }
     MemCall->eraseFromParent();
   }
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 3f0c7be7863d..5b626cbcd5ba 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -159,7 +159,8 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
-  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace();
+  AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index eab5ee80561e..86a28f7d0700 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -27,6 +27,13 @@ void NVPTXFloatMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
   switch (Kind) {
   default: llvm_unreachable("Invalid kind!");
+  case VK_NVPTX_HALF_PREC_FLOAT:
+    // ptxas does not have a way to specify half-precision floats.
+    // Instead we have to print and load fp16 constants as .b16
+    OS << "0x";
+    NumHex = 4;
+    APF.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+    break;
   case VK_NVPTX_SINGLE_PREC_FLOAT:
     OS << "0f";
     NumHex = 8;
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 7f833c42fa8f..95741d9b0451 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -22,8 +22,9 @@ class NVPTXFloatMCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
     VK_NVPTX_None,
-    VK_NVPTX_SINGLE_PREC_FLOAT,   // FP constant in single-precision
-    VK_NVPTX_DOUBLE_PREC_FLOAT    // FP constant in double-precision
+    VK_NVPTX_HALF_PREC_FLOAT,   // FP constant in half-precision
+    VK_NVPTX_SINGLE_PREC_FLOAT, // FP constant in single-precision
+    VK_NVPTX_DOUBLE_PREC_FLOAT  // FP constant in double-precision
   };
 
 private:
@@ -40,6 +41,11 @@ public:
   static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt,
                                         MCContext &Ctx);
 
+  static const NVPTXFloatMCExpr *createConstantFPHalf(const APFloat &Flt,
+                                                        MCContext &Ctx) {
+    return create(VK_NVPTX_HALF_PREC_FLOAT, Flt, Ctx);
+  }
+
   static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt,
                                                         MCContext &Ctx) {
     return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 49e639793efc..e10b046f7c97 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -113,7 +113,7 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
       BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
               Root.getOperand(0).getReg())
           .addReg(NVPTX::VRFrameLocal)
-          .addOperand(Prev.getOperand(2));
+          .add(Prev.getOperand(2));
 
   MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
 
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 6cbf0604d7ef..8d46694fbe50 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -27,12 +27,19 @@ using namespace llvm;
 
 namespace llvm {
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
-  if (RC == &NVPTX::Float32RegsRegClass) {
+  if (RC == &NVPTX::Float32RegsRegClass)
     return ".f32";
-  }
-  if (RC == &NVPTX::Float64RegsRegClass) {
+  if (RC == &NVPTX::Float16RegsRegClass)
+    // Ideally fp16 registers should be .f16, but this syntax is only
+    // supported on sm_53+. On the other hand, .b16 registers are
+    // accepted for all supported fp16 instructions on all GPU
+    // variants, so we can use them instead.
+    return ".b16";
+  if (RC == &NVPTX::Float16x2RegsRegClass)
+    return ".b32";
+  if (RC == &NVPTX::Float64RegsRegClass)
     return ".f64";
-  } else if (RC == &NVPTX::Int64RegsRegClass) {
+  if (RC == &NVPTX::Int64RegsRegClass)
     // We use untyped (.b) integer registers here as NVCC does.
     // Correctness of generated code does not depend on register type,
     // but using .s/.u registers runs into ptxas bug that prevents
@@ -52,40 +59,37 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
     //   add.f16v2 rb32,rb32,rb32; // OK
     //   add.f16v2 rs32,rs32,rs32; // OK
     return ".b64";
-  } else if (RC == &NVPTX::Int32RegsRegClass) {
+  if (RC == &NVPTX::Int32RegsRegClass)
     return ".b32";
-  } else if (RC == &NVPTX::Int16RegsRegClass) {
+  if (RC == &NVPTX::Int16RegsRegClass)
     return ".b16";
-  } else if (RC == &NVPTX::Int1RegsRegClass) {
+  if (RC == &NVPTX::Int1RegsRegClass)
     return ".pred";
-  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+  if (RC == &NVPTX::SpecialRegsRegClass)
     return "!Special!";
-  } else {
-    return "INTERNAL";
-  }
-  return "";
+  return "INTERNAL";
 }
 
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
-  if (RC == &NVPTX::Float32RegsRegClass) {
+  if (RC == &NVPTX::Float32RegsRegClass)
     return "%f";
-  }
-  if (RC == &NVPTX::Float64RegsRegClass) {
+  if (RC == &NVPTX::Float16RegsRegClass)
+    return "%h";
+  if (RC == &NVPTX::Float16x2RegsRegClass)
+    return "%hh";
+  if (RC == &NVPTX::Float64RegsRegClass)
     return "%fd";
-  } else if (RC == &NVPTX::Int64RegsRegClass) {
+  if (RC == &NVPTX::Int64RegsRegClass)
     return "%rd";
-  } else if (RC == &NVPTX::Int32RegsRegClass) {
+  if (RC == &NVPTX::Int32RegsRegClass)
     return "%r";
-  } else if (RC == &NVPTX::Int16RegsRegClass) {
+  if (RC == &NVPTX::Int16RegsRegClass)
     return "%rs";
-  } else if (RC == &NVPTX::Int1RegsRegClass) {
+  if (RC == &NVPTX::Int1RegsRegClass)
     return "%p";
-  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+  if (RC == &NVPTX::SpecialRegsRegClass)
     return "!Special!";
-  } else {
-    return "INTERNAL";
-  }
-  return "";
+  return "INTERNAL";
 }
 }
 
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index ff6ccc457db7..f04764a9e9a3 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -36,6 +36,8 @@ foreach i = 0-4 in {
   def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
   def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
   def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+  def H#i  : NVPTXReg<"%h"#i>;  // 16-bit float
+  def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float
   def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
   def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
 
@@ -57,6 +59,8 @@ def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
 def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
 def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
 def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>;
+def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;
 def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
 def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
 def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index b0472de980fc..d736eaa41301 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -31,7 +31,7 @@ public:
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
-  void PrintSwitchToSection(const MCAsmInfo &MAI,
+  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override {}
 
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 6e1f427ed021..acbee86ae386 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -23,6 +23,11 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "NVPTXGenSubtargetInfo.inc"
 
+static cl::opt<bool>
+    NoF16Math("nvptx-no-f16-math", cl::ZeroOrMore, cl::Hidden,
+              cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
+              cl::init(false));
+
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
@@ -57,3 +62,7 @@ bool NVPTXSubtarget::hasImageHandles() const {
   // Disabled, otherwise
   return false;
 }
+
+bool NVPTXSubtarget::allowFP16Math() const {
+  return hasFP16Math() && NoF16Math == false;
+}
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index da020a94bcdd..96618cf46373 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -101,6 +101,8 @@ public:
   inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
   inline bool hasROT64() const { return SmVersion >= 20; }
   bool hasImageHandles() const;
+  bool hasFP16Math() const { return SmVersion >= 53; }
+  bool allowFP16Math() const;
 
   unsigned int getSmVersion() const { return SmVersion; }
   std::string getTargetName() const { return TargetName; }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index eb357e0a4d50..ab5298d0dcfd 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -50,7 +51,6 @@ void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
-void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
@@ -70,7 +70,6 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeGenericToNVVMPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
   initializeNVPTXAssignValidGlobalNamesPass(PR);
-  initializeNVPTXInferAddressSpacesPass(PR);
   initializeNVPTXLowerArgsPass(PR);
   initializeNVPTXLowerAllocaPass(PR);
   initializeNVPTXLowerAggrCopiesPass(PR);
@@ -167,9 +166,13 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new NVPTXPassConfig(this, PM);
 }
 
-void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
-  PM.add(createNVVMReflectPass());
-  PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+  Builder.addExtension(
+    PassManagerBuilder::EP_EarlyAsPossible,
+    [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      PM.add(createNVVMReflectPass());
+      PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+    });
 }
 
 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
@@ -190,7 +193,7 @@ void NVPTXPassConfig::addAddressSpaceInferencePasses() {
   // be eliminated by SROA.
   addPass(createSROAPass());
   addPass(createNVPTXLowerAllocaPass());
-  addPass(createNVPTXInferAddressSpacesPass());
+  addPass(createInferAddressSpacesPass());
 }
 
 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 78a053831772..1ed8e3b1e935 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -61,7 +61,8 @@ public:
     return TLOF.get();
   }
 
-  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+  void adjustPassManager(PassManagerBuilder &) override;
+
   TargetIRAnalysis getTargetIRAnalysis() override;
 
 }; // NVPTXTargetMachine.
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index b6c271ae4cbc..03075b550429 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -45,6 +45,10 @@ public:
 
   bool isSourceOfDivergence(const Value *V);
 
+  unsigned getFlatAddressSpace() const {
+    return AddressSpace::ADDRESS_SPACE_GENERIC;
+  }
+
   // Increase the inlining cost threshold by a factor of 5, reflecting that
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() { return 5; }
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index c639c4dc0683..152b665d0fdc 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -10,11 +10,10 @@
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
 // with an integer.
 //
-// We choose the value we use by looking, in this order, at:
-//
-//  * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
-//  * the StringMap passed to the pass's constructor, and
-//  * metadata in the module itself.
+// We choose the value we use by looking at metadata in the module itself.  Note
+// that we intentionally only have one way to choose these values, because other
+// parts of LLVM (particularly, InstCombineCall) rely on being able to predict
+// the values chosen by this pass.
 //
 // If we see an unknown string, we replace its call with 0.
 //
@@ -49,30 +48,17 @@ namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
 class NVVMReflect : public FunctionPass {
-private:
-  StringMap<int> VarMap;
-
 public:
   static char ID;
-  NVVMReflect() : NVVMReflect(StringMap<int>()) {}
-
-  NVVMReflect(const StringMap<int> &Mapping)
-      : FunctionPass(ID), VarMap(Mapping) {
+  NVVMReflect() : FunctionPass(ID) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-    setVarMap();
   }
 
   bool runOnFunction(Function &) override;
-
-private:
-  void setVarMap();
 };
 }
 
 FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
-FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
-  return new NVVMReflect(Mapping);
-}
 
 static cl::opt<bool>
 NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
@@ -83,35 +69,6 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
-static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
-            cl::desc("A list of string=num assignments"),
-            cl::ValueRequired);
-
-/// The command line can look as follows :
-/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
-/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
-/// ReflectList vector. First, each of ReflectList[i] is 'split'
-/// using "," as the delimiter. Then each of this part is split
-/// using "=" as the delimiter.
-void NVVMReflect::setVarMap() {
-  for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
-    SmallVector<StringRef, 4> NameValList;
-    StringRef(ReflectList[i]).split(NameValList, ',');
-    for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
-      SmallVector<StringRef, 2> NameValPair;
-      NameValList[j].split(NameValPair, '=');
-      assert(NameValPair.size() == 2 && "name=val expected");
-      std::stringstream ValStream(NameValPair[1]);
-      int Val;
-      ValStream >> Val;
-      assert((!(ValStream.fail())) && "integer value expected");
-      VarMap[NameValPair[0]] = Val;
-    }
-  }
-}
-
 bool NVVMReflect::runOnFunction(Function &F) {
   if (!NVVMReflectEnabled)
     return false;
@@ -199,11 +156,10 @@ bool NVVMReflect::runOnFunction(Function &F) {
     DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
 
     int ReflectVal = 0; // The default value is 0
-    auto Iter = VarMap.find(ReflectArg);
-    if (Iter != VarMap.end())
-      ReflectVal = Iter->second;
-    else if (ReflectArg == "__CUDA_FTZ") {
-      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+    if (ReflectArg == "__CUDA_FTZ") {
+      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.  Our
+      // choice here must be kept in sync with AutoUpgrade, which uses the same
+      // technique to detect whether ftz is enabled.
       if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
               F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
         ReflectVal = Flag->getSExtValue();
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 4842c3b7a656..7ca4c1999003 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_target(PowerPCCodeGen
   PPCVSXCopy.cpp
   PPCVSXFMAMutate.cpp
   PPCVSXSwapRemoval.cpp
+  PPCExpandISEL.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5847b3a52bfc..4863ac542736 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -114,7 +114,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 017d21af08a8..a00b56af0490 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -11,22 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOpcodes.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -34,10 +40,8 @@ using namespace llvm;
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
-class PPCMCCodeEmitter : public MCCodeEmitter {
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
-  void operator=(const PPCMCCodeEmitter &) = delete;
 
+class PPCMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   const MCContext &CTX;
   bool IsLittleEndian;
@@ -46,8 +50,9 @@ public:
   PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : MCII(mcii), CTX(ctx),
         IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {}
-
-  ~PPCMCCodeEmitter() override {}
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
+  void operator=(const PPCMCCodeEmitter &) = delete;
+  ~PPCMCCodeEmitter() override = default;
 
   unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
@@ -103,6 +108,7 @@ public:
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override {
@@ -137,7 +143,7 @@ public:
       }
       break;
     default:
-      llvm_unreachable ("Invalid instruction size");
+      llvm_unreachable("Invalid instruction size");
     }
     
     ++MCNumEmitted;  // Keep track of the # of mi's emitted.
@@ -238,7 +244,6 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
   return RegBits;
 }
 
-
 unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
@@ -286,7 +291,6 @@ unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
   return reverseBits(Imm | RegBits) >> 22;
 }
 
-
 unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
                                               SmallVectorImpl<MCFixup> &Fixups,
                                               const MCSubtargetInfo &STI)
@@ -302,7 +306,6 @@ unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
   return reverseBits(Imm | RegBits) >> 22;
 }
 
-
 unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
                                               SmallVectorImpl<MCFixup> &Fixups,
                                               const MCSubtargetInfo &STI)
@@ -318,7 +321,6 @@ unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
   return reverseBits(Imm | RegBits) >> 22;
 }
 
-
 unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
@@ -383,7 +385,5 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return MO.getImm();
 }
 
-
-
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "PPCGenMCCodeEmitter.inc"
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index bbd10e5b260f..2d686f227919 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -11,22 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCMCTargetDesc.h"
 #include "InstPrinter/PPCInstPrinter.h"
-#include "PPCMCAsmInfo.h"
+#include "MCTargetDesc/PPCMCAsmInfo.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPCTargetStreamer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -41,9 +48,10 @@ using namespace llvm;
 #include "PPCGenRegisterInfo.inc"
 
 // Pin the vtable to this file.
-PPCTargetStreamer::~PPCTargetStreamer() {}
 PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
+PPCTargetStreamer::~PPCTargetStreamer() = default;
+
 static MCInstrInfo *createPPCMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitPPCMCInstrInfo(X);
@@ -96,12 +104,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
 }
 
 namespace {
+
 class PPCTargetAsmStreamer : public PPCTargetStreamer {
   formatted_raw_ostream &OS;
 
 public:
   PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
       : PPCTargetStreamer(S), OS(OS) {}
+
   void emitTCEntry(const MCSymbol &S) override {
     OS << "\t.tc ";
     OS << S.getName();
@@ -109,12 +119,15 @@ public:
     OS << S.getName();
     OS << '\n';
   }
+
   void emitMachine(StringRef CPU) override {
     OS << "\t.machine " << CPU << '\n';
   }
+
   void emitAbiVersion(int AbiVersion) override {
     OS << "\t.abiversion " << AbiVersion << '\n';
   }
+
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
 
@@ -129,18 +142,22 @@ public:
 class PPCTargetELFStreamer : public PPCTargetStreamer {
 public:
   PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+
   MCELFStreamer &getStreamer() {
     return static_cast<MCELFStreamer &>(Streamer);
   }
+
   void emitTCEntry(const MCSymbol &S) override {
     // Creates a R_PPC64_TOC relocation
     Streamer.EmitValueToAlignment(8);
     Streamer.EmitSymbolValue(&S, 8);
   }
+
   void emitMachine(StringRef CPU) override {
     // FIXME: Is there anything to do in here or does this directive only
     // limit the parser?
   }
+
   void emitAbiVersion(int AbiVersion) override {
     MCAssembler &MCA = getStreamer().getAssembler();
     unsigned Flags = MCA.getELFHeaderEFlags();
@@ -148,6 +165,7 @@ public:
     Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
     MCA.setELFHeaderEFlags(Flags);
   }
+
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     MCAssembler &MCA = getStreamer().getAssembler();
 
@@ -170,6 +188,7 @@ public:
     if ((Flags & ELF::EF_PPC64_ABI) == 0)
       MCA.setELFHeaderEFlags(Flags | 2);
   }
+
   void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
     auto *Symbol = cast<MCSymbolELF>(S);
     // When encoding an assignment to set symbol A to symbol B, also copy
@@ -188,21 +207,26 @@ public:
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
 public:
   PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+
   void emitTCEntry(const MCSymbol &S) override {
     llvm_unreachable("Unknown pseudo-op: .tc");
   }
+
   void emitMachine(StringRef CPU) override {
     // FIXME: We should update the CPUType, CPUSubType in the Object file if
     // the new values are different from the defaults.
   }
+
   void emitAbiVersion(int AbiVersion) override {
     llvm_unreachable("Unknown pseudo-op: .abiversion");
   }
+
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     llvm_unreachable("Unknown pseudo-op: .localentry");
   }
 };
-}
+
+} // end anonymous namespace
 
 static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 0989e0c8e268..893233ee2300 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -17,23 +17,22 @@
 // GCC #defines PPC on Linux but we use it as our namespace name
 #undef PPC
 
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/MathExtras.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
-class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
 class Triple;
 class StringRef;
 class raw_pwrite_stream;
-class raw_ostream;
 
 Target &getThePPC32Target();
 Target &getThePPC64Target();
@@ -83,7 +82,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
   return false;
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
 // Generated files will use "namespace PPC". To avoid symbol clash,
 // undefine PPC here. PPC may be predefined on some hosts.
@@ -103,4 +102,4 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
 #define GET_SUBTARGETINFO_ENUM
 #include "PPCGenSubtargetInfo.inc"
 
-#endif
+#endif // LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index e01f49dce81e..38ae62b26757 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -45,11 +45,13 @@ namespace llvm {
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
+  FunctionPass *createPPCExpandISELPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
+  void initializePPCExpandISELPass(PassRegistry &);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f0e0ebc4946c..1f181d007f63 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -112,7 +112,9 @@ public:
     void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
     bool runOnMachineFunction(MachineFunction &MF) override {
       Subtarget = &MF.getSubtarget<PPCSubtarget>();
-      return AsmPrinter::runOnMachineFunction(MF);
+      bool Changed = AsmPrinter::runOnMachineFunction(MF);
+      emitXRayTable();
+      return Changed;
     }
   };
 
@@ -134,6 +136,7 @@ public:
 
     void EmitFunctionBodyStart() override;
     void EmitFunctionBodyEnd() override;
+    void EmitInstruction(const MachineInstr *MI) override;
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -402,7 +405,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
                                       .addImm(CallTarget & 0xFFFF));
 
       // Save the current TOC pointer before the remote call.
-      int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+      int TOCSaveOffset = Subtarget->getFrameLowering()->getTOCSaveOffset();
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
                                       .addReg(PPC::X2)
                                       .addImm(TOCSaveOffset)
@@ -1046,6 +1049,97 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (!Subtarget->isPPC64())
+    return PPCAsmPrinter::EmitInstruction(MI);
+
+  switch (MI->getOpcode()) {
+  default:
+    return PPCAsmPrinter::EmitInstruction(MI);
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+    // .begin:
+    //   b .end # lis 0, FuncId[16..32]
+    //   nop    # li  0, FuncId[0..15]
+    //   std 0, -8(1)
+    //   mflr 0
+    //   bl __xray_FunctionEntry
+    //   mtlr 0
+    // .end:
+    //
+    // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+    // of instructions change.
+    MCSymbol *BeginOfSled = OutContext.createTempSymbol();
+    MCSymbol *EndOfSled = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(BeginOfSled);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::B).addExpr(
+                       MCSymbolRefExpr::create(EndOfSled, OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL8_NOP)
+                       .addExpr(MCSymbolRefExpr::create(
+                           OutContext.getOrCreateSymbol("__xray_FunctionEntry"),
+                           OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    OutStreamer->EmitLabel(EndOfSled);
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
+    break;
+  }
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT: {
+    // .p2align 3
+    // .begin:
+    //   b(lr)? # lis 0, FuncId[16..32]
+    //   nop    # li  0, FuncId[0..15]
+    //   std 0, -8(1)
+    //   mflr 0
+    //   bl __xray_FunctionExit
+    //   mtlr 0
+    // .end:
+    //   b(lr)?
+    //
+    // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+    // of instructions change.
+    const MachineInstr *Next = [&] {
+      MachineBasicBlock::const_iterator It(MI);
+      assert(It != MI->getParent()->end());
+      ++It;
+      assert(It->isReturn());
+      return &*It;
+    }();
+    OutStreamer->EmitCodeAlignment(8);
+    MCSymbol *BeginOfSled = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(BeginOfSled);
+    MCInst TmpInst;
+    LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL8_NOP)
+                       .addExpr(MCSymbolRefExpr::create(
+                           OutContext.getOrCreateSymbol("__xray_FunctionExit"),
+                           OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
+    break;
+  }
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+  case TargetOpcode::PATCHABLE_RET:
+    // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really
+    // lower to a PPC::B instruction. The PPC::B instruction is generated
+    // before it, and handled by the normal case.
+    llvm_unreachable("Tail call is handled in the normal case. See comments"
+                     "around this assert.");
+  }
+}
+
 void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
     PPCTargetStreamer *TS =
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index ae76386fdfb6..b7d3154d0000 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -78,7 +78,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
   BlockSizes.resize(Fn.getNumBlockIDs());
 
   auto GetAlignmentAdjustment =
-    [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
+    [](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
     unsigned Align = MBB.getAlignment();
     if (!Align)
       return 0;
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 2c62a0f1d909..70c4170653ae 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -298,15 +298,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
               return true;
             else
               continue; // ISD::FCOPYSIGN is never a library call.
-          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
-          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
-          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
-          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
-          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
-          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
-          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
-          case Intrinsic::minnum:    Opcode = ISD::FMINNUM;    break;
-          case Intrinsic::maxnum:    Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
+          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
           }
         }
 
@@ -315,7 +317,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
         // (i.e. soft float or atomics). If adapting for targets that do,
         // additional care is required here.
 
-        LibFunc::Func Func;
+        LibFunc Func;
         if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
             LibInfo->getLibFunc(F->getName(), Func) &&
             LibInfo->hasOptimizedCodeGen(Func)) {
@@ -329,50 +331,50 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
 
           switch (Func) {
           default: return true;
-          case LibFunc::copysign:
-          case LibFunc::copysignf:
+          case LibFunc_copysign:
+          case LibFunc_copysignf:
             continue; // ISD::FCOPYSIGN is never a library call.
-          case LibFunc::copysignl:
+          case LibFunc_copysignl:
             return true;
-          case LibFunc::fabs:
-          case LibFunc::fabsf:
-          case LibFunc::fabsl:
+          case LibFunc_fabs:
+          case LibFunc_fabsf:
+          case LibFunc_fabsl:
             continue; // ISD::FABS is never a library call.
-          case LibFunc::sqrt:
-          case LibFunc::sqrtf:
-          case LibFunc::sqrtl:
+          case LibFunc_sqrt:
+          case LibFunc_sqrtf:
+          case LibFunc_sqrtl:
             Opcode = ISD::FSQRT; break;
-          case LibFunc::floor:
-          case LibFunc::floorf:
-          case LibFunc::floorl:
+          case LibFunc_floor:
+          case LibFunc_floorf:
+          case LibFunc_floorl:
             Opcode = ISD::FFLOOR; break;
-          case LibFunc::nearbyint:
-          case LibFunc::nearbyintf:
-          case LibFunc::nearbyintl:
+          case LibFunc_nearbyint:
+          case LibFunc_nearbyintf:
+          case LibFunc_nearbyintl:
             Opcode = ISD::FNEARBYINT; break;
-          case LibFunc::ceil:
-          case LibFunc::ceilf:
-          case LibFunc::ceill:
+          case LibFunc_ceil:
+          case LibFunc_ceilf:
+          case LibFunc_ceill:
             Opcode = ISD::FCEIL; break;
-          case LibFunc::rint:
-          case LibFunc::rintf:
-          case LibFunc::rintl:
+          case LibFunc_rint:
+          case LibFunc_rintf:
+          case LibFunc_rintl:
             Opcode = ISD::FRINT; break;
-          case LibFunc::round:
-          case LibFunc::roundf:
-          case LibFunc::roundl:
+          case LibFunc_round:
+          case LibFunc_roundf:
+          case LibFunc_roundl:
             Opcode = ISD::FROUND; break;
-          case LibFunc::trunc:
-          case LibFunc::truncf:
-          case LibFunc::truncl:
+          case LibFunc_trunc:
+          case LibFunc_truncf:
+          case LibFunc_truncl:
             Opcode = ISD::FTRUNC; break;
-          case LibFunc::fmin:
-          case LibFunc::fminf:
-          case LibFunc::fminl:
+          case LibFunc_fmin:
+          case LibFunc_fminf:
+          case LibFunc_fminl:
             Opcode = ISD::FMINNUM; break;
-          case LibFunc::fmax:
-          case LibFunc::fmaxf:
-          case LibFunc::fmaxl:
+          case LibFunc_fmax:
+          case LibFunc_fmaxf:
+          case LibFunc_fmaxl:
             Opcode = ISD::FMAXNUM; break;
           }
         }
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
new file mode 100644
index 000000000000..ebd414baf1d2
--- /dev/null
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -0,0 +1,458 @@
+//===------------- PPCExpandISEL.cpp - Expand ISEL instruction ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass that expands the ISEL instruction into an if-then-else sequence.
+// This pass must be run post-RA since all operands must be physical registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-expand-isel"
+
+STATISTIC(NumExpanded, "Number of ISEL instructions expanded");
+STATISTIC(NumRemoved, "Number of ISEL instructions removed");
+STATISTIC(NumFolded, "Number of ISEL instructions folded");
+
+// If -ppc-gen-isel=false is set, we will disable generating the ISEL
+// instruction on all PPC targets. Otherwise, if the user set option
+// -misel or the platform supports ISEL by default, still generate the
+// ISEL instruction, else expand it.
+static cl::opt<bool>
+    GenerateISEL("ppc-gen-isel",
+                 cl::desc("Enable generating the ISEL instruction."),
+                 cl::init(true), cl::Hidden);
+
+namespace {
+class PPCExpandISEL : public MachineFunctionPass {
+  DebugLoc dl;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  bool IsTrueBlockRequired;
+  bool IsFalseBlockRequired;
+  MachineBasicBlock *TrueBlock;
+  MachineBasicBlock *FalseBlock;
+  MachineBasicBlock *NewSuccessor;
+  MachineBasicBlock::iterator TrueBlockI;
+  MachineBasicBlock::iterator FalseBlockI;
+
+  typedef SmallVector<MachineInstr *, 4> BlockISELList;
+  typedef SmallDenseMap<int, BlockISELList> ISELInstructionList;
+
+  // A map of MBB numbers to their lists of contained ISEL instructions.
+  ISELInstructionList ISELInstructions;
+
+  /// Initialize the object.
+  void initialize(MachineFunction &MFParam);
+
+  void handleSpecialCases(BlockISELList &BIL, MachineBasicBlock *MBB);
+  void reorganizeBlockLayout(BlockISELList &BIL, MachineBasicBlock *MBB);
+  void populateBlocks(BlockISELList &BIL);
+  void expandMergeableISELs(BlockISELList &BIL);
+  void expandAndMergeISELs();
+
+  bool canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI);
+
+  ///  Is this instruction an ISEL or ISEL8?
+  static bool isISEL(const MachineInstr &MI) {
+    return (MI.getOpcode() == PPC::ISEL || MI.getOpcode() == PPC::ISEL8);
+  }
+
+  ///  Is this instruction an ISEL8?
+  static bool isISEL8(const MachineInstr &MI) {
+    return (MI.getOpcode() == PPC::ISEL8);
+  }
+
+  /// Are the two operands using the same register?
+  bool useSameRegister(const MachineOperand &Op1, const MachineOperand &Op2) {
+    return (Op1.getReg() == Op2.getReg());
+  }
+
+  ///
+  ///  Collect all ISEL instructions from the current function.
+  ///
+  /// Walk the current function and collect all the ISEL instructions that are
+  /// found. The instructions are placed in the ISELInstructions vector.
+  ///
+  /// \return true if any ISEL instructions were found, false otherwise
+  ///
+  bool collectISELInstructions();
+
+public:
+  static char ID;
+  PPCExpandISEL() : MachineFunctionPass(ID) {
+    initializePPCExpandISELPass(*PassRegistry::getPassRegistry());
+  }
+
+  ///
+  ///  Determine whether to generate the ISEL instruction or expand it.
+  ///
+  /// Expand ISEL instruction into if-then-else sequence when one of
+  /// the following two conditions hold:
+  /// (1) -ppc-gen-isel=false
+  /// (2) hasISEL() return false
+  /// Otherwise, still generate ISEL instruction.
+  /// The -ppc-gen-isel option is set to true by default. Which means the ISEL
+  /// instruction is still generated by default on targets that support them.
+  ///
+  /// \return true if ISEL should be expanded into if-then-else code sequence;
+  ///         false if ISEL instruction should be generated, i.e. not expaned.
+  ///
+  static bool isExpandISELEnabled(const MachineFunction &MF);
+
+#ifndef NDEBUG
+  void DumpISELInstructions() const;
+#endif
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!isExpandISELEnabled(MF))
+      return false;
+
+    DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+    initialize(MF);
+
+    if (!collectISELInstructions()) {
+      DEBUG(dbgs() << "No ISEL instructions in this function\n");
+      return false;
+    }
+
+#ifndef NDEBUG
+    DumpISELInstructions();
+#endif
+
+    expandAndMergeISELs();
+
+    return true;
+  }
+};
+} // end anonymous namespace
+
+void PPCExpandISEL::initialize(MachineFunction &MFParam) {
+  MF = &MFParam;
+  TII = MF->getSubtarget().getInstrInfo();
+  ISELInstructions.clear();
+}
+
+bool PPCExpandISEL::isExpandISELEnabled(const MachineFunction &MF) {
+  return !GenerateISEL || !MF.getSubtarget<PPCSubtarget>().hasISEL();
+}
+
+bool PPCExpandISEL::collectISELInstructions() {
+  for (MachineBasicBlock &MBB : *MF) {
+    BlockISELList thisBlockISELs;
+    for (MachineInstr &MI : MBB)
+      if (isISEL(MI))
+        thisBlockISELs.push_back(&MI);
+    if (!thisBlockISELs.empty())
+      ISELInstructions.insert(std::make_pair(MBB.getNumber(), thisBlockISELs));
+  }
+  return !ISELInstructions.empty();
+}
+
+#ifndef NDEBUG
+void PPCExpandISEL::DumpISELInstructions() const {
+  for (const auto &I : ISELInstructions) {
+    DEBUG(dbgs() << "BB#" << I.first << ":\n");
+    for (const auto &VI : I.second)
+      DEBUG(dbgs() << "    "; VI->print(dbgs()));
+  }
+}
+#endif
+
+/// Contiguous ISELs that have the same condition can be merged.
+bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) {
+  // Same Condition Register?
+  if (!useSameRegister(PrevPushedMI->getOperand(3), MI->getOperand(3)))
+    return false;
+
+  MachineBasicBlock::iterator PrevPushedMBBI = *PrevPushedMI;
+  MachineBasicBlock::iterator MBBI = *MI;
+  return (std::prev(MBBI) == PrevPushedMBBI); // Contiguous ISELs?
+}
+
+void PPCExpandISEL::expandAndMergeISELs() {
+  for (auto &BlockList : ISELInstructions) {
+    DEBUG(dbgs() << "Expanding ISEL instructions in BB#" << BlockList.first
+                 << "\n");
+
+    BlockISELList &CurrentISELList = BlockList.second;
+    auto I = CurrentISELList.begin();
+    auto E = CurrentISELList.end();
+
+    while (I != E) {
+      BlockISELList SubISELList;
+
+      SubISELList.push_back(*I++);
+
+      // Collect the ISELs that can be merged together.
+      while (I != E && canMerge(SubISELList.back(), *I))
+        SubISELList.push_back(*I++);
+
+      expandMergeableISELs(SubISELList);
+    }
+  }
+}
+
+void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
+                                       MachineBasicBlock *MBB) {
+  IsTrueBlockRequired = false;
+  IsFalseBlockRequired = false;
+
+  auto MI = BIL.begin();
+  while (MI != BIL.end()) {
+    assert(isISEL(**MI) && "Expecting an ISEL instruction");
+    DEBUG(dbgs() << "ISEL: " << **MI << "\n");
+
+    MachineOperand &Dest = (*MI)->getOperand(0);
+    MachineOperand &TrueValue = (*MI)->getOperand(1);
+    MachineOperand &FalseValue = (*MI)->getOperand(2);
+
+    // If at least one of the ISEL instructions satisfy the following
+    // condition, we need the True Block:
+    // The Dest Register and True Value Register are not the same
+    // Similarly, if at least one of the ISEL instructions satisfy the
+    // following condition, we need the False Block:
+    // The Dest Register and False Value Register are not the same.
+
+    bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
+    bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
+
+    // Special case 1, all registers used by ISEL are the same one.
+    if (!IsADDIInstRequired && !IsORIInstRequired) {
+      DEBUG(dbgs() << "Remove redudant ISEL instruction.");
+      NumRemoved++;
+      (*MI)->eraseFromParent();
+      // Setting MI to the erase result keeps the iterator valid and increased.
+      MI = BIL.erase(MI);
+      continue;
+    }
+
+    // Special case 2, the two input registers used by ISEL are the same.
+    // Note 1: We favor merging ISEL expansions over folding a single one. If
+    // the passed list has multiple merge-able ISEL's, we won't fold any.
+    // Note 2: There is no need to test for PPC::R0/PPC::X0 because PPC::ZERO/
+    // PPC::ZERO8 will be used for the first operand if the value is meant to
+    // be zero. In this case, the useSameRegister method will return false,
+    // thereby preventing this ISEL from being folded.
+
+    if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
+      DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy.");
+      NumFolded++;
+      BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::ADDI8 : PPC::ADDI))
+          .add(Dest)
+          .add(TrueValue)
+          .add(MachineOperand::CreateImm(0));
+      (*MI)->eraseFromParent();
+      // Setting MI to the erase result keeps the iterator valid and increased.
+      MI = BIL.erase(MI);
+      continue;
+    }
+
+    IsTrueBlockRequired |= IsADDIInstRequired;
+    IsFalseBlockRequired |= IsORIInstRequired;
+    MI++;
+  }
+}
+
+void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
+                                          MachineBasicBlock *MBB) {
+  if (BIL.empty())
+    return;
+
+  assert((IsTrueBlockRequired || IsFalseBlockRequired) &&
+         "Should have been handled by special cases earlier!");
+
+  MachineBasicBlock *Successor = nullptr;
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineBasicBlock::iterator MBBI = (*BIL.back());
+  NewSuccessor = (MBBI != MBB->getLastNonDebugInstr() || !MBB->canFallThrough())
+                     // Another BB is needed to move the instructions that
+                     // follow this ISEL.  If the ISEL is the last instruction
+                     // in a block that can't fall through, we also need a block
+                     // to branch to.
+                     ? MF->CreateMachineBasicBlock(LLVM_BB)
+                     : nullptr;
+
+  MachineFunction::iterator It = MBB->getIterator();
+  ++It; // Point to the successor block of MBB.
+
+  // If NewSuccessor is NULL then the last ISEL in this group is the last
+  // non-debug instruction in this block. Find the fall-through successor
+  // of this block to use when updating the CFG below.
+  if (!NewSuccessor) {
+    for (auto &Succ : MBB->successors()) {
+      if (MBB->isLayoutSuccessor(Succ)) {
+        Successor = Succ;
+        break;
+      }
+    }
+  } else
+    Successor = NewSuccessor;
+
+  // The FalseBlock and TrueBlock are inserted after the MBB block but before
+  // its successor.
+  // Note this need to be done *after* the above setting the Successor code.
+  if (IsFalseBlockRequired) {
+    FalseBlock = MF->CreateMachineBasicBlock(LLVM_BB);
+    MF->insert(It, FalseBlock);
+  }
+
+  if (IsTrueBlockRequired) {
+    TrueBlock = MF->CreateMachineBasicBlock(LLVM_BB);
+    MF->insert(It, TrueBlock);
+  }
+
+  if (NewSuccessor) {
+    MF->insert(It, NewSuccessor);
+
+    // Transfer the rest of this block into the new successor block.
+    NewSuccessor->splice(NewSuccessor->end(), MBB,
+                         std::next(MachineBasicBlock::iterator(BIL.back())),
+                         MBB->end());
+    NewSuccessor->transferSuccessorsAndUpdatePHIs(MBB);
+
+    // Copy the original liveIns of MBB to NewSuccessor.
+    for (auto &LI : MBB->liveins())
+      NewSuccessor->addLiveIn(LI);
+
+    // After splitting the NewSuccessor block, Regs defined but not killed
+    // in MBB should be treated as liveins of NewSuccessor.
+    // Note: Cannot use stepBackward instead since we are using the Reg
+    // liveness state at the end of MBB (liveOut of MBB) as the liveIn for
+    // NewSuccessor. Otherwise, will cause cyclic dependence.
+    LivePhysRegs LPR(MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
+    SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers;
+    for (MachineInstr &MI : *MBB)
+      LPR.stepForward(MI, Clobbers);
+    for (auto &LI : LPR)
+      NewSuccessor->addLiveIn(LI);
+  } else {
+    // Remove successor from MBB.
+    MBB->removeSuccessor(Successor);
+  }
+
+  // Note that this needs to be done *after* transfering the successors from MBB
+  // to the NewSuccessor block, otherwise these blocks will also be transferred
+  // as successors!
+  MBB->addSuccessor(IsTrueBlockRequired ? TrueBlock : Successor);
+  MBB->addSuccessor(IsFalseBlockRequired ? FalseBlock : Successor);
+
+  if (IsTrueBlockRequired) {
+    TrueBlockI = TrueBlock->begin();
+    TrueBlock->addSuccessor(Successor);
+  }
+
+  if (IsFalseBlockRequired) {
+    FalseBlockI = FalseBlock->begin();
+    FalseBlock->addSuccessor(Successor);
+  }
+
+  // Conditional branch to the TrueBlock or Successor
+  BuildMI(*MBB, BIL.back(), dl, TII->get(PPC::BC))
+      .add(BIL.back()->getOperand(3))
+      .addMBB(IsTrueBlockRequired ? TrueBlock : Successor);
+
+  // Jump over the true block to the new successor if the condition is false.
+  BuildMI(*(IsFalseBlockRequired ? FalseBlock : MBB),
+          (IsFalseBlockRequired ? FalseBlockI : BIL.back()), dl,
+          TII->get(PPC::B))
+      .addMBB(Successor);
+
+  if (IsFalseBlockRequired)
+    FalseBlockI = FalseBlock->begin(); // get the position of PPC::B
+}
+
+void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
+  for (auto &MI : BIL) {
+    assert(isISEL(*MI) && "Expecting an ISEL instruction");
+
+    MachineOperand &Dest = MI->getOperand(0);       // location to store to
+    MachineOperand &TrueValue = MI->getOperand(1);  // Value to store if
+                                                       // condition is true
+    MachineOperand &FalseValue = MI->getOperand(2); // Value to store if
+                                                       // condition is false
+    MachineOperand &ConditionRegister = MI->getOperand(3); // Condition
+
+    DEBUG(dbgs() << "Dest: " << Dest << "\n");
+    DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
+    DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
+    DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
+
+
+    // If the Dest Register and True Value Register are not the same one, we
+    // need the True Block.
+    bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
+    bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
+
+    if (IsADDIInstRequired) {
+      // Copy the result into the destination if the condition is true.
+      BuildMI(*TrueBlock, TrueBlockI, dl,
+              TII->get(isISEL8(*MI) ? PPC::ADDI8 : PPC::ADDI))
+          .add(Dest)
+          .add(TrueValue)
+          .add(MachineOperand::CreateImm(0));
+
+      // Add the LiveIn registers required by true block.
+      TrueBlock->addLiveIn(TrueValue.getReg());
+    }
+
+    if (IsORIInstRequired) {
+      // Add the LiveIn registers required by false block.
+      FalseBlock->addLiveIn(FalseValue.getReg());
+    }
+
+    if (NewSuccessor) {
+      // Add the LiveIn registers required by NewSuccessor block.
+      NewSuccessor->addLiveIn(Dest.getReg());
+      NewSuccessor->addLiveIn(TrueValue.getReg());
+      NewSuccessor->addLiveIn(FalseValue.getReg());
+      NewSuccessor->addLiveIn(ConditionRegister.getReg());
+    }
+
+    // Copy the value into the destination if the condition is false.
+    if (IsORIInstRequired)
+      BuildMI(*FalseBlock, FalseBlockI, dl,
+              TII->get(isISEL8(*MI) ? PPC::ORI8 : PPC::ORI))
+          .add(Dest)
+          .add(FalseValue)
+          .add(MachineOperand::CreateImm(0));
+
+    MI->eraseFromParent(); // Remove the ISEL instruction.
+
+    NumExpanded++;
+  }
+}
+
+void PPCExpandISEL::expandMergeableISELs(BlockISELList &BIL) {
+  // At this stage all the ISELs of BIL are in the same MBB.
+  MachineBasicBlock *MBB = BIL.back()->getParent();
+
+  handleSpecialCases(BIL, MBB);
+  reorganizeBlockLayout(BIL, MBB);
+  populateBlocks(BIL);
+}
+
+INITIALIZE_PASS(PPCExpandISEL, DEBUG_TYPE, "PowerPC Expand ISEL Generation",
+                false, false)
+char PPCExpandISEL::ID = 0;
+
+FunctionPass *llvm::createPPCExpandISELPass() { return new PPCExpandISEL(); }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index e786ef9aee0e..4c9430a2eca0 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -433,8 +433,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
@@ -519,8 +518,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
   unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
   unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
@@ -616,8 +614,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
     return true;
 
   // Get the list of callee-saved registers for the target.
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());
 
   // Get all the available registers in the block.
@@ -663,8 +660,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
 // and the stack frame is large, we need two scratch registers.
 bool
 PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MachineFunction &MF = *(MBB->getParent());
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned FrameSize = determineFrameLayout(MF, false);
@@ -694,10 +690,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
@@ -1221,10 +1215,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   if (MBBI != MBB.end())
     dl = MBBI->getDebugLoc();
   
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1550,8 +1542,7 @@ void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
   if (MBBI != MBB.end())
     dl = MBBI->getDebugLoc();
 
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
 
   // Create branch instruction for pseudo tail call return instruction
   unsigned RetOpcode = MBBI->getOpcode();
@@ -1589,8 +1580,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1793,8 +1783,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
   }
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   if (RegInfo->hasBasePointer(MF)) {
     HasGPSaveArea = true;
 
@@ -1941,8 +1930,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
@@ -2083,8 +2071,7 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   bool CR2Spilled = false;
   bool CR3Spilled = false;
   bool CR4Spilled = false;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1e51c1f651c9..9c72638023bb 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -12,30 +12,57 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <new>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-codegen"
@@ -60,6 +87,7 @@ static cl::opt<bool> EnableBranchHint(
     cl::Hidden);
 
 namespace {
+
   //===--------------------------------------------------------------------===//
   /// PPCDAGToDAGISel - PPC specific code to select PPC machine
   /// instructions for SelectionDAG operations.
@@ -69,6 +97,7 @@ namespace {
     const PPCSubtarget *PPCSubTarget;
     const PPCTargetLowering *PPCLowering;
     unsigned GlobalBaseReg;
+
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
         : SelectionDAGISel(tm), TM(tm) {}
@@ -184,7 +213,6 @@ namespace {
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
                                       std::vector<SDValue> &OutOps) override {
-
       switch(ConstraintID) {
       default:
         errs() << "ConstraintID: " << ConstraintID << "\n";
@@ -237,7 +265,8 @@ private:
 
     void transferMemOperands(SDNode *N, SDNode *Result);
   };
-}
+
+} // end anonymous namespace
 
 /// InsertVRSaveCode - Once the entire function has been instruction selected,
 /// all virtual registers are created and all machine instructions are built,
@@ -303,7 +332,6 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
   }
 }
 
-
 /// getGlobalBaseReg - Output the instructions required to put the
 /// base address to use for accessing globals into a register.
 ///
@@ -368,7 +396,6 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) {
   return isIntS16Immediate(Op.getNode(), Imm);
 }
 
-
 /// isInt32Immediate - This method tests to see if the node is a 32-bit constant
 /// operand. If so Imm will receive the 32-bit value.
 static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
@@ -833,6 +860,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
 }
 
 namespace {
+
 class BitPermutationSelector {
   struct ValueBit {
     SDValue V;
@@ -898,14 +926,12 @@ class BitPermutationSelector {
   // associated with each) used to choose the lowering method.
   struct ValueRotInfo {
     SDValue V;
-    unsigned RLAmt;
-    unsigned NumGroups;
-    unsigned FirstGroupStartIdx;
-    bool Repl32;
+    unsigned RLAmt = std::numeric_limits<unsigned>::max();
+    unsigned NumGroups = 0;
+    unsigned FirstGroupStartIdx = std::numeric_limits<unsigned>::max();
+    bool Repl32 = false;
 
-    ValueRotInfo()
-      : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
-        Repl32(false) {}
+    ValueRotInfo() = default;
 
     // For sorting (in reverse order) by NumGroups, and then by
     // FirstGroupStartIdx.
@@ -1985,7 +2011,8 @@ public:
     return RNLM;
   }
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
   if (N->getValueType(0) != MVT::i32 &&
@@ -2450,7 +2477,6 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 }
 
-
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -2474,19 +2500,18 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   switch (N->getOpcode()) {
   default: break;
 
-  case ISD::Constant: {
+  case ISD::Constant:
     if (N->getValueType(0) == MVT::i64) {
       ReplaceNode(N, getInt64(CurDAG, N));
       return;
     }
     break;
-  }
 
-  case ISD::SETCC: {
+  case ISD::SETCC:
     if (trySETCC(N))
       return;
     break;
-  }
+
   case PPCISD::GlobalBaseReg:
     ReplaceNode(N, getGlobalBaseReg());
     return;
@@ -2502,11 +2527,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     return;
   }
 
-  case PPCISD::READ_TIME_BASE: {
+  case PPCISD::READ_TIME_BASE:
     ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
                                           MVT::Other, N->getOperand(0)));
     return;
-  }
 
   case PPCISD::SRA_ADDZE: {
     SDValue N0 = N->getOperand(0);
@@ -2690,6 +2714,19 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
       return;
     }
+    // If this is a negated 64-bit zero-extension mask,
+    // i.e. the immediate is a sequence of ones from most significant side
+    // and all zero for reminder, we should use rldicr.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(~Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 63 - countTrailingOnes(~Imm64);
+      SH = 0;
+      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+      return;
+    }
+
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
@@ -2911,8 +2948,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
       return;
     }
-
     break;
+
   case ISD::VECTOR_SHUFFLE:
     if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
@@ -2940,7 +2977,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
             SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
           SDValue Chain = LD->getChain();
           SDValue Ops[] = { Base, Offset, Chain };
-          CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops);
+          SDNode *NewN = CurDAG->SelectNodeTo(N, PPC::LXVDSX,
+                                              N->getValueType(0), Ops);
+          MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+          MemOp[0] = LD->getMemOperand();
+          cast<MachineSDNode>(NewN)->setMemRefs(MemOp, MemOp + 1);
           return;
         }
       }
@@ -3088,7 +3129,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
                                           SDValue(Tmp, 0), GA));
     return;
   }
-  case PPCISD::PPC32_PICGOT: {
+  case PPCISD::PPC32_PICGOT:
     // Generate a PIC-safe GOT reference.
     assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
       "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
@@ -3096,7 +3137,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
                          PPCLowering->getPointerTy(CurDAG->getDataLayout()),
                          MVT::i32);
     return;
-  }
+
   case PPCISD::VADD_SPLAT: {
     // This expands into one of three sequences, depending on whether
     // the first operand is odd or even, positive or negative.
@@ -3139,7 +3180,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue TmpVal = SDValue(Tmp, 0);
       ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal));
       return;
-
     } else if (Elt > 0) {
       // Elt is odd and positive, in the range [17,31].
       //
@@ -3154,7 +3194,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
                                             SDValue(Tmp2, 0)));
       return;
-
     } else {
       // Elt is odd and negative, in the range [-31,-17].
       //
@@ -3199,7 +3238,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   SDValue RHS, LHS;
-  bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  bool BytesFound[8] = {false, false, false, false, false, false, false, false};
   uint64_t Mask = 0, Alt = 0;
 
   auto IsByteSelectCC = [this](SDValue O, unsigned &b,
@@ -3499,7 +3538,6 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
 /// PostprocessISelDAG - Perform some late peephole optimizations
 /// on the DAG representation.
 void PPCDAGToDAGISel::PostprocessISelDAG() {
-
   // Skip peepholes at -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
@@ -3515,10 +3553,6 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // be folded with the isel so that we don't need to materialize a register
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
-  // If we're not using isel, then this does not matter.
-  if (!PPCSubTarget->hasISEL())
-    return false;
-
   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
        UI != UE; ++UI) {
     SDNode *User = *UI;
@@ -4520,7 +4554,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
   }
 }
 
-
 /// createPPCISelDag - This pass converts a legalized DAG into a
 /// PowerPC-specific DAG, ready for instruction scheduling.
 ///
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 2b9195b095e1..f7663d8e5185 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11,39 +11,88 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCISelLowering.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
 #include "PPCCallingConv.h"
 #include "PPCCCState.h"
+#include "PPCFrameLowering.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
+#include "PPCRegisterInfo.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
-#include "PPCTargetObjectFile.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 #include <list>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -1525,7 +1574,6 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
 
 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                           unsigned &InsertAtByte, bool &Swap, bool IsLE) {
-
   // Check that the mask is shuffling words
   for (unsigned i = 0; i < 4; ++i) {
     unsigned B0 = N->getMaskElt(i*4);
@@ -1643,7 +1691,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
       // If the element isn't a constant, bail fully out.
       if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
 
-
       if (!UniquedVals[i&(Multiple-1)].getNode())
         UniquedVals[i&(Multiple-1)] = N->getOperand(i);
       else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
@@ -2026,7 +2073,6 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   }
 
   if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
-
     // Common code will reject creating a pre-inc form if the base pointer
     // is a frame index, or if N is a store and the base pointer is either
     // the same as or a predecessor of the value being stored.  Check for
@@ -2277,7 +2323,6 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
-
   // FIXME: TLS addresses currently use medium model code sequences,
   // which is the most useful form.  Eventually support for small and
   // large models could be added if users need it, at the cost of
@@ -2602,10 +2647,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-               std::move(Args));
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -2737,7 +2781,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-bool 
+bool
 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
                                                   MVT &LocVT,
                                                   CCValAssign::LocInfo &LocInfo,
@@ -2752,7 +2796,7 @@ llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
   unsigned RegNum = State.getFirstUnallocated(ArgRegs);
   int RegsLeft = NumArgRegs - RegNum;
 
-  // Skip if there is not enough registers left for long double type (4 gpr regs 
+  // Skip if there is not enough registers left for long double type (4 gpr regs
   // in soft float mode) and put long double argument on the stack.
   if (RegNum != NumArgRegs && RegsLeft < 4) {
     for (int i = 0; i < RegsLeft; i++) {
@@ -4066,7 +4110,7 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
 
 static bool
 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
-  if (CS->arg_size() != CallerFn->getArgumentList().size())
+  if (CS->arg_size() != CallerFn->arg_size())
     return false;
 
   ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
@@ -4222,11 +4266,12 @@ namespace {
 struct TailCallArgumentInfo {
   SDValue Arg;
   SDValue FrameIdxOp;
-  int       FrameIdx;
+  int FrameIdx = 0;
 
-  TailCallArgumentInfo() : FrameIdx(0) {}
+  TailCallArgumentInfo() = default;
 };
-}
+
+} // end anonymous namespace
 
 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
 static void StoreTailCallArgumentsToStackSlot(
@@ -4406,7 +4451,6 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
             SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
             SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
             ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
-
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
@@ -4602,7 +4646,6 @@ SDValue PPCTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
@@ -4649,7 +4692,6 @@ SDValue PPCTargetLowering::FinishCall(
     SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
     unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
     SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
-
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
@@ -5059,7 +5101,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite *CS) const {
-
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
@@ -5105,10 +5146,30 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   };
 
   const unsigned NumGPRs = array_lengthof(GPR);
-  const unsigned NumFPRs = 13;
+  const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
   const unsigned NumVRs  = array_lengthof(VR);
   const unsigned NumQFPRs = NumFPRs;
 
+  // On ELFv2, we can avoid allocating the parameter area if all the arguments
+  // can be passed to the callee in registers.
+  // For the fast calling convention, there is another check below.
+  // Note: We should keep consistent with LowerFormalArguments_64SVR4()
+  bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
+  if (!HasParameterArea) {
+    unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+    unsigned AvailableFPRs = NumFPRs;
+    unsigned AvailableVRs = NumVRs;
+    unsigned NumBytesTmp = NumBytes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (Outs[i].Flags.isNest()) continue;
+      if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
+                                PtrByteSize, LinkageSize, ParamAreaSize,
+                                NumBytesTmp, AvailableFPRs, AvailableVRs,
+                                Subtarget.hasQPX()))
+        HasParameterArea = true;
+    }
+  }
+
   // When using the fast calling convention, we don't provide backing for
   // arguments that will be in registers.
   unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
@@ -5176,13 +5237,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
 
   unsigned NumBytesActuallyUsed = NumBytes;
 
-  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // In the old ELFv1 ABI,
+  // the prolog code of the callee may store up to 8 GPR argument registers to
   // the stack, allowing va_start to index over them in memory if its varargs.
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
-  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
-  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+  // In the ELFv2 ABI, we allocate the parameter area iff a callee
+  // really requires memory operands, e.g. a vararg function.
+  if (HasParameterArea)
+    NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+  else
+    NumBytes = LinkageSize;
 
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
@@ -5401,6 +5467,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
@@ -5486,6 +5554,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
@@ -5520,6 +5590,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // GPRs when within range.  For now, we always put the value in both
       // locations (or even all three).
       if (isVarArg) {
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
@@ -5552,6 +5624,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
@@ -5572,6 +5646,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
       if (isVarArg) {
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
@@ -5604,6 +5680,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
@@ -5618,7 +5696,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     }
   }
 
-  assert(NumBytesActuallyUsed == ArgOffset);
+  assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
+         "mismatch in size of parameter area");
   (void)NumBytesActuallyUsed;
 
   if (!MemOpChains.empty())
@@ -5673,7 +5752,6 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite *CS) const {
-
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -6065,7 +6143,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &dl, SelectionDAG &DAG) const {
-
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
@@ -7612,7 +7689,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
     }
-
   }
 
   if (Subtarget.hasQPX()) {
@@ -7792,24 +7868,39 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
                                  bool &isDot, const PPCSubtarget &Subtarget) {
   unsigned IntrinsicID =
-    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+      cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
   CompareOpc = -1;
   isDot = false;
   switch (IntrinsicID) {
-  default: return false;
-    // Comparison predicates.
-  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  default:
+    return false;
+  // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:
+    CompareOpc = 966;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    CompareOpc = 198;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequb_p:
+    CompareOpc = 6;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequh_p:
+    CompareOpc = 70;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequw_p:
+    CompareOpc = 134;
+    isDot = true;
+    break;
   case Intrinsic::ppc_altivec_vcmpequd_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 199;
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
   case Intrinsic::ppc_altivec_vcmpneb_p:
   case Intrinsic::ppc_altivec_vcmpneh_p:
@@ -7818,45 +7909,80 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpnezh_p:
   case Intrinsic::ppc_altivec_vcmpnezw_p:
     if (Subtarget.hasP9Altivec()) {
-      switch(IntrinsicID) {
-      default: llvm_unreachable("Unknown comparison intrinsic.");
-      case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break;
-      case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break;
-      case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break;
-      case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break;
-      case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break;
-      case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break;
+      switch (IntrinsicID) {
+      default:
+        llvm_unreachable("Unknown comparison intrinsic.");
+      case Intrinsic::ppc_altivec_vcmpneb_p:
+        CompareOpc = 7;
+        break;
+      case Intrinsic::ppc_altivec_vcmpneh_p:
+        CompareOpc = 71;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnew_p:
+        CompareOpc = 135;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezb_p:
+        CompareOpc = 263;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezh_p:
+        CompareOpc = 327;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezw_p:
+        CompareOpc = 391;
+        break;
       }
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
-  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p:
+    CompareOpc = 454;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    CompareOpc = 710;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    CompareOpc = 774;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    CompareOpc = 838;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    CompareOpc = 902;
+    isDot = true;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtsd_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 967;
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
-  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p:
+    CompareOpc = 518;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    CompareOpc = 582;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p:
+    CompareOpc = 646;
+    isDot = true;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtud_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 711;
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
-    // VSX predicate comparisons use the same infrastructure
+
+  // VSX predicate comparisons use the same infrastructure
   case Intrinsic::ppc_vsx_xvcmpeqdp_p:
   case Intrinsic::ppc_vsx_xvcmpgedp_p:
   case Intrinsic::ppc_vsx_xvcmpgtdp_p:
@@ -7865,33 +7991,51 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_vsx_xvcmpgtsp_p:
     if (Subtarget.hasVSX()) {
       switch (IntrinsicID) {
-      case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break;
-      case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break;
-      case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break;
-      case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break;
-      case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break;
-      case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break;
+      case Intrinsic::ppc_vsx_xvcmpeqdp_p:
+        CompareOpc = 99;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgedp_p:
+        CompareOpc = 115;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgtdp_p:
+        CompareOpc = 107;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpeqsp_p:
+        CompareOpc = 67;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgesp_p:
+        CompareOpc = 83;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgtsp_p:
+        CompareOpc = 75;
+        break;
       }
-      isDot = 1;
-    }
-    else
+      isDot = true;
+    } else
       return false;
-
     break;
 
-    // Normal Comparisons.
-  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:
+    CompareOpc = 966;
+    break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:
+    CompareOpc = 198;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequb:
+    CompareOpc = 6;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequh:
+    CompareOpc = 70;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequw:
+    CompareOpc = 134;
+    break;
   case Intrinsic::ppc_altivec_vcmpequd:
-    if (Subtarget.hasP8Altivec()) {
+    if (Subtarget.hasP8Altivec())
       CompareOpc = 199;
-      isDot = 0;
-    } else
+    else
       return false;
-
     break;
   case Intrinsic::ppc_altivec_vcmpneb:
   case Intrinsic::ppc_altivec_vcmpneh:
@@ -7899,43 +8043,67 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpnezb:
   case Intrinsic::ppc_altivec_vcmpnezh:
   case Intrinsic::ppc_altivec_vcmpnezw:
-    if (Subtarget.hasP9Altivec()) {
+    if (Subtarget.hasP9Altivec())
       switch (IntrinsicID) {
-      default: llvm_unreachable("Unknown comparison intrinsic.");
-      case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break;
-      case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break;
-      case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break;
-      case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break;
-      case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break;
-      case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break;
+      default:
+        llvm_unreachable("Unknown comparison intrinsic.");
+      case Intrinsic::ppc_altivec_vcmpneb:
+        CompareOpc = 7;
+        break;
+      case Intrinsic::ppc_altivec_vcmpneh:
+        CompareOpc = 71;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnew:
+        CompareOpc = 135;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezb:
+        CompareOpc = 263;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezh:
+        CompareOpc = 327;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezw:
+        CompareOpc = 391;
+        break;
       }
-      isDot = 0;
-    } else
+    else
       return false;
     break;
-  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:
+    CompareOpc = 454;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:
+    CompareOpc = 710;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:
+    CompareOpc = 774;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:
+    CompareOpc = 838;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:
+    CompareOpc = 902;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtsd:
-    if (Subtarget.hasP8Altivec()) {
+    if (Subtarget.hasP8Altivec())
       CompareOpc = 967;
-      isDot = 0;
-    } else
+    else
       return false;
-
     break;
-  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:
+    CompareOpc = 518;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:
+    CompareOpc = 582;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:
+    CompareOpc = 646;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtud:
-    if (Subtarget.hasP8Altivec()) {
+    if (Subtarget.hasP8Altivec())
       CompareOpc = 711;
-      isDot = 0;
-    } else
+    else
       return false;
-
     break;
   }
   return true;
@@ -8044,7 +8212,7 @@ SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
-                                                   SelectionDAG &DAG) const {
+                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -9174,10 +9342,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (Subtarget.hasISEL() &&
-      (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+  if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
        MI.getOpcode() == PPC::SELECT_CC_I8 ||
-       MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
+       MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) {
     SmallVector<MachineOperand, 2> Cond;
     if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
         MI.getOpcode() == PPC::SELECT_CC_I8)
@@ -9417,7 +9584,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BB = EmitAtomicBinary(MI, BB, 4, 0);
   else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0);
-
   else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
            (Subtarget.hasPartwordAtomics() &&
@@ -10028,14 +10194,12 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   return false;
 }
 
-
 /// This function is called when we have proved that a SETCC node can be replaced
 /// by subtraction (and other supporting instructions) so that the result of
 /// comparison is kept in a GPR instead of CR. This function is purely for
 /// codegen purposes and has some flags to guide the codegen process.
 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
                                      bool Swap, SDLoc &DL, SelectionDAG &DAG) {
-
   assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
 
   // Zero extend the operands to the largest legal integer. Originally, they
@@ -10068,7 +10232,6 @@ static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
 
 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
-
   assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
 
   SelectionDAG &DAG = DCI.DAG;
@@ -11227,9 +11390,20 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (BSwapOp.getValueType() == MVT::i16)
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
 
+      // If the type of BSWAP operand is wider than stored memory width
+      // it need to be shifted to the right side before STBRX.
+      EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
+      if (Op1VT.bitsGT(mVT)) {
+        int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
+        BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
+                              DAG.getConstant(Shift, dl, MVT::i32));
+        // Need to truncate if this is a bswap of i64 stored as i32/i16.
+        if (Op1VT == MVT::i64)
+          BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
+      }
+
       SDValue Ops[] = {
-        N->getOperand(0), BSwapOp, N->getOperand(2),
-        DAG.getValueType(N->getOperand(1).getValueType())
+        N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
       };
       return
         DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
@@ -11570,7 +11744,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     break;
-  case ISD::INTRINSIC_W_CHAIN: {
+  case ISD::INTRINSIC_W_CHAIN:
     // For little endian, VSX loads require generating lxvd2x/xxswapd.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
     if (Subtarget.needsSwapsForVSXMemOps()) {
@@ -11583,8 +11757,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     break;
-  }
-  case ISD::INTRINSIC_VOID: {
+  case ISD::INTRINSIC_VOID:
     // For little endian, VSX stores require generating xxswapd/stxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
     if (Subtarget.needsSwapsForVSXMemOps()) {
@@ -11597,7 +11770,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     break;
-  }
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
@@ -11635,9 +11807,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // Return N so it doesn't get rechecked!
       return SDValue(N, 0);
     }
-
     break;
-  case PPCISD::VCMP: {
+  case PPCISD::VCMP:
     // If a VCMPo node already exists with exactly the same operands as this
     // node, use its result instead of this node (VCMPo computes both a CR6 and
     // a normal output).
@@ -11687,7 +11858,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return SDValue(VCMPoNode, 0);
     }
     break;
-  }
   case ISD::BRCOND: {
     SDValue Cond = N->getOperand(1);
     SDValue Target = N->getOperand(2);
@@ -11847,6 +12017,7 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
@@ -12295,7 +12466,6 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            unsigned Intrinsic) const {
-
   switch (Intrinsic) {
   case Intrinsic::ppc_qpx_qvlfd:
   case Intrinsic::ppc_qpx_qvlfs:
@@ -12753,7 +12923,6 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
 }
 
 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-
   if (!VT.isSimple() || !Subtarget.hasVSX())
     return false;
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 05acd25ae5fc..6113eb58f421 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -17,13 +17,26 @@
 
 #include "PPC.h"
 #include "PPCInstrInfo.h"
-#include "PPCRegisterInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Target/TargetLowering.h"
+#include <utility>
 
 namespace llvm {
+
   namespace PPCISD {
+
     enum NodeType : unsigned {
       // Start the numbering where the builtin ops and target ops leave off.
       FIRST_NUMBER = ISD::BUILTIN_OP_END,
@@ -398,10 +411,12 @@ namespace llvm {
       /// the last operand.
       TOC_ENTRY
     };
-  }
+
+  } // end namespace PPCISD
 
   /// Define some predicates that are used for node matching.
   namespace PPC {
+
     /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUHUM instruction.
     bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
@@ -465,7 +480,8 @@ namespace llvm {
     /// If this is a qvaligni shuffle mask, return the shift
     /// amount, otherwise return -1.
     int isQVALIGNIShuffleMask(SDNode *N);
-  }
+
+  } // end namespace PPC
 
   class PPCTargetLowering : public TargetLowering {
     const PPCSubtarget &Subtarget;
@@ -492,6 +508,7 @@ namespace llvm {
         return TypeWidenVector;
       return TargetLoweringBase::getPreferredVectorAction(VT);
     }
+
     bool useSoftFloat() const override;
 
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
@@ -514,6 +531,10 @@ namespace llvm {
       return true;
     }
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
     bool supportSplitCSR(MachineFunction *MF) const override {
       return
         MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
@@ -587,6 +608,7 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
@@ -694,6 +716,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool convertSelectOfConstantsToMath() const override {
+      return true;
+    }
+
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     bool getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -785,15 +811,13 @@ namespace llvm {
       SDValue Chain;
       SDValue ResChain;
       MachinePointerInfo MPI;
-      bool IsDereferenceable;
-      bool IsInvariant;
-      unsigned Alignment;
+      bool IsDereferenceable = false;
+      bool IsInvariant = false;
+      unsigned Alignment = 0;
       AAMDNodes AAInfo;
-      const MDNode *Ranges;
+      const MDNode *Ranges = nullptr;
 
-      ReuseLoadInfo()
-          : IsDereferenceable(false), IsInvariant(false), Alignment(0),
-            Ranges(nullptr) {}
+      ReuseLoadInfo() = default;
 
       MachineMemOperand::Flags MMOFlags() const {
         MachineMemOperand::Flags F = MachineMemOperand::MONone;
@@ -906,15 +930,13 @@ namespace llvm {
                          const SDLoc &dl, SelectionDAG &DAG,
                          SmallVectorImpl<SDValue> &InVals) const override;
 
-    SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
-    bool
-      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                   bool isVarArg,
-                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   LLVMContext &Context) const override;
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -994,14 +1016,16 @@ namespace llvm {
     CCAssignFn *useFastISelCCs(unsigned Flag) const;
 
     SDValue
-      combineElementTruncationToVectorTruncation(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const;
+    combineElementTruncationToVectorTruncation(SDNode *N,
+                                               DAGCombinerInfo &DCI) const;
   };
 
   namespace PPC {
+
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
                              const TargetLibraryInfo *LibInfo);
-  }
+
+  } // end namespace PPC
 
   bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                   CCValAssign::LocInfo &LocInfo,
@@ -1026,6 +1050,7 @@ namespace llvm {
                                            CCValAssign::LocInfo &LocInfo,
                                            ISD::ArgFlagsTy &ArgFlags,
                                            CCState &State);
-}
 
-#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+} // end namespace llvm
+
+#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index fbec8787ef8d..997b96ca6ec8 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -253,11 +253,11 @@ def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
            Requires<[IsISA3_0]>;
 }
 
-let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in
+let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
                     "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
 
-let mayStore = 1, hasSideEffects = 0 in
+let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
                           "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64,
             Requires<[IsISA3_0]>;
@@ -1082,7 +1082,7 @@ def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
 }
 
 // Stores with Update (pre-inc).
-let PPC970_Unit = 2, mayStore = 1 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
                    "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
@@ -1232,6 +1232,10 @@ def : Pat<(srl i64:$rS, i32:$rB),
 def : Pat<(shl i64:$rS, i32:$rB),
           (SLD $rS, $rB)>;
 
+// SUBFIC
+def : Pat<(sub imm64SExt16:$imm, i64:$in),
+          (SUBFIC8 $in, imm:$imm)>;
+
 // SHL/SRL
 def : Pat<(shl i64:$in, (i32 imm:$imm)),
           (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 5c022749ad64..c380766e9f5c 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -407,7 +407,7 @@ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
                       "mtvscr $vB", IIC_LdStLoad,
                       [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
 
-let PPC970_Unit = 2 in {  // Loads.
+let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {  // Loads.
 def LVEBX: XForm_1<31,   7, (outs vrrc:$vD), (ins memrr:$src),
                    "lvebx $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
@@ -434,7 +434,7 @@ def LVSR : XForm_1<31,  38, (outs vrrc:$vD), (ins memrr:$src),
                    [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 
-let PPC970_Unit = 2 in {   // Stores.
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {   // Stores.
 def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvebx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
@@ -851,6 +851,10 @@ def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
 // Additional Altivec Patterns
 //
 
+// Extended mnemonics
+def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+
 // Loads.
 def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 2e0b9355f82b..8e159f47ea2e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -65,7 +65,9 @@ UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
 void PPCInstrInfo::anchor() {}
 
 PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
-    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+                      /* CatchRetOpcode */ -1,
+                      STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
       Subtarget(STI), RI(STI.getTargetMachine()) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
@@ -662,12 +664,14 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
                               (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                               (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
     else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
-      BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+      BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB);
     else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
-      BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+      BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB);
     else                // Conditional branch
       BuildMI(&MBB, DL, get(PPC::BCC))
-        .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+          .addImm(Cond[0].getImm())
+          .add(Cond[1])
+          .addMBB(TBB);
     return 1;
   }
 
@@ -677,12 +681,14 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
                             (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                             (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
   else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
-    BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+    BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB);
   else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
-    BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+    BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB);
   else
     BuildMI(&MBB, DL, get(PPC::BCC))
-      .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+        .addImm(Cond[0].getImm())
+        .add(Cond[1])
+        .addMBB(TBB);
   BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
   return 2;
 }
@@ -692,9 +698,6 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                 ArrayRef<MachineOperand> Cond,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
-  if (!Subtarget.hasISEL())
-    return false;
-
   if (Cond.size() != 2)
     return false;
 
@@ -736,9 +739,6 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 &&
          "PPC branch conditions have two components!");
 
-  assert(Subtarget.hasISEL() &&
-         "Cannot insert select on target without ISEL support");
-
   // Get the register classes.
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC =
@@ -1493,7 +1493,7 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
   return Found;
 }
 
-bool PPCInstrInfo::isPredicable(MachineInstr &MI) const {
+bool PPCInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned OpC = MI.getOpcode();
   switch (OpC) {
   default:
@@ -1836,8 +1836,7 @@ unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     PatchPointOpers Opers(&MI);
     return Opers.getNumPatchBytes();
   } else {
-    const MCInstrDesc &Desc = get(Opcode);
-    return Desc.getSize();
+    return get(Opcode).getSize();
   }
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 32b2f009a3f5..f11aed8fa268 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -253,7 +253,7 @@ public:
   bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   // Comparison optimization.
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f615cc7cc974..f004ce49cac0 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -114,9 +114,9 @@ def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
                        [SDNPHasChain, SDNPMayStore]>;
 def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
-                       [SDNPHasChain, SDNPMayLoad]>;
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
-                       [SDNPHasChain, SDNPMayLoad]>;
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
                        [SDNPHasChain, SDNPMayLoad]>;
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
@@ -243,7 +243,7 @@ def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
-                           [SDNPHasChain, SDNPMayLoad]>;
+                           [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore]>;
 
@@ -770,9 +770,10 @@ def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
 }
 
 // A single-register address. This is used with the SjLj
-// pseudo-instructions.
+// pseudo-instructions which tranlates to LD/LWZ.  These instructions requires
+// G8RC_NOX0 registers.
 def memr : Operand<iPTR> {
-  let MIOperandInfo = (ops ptr_rc:$ptrreg);
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
 }
 def PPCTLSRegOperand : AsmOperandClass {
   let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
@@ -1648,7 +1649,7 @@ let usesCustomInserter = 1 in {
 }
 
 // Instructions to support atomic operations
-let mayLoad = 1, hasSideEffects = 0 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
 def LBARX : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
                     "lbarx $rD, $src", IIC_LdStLWARX, []>,
                     Requires<[HasPartwordAtomics]>;
@@ -1681,7 +1682,7 @@ def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
            Requires<[IsISA3_0]>;
 }
 
-let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in {
+let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
 def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
                     "stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
                     isDOT, Requires<[HasPartwordAtomics]>;
@@ -1694,7 +1695,7 @@ def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
                     "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
 }
 
-let mayStore = 1, hasSideEffects = 0 in
+let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
                           "stwat $rS, $rA, $FC", IIC_LdStStore>,
             Requires<[IsISA3_0]>;
@@ -1740,7 +1741,7 @@ def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
 
 
 // Unindexed (r+i) Loads with Update (preinc).
-let mayLoad = 1, hasSideEffects = 0 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
 def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
                    "lbzu $rD, $addr", IIC_LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
@@ -1813,7 +1814,7 @@ def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
 
 // Indexed (r+r) Loads.
 //
-let PPC970_Unit = 2 in {
+let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
 def LBZX : XForm_1<31,  87, (outs gprc:$rD), (ins memrr:$src),
                    "lbzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
@@ -1827,8 +1828,6 @@ def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
 def LWZX : XForm_1<31,  23, (outs gprc:$rD), (ins memrr:$src),
                    "lwzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (load xaddr:$src))]>;
-                   
-                   
 def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
                    "lhbrx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
@@ -1860,7 +1859,7 @@ def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
 //
 
 // Unindexed (r+i) Stores.
-let PPC970_Unit = 2 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
                    "stb $rS, $src", IIC_LdStStore,
                    [(truncstorei8 i32:$rS, iaddr:$src)]>;
@@ -1879,7 +1878,7 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
 }
 
 // Unindexed (r+i) Stores with Update (preinc).
-let PPC970_Unit = 2, mayStore = 1 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
                     "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
@@ -1948,7 +1947,7 @@ def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
 }
 
 // Indexed (r+r) Stores with Update (preinc).
-let PPC970_Unit = 2, mayStore = 1 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
                     "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 0d9e3459f47e..13603732397a 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -62,7 +62,7 @@ def SDTVecConv : SDTypeProfile<1, 2, [
 ]>;
 
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
-                        [SDNPHasChain, SDNPMayLoad]>;
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
                         [SDNPHasChain, SDNPMayStore]>;
 def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
@@ -117,7 +117,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
 let Uses = [RM] in {
 
   // Load indexed instructions
-  let mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
     def LXSDX : XX1Form<31, 588,
                         (outs vsfrc:$XT), (ins memrr:$src),
@@ -142,7 +142,7 @@ let Uses = [RM] in {
   } // mayLoad
 
   // Store indexed instructions
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
     def STXSDX : XX1Form<31, 716,
                         (outs), (ins vsfrc:$XT, memrr:$dst),
@@ -1197,7 +1197,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                        [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
 
   // VSX scalar loads introduced in ISA 2.07
-  let mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
     def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
                          "lxsspx $XT, $src", IIC_LdStLFD,
@@ -1211,7 +1211,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   } // mayLoad
 
   // VSX scalar stores introduced in ISA 2.07
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
     def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
                           "stxsspx $XT, $dst", IIC_LdStSTFD,
@@ -1410,6 +1410,11 @@ let Predicates = [HasDirectMove] in {
                               "mfvsrd $rA, $XT", IIC_VecGeneral,
                               [(set i64:$rA, (PPCmfvsr f64:$XT))]>,
       Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT),
+                             "mfvsrd $rA, $XT", IIC_VecGeneral,
+                             []>,
+      Requires<[In64BitMode]>;
   def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
                                "mfvsrwz $rA, $XT", IIC_VecGeneral,
                                [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
@@ -1440,6 +1445,13 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
 } // IsISA3_0, HasDirectMove
 } // UseVSXReg = 1
 
+// We want to parse this from asm, but we don't want to emit this as it would
+// be emitted with a VSX reg. So leave Emit = 0 here.
+def : InstAlias<"mfvrd $rA, $XT",
+                (MFVRD g8rc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprd $rA, $src",
+                (MFVSRD g8rc:$rA, f8rc:$src)>;
+
 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
     the value up into element 0 (both BE and LE). Namely, entities smaller than
     a doubleword are shifted left and moved for BE. For LE, they're moved, then
@@ -2186,7 +2198,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // UseVSXReg = 1
 
   // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
-  // seperate pattern so that it can convert the input register class from
+  // separate pattern so that it can convert the input register class from
   // VRRC(v8i16) to VSRC.
   def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
             (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
@@ -2335,7 +2347,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
   // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
-  let mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
   // Load Vector
   def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
                             "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg;
@@ -2383,7 +2395,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
   // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 0 in {
   // Store Vector
   def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
                              "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg;
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 2c3e75523e8f..a349fa1b4090 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -72,9 +73,10 @@ namespace {
   public:
     static char ID; // Pass ID, replacement for typeid
 
-    PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) {
+    PPCLoopPreIncPrep() : FunctionPass(ID) {
       initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
     }
+
     PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
       initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
     }
@@ -93,7 +95,7 @@ namespace {
     bool rotateLoop(Loop *L);
 
   private:
-    PPCTargetMachine *TM;
+    PPCTargetMachine *TM = nullptr;
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index e527b018d4fb..541b98e01b99 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -148,7 +148,7 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       assert(!MO.getSubReg() && "Subregs should be eliminated!");
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index 2413af3f7042..c6d2c3ebcc0f 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -147,9 +147,9 @@ bool PPCMIPeephole::simplifyCode(void) {
                       << "Optimizing load-and-splat/splat "
                       "to load-and-splat/copy: ");
                 DEBUG(MI.dump());
-                BuildMI(MBB, &MI, MI.getDebugLoc(),
-                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
-                  .addOperand(MI.getOperand(1));
+                BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                        MI.getOperand(0).getReg())
+                    .add(MI.getOperand(1));
                 ToErase = &MI;
                 Simplified = true;
               }
@@ -169,9 +169,9 @@ bool PPCMIPeephole::simplifyCode(void) {
                       << "Optimizing splat/swap or splat/splat "
                       "to splat/copy: ");
                 DEBUG(MI.dump());
-                BuildMI(MBB, &MI, MI.getDebugLoc(),
-                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
-                  .addOperand(MI.getOperand(1));
+                BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                        MI.getOperand(0).getReg())
+                    .add(MI.getOperand(1));
                 ToErase = &MI;
                 Simplified = true;
               }
@@ -194,9 +194,9 @@ bool PPCMIPeephole::simplifyCode(void) {
               else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
                 DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
                 DEBUG(MI.dump());
-                BuildMI(MBB, &MI, MI.getDebugLoc(),
-                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
-                  .addOperand(DefMI->getOperand(1));
+                BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                        MI.getOperand(0).getReg())
+                    .add(DefMI->getOperand(1));
                 ToErase = &MI;
                 Simplified = true;
               }
@@ -251,7 +251,7 @@ bool PPCMIPeephole::simplifyCode(void) {
           DEBUG(MI.dump());
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                   MI.getOperand(0).getReg())
-              .addOperand(MI.getOperand(OpNo));
+              .add(MI.getOperand(OpNo));
           ToErase = &MI;
           Simplified = true;
         }
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 9d91e31165de..bc2d9a08b5e8 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -8,14 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCMachineFunctionInfo.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
-void PPCFunctionInfo::anchor() { }
+void PPCFunctionInfo::anchor() {}
 
 MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
   const DataLayout &DL = MF.getDataLayout();
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 4c29aa06f048..202e10058b73 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -26,17 +27,17 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// FramePointerSaveIndex - Frame index of where the old frame pointer is
   /// stored.  Also used as an anchor for instructions that need to be altered
   /// when using frame pointers (dyna_add, dyna_sub.)
-  int FramePointerSaveIndex;
+  int FramePointerSaveIndex = 0;
   
   /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
   ///
-  int ReturnAddrSaveIndex;
+  int ReturnAddrSaveIndex = 0;
 
   /// Frame index where the old base pointer is stored.
-  int BasePointerSaveIndex;
+  int BasePointerSaveIndex = 0;
 
   /// Frame index where the old PIC base pointer is stored.
-  int PICBasePointerSaveIndex;
+  int PICBasePointerSaveIndex = 0;
 
   /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
   /// function.  This is only valid after the initial scan of the function by
@@ -44,54 +45,58 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   bool MustSaveLR;
 
   /// Does this function have any stack spills.
-  bool HasSpills;
+  bool HasSpills = false;
 
   /// Does this function spill using instructions with only r+r (not r+i)
   /// forms.
-  bool HasNonRISpills;
+  bool HasNonRISpills = false;
 
   /// SpillsCR - Indicates whether CR is spilled in the current function.
-  bool SpillsCR;
+  bool SpillsCR = false;
 
   /// Indicates whether VRSAVE is spilled in the current function.
-  bool SpillsVRSAVE;
+  bool SpillsVRSAVE = false;
 
   /// LRStoreRequired - The bool indicates whether there is some explicit use of
   /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
   /// requires that the code generator produce a store of LR to the stack on
   /// entry, even though LR may otherwise apparently not be used.
-  bool LRStoreRequired;
+  bool LRStoreRequired = false;
 
   /// This function makes use of the PPC64 ELF TOC base pointer (register r2).
-  bool UsesTOCBasePtr;
+  bool UsesTOCBasePtr = false;
 
   /// MinReservedArea - This is the frame size that is at least reserved in a
   /// potential caller (parameter+linkage area).
-  unsigned MinReservedArea;
+  unsigned MinReservedArea = 0;
 
   /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
   /// amount the stack pointer is adjusted to make the frame bigger for tail
   /// calls. Used for creating an area before the register spill area.
-  int TailCallSPDelta;
+  int TailCallSPDelta = 0;
 
   /// HasFastCall - Does this function contain a fast call. Used to determine
   /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
-  bool HasFastCall;
+  bool HasFastCall = false;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
+
   /// VarArgsStackOffset - StackOffset for start of stack
   /// arguments.
-  int VarArgsStackOffset;
+
+  int VarArgsStackOffset = 0;
+
   /// VarArgsNumGPR - Index of the first unused integer
   /// register for parameter passing.
-  unsigned VarArgsNumGPR;
+  unsigned VarArgsNumGPR = 0;
+
   /// VarArgsNumFPR - Index of the first unused double
   /// register for parameter passing.
-  unsigned VarArgsNumFPR;
+  unsigned VarArgsNumFPR = 0;
 
   /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
-  int CRSpillFrameIndex;
+  int CRSpillFrameIndex = 0;
 
   /// If any of CR[2-4] need to be saved in the prologue and restored in the
   /// epilogue then they are added to this array. This is used for the
@@ -102,35 +107,14 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   MachineFunction &MF;
 
   /// Whether this uses the PIC Base register or not.
-  bool UsesPICBase;
+  bool UsesPICBase = false;
 
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies
-  bool IsSplitCSR;
+  bool IsSplitCSR = false;
 
 public:
-  explicit PPCFunctionInfo(MachineFunction &MF) 
-    : FramePointerSaveIndex(0),
-      ReturnAddrSaveIndex(0),
-      BasePointerSaveIndex(0),
-      PICBasePointerSaveIndex(0),
-      HasSpills(false),
-      HasNonRISpills(false),
-      SpillsCR(false),
-      SpillsVRSAVE(false),
-      LRStoreRequired(false),
-      UsesTOCBasePtr(false),
-      MinReservedArea(0),
-      TailCallSPDelta(0),
-      HasFastCall(false),
-      VarArgsFrameIndex(0),
-      VarArgsStackOffset(0),
-      VarArgsNumGPR(0),
-      VarArgsNumFPR(0),
-      CRSpillFrameIndex(0),
-      MF(MF),
-      UsesPICBase(0),
-      IsSplitCSR(false) {}
+  explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {}
 
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
@@ -211,7 +195,6 @@ public:
   MCSymbol *getTOCOffsetSymbol() const;
 };
 
-} // end of namespace llvm
-
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index e49201402861..aad913924692 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -209,86 +209,67 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // The ZERO register is not really a register, but the representation of r0
   // when used in instructions that treat r0 as the constant 0.
-  Reserved.set(PPC::ZERO);
-  Reserved.set(PPC::ZERO8);
+  markSuperRegs(Reserved, PPC::ZERO);
 
   // The FP register is also not really a register, but is the representation
   // of the frame pointer register used by ISD::FRAMEADDR.
-  Reserved.set(PPC::FP);
-  Reserved.set(PPC::FP8);
+  markSuperRegs(Reserved, PPC::FP);
 
   // The BP register is also not really a register, but is the representation
   // of the base pointer register used by setjmp.
-  Reserved.set(PPC::BP);
-  Reserved.set(PPC::BP8);
+  markSuperRegs(Reserved, PPC::BP);
 
   // The counter registers must be reserved so that counter-based loops can
   // be correctly formed (and the mtctr instructions are not DCE'd).
-  Reserved.set(PPC::CTR);
-  Reserved.set(PPC::CTR8);
+  markSuperRegs(Reserved, PPC::CTR);
+  markSuperRegs(Reserved, PPC::CTR8);
 
-  Reserved.set(PPC::R1);
-  Reserved.set(PPC::LR);
-  Reserved.set(PPC::LR8);
-  Reserved.set(PPC::RM);
+  markSuperRegs(Reserved, PPC::R1);
+  markSuperRegs(Reserved, PPC::LR);
+  markSuperRegs(Reserved, PPC::LR8);
+  markSuperRegs(Reserved, PPC::RM);
 
   if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
-    Reserved.set(PPC::VRSAVE);
+    markSuperRegs(Reserved, PPC::VRSAVE);
 
   // The SVR4 ABI reserves r2 and r13
   if (Subtarget.isSVR4ABI()) {
-    Reserved.set(PPC::R2);  // System-reserved register
-    Reserved.set(PPC::R13); // Small Data Area pointer register
+    // We only reserve r2 if we need to use the TOC pointer. If we have no
+    // explicit uses of the TOC pointer (meaning we're a leaf function with
+    // no constant-pool loads, etc.) and we have no potential uses inside an
+    // inline asm block, then we can treat r2 has an ordinary callee-saved
+    // register.
+    const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+    if (!TM.isPPC64() || FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
+      markSuperRegs(Reserved, PPC::R2);  // System-reserved register
+    markSuperRegs(Reserved, PPC::R13); // Small Data Area pointer register
   }
 
   // On PPC64, r13 is the thread pointer. Never allocate this register.
-  if (TM.isPPC64()) {
-    Reserved.set(PPC::R13);
-
-    Reserved.set(PPC::X1);
-    Reserved.set(PPC::X13);
-
-    if (TFI->needsFP(MF))
-      Reserved.set(PPC::X31);
-
-    if (hasBasePointer(MF))
-      Reserved.set(PPC::X30);
-
-    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
-    if (Subtarget.isSVR4ABI()) {
-      // We only reserve r2 if we need to use the TOC pointer. If we have no
-      // explicit uses of the TOC pointer (meaning we're a leaf function with
-      // no constant-pool loads, etc.) and we have no potential uses inside an
-      // inline asm block, then we can treat r2 has an ordinary callee-saved
-      // register.
-      const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-      if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
-        Reserved.set(PPC::X2);
-      else
-        Reserved.reset(PPC::R2);
-    }
-  }
+  if (TM.isPPC64())
+    markSuperRegs(Reserved, PPC::R13);
 
   if (TFI->needsFP(MF))
-    Reserved.set(PPC::R31);
+    markSuperRegs(Reserved, PPC::R31);
 
   bool IsPositionIndependent = TM.isPositionIndependent();
   if (hasBasePointer(MF)) {
     if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
-      Reserved.set(PPC::R29);
+      markSuperRegs(Reserved, PPC::R29);
     else
-      Reserved.set(PPC::R30);
+      markSuperRegs(Reserved, PPC::R30);
   }
 
   if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
-    Reserved.set(PPC::R30);
+    markSuperRegs(Reserved, PPC::R30);
 
   // Reserve Altivec registers when Altivec is unavailable.
   if (!Subtarget.hasAltivec())
     for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
          IE = PPC::VRRCRegClass.end(); I != IE; ++I)
-      Reserved.set(*I);
+      markSuperRegs(Reserved, *I);
 
+  assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
 
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index 8e52da583a0d..79963dd6a3e9 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -377,7 +377,7 @@ def P8Itineraries : ProcessorItineraries<
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [7, 1, 1]>,
   InstrItinData<IIC_VecPerm     , [InstrStage<1, [P8_DU1, P8_DU2], 0>,
-                                   InstrStage<1, [P8_FPU2, P8_FPU2]>],
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [3, 1, 1]>
 ]>;
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index e8a87e7f4437..ccf0f80c336b 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -220,8 +220,8 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
-unsigned char PPCSubtarget::classifyGlobalReference(
-    const GlobalValue *GV) const {
+unsigned char
+PPCSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
   // Note that currently we don't generate non-pic references.
   // If a caller wants that, this will have to be updated.
 
@@ -229,23 +229,9 @@ unsigned char PPCSubtarget::classifyGlobalReference(
   if (TM.getCodeModel() == CodeModel::Large)
     return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
 
-  unsigned char flags = PPCII::MO_PIC_FLAG;
-
-  // Only if the relocation mode is PIC do we have to worry about
-  // interposition. In all other cases we can use a slightly looser standard to
-  // decide how to access the symbol.
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    // If it's local, or it's non-default, it can't be interposed.
-    if (!GV->hasLocalLinkage() &&
-        GV->hasDefaultVisibility()) {
-      flags |= PPCII::MO_NLP_FLAG;
-    }
-    return flags;
-  }
-
-  if (GV->isStrongDefinitionForLinker())
-    return flags;
-  return flags | PPCII::MO_NLP_FLAG;
+  if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return PPCII::MO_PIC_FLAG;
+  return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
 }
 
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 7fd907990ceb..5a97f595ad8c 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -298,7 +298,9 @@ public:
   bool isSVR4ABI() const { return !isDarwinABI(); }
   bool isELFv2ABI() const;
 
-  bool enableEarlyIfConversion() const override { return hasISEL(); }
+  /// Originally, this function return hasISEL(). Now we always enable it,
+  /// but may expand the ISEL instruction later.
+  bool enableEarlyIfConversion() const override { return true; }
 
   // Scheduling customization.
   bool enableMachineScheduler() const override;
@@ -316,6 +318,8 @@ public:
   /// classifyGlobalReference - Classify a global variable reference for the
   /// current subtarget accourding to how we should reference it.
   unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+  bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 91b1d24b2e41..7806d45b5457 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -11,21 +11,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPC.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetObjectFile.h"
+#include "PPCTargetMachine.h"
 #include "PPCTargetTransformInfo.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::
@@ -80,6 +92,7 @@ extern "C" void LLVMInitializePowerPCTarget() {
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
+  initializePPCExpandISELPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -149,9 +162,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   // If it isn't a Mach-O file then it's going to be a linux ELF
   // object file.
   if (TT.isOSDarwin())
-    return make_unique<TargetLoweringObjectFileMachO>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
 
-  return make_unique<PPC64LinuxTargetObjectFile>();
+  return llvm::make_unique<PPC64LinuxTargetObjectFile>();
 }
 
 static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
@@ -205,15 +218,13 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                         computeFSAdditions(FS, OL, TT), Options,
                         getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      TargetABI(computeTargetABI(TT, Options)),
-      Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
-
+      TargetABI(computeTargetABI(TT, Options)) {
   initAsmInfo();
 }
 
-PPCTargetMachine::~PPCTargetMachine() {}
+PPCTargetMachine::~PPCTargetMachine() = default;
 
-void PPC32TargetMachine::anchor() { }
+void PPC32TargetMachine::anchor() {}
 
 PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -223,7 +234,7 @@ PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL)
     : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
-void PPC64TargetMachine::anchor() { }
+void PPC64TargetMachine::anchor() {}
 
 PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -281,6 +292,7 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// PPC Code Generator Pass Configuration Options.
 class PPCPassConfig : public TargetPassConfig {
 public:
@@ -300,7 +312,8 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new PPCPassConfig(this, PM);
@@ -416,6 +429,8 @@ void PPCPassConfig::addPreSched2() {
 }
 
 void PPCPassConfig::addPreEmitPass() {
+  addPass(createPPCExpandISELPass());
+
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createPPCEarlyReturnPass(), false);
   // Must run branch selection immediately preceding the asm printer.
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 59b4f1e30c0e..f2838351cee5 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -29,7 +29,6 @@ public:
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   PPCABI TargetABI;
-  PPCSubtarget Subtarget;
 
   mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
index dbe7617d3542..310fea9ef09f 100644
--- a/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -1,4 +1,4 @@
-//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//===- PPCTargetStreamer.h - PPC Target Streamer ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,18 +10,26 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
+
+class MCExpr;
+class MCSymbol;
+class MCSymbolELF;
+
 class PPCTargetStreamer : public MCTargetStreamer {
 public:
   PPCTargetStreamer(MCStreamer &S);
   ~PPCTargetStreamer() override;
+
   virtual void emitTCEntry(const MCSymbol &S) = 0;
   virtual void emitMachine(StringRef CPU) = 0;
   virtual void emitAbiVersion(int AbiVersion) = 0;
   virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f94d1eab097d..7ee1317bf72f 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -302,14 +302,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return LT.first;
 }
 
-int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -352,7 +354,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 }
 
 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -401,6 +403,10 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
     return Cost;
 
+  // Newer PPC supports unaligned memory access.
+  if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
+    return Cost;
+
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 30ee2814aba1..6ce70fbd8778 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -74,11 +74,13 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
index 3b5d8f094fd0..f3a0290da054 100644
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -112,7 +112,7 @@ protected:
                   TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
               .addImm(1) // add 1, not 0, because there is no implicit clearing
                          // of the high bits.
-              .addOperand(SrcMO)
+              .add(SrcMO)
               .addImm(PPC::sub_64);
 
           // The source of the original copy is now the new virtual register.
@@ -132,7 +132,7 @@ protected:
           unsigned NewVReg = MRI.createVirtualRegister(DstRC);
           BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
                   NewVReg)
-              .addOperand(SrcMO);
+              .add(SrcMO);
 
           // Transform the original copy into a subregister extraction copy.
           SrcMO.setReg(NewVReg);
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 8197285b7b1f..d3434b77be8a 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -522,7 +522,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
 
   if (RelevantFunction) {
     DEBUG(dbgs() << "Swap vector when first built\n\n");
-    dumpSwapVector();
+    DEBUG(dumpSwapVector());
   }
 
   return RelevantFunction;
@@ -731,7 +731,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
   }
 
   DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
-  dumpSwapVector();
+  DEBUG(dumpSwapVector());
 }
 
 // Walk the swap vector entries looking for swaps fed by permuting loads
@@ -936,9 +936,9 @@ bool PPCVSXSwapRemoval::removeSwaps() {
       Changed = true;
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
       MachineBasicBlock *MBB = MI->getParent();
-      BuildMI(*MBB, MI, MI->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-        .addOperand(MI->getOperand(1));
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              MI->getOperand(0).getReg())
+          .add(MI->getOperand(1));
 
       DEBUG(dbgs() << format("Replaced %d with copy: ",
                              SwapVector[EntryIdx].VSEId));
@@ -951,77 +951,78 @@ bool PPCVSXSwapRemoval::removeSwaps() {
   return Changed;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // For debug purposes, dump the contents of the swap vector.
-void PPCVSXSwapRemoval::dumpSwapVector() {
+LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
 
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
 
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
     int ID = SwapVector[EntryIdx].VSEId;
 
-    DEBUG(dbgs() << format("%6d", ID));
-    DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID)));
-    DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber()));
-    DEBUG(dbgs() << format("  %14s  ",
-                           TII->getName(MI->getOpcode()).str().c_str()));
+    dbgs() << format("%6d", ID);
+    dbgs() << format("%6d", EC->getLeaderValue(ID));
+    dbgs() << format(" BB#%3d", MI->getParent()->getNumber());
+    dbgs() << format("  %14s  ", TII->getName(MI->getOpcode()).str().c_str());
 
     if (SwapVector[EntryIdx].IsLoad)
-      DEBUG(dbgs() << "load ");
+      dbgs() << "load ";
     if (SwapVector[EntryIdx].IsStore)
-      DEBUG(dbgs() << "store ");
+      dbgs() << "store ";
     if (SwapVector[EntryIdx].IsSwap)
-      DEBUG(dbgs() << "swap ");
+      dbgs() << "swap ";
     if (SwapVector[EntryIdx].MentionsPhysVR)
-      DEBUG(dbgs() << "physreg ");
+      dbgs() << "physreg ";
     if (SwapVector[EntryIdx].MentionsPartialVR)
-      DEBUG(dbgs() << "partialreg ");
+      dbgs() << "partialreg ";
 
     if (SwapVector[EntryIdx].IsSwappable) {
-      DEBUG(dbgs() << "swappable ");
+      dbgs() << "swappable ";
       switch(SwapVector[EntryIdx].SpecialHandling) {
       default:
-        DEBUG(dbgs() << "special:**unknown**");
+        dbgs() << "special:**unknown**";
         break;
       case SH_NONE:
         break;
       case SH_EXTRACT:
-        DEBUG(dbgs() << "special:extract ");
+        dbgs() << "special:extract ";
         break;
       case SH_INSERT:
-        DEBUG(dbgs() << "special:insert ");
+        dbgs() << "special:insert ";
         break;
       case SH_NOSWAP_LD:
-        DEBUG(dbgs() << "special:load ");
+        dbgs() << "special:load ";
         break;
       case SH_NOSWAP_ST:
-        DEBUG(dbgs() << "special:store ");
+        dbgs() << "special:store ";
         break;
       case SH_SPLAT:
-        DEBUG(dbgs() << "special:splat ");
+        dbgs() << "special:splat ";
         break;
       case SH_XXPERMDI:
-        DEBUG(dbgs() << "special:xxpermdi ");
+        dbgs() << "special:xxpermdi ";
         break;
       case SH_COPYWIDEN:
-        DEBUG(dbgs() << "special:copywiden ");
+        dbgs() << "special:copywiden ";
         break;
       }
     }
 
     if (SwapVector[EntryIdx].WebRejected)
-      DEBUG(dbgs() << "rejected ");
+      dbgs() << "rejected ";
     if (SwapVector[EntryIdx].WillRemove)
-      DEBUG(dbgs() << "remove ");
+      dbgs() << "remove ";
 
-    DEBUG(dbgs() << "\n");
+    dbgs() << "\n";
 
     // For no-asserts builds.
     (void)MI;
     (void)ID;
   }
 
-  DEBUG(dbgs() << "\n");
+  dbgs() << "\n";
 }
+#endif
 
 } // end default namespace
 
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index f8ef142255c8..d6f2672271e9 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -33,7 +33,7 @@ public:
   ~RISCVAsmBackend() override {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -71,7 +71,7 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
 void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                  unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
+                                 bool IsPCRel, MCContext &Ctx) const {
   return;
 }
 
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 4fc69a7fcaba..41be0a2084b3 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -44,13 +44,12 @@ static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
 
 static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const Triple &TT) {
-  MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
-  return MAI;
+  return new RISCVMCAsmInfo(TT);
 }
 
 extern "C" void LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
-    RegisterMCAsmInfoFn X(*T, createRISCVMCAsmInfo);
+    TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createRISCVMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createRISCVMCRegisterInfo);
     TargetRegistry::RegisterMCAsmBackend(*T, createRISCVAsmBackend);
diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td
index 1e9bc3bf9bc5..3fab7122f6f1 100644
--- a/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/lib/Target/RISCV/RISCVInstrFormats.td
@@ -44,8 +44,9 @@ class RISCVInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : RISCVInst<outs, ins, asmstr, pattern> {
+    : RISCVInst<outs, ins, "", pattern> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class FR<bits<7> funct7, bits<3> funct3, bits<7> opcode, dag outs, dag ins,
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index afbbe004186e..a20331cd0a3e 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -32,7 +32,7 @@ static std::string computeDataLayout(const Triple &TT) {
     return "e-m:e-i64:64-n32:64-S128";
   } else {
     assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
-    return "e-m:e-i64:64-n32-S128";
+    return "e-m:e-p:32:32-i64:64-n32-S128";
   }
 }
 
@@ -51,7 +51,9 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(TT, RM), CM, OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()) {}
+      TLOF(make_unique<TargetLoweringObjectFileELF>()) {
+  initAsmInfo();
+}
 
 TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new TargetPassConfig(this, PM);
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index e775aa607b53..7e6dff6b7894 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -9,32 +9,49 @@
 
 #include "MCTargetDesc/SparcMCExpr.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
 
 using namespace llvm;
 
 // The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
 // namespace. But SPARC backend uses "SP" as its namespace.
 namespace llvm {
-  namespace Sparc {
+namespace Sparc {
+
     using namespace SP;
-  }
-}
+
+} // end namespace Sparc
+} // end namespace llvm
 
 namespace {
+
 class SparcOperand;
-class SparcAsmParser : public MCTargetAsmParser {
 
+class SparcAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
   /// @name Auto-generated Match Functions
@@ -95,9 +112,10 @@ public:
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
-
 };
 
+} // end anonymous namespace
+
   static const MCPhysReg IntRegs[32] = {
     Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
     Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
@@ -166,6 +184,8 @@ public:
     Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23,
     Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31};
   
+namespace {
+
 /// SparcOperand - Instances of this class represent a parsed Sparc machine
 /// instruction.
 class SparcOperand : public MCParsedAsmOperand {
@@ -219,6 +239,7 @@ private:
     struct ImmOp Imm;
     struct MemOp Mem;
   };
+
 public:
   SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 
@@ -464,7 +485,7 @@ public:
   }
 };
 
-} // end namespace
+} // end anonymous namespace
 
 bool SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
                                SmallVectorImpl<MCInst> &Instructions) {
@@ -591,9 +612,8 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Implement any new match types added!");
 }
 
-bool SparcAsmParser::
-ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
-{
+bool SparcAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                   SMLoc &EndLoc) {
   const AsmToken &Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
@@ -695,7 +715,7 @@ ParseDirective(AsmToken DirectiveID)
 
 bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
         return true;
@@ -717,7 +737,6 @@ bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
 
 OperandMatchResultTy
 SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
-
   SMLoc S, E;
   unsigned BaseReg = 0;
 
@@ -824,7 +843,6 @@ SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 OperandMatchResultTy
 SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
                                      bool isCall) {
-
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
@@ -910,11 +928,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
 
 OperandMatchResultTy
 SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
-
   // parse (,a|,pn|,pt)+
 
   while (getLexer().is(AsmToken::Comma)) {
-
     Parser.Lex(); // Eat the comma
 
     if (!getLexer().is(AsmToken::Identifier))
@@ -929,10 +945,8 @@ SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
-                                       unsigned &RegNo,
-                                       unsigned &RegKind)
-{
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+                                       unsigned &RegKind) {
   int64_t intVal = 0;
   RegNo = 0;
   RegKind = SparcOperand::rk_None;
@@ -1211,8 +1225,7 @@ static bool hasGOTReference(const MCExpr *Expr) {
 
 const SparcMCExpr *
 SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
-                                    const MCExpr *subExpr)
-{
+                                    const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
   // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
@@ -1236,8 +1249,7 @@ SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
-                                            SMLoc &EndLoc)
-{
+                                            SMLoc &EndLoc) {
   AsmToken Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return false;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 6106a6c32dc8..cc07547ede2c 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -274,7 +274,8 @@ namespace {
       SparcAsmBackend(T), OSType(OSType) { }
 
     void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel) const override {
+                    uint64_t Value, bool IsPCRel,
+                    MCContext &Ctx) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 280c6d7937b2..3ed09898fb78 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//===- SparcMCAsmInfo.cpp - Sparc asm properties --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,7 +14,10 @@
 #include "SparcMCAsmInfo.h"
 #include "SparcMCExpr.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Dwarf.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index ad441227600e..5e8d0cb50312 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//===- SparcMCAsmInfo.h - Sparc asm properties -----------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
+
 class Triple;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
@@ -24,6 +25,7 @@ class SparcELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit SparcELFMCAsmInfo(const Triple &TheTriple);
+
   const MCExpr*
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
@@ -33,6 +35,6 @@ public:
 
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 86341c61d1e2..684f66970dbe 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,20 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcMCExpr.h"
 #include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCExpr.h"
 #include "SparcMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,17 +42,17 @@ using namespace llvm;
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
+
 class SparcMCCodeEmitter : public MCCodeEmitter {
-  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
-  void operator=(const SparcMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 
 public:
   SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : MCII(mcii), Ctx(ctx) {}
-
-  ~SparcMCCodeEmitter() override {}
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
+  SparcMCCodeEmitter &operator=(const SparcMCCodeEmitter &) = delete;
+  ~SparcMCCodeEmitter() override = default;
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -79,13 +88,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-} // end anonymous namespace
 
-MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new SparcMCCodeEmitter(MCII, Ctx);
-}
+} // end anonymous namespace
 
 void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                            SmallVectorImpl<MCFixup> &Fixups,
@@ -121,12 +125,10 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   ++MCNumEmitted;  // Keep track of the # of mi's emitted.
 }
 
-
 unsigned SparcMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
-
   if (MO.isReg())
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
 
@@ -209,6 +211,7 @@ getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
                                    (MCFixupKind)Sparc::fixup_sparc_br19));
   return 0;
 }
+
 unsigned SparcMCCodeEmitter::
 getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
@@ -227,3 +230,9 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "SparcGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new SparcMCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 122f830e0dc5..c07cc213c3ed 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -288,11 +288,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   return true;
@@ -305,8 +305,8 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineFrameInfo    &MFI = MF.getFrameInfo();
 
   return !(MFI.hasCalls()                  // has calls
-           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
-           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || MRI.isPhysRegUsed(SP::L0)    // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6)    // %SP is used
            || hasFP(MF));                  // need %FP
 }
 
@@ -314,11 +314,10 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (MRI.reg_nodbg_empty(reg))
+    if (!MRI.isPhysRegUsed(reg))
       continue;
 
     unsigned mapped_reg = reg - SP::I0 + SP::O0;
-    assert(MRI.reg_nodbg_empty(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 2ac9aae2471b..455d1ee1564a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1877,6 +1877,7 @@ void SparcTargetLowering::computeKnownBitsForTargetNode
                                 (const SDValue Op,
                                  APInt &KnownZero,
                                  APInt &KnownOne,
+                                 const APInt &DemandedElts,
                                  const SelectionDAG &DAG,
                                  unsigned Depth) const {
   APInt KnownZero2, KnownOne2;
@@ -2177,8 +2178,8 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
     if (!Subtarget->is64Bit())
-      Entry.isSRet = true;
-    Entry.isReturned = false;
+      Entry.IsSRet = true;
+    Entry.IsReturned = false;
     Args.push_back(Entry);
     RetTyABI = Type::getVoidTy(*DAG.getContext());
   }
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index e0a421b83712..90d03984060c 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -68,6 +68,7 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a94717c93456..3f91ca9035a6 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -8,16 +8,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
 
 using namespace llvm;
 
@@ -31,6 +46,7 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 }
 
 namespace {
+
 enum RegisterKind {
   GR32Reg,
   GRH32Reg,
@@ -56,7 +72,6 @@ enum MemoryKind {
 };
 
 class SystemZOperand : public MCParsedAsmOperand {
-public:
 private:
   enum OperandKind {
     KindInvalid,
@@ -140,12 +155,14 @@ public:
                                                        SMLoc EndLoc) {
     return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
+
   static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
     auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
@@ -153,12 +170,14 @@ public:
     Op->Reg.Num = Num;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
             const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
@@ -175,6 +194,7 @@ public:
       Op->Mem.Length.Reg = LengthReg;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
                SMLoc StartLoc, SMLoc EndLoc) {
@@ -503,6 +523,7 @@ public:
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
   }
 };
+
 } // end anonymous namespace
 
 #define GET_REGISTER_MATCHER
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 1806e015f61e..a281a0aa6bcc 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "SystemZ.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -21,17 +25,19 @@ using namespace llvm;
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+
 class SystemZDisassembler : public MCDisassembler {
 public:
   SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
-  ~SystemZDisassembler() override {}
+  ~SystemZDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
 };
+
 } // end anonymous namespace
 
 static MCDisassembler *createSystemZDisassembler(const Target &T,
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 1207c7b327e8..6cd12e13e220 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,10 +10,13 @@
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 6336f5ee0efa..d65c661545eb 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -15,8 +15,10 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCOperand;
 
 class SystemZInstPrinter : public MCInstPrinter {
@@ -70,6 +72,7 @@ private:
   // This forms part of the instruction name rather than the operand list.
   void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 9192448afd04..23b7d5b5d501 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -51,7 +51,7 @@ public:
   }
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
@@ -91,7 +91,7 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                      unsigned DataSize, uint64_t Value,
-                                     bool IsPCRel) const {
+                                     bool IsPCRel, MCContext &Ctx) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 7082abad716d..092eb4011adc 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -11,20 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   MCContext &Ctx;
@@ -34,7 +42,7 @@ public:
     : MCII(mcii), Ctx(ctx) {
   }
 
-  ~SystemZMCCodeEmitter() override {}
+  ~SystemZMCCodeEmitter() override = default;
 
   // OVerride MCCodeEmitter.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -137,13 +145,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-} // end anonymous namespace
 
-MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                MCContext &Ctx) {
-  return new SystemZMCCodeEmitter(MCII, Ctx);
-}
+} // end anonymous namespace
 
 void SystemZMCCodeEmitter::
 encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -282,3 +285,9 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "SystemZGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new SystemZMCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 43a96e84289c..3de570bf30cc 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -7,35 +7,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class SystemZObjectWriter : public MCELFObjectTargetWriter {
 public:
   SystemZObjectWriter(uint8_t OSABI);
-
-  ~SystemZObjectWriter() override;
+  ~SystemZObjectWriter() override = default;
 
 protected:
   // Override MCELFObjectTargetWriter.
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
+
 } // end anonymous namespace
 
 SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
   : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
                             /*HasRelocationAddend=*/ true) {}
 
-SystemZObjectWriter::~SystemZObjectWriter() {
-}
-
 // Return the relocation type for an absolute value of MCFixupKind Kind.
 static unsigned getAbsoluteReloc(unsigned Kind) {
   switch (Kind) {
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index b4c843f658aa..d70f9e90cd3e 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -13,15 +13,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,11 +41,11 @@ STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
 STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
 
 namespace {
+
 // Represents the references to a particular register in one or more
 // instructions.
 struct Reference {
-  Reference()
-    : Def(false), Use(false) {}
+  Reference() = default;
 
   Reference &operator|=(const Reference &Other) {
     Def |= Other.Def;
@@ -49,15 +57,16 @@ struct Reference {
 
   // True if the register is defined or used in some form, either directly or
   // via a sub- or super-register.
-  bool Def;
-  bool Use;
+  bool Def = false;
+  bool Use = false;
 };
 
 class SystemZElimCompare : public MachineFunctionPass {
 public:
   static char ID;
+
   SystemZElimCompare(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
+    : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override {
     return "SystemZ Comparison Elimination";
@@ -65,6 +74,7 @@ public:
 
   bool processBlock(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &F) override;
+
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::NoVRegs);
@@ -84,16 +94,13 @@ private:
   bool fuseCompareOperations(MachineInstr &Compare,
                              SmallVectorImpl<MachineInstr *> &CCUsers);
 
-  const SystemZInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
+  const SystemZInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 };
 
 char SystemZElimCompare::ID = 0;
-} // end anonymous namespace
 
-FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
-  return new SystemZElimCompare(TM);
-}
+} // end anonymous namespace
 
 // Return true if CC is live out of MBB.
 static bool isCCLiveOut(MachineBasicBlock &MBB) {
@@ -167,7 +174,7 @@ static unsigned getCompareSourceReg(MachineInstr &Compare) {
     reg = Compare.getOperand(0).getReg();
   else if (isLoadAndTestAsCmp(Compare))
     reg = Compare.getOperand(1).getReg();
-  assert (reg);
+  assert(reg);
 
   return reg;
 }
@@ -216,9 +223,7 @@ bool SystemZElimCompare::convertToBRCT(
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(BRCT));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
-  MIB.addOperand(MI.getOperand(0))
-     .addOperand(MI.getOperand(1))
-     .addOperand(Target);
+  MIB.add(MI.getOperand(0)).add(MI.getOperand(1)).add(Target);
   // Add a CC def to BRCT(G), since we may have to split them again if the
   // branch displacement overflows.  BRCTH has a 32-bit displacement, so
   // this is not necessary there.
@@ -261,10 +266,10 @@ bool SystemZElimCompare::convertToLoadAndTrap(
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(LATOpcode));
   MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
-      .addOperand(MI.getOperand(0))
-      .addOperand(MI.getOperand(1))
-      .addOperand(MI.getOperand(2))
-      .addOperand(MI.getOperand(3));
+      .add(MI.getOperand(0))
+      .add(MI.getOperand(1))
+      .add(MI.getOperand(2))
+      .add(MI.getOperand(3));
   MI.eraseFromParent();
   return true;
 }
@@ -368,10 +373,8 @@ static bool isCompareZero(MachineInstr &Compare) {
     return true;
 
   default:
-
     if (isLoadAndTestAsCmp(Compare))
       return true;
-
     return Compare.getNumExplicitOperands() == 2 &&
            Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
   }
@@ -502,15 +505,15 @@ bool SystemZElimCompare::fuseCompareOperations(
   Branch->setDesc(TII->get(FusedOpcode));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
   for (unsigned I = 0; I < SrcNOps; I++)
-    MIB.addOperand(Compare.getOperand(I));
-  MIB.addOperand(CCMask);
+    MIB.add(Compare.getOperand(I));
+  MIB.add(CCMask);
 
   if (Type == SystemZII::CompareAndBranch) {
     // Only conditional branches define CC, as they may be converted back
     // to a non-fused branch because of a long displacement.  Conditional
     // returns don't have that problem.
-    MIB.addOperand(Target)
-       .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+    MIB.add(Target).addReg(SystemZ::CC,
+                           RegState::ImplicitDefine | RegState::Dead);
   }
 
   if (Type == SystemZII::CompareAndSibcall)
@@ -573,3 +576,7 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
 
   return Changed;
 }
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+  return new SystemZElimCompare(TM);
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2d0a06af18ae..84d3c7bed50a 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -194,6 +194,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // Only z196 and above have native support for conversions to unsigned.
+      // On z10, promoting to i64 doesn't generate an inexact condition for
+      // values that are outside the i32 range but in the i64 range, so use
+      // the default expansion.
       if (!Subtarget.hasFPExtension())
         setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     }
@@ -344,9 +347,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // There should be no need to check for float types other than v2f64
     // since <2 x f32> isn't a legal type.
     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
   }
 
   // Handle floating-point types.
@@ -2789,8 +2796,9 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   // but we need this case for bitcasts that are created during lowering
   // and which are then lowered themselves.
   if (auto *LoadN = dyn_cast<LoadSDNode>(In))
-    return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
-                       LoadN->getMemOperand());
+    if (ISD::isNormalLoad(LoadN))
+      return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
+                         LoadN->getMemOperand());
 
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
     SDValue In64;
@@ -3802,7 +3810,7 @@ namespace {
 struct GeneralShuffle {
   GeneralShuffle(EVT vt) : VT(vt) {}
   void addUndef();
-  void add(SDValue, unsigned);
+  bool add(SDValue, unsigned);
   SDValue getNode(SelectionDAG &, const SDLoc &);
 
   // The operands of the shuffle.
@@ -3828,8 +3836,10 @@ void GeneralShuffle::addUndef() {
 // Add an extra element to the shuffle, taking it from element Elem of Op.
 // A null Op indicates a vector input whose value will be calculated later;
 // there is at most one such input per shuffle and it always has the same
-// type as the result.
-void GeneralShuffle::add(SDValue Op, unsigned Elem) {
+// type as the result. Aborts and returns false if the source vector elements
+// of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
+// LLVM they become implicitly extended, but this is rare and not optimized.
+bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
 
   // The source vector can have wider elements than the result,
@@ -3837,8 +3847,12 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
   // We want the least significant part.
   EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
   unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
-  assert(FromBytesPerElement >= BytesPerElement &&
-         "Invalid EXTRACT_VECTOR_ELT");
+
+  // Return false if the source elements are smaller than their destination
+  // elements.
+  if (FromBytesPerElement < BytesPerElement)
+    return false;
+
   unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
                    (FromBytesPerElement - BytesPerElement));
 
@@ -3856,13 +3870,13 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
         break;
       if (NewByte < 0) {
         addUndef();
-        return;
+        return true;
       }
       Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
       Byte = unsigned(NewByte) % SystemZ::VectorBytes;
     } else if (Op.isUndef()) {
       addUndef();
-      return;
+      return true;
     } else
       break;
   }
@@ -3879,6 +3893,8 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
   unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
   for (unsigned I = 0; I < BytesPerElement; ++I)
     Bytes.push_back(Base + I);
+
+  return true;
 }
 
 // Return SDNodes for the completed shuffle.
@@ -4110,12 +4126,14 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
     if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op.getOperand(1).getOpcode() == ISD::Constant) {
       unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      GS.add(Op.getOperand(0), Elem);
+      if (!GS.add(Op.getOperand(0), Elem))
+        return SDValue();
       FoundOne = true;
     } else if (Op.isUndef()) {
       GS.addUndef();
     } else {
-      GS.add(SDValue(), ResidueOps.size());
+      if (!GS.add(SDValue(), ResidueOps.size()))
+        return SDValue();
       ResidueOps.push_back(BVN->getOperand(I));
     }
   }
@@ -4354,9 +4372,9 @@ SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
     int Elt = VSN->getMaskElt(I);
     if (Elt < 0)
       GS.addUndef();
-    else
-      GS.add(Op.getOperand(unsigned(Elt) / NumElements),
-             unsigned(Elt) % NumElements);
+    else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
+                     unsigned(Elt) % NumElements))
+      return SDValue();
   }
   return GS.getNode(DAG, SDLoc(VSN));
 }
@@ -4722,9 +4740,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 // Return true if VT is a vector whose elements are a whole number of bytes
-// in width.
-static bool canTreatAsByteVector(EVT VT) {
-  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0;
+// in width. Also check for presence of vector support.
+bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
+  if (!Subtarget.hasVector())
+    return false;
+
+  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
 }
 
 // Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
@@ -4986,6 +5007,10 @@ SDValue SystemZTargetLowering::combineSTORE(
 
 SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
     SDNode *N, DAGCombinerInfo &DCI) const {
+
+  if (!Subtarget.hasVector())
+    return SDValue();
+
   // Try to simplify a vector extraction.
   if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     SDValue Op0 = N->getOperand(0);
@@ -5233,7 +5258,7 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
 
   unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
-      .addOperand(Base)
+      .add(Base)
       .addImm(0)
       .addReg(0);
   return Reg;
@@ -5322,8 +5347,11 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
     if (Invert)
       CCMask ^= CCValid;
     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
-      .addReg(SrcReg).addOperand(Base).addImm(Disp)
-      .addImm(CCValid).addImm(CCMask);
+        .addReg(SrcReg)
+        .add(Base)
+        .addImm(Disp)
+        .addImm(CCValid)
+        .addImm(CCMask);
     MI.eraseFromParent();
     return MBB;
   }
@@ -5350,7 +5378,10 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
   //   # fallthrough to JoinMBB
   MBB = FalseMBB;
   BuildMI(MBB, DL, TII->get(StoreOpcode))
-    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+      .addReg(SrcReg)
+      .add(Base)
+      .addImm(Disp)
+      .addReg(IndexReg);
   MBB->addSuccessor(JoinMBB);
 
   MI.eraseFromParent();
@@ -5415,8 +5446,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   //   %OrigVal = L Disp(%Base)
   //   # fall through to LoopMMB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5437,8 +5467,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   if (Invert) {
     // Perform the operation normally and then invert every bit of the field.
     unsigned Tmp = MRI.createVirtualRegister(RC);
-    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
-      .addReg(RotatedOldVal).addOperand(Src2);
+    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
     if (BitSize <= 32)
       // XILF with the upper BitSize bits set.
       BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
@@ -5454,7 +5483,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   } else if (BinOpcode)
     // A simply binary operation.
     BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
-      .addReg(RotatedOldVal).addOperand(Src2);
+        .addReg(RotatedOldVal)
+        .add(Src2);
   else if (IsSubWord)
     // Use RISBG to rotate Src2 into position and use it to replace the
     // field in RotatedOldVal.
@@ -5465,7 +5495,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
-    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(NewVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5533,8 +5566,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
   //   %OrigVal     = L Disp(%Base)
   //   # fall through to LoopMMB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5581,7 +5613,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
-    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(NewVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5642,7 +5677,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   //   # fall through to LoopMMB
   MBB = StartMBB;
   BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+      .add(Base)
+      .addImm(Disp)
+      .addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5696,7 +5733,10 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
     .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
   BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
-    .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(StoreVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5869,7 +5909,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (!isUInt<12>(DestDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .addOperand(DestBase)
+          .add(DestBase)
           .addImm(DestDisp)
           .addReg(0);
       DestBase = MachineOperand::CreateReg(Reg, false);
@@ -5878,15 +5918,18 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (!isUInt<12>(SrcDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .addOperand(SrcBase)
+          .add(SrcBase)
           .addImm(SrcDisp)
           .addReg(0);
       SrcBase = MachineOperand::CreateReg(Reg, false);
       SrcDisp = 0;
     }
     BuildMI(*MBB, MI, DL, TII->get(Opcode))
-      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
-      .addOperand(SrcBase).addImm(SrcDisp);
+        .add(DestBase)
+        .addImm(DestDisp)
+        .addImm(ThisLength)
+        .add(SrcBase)
+        .addImm(SrcDisp);
     DestDisp += ThisLength;
     SrcDisp += ThisLength;
     Length -= ThisLength;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 7a21a474c119..7d92a7355877 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -537,6 +537,7 @@ private:
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
+  bool canTreatAsByteVector(EVT VT) const;
   SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
                          unsigned Index, DAGCombinerInfo &DCI,
                          bool Force) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 3565d5f2c49c..c8ff9558cc88 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -11,12 +11,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SystemZInstrInfo.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZ.h"
 #include "SystemZInstrBuilder.h"
-#include "SystemZTargetMachine.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -58,12 +79,25 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
   MBB->insert(MI, EarlierMI);
 
-  // Set up the two 64-bit registers.
+  // Set up the two 64-bit registers and remember super reg and its flags.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
+  unsigned Reg128 = LowRegOp.getReg();
+  unsigned Reg128Killed = getKillRegState(LowRegOp.isKill());
+  unsigned Reg128Undef  = getUndefRegState(LowRegOp.isUndef());
   HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
   LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
+  if (MI->mayStore()) {
+    // Add implicit uses of the super register in case one of the subregs is
+    // undefined. We could track liveness and skip storing an undefined
+    // subreg, but this is hopefully rare (discovered with llvm-stress).
+    // If Reg128 was killed, set kill flag on MI.
+    unsigned Reg128UndefImpl = (Reg128Undef | RegState::Implicit);
+    MachineInstrBuilder(MF, EarlierMI).addReg(Reg128, Reg128UndefImpl);
+    MachineInstrBuilder(MF, MI).addReg(Reg128, (Reg128UndefImpl | Reg128Killed));
+  }
+
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
   MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
@@ -131,7 +165,8 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
     MI.setDesc(get(LowOpcodeK));
   else {
     emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
-                  SystemZ::LR, 32, MI.getOperand(1).isKill());
+                  SystemZ::LR, 32, MI.getOperand(1).isKill(),
+                  MI.getOperand(1).isUndef());
     MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
     MI.getOperand(1).setReg(DestReg);
     MI.tieOperands(0, 1);
@@ -185,9 +220,15 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
 // are low registers, otherwise use RISB[LH]G.
 void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                                         unsigned Size) const {
-  emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
-                MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
-                Size, MI.getOperand(1).isKill());
+  MachineInstrBuilder MIB =
+    emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+               MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+               Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef());
+
+  // Keep the remaining operands as-is.
+  for (unsigned I = 2; I < MI.getNumOperands(); ++I)
+    MIB.add(MI.getOperand(I));
+
   MI.eraseFromParent();
 }
 
@@ -227,11 +268,13 @@ void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
 // are low registers, otherwise use RISB[LH]G.  Size is the number of bits
 // taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
 // KillSrc is true if this move is the last use of SrcReg.
-void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     const DebugLoc &DL, unsigned DestReg,
-                                     unsigned SrcReg, unsigned LowLowOpcode,
-                                     unsigned Size, bool KillSrc) const {
+MachineInstrBuilder
+SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, unsigned LowLowOpcode,
+                                unsigned Size, bool KillSrc,
+                                bool UndefSrc) const {
   unsigned Opcode;
   bool DestIsHigh = isHighReg(DestReg);
   bool SrcIsHigh = isHighReg(SrcReg);
@@ -242,18 +285,16 @@ void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
   else if (!DestIsHigh && SrcIsHigh)
     Opcode = SystemZ::RISBLH;
   else {
-    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+    return BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc));
   }
   unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
-  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+  return BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
     .addReg(DestReg, RegState::Undef)
-    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc))
     .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
 }
 
-
 MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
@@ -282,7 +323,6 @@ MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   }
 }
 
-
 // If MI is a simple load or store for a frame object, return the register
 // it loads or stores and set FrameIndex to the index of the frame object.
 // Return 0 otherwise.
@@ -586,7 +626,6 @@ bool SystemZInstrInfo::optimizeCompareInstr(
          removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
 }
 
-
 bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                        ArrayRef<MachineOperand> Pred,
                                        unsigned TrueReg, unsigned FalseReg,
@@ -640,6 +679,12 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
     else {
       Opc = SystemZ::LOCR;
       MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
+      unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg);
+      TrueReg = TReg;
+      FalseReg = FReg;
     }
   } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
     Opc = SystemZ::LOCGR;
@@ -706,7 +751,7 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   return true;
 }
 
-bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+bool SystemZInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   if (Opcode == SystemZ::Return ||
       Opcode == SystemZ::Trap ||
@@ -780,10 +825,11 @@ bool SystemZInstrInfo::PredicateInstruction(
     MI.RemoveOperand(0);
     MI.setDesc(get(SystemZ::CallBRCL));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-      .addImm(CCValid).addImm(CCMask)
-      .addOperand(FirstOp)
-      .addRegMask(RegMask)
-      .addReg(SystemZ::CC, RegState::Implicit);
+        .addImm(CCValid)
+        .addImm(CCMask)
+        .add(FirstOp)
+        .addRegMask(RegMask)
+        .addReg(SystemZ::CC, RegState::Implicit);
     return true;
   }
   if (Opcode == SystemZ::CallBR) {
@@ -813,7 +859,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 
   if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
-    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc,
+                  false);
     return;
   }
 
@@ -888,15 +935,19 @@ static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
 }
 
 namespace {
+
 struct LogicOp {
-  LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+  LogicOp() = default;
   LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
     : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
 
   explicit operator bool() const { return RegSize; }
 
-  unsigned RegSize, ImmLSB, ImmSize;
+  unsigned RegSize = 0;
+  unsigned ImmLSB = 0;
+  unsigned ImmSize = 0;
 };
+
 } // end anonymous namespace
 
 static LogicOp interpretAndImmediate(unsigned Opcode) {
@@ -976,12 +1027,12 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
       MachineInstrBuilder MIB(
           *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
                                       /*NoImplicit=*/true));
-      MIB.addOperand(Dest);
+      MIB.add(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
-        MIB.addOperand(MI.getOperand(I));
+        MIB.add(MI.getOperand(I));
       MBB->insert(MI, MIB);
       return finishConvertToThreeAddress(&MI, MIB, LV);
     }
@@ -1009,7 +1060,7 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
       MachineOperand &Src = MI.getOperand(1);
       MachineInstrBuilder MIB =
           BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpcode))
-              .addOperand(Dest)
+              .add(Dest)
               .addReg(0)
               .addReg(Src.getReg(), getKillRegState(Src.isKill()),
                       Src.getSubReg())
@@ -1040,7 +1091,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
       LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
       ++CCUnit;
-      assert (!CCUnit.isValid() && "CC only has one reg unit.");
+      assert(!CCUnit.isValid() && "CC only has one reg unit.");
       SlotIndex MISlot =
           LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
       if (!CCLiveRange.liveAt(MISlot)) {
@@ -1091,7 +1142,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                      get(StoreOpcode))
-          .addOperand(MI.getOperand(1))
+          .add(MI.getOperand(1))
           .addFrameIndex(FrameIndex)
           .addImm(0)
           .addReg(0);
@@ -1100,12 +1151,12 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // destination register instead.
     if (OpNum == 1) {
       unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
-      unsigned Dest = MI.getOperand(0).getReg();
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
-                     get(LoadOpcode), Dest)
-          .addFrameIndex(FrameIndex)
-          .addImm(0)
-          .addReg(0);
+                     get(LoadOpcode))
+        .add(MI.getOperand(0))
+        .addFrameIndex(FrameIndex)
+        .addImm(0)
+        .addReg(0);
     }
   }
 
@@ -1132,7 +1183,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
             .addFrameIndex(FrameIndex)
             .addImm(0)
             .addImm(Size)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(MI.getOperand(2).getImm())
             .addMemOperand(MMO);
       }
@@ -1140,7 +1191,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXStore)) {
         return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                        get(SystemZ::MVC))
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(MI.getOperand(2).getImm())
             .addImm(Size)
             .addFrameIndex(FrameIndex)
@@ -1164,7 +1215,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
                                         MI.getDebugLoc(), get(MemOpcode));
       for (unsigned I = 0; I < OpNum; ++I)
-        MIB.addOperand(MI.getOperand(I));
+        MIB.add(MI.getOperand(I));
       MIB.addFrameIndex(FrameIndex).addImm(Offset);
       if (MemDesc.TSFlags & SystemZII::HasIndex)
         MIB.addReg(0);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 794b193a501e..b8be1f5f3921 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -16,16 +16,22 @@
 
 #include "SystemZ.h"
 #include "SystemZRegisterInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cstdint>
 
 #define GET_INSTRINFO_HEADER
 #include "SystemZGenInstrInfo.inc"
 
 namespace llvm {
 
-class SystemZTargetMachine;
+class SystemZSubtarget;
 
 namespace SystemZII {
+
 enum {
   // See comments in SystemZInstrFormats.td.
   SimpleBDXLoad          = (1 << 0),
@@ -43,12 +49,15 @@ enum {
   CCMaskLast             = (1 << 19),
   IsLogical              = (1 << 20)
 };
+
 static inline unsigned getAccessSize(unsigned int Flags) {
   return (Flags & AccessSizeMask) >> AccessSizeShift;
 }
+
 static inline unsigned getCCValues(unsigned int Flags) {
   return (Flags & CCValuesMask) >> CCValuesShift;
 }
+
 static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
   return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
 }
@@ -64,6 +73,7 @@ enum {
   // @INDNTPOFF
   MO_INDNTPOFF = (2 << 0)
 };
+
 // Classifies a branch.
 enum BranchType {
   // An instruction that branches on the current value of CC.
@@ -93,6 +103,7 @@ enum BranchType {
   // the result is nonzero.
   BranchCTG
 };
+
 // Information about a branch instruction.
 struct Branch {
   // The type of the branch.
@@ -111,6 +122,7 @@ struct Branch {
          const MachineOperand *target)
     : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
 };
+
 // Kinds of fused compares in compare-and-* instructions.  Together with type
 // of the converted compare, this identifies the compare-and-*
 // instruction.
@@ -127,9 +139,9 @@ enum FusedCompareType {
   // Trap
   CompareAndTrap
 };
+
 } // end namespace SystemZII
 
-class SystemZSubtarget;
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
   SystemZSubtarget &STI;
@@ -149,9 +161,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                         unsigned Size) const;
   void expandLoadStackGuard(MachineInstr *MI) const;
-  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
-                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+
+  MachineInstrBuilder
+  emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                unsigned LowLowOpcode, unsigned Size, bool KillSrc,
+                bool UndefSrc) const;
+
   virtual void anchor();
 
 protected:
@@ -203,7 +219,7 @@ public:
                     unsigned FalseReg) const override;
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const override;
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override;
@@ -304,6 +320,7 @@ public:
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 738ea7a33729..0158fe6aec08 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -56,17 +56,28 @@ def : VectorExtractSubreg<v4i32, VLGVF>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [FeatureVector] in {
-  // Generate byte mask.
-  def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
-  def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
-  def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
-
-  // Generate mask.
-  def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
-  def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
-  def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
-  def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
-  def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+  let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+      isReMaterializable = 1 in {
+
+    // Generate byte mask.
+    def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
+    def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
+    def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+
+    // Generate mask.
+    def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
+    def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
+    def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
+    def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
+    def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+
+    // Replicate immediate.
+    def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
+    def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
+    def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
+    def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
+    def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+  }
 
   // Load element immediate.
   //
@@ -86,13 +97,6 @@ let Predicates = [FeatureVector] in {
     def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert,
                             v128g, v128g, imm64sx16, imm32zx1>;
   }
-
-  // Replicate immediate.
-  def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
-  def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
-  def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
-  def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
-  def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 14ff6afbd4ae..791f0334e0f1 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -53,15 +53,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -70,72 +76,72 @@ using namespace llvm;
 STATISTIC(LongBranches, "Number of long branches.");
 
 namespace {
+
 // Represents positional information about a basic block.
 struct MBBInfo {
   // The address that we currently assume the block has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The size of the block in bytes, excluding terminators.
   // This value never changes.
-  uint64_t Size;
+  uint64_t Size = 0;
 
   // The minimum alignment of the block, as a log2 value.
   // This value never changes.
-  unsigned Alignment;
+  unsigned Alignment = 0;
 
   // The number of terminators in this block.  This value never changes.
-  unsigned NumTerminators;
+  unsigned NumTerminators = 0;
 
-  MBBInfo()
-    : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+  MBBInfo() = default;
 };
 
 // Represents the state of a block terminator.
 struct TerminatorInfo {
   // If this terminator is a relaxable branch, this points to the branch
   // instruction, otherwise it is null.
-  MachineInstr *Branch;
+  MachineInstr *Branch = nullptr;
 
   // The address that we currently assume the terminator has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The current size of the terminator in bytes.
-  uint64_t Size;
+  uint64_t Size = 0;
 
   // If Branch is nonnull, this is the number of the target block,
   // otherwise it is unused.
-  unsigned TargetBlock;
+  unsigned TargetBlock = 0;
 
   // If Branch is nonnull, this is the length of the longest relaxed form,
   // otherwise it is zero.
-  unsigned ExtraRelaxSize;
+  unsigned ExtraRelaxSize = 0;
 
-  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
-                     ExtraRelaxSize(0) {}
+  TerminatorInfo() = default;
 };
 
 // Used to keep track of the current position while iterating over the blocks.
 struct BlockPosition {
   // The address that we assume this position has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The number of low bits in Address that are known to be the same
   // as the runtime address.
   unsigned KnownBits;
 
-  BlockPosition(unsigned InitialAlignment)
-    : Address(0), KnownBits(InitialAlignment) {}
+  BlockPosition(unsigned InitialAlignment) : KnownBits(InitialAlignment) {}
 };
 
 class SystemZLongBranch : public MachineFunctionPass {
 public:
   static char ID;
+
   SystemZLongBranch(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr) {}
+    : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "SystemZ Long Branch"; }
 
   bool runOnMachineFunction(MachineFunction &F) override;
+
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::NoVRegs);
@@ -155,7 +161,7 @@ private:
   void relaxBranch(TerminatorInfo &Terminator);
   void relaxBranches();
 
-  const SystemZInstrInfo *TII;
+  const SystemZInstrInfo *TII = nullptr;
   MachineFunction *MF;
   SmallVector<MBBInfo, 16> MBBs;
   SmallVector<TerminatorInfo, 16> Terminators;
@@ -165,11 +171,8 @@ char SystemZLongBranch::ID = 0;
 
 const uint64_t MaxBackwardRange = 0x10000;
 const uint64_t MaxForwardRange = 0xfffe;
-} // end anonymous namespace
 
-FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
-  return new SystemZLongBranch(TM);
-}
+} // end anonymous namespace
 
 // Position describes the state immediately before Block.  Update Block
 // accordingly and move Position to the end of the block's non-terminator
@@ -354,13 +357,13 @@ void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1))
-    .addImm(-1);
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .addImm(-1);
   MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
-    .addImm(SystemZ::CCMASK_ICMP)
-    .addImm(SystemZ::CCMASK_CMP_NE)
-    .addOperand(MI->getOperand(2));
+                           .addImm(SystemZ::CCMASK_ICMP)
+                           .addImm(SystemZ::CCMASK_CMP_NE)
+                           .add(MI->getOperand(2));
   // The implicit use of CC is a killing use.
   BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
   MI->eraseFromParent();
@@ -373,12 +376,12 @@ void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1));
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1));
   MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
-    .addImm(SystemZ::CCMASK_ICMP)
-    .addOperand(MI->getOperand(2))
-    .addOperand(MI->getOperand(3));
+                           .addImm(SystemZ::CCMASK_ICMP)
+                           .add(MI->getOperand(2))
+                           .add(MI->getOperand(3));
   // The implicit use of CC is a killing use.
   BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
   MI->eraseFromParent();
@@ -463,3 +466,7 @@ bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
   relaxBranches();
   return true;
 }
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index b919758b70e7..12357e0348a9 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -1,4 +1,4 @@
-//==-- SystemZMachineScheduler.h - SystemZ Scheduler Interface -*- C++ -*---==//
+//==- SystemZMachineScheduler.h - SystemZ Scheduler Interface ----*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,10 +14,10 @@
 // usage of processor resources.
 //===----------------------------------------------------------------------===//
 
-#include "SystemZInstrInfo.h"
 #include "SystemZHazardRecognizer.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <set>
 
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
@@ -28,29 +28,29 @@ namespace llvm {
   
 /// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
 class SystemZPostRASchedStrategy : public MachineSchedStrategy {
-    ScheduleDAGMI *DAG;
+  ScheduleDAGMI *DAG;
   
   /// A candidate during instruction evaluation.
   struct Candidate {
-    SUnit *SU;
+    SUnit *SU = nullptr;
 
     /// The decoding cost.
-    int GroupingCost;
+    int GroupingCost = 0;
 
     /// The processor resources cost.
-    int ResourcesCost;
+    int ResourcesCost = 0;
 
-    Candidate() : SU(nullptr), GroupingCost(0), ResourcesCost(0) {}
+    Candidate() = default;
     Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec);
 
     // Compare two candidates.
     bool operator<(const Candidate &other);
 
     // Check if this node is free of cost ("as good as any").
-    bool inline noCost() {
+    bool noCost() const {
       return (GroupingCost <= 0 && !ResourcesCost);
     }
-   };
+  };
 
   // A sorter for the Available set that makes sure that SUs are considered
   // in the best order.
@@ -83,7 +83,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   // region.
   SystemZHazardRecognizer HazardRec;
   
- public:
+public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
 
   /// PostRA scheduling does not track pressure.
@@ -107,6 +107,6 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   void releaseBottomNode(SUnit *SU) override {};
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H */
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index e97d61d8355d..7aee6f52e9a7 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -855,8 +855,8 @@ def : InstRW<[VecXsPm], (instregex "VZERO$")>;
 def : InstRW<[VecXsPm], (instregex "VONE$")>;
 def : InstRW<[VecXsPm], (instregex "VGBM$")>;
 def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
 def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Loads
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 83882fc0310a..263aff8b7bfb 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -167,10 +167,10 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
     MI.RemoveOperand(0);
     MI.setDesc(TII->get(Opcode));
     MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
-      .addOperand(Dest)
-      .addOperand(Mode)
-      .addOperand(Src)
-      .addOperand(Suppress);
+        .add(Dest)
+        .add(Mode)
+        .add(Src)
+        .add(Suppress);
     return true;
   }
   return false;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 33fdb8f90825..ede5005fa491 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -7,14 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZ.h"
+#include "SystemZMachineScheduler.h"
 #include "SystemZTargetMachine.h"
 #include "SystemZTargetTransformInfo.h"
-#include "SystemZMachineScheduler.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include <string>
 
 using namespace llvm;
 
@@ -48,7 +59,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      StringRef FS) {
   bool VectorABI = UsesVectorABI(CPU, FS);
-  std::string Ret = "";
+  std::string Ret;
 
   // Big endian.
   Ret += "E";
@@ -96,14 +107,15 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
                         getEffectiveRelocModel(RM), CM, OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
-SystemZTargetMachine::~SystemZTargetMachine() {}
+SystemZTargetMachine::~SystemZTargetMachine() = default;
 
 namespace {
+
 /// SystemZ Code Generator Pass Configuration Options.
 class SystemZPassConfig : public TargetPassConfig {
 public:
@@ -116,7 +128,8 @@ public:
 
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
-    return new ScheduleDAGMI(C, make_unique<SystemZPostRASchedStrategy>(C),
+    return new ScheduleDAGMI(C,
+                             llvm::make_unique<SystemZPostRASchedStrategy>(C),
                              /*RemoveKillFlags=*/true);
   }
 
@@ -126,6 +139,7 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
+
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
@@ -157,7 +171,6 @@ void SystemZPassConfig::addPreSched2() {
 }
 
 void SystemZPassConfig::addPreEmitPass() {
-
   // Do instruction shortening before compare elimination because some
   // vector instructions will be shortened into opcodes that compare
   // elimination recognizes.
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 69cf9bc6e525..a10ca64fa632 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -1,4 +1,4 @@
-//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//=- SystemZTargetMachine.h - Define TargetMachine for SystemZ ----*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,18 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
 
 #include "SystemZSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
-class TargetFrameLowering;
-
 class SystemZTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  SystemZSubtarget        Subtarget;
+  SystemZSubtarget Subtarget;
 
 public:
   SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -34,20 +37,22 @@ public:
   ~SystemZTargetMachine() override;
 
   const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
   const SystemZSubtarget *getSubtargetImpl(const Function &) const override {
     return &Subtarget;
   }
+
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetIRAnalysis getTargetIRAnalysis() override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
-
 };
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b10c0e09a0d4..e74c9a80515d 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
         }
       }
       if (isa<StoreInst>(&I)) {
-        NumStores++;
         Type *MemAccessTy = I.getOperand(0)->getType();
-        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
-           (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
-          NumStores++;  // 128 bit fp/int stores get split.
+        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
       }
     }
 
@@ -313,3 +310,547 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
   return 0;
 }
 
+int SystemZTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
+
+  // TODO: return a good value for BB-VECTORIZER that includes the
+  // immediate loads, which we do not want to count for the loop
+  // vectorizer, since they are hopefully hoisted out of the loop. This
+  // would require a new parameter 'InLoop', but not sure if constant
+  // args are common enough to motivate this.
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+  if (Ty->isVectorTy()) {
+    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    unsigned VF = Ty->getVectorNumElements();
+    unsigned NumVectors = getNumberOfParts(Ty);
+
+    // These vector operations are custom handled, but are still supported
+    // with one instruction per vector, regardless of element size.
+    if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+        Opcode == Instruction::AShr) {
+      return NumVectors;
+    }
+
+    // These FP operations are supported with a single vector instruction for
+    // double (base implementation assumes float generally costs 2). For
+    // FP128, the scalar cost is 1, and there is no overhead since the values
+    // are already in scalar registers.
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+      switch (ScalarBits) {
+      case 32: {
+        // Return the cost of multiple scalar invocation plus the cost of
+        // inserting and extracting the values.
+        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        // FIXME: VF 2 for these FP operations are currently just as
+        // expensive as for VF 4.
+        if (VF == 2)
+          Cost *= 2;
+        return Cost;
+      }
+      case 64:
+      case 128:
+        return NumVectors;
+      default:
+        break;
+      }
+    }
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem) {
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+      if (VF == 2 && ScalarBits == 32)
+        Cost *= 2;
+      return Cost;
+    }
+  }
+  else {  // Scalar:
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1)
+      // 2 * ipm sequences ; xor ; shift ; compare
+      return 7;
+
+    // An extra extension for narrow types is needed.
+    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+      // sext of op(s) for narrow types
+      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+
+    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
+      return (ScalarBits < 32 ? 4 : 2);
+  }
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  assert (Tp->isVectorTy());
+  assert (ST->hasVector() && "getShuffleCost() called.");
+  unsigned NumVectors = getNumberOfParts(Tp);
+  
+  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+
+  // FP128 values are always in scalar registers, so there is no work
+  // involved with a shuffle, except for broadcast. In that case register
+  // moves are done with a single instruction per element.
+  if (Tp->getScalarType()->isFP128Ty())
+    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+
+  switch (Kind) {
+  case  TargetTransformInfo::SK_ExtractSubvector:
+    // ExtractSubvector Index indicates start offset.
+
+    // Extracting a subvector from first index is a noop.
+    return (Index == 0 ? 0 : NumVectors);
+
+  case TargetTransformInfo::SK_Broadcast:
+    // Loop vectorizer calls here to figure out the extra cost of
+    // broadcasting a loaded value to all elements of a vector. Since vlrep
+    // loads and replicates with a single instruction, adjust the returned
+    // value.
+    return NumVectors - 1;
+
+  default:
+
+    // SystemZ supports single instruction permutation / replication.
+    return NumVectors;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+// Return the log2 difference of the element sizes of the two vector types.
+static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
+  unsigned Bits0 = Ty0->getScalarSizeInBits();
+  unsigned Bits1 = Ty1->getScalarSizeInBits();
+
+  if (Bits1 >  Bits0)
+    return (Log2_32(Bits1) - Log2_32(Bits0));
+
+  return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+// Return the number of instructions needed to truncate SrcTy to DstTy.
+unsigned SystemZTTIImpl::
+getVectorTruncCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
+  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
+          "Packing must reduce size of vector type.");
+  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
+          "Packing should not change number of elements.");
+
+  // TODO: Since fp32 is expanded, the extract cost should always be 0.
+
+  unsigned NumParts = getNumberOfParts(SrcTy);
+  if (NumParts <= 2)
+    // Up to 2 vector registers can be truncated efficiently with pack or
+    // permute. The latter requires an immediate mask to be loaded, which
+    // typically gets hoisted out of a loop.  TODO: return a good value for
+    // BB-VECTORIZER that includes the immediate loads, which we do not want
+    // to count for the loop vectorizer.
+    return 1;
+
+  unsigned Cost = 0;
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  unsigned VF = SrcTy->getVectorNumElements();
+  for (unsigned P = 0; P < Log2Diff; ++P) {
+    if (NumParts > 1)
+      NumParts /= 2;
+    Cost += NumParts;
+  }
+
+  // Currently, a general mix of permutes and pack instructions is output by
+  // isel, which follow the cost computation above except for this case which
+  // is one instruction less:
+  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
+      DstTy->getScalarSizeInBits() == 8)
+    Cost--;
+
+  return Cost;
+}
+
+// Return the cost of converting a vector bitmask produced by a compare
+// (SrcTy), to the type of the select or extend instruction (DstTy).
+unsigned SystemZTTIImpl::
+getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
+          "Should only be called with vector types.");
+
+  unsigned PackCost = 0;
+  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
+  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  if (SrcScalarBits > DstScalarBits)
+    // The bitmask will be truncated.
+    PackCost = getVectorTruncCost(SrcTy, DstTy);
+  else if (SrcScalarBits < DstScalarBits) {
+    unsigned DstNumParts = getNumberOfParts(DstTy);
+    // Each vector select needs its part of the bitmask unpacked.
+    PackCost = Log2Diff * DstNumParts;
+    // Extra cost for moving part of mask before unpacking.
+    PackCost += DstNumParts - 1;
+  }
+
+  return PackCost;
+}
+
+// Return the type of the compared operands. This is needed to compute the
+// cost for a Select / ZExt or SExt instruction.
+static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
+  Type *OpTy = nullptr;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+    OpTy = CI->getOperand(0)->getType();
+  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
+    if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
+      if (isa<CmpInst>(LogicI->getOperand(1)))
+        OpTy = CI0->getOperand(0)->getType();
+
+  if (OpTy != nullptr) {
+    if (VF == 1) {
+      assert (!OpTy->isVectorTy() && "Expected scalar type");
+      return OpTy;
+    }
+    // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
+    // be either scalar or already vectorized with a same or lesser VF.
+    Type *ElTy = OpTy->getScalarType();
+    return VectorType::get(ElTy, VF);
+  }
+
+  return nullptr;
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
+  unsigned DstScalarBits = Dst->getScalarSizeInBits();
+  unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+  if (Src->isVectorTy()) {
+    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+    assert (Dst->isVectorTy());
+    unsigned VF = Src->getVectorNumElements();
+    unsigned NumDstVectors = getNumberOfParts(Dst);
+    unsigned NumSrcVectors = getNumberOfParts(Src);
+
+    if (Opcode == Instruction::Trunc) {
+      if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
+        return 0; // Check for NOOP conversions.
+      return getVectorTruncCost(Src, Dst);
+    }
+
+    if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      if (SrcScalarBits >= 8) {
+        // ZExt/SExt will be handled with one unpack per doubling of width.
+        unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
+
+        // For types that spans multiple vector registers, some additional
+        // instructions are used to setup the unpacking.
+        unsigned NumSrcVectorOps =
+          (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
+                          : (NumDstVectors / 2));
+
+        return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
+      }
+      else if (SrcScalarBits == 1) {
+        // This should be extension of a compare i1 result.
+        // If we know what the widths of the compared operands, get the
+        // cost of converting it to Dst. Otherwise assume same widths.
+        unsigned Cost = 0;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+        if (CmpOpTy != nullptr)
+          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+        if (Opcode == Instruction::ZExt)
+          // One 'vn' per dst vector with an immediate mask.
+          Cost += NumDstVectors;
+        return Cost;
+      }
+    }
+  
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+        Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+      // TODO: Fix base implementation which could simplify things a bit here
+      // (seems to miss on differentiating on scalar/vector types).
+
+      // Only 64 bit vector conversions are natively supported.
+      if (SrcScalarBits == 64 && DstScalarBits == 64)
+        return NumDstVectors;
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values. Base implementation does not
+      // realize float->int gets scalarized.
+      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+      unsigned TotCost = VF * ScalarCost;
+      bool NeedsInserts = true, NeedsExtracts = true;
+      // FP128 registers do not get inserted or extracted.
+      if (DstScalarBits == 128 &&
+          (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+        NeedsInserts = false;
+      if (SrcScalarBits == 128 &&
+          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+        NeedsExtracts = false;
+
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+        TotCost *= 2;
+
+      return TotCost;
+    }
+
+    if (Opcode == Instruction::FPTrunc) {
+      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
+        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+      else // double -> float
+        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+    }
+
+    if (Opcode == Instruction::FPExt) {
+      if (SrcScalarBits == 32 && DstScalarBits == 64) {
+        // float -> double is very rare and currently unoptimized. Instead of
+        // using vldeb, which can do two at a time, all conversions are
+        // scalarized.
+        return VF * 2;
+      }
+      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
+      return VF + getScalarizationOverhead(Src, false, true);
+    }
+  }
+  else { // Scalar
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+    
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+
+      return Cost;
+    }
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                       const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+    assert (CondTy == nullptr || CondTy->isVectorTy());
+    unsigned VF = ValTy->getVectorNumElements();
+
+    // Called with a compare instruction.
+    if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+      unsigned PredicateExtraCost = 0;
+      if (I != nullptr) {
+        // Some predicates cost one or two extra instructions.
+        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        case CmpInst::Predicate::ICMP_NE:
+        case CmpInst::Predicate::ICMP_UGE:
+        case CmpInst::Predicate::ICMP_ULE:
+        case CmpInst::Predicate::ICMP_SGE:
+        case CmpInst::Predicate::ICMP_SLE:
+          PredicateExtraCost = 1;
+          break;
+        case CmpInst::Predicate::FCMP_ONE:
+        case CmpInst::Predicate::FCMP_ORD:
+        case CmpInst::Predicate::FCMP_UEQ:
+        case CmpInst::Predicate::FCMP_UNO:
+          PredicateExtraCost = 2;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
+      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+      unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+
+      unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
+      return Cost;
+    }
+    else { // Called with a select instruction.
+      assert (Opcode == Instruction::Select);
+
+      // We can figure out the extra cost of packing / unpacking if the
+      // instruction was passed and the compare instruction is found.
+      unsigned PackCost = 0;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+      if (CmpOpTy != nullptr)
+        PackCost =
+          getVectorBitmaskConversionCost(CmpOpTy, ValTy);
+
+      return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+    }
+  }
+  else { // Scalar
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += 2; // extend both operands
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP, so this costs a conditional jump.
+      return 1; // Load On Condition.
+    }
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
+
+int SystemZTTIImpl::
+getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  // vlvgp will insert two grs into a vector register, so only count half the
+  // number of instructions.
+  if (Opcode == Instruction::InsertElement &&
+      Val->getScalarType()->isIntegerTy(64))
+    return ((Index % 2 == 0) ? 1 : 0);
+
+  if (Opcode == Instruction::ExtractElement) {
+    int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+
+    // Give a slight penalty for moving out of vector pipeline to FXU unit.
+    if (Index == 0 && Val->getScalarType()->isIntegerTy())
+      Cost += 1;
+
+    return Cost;
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
+  assert(!Src->isVoidTy() && "Invalid type");
+
+  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
+      I != nullptr && I->hasOneUse()) {
+      const Instruction *UserI = cast<Instruction>(*I->user_begin());
+      unsigned Bits = Src->getScalarSizeInBits();
+      bool FoldsLoad = false;
+      switch (UserI->getOpcode()) {
+      case Instruction::ICmp:
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      // This also makes sense for float operations, but disabled for now due
+      // to regressions.
+      // case Instruction::FCmp:
+      // case Instruction::FAdd:
+      // case Instruction::FSub:
+      // case Instruction::FMul:
+      // case Instruction::FDiv:
+        FoldsLoad = (Bits == 32 || Bits == 64);
+        break;
+      }
+
+      if (FoldsLoad) {
+        assert (UserI->getNumOperands() == 2 &&
+                "Expected to only handle binops.");
+
+        // UserI can't fold two loads, so in that case return 0 cost only
+        // half of the time.
+        for (unsigned i = 0; i < 2; ++i) {
+          if (UserI->getOperand(i) == I)
+            continue;
+          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
+            if (LI->hasOneUse())
+              return i == 0;
+          }
+        }
+
+        return 0;
+      }
+  }
+
+  unsigned NumOps = getNumberOfParts(Src);
+
+  if (Src->getScalarSizeInBits() == 128)
+    // 128 bit scalars are held in a pair of two 64 bit registers.
+    NumOps *= 2;
+
+  return  NumOps;
+}
+
+int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
+     (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
+  assert (WideBits > 0 && "Could not compute size of vector");
+  int NumWideParts =
+    ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+
+  // How many source vectors are handled to produce a vectorized operand?
+  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
+  int NumSrcParts =
+    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+
+  // A Load group may have gaps.
+  unsigned NumOperands =
+    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+
+  // Each needed permute takes two vectors as input.
+  if (NumSrcParts > 1)
+    NumSrcParts--;
+  int NumPermutes = NumSrcParts * NumOperands;
+
+  // Cost of load/store operations and the permutations needed.
+  return NumWideParts + NumPermutes;
+}
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f7d2d827f11b..3766ed45b8c4 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   const SystemZSubtarget *getST() const { return ST; }
   const SystemZTargetLowering *getTLI() const { return TLI; }
 
+  unsigned const LIBCALL_COST = 30;
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -53,6 +55,32 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
 
+  bool supportsEfficientVectorElementLoadStore() { return true; }
+  bool enableInterleavedAccessVectorization() { return true; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
+  unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace, const Instruction *I = nullptr);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
   /// @}
 };
 
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 375f8511f7ad..50272fda56de 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -264,10 +264,7 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
   // in discardable section
   // FIXME: this isn't the right predicate, should be based on the MCSection
   // for the function.
-  if (F.isWeakForLinker())
-    return true;
-
-  return false;
+  return F.isWeakForLinker();
 }
 
 /// Given a mergable constant with the specified size and relocation
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 8a6d28490e8c..e8fe0a2b218e 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -74,10 +74,10 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
       Options.X = DefaultOptions.X;                                            \
   } while (0)
 
-  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
   RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+  RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
   RESET_OPTION(NoTrappingFPMath, "no-trapping-math");
 
   StringRef Denormal =
@@ -156,8 +156,11 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     bool IsTLS = GV && GV->isThreadLocal();
     bool IsAccessViaCopyRelocs =
         Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
-    // Check if we can use copy relocations.
-    if (!IsTLS && (RM == Reloc::Static || IsAccessViaCopyRelocs))
+    Triple::ArchType Arch = TT.getArch();
+    bool IsPPC =
+        Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::ppc64le;
+    // Check if we can use copy relocations. PowerPC has no copy relocations.
+    if (!IsTLS && !IsPPC && (RM == Reloc::Static || IsAccessViaCopyRelocs))
       return true;
   }
 
@@ -198,7 +201,7 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
 void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
 
 TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
+  return TargetIRAnalysis([](const Function &F) {
     return TargetTransformInfo(F.getParent()->getDataLayout());
   });
 }
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index d9c53ecc8d08..78b2cdb61b76 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCallIndirectFixup.cpp
   WebAssemblyCFGStackify.cpp
+  WebAssemblyCFGSort.cpp
   WebAssemblyExplicitLocals.cpp
   WebAssemblyFastISel.cpp
   WebAssemblyFixIrreducibleControlFlow.cpp
@@ -35,6 +36,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyRegNumbering.cpp
   WebAssemblyRegStackify.cpp
   WebAssemblyReplacePhysRegs.cpp
+  WebAssemblyRuntimeLibcallSignatures.cpp
   WebAssemblySelectionDAGInfo.cpp
   WebAssemblySetP2AlignOperands.cpp
   WebAssemblyStoreResults.cpp
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index b4763ca60ab6..b5f53114d3e1 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -63,89 +63,8 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
 MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
     raw_ostream &OS, raw_ostream &CS) const {
-  Size = 0;
-  uint64_t Pos = 0;
 
-  // Read the opcode.
-  if (Pos + sizeof(uint64_t) > Bytes.size())
-    return MCDisassembler::Fail;
-  uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
-  Pos += sizeof(uint64_t);
+  // TODO: Implement disassembly.
 
-  if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
-    return MCDisassembler::Fail;
-
-  MI.setOpcode(Opcode);
-  const MCInstrDesc &Desc = MCII->get(Opcode);
-  unsigned NumFixedOperands = Desc.NumOperands;
-
-  // If it's variadic, read the number of extra operands.
-  unsigned NumExtraOperands = 0;
-  if (Desc.isVariadic()) {
-    if (Pos + sizeof(uint64_t) > Bytes.size())
-      return MCDisassembler::Fail;
-    NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
-    Pos += sizeof(uint64_t);
-  }
-
-  // Read the fixed operands. These are described by the MCInstrDesc.
-  for (unsigned i = 0; i < NumFixedOperands; ++i) {
-    const MCOperandInfo &Info = Desc.OpInfo[i];
-    switch (Info.OperandType) {
-    case MCOI::OPERAND_IMMEDIATE:
-    case WebAssembly::OPERAND_LOCAL:
-    case WebAssembly::OPERAND_P2ALIGN:
-    case WebAssembly::OPERAND_BASIC_BLOCK: {
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      MI.addOperand(MCOperand::createImm(Imm));
-      break;
-    }
-    case MCOI::OPERAND_REGISTER: {
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      MI.addOperand(MCOperand::createReg(Reg));
-      break;
-    }
-    case WebAssembly::OPERAND_F32IMM:
-    case WebAssembly::OPERAND_F64IMM: {
-      // TODO: MC converts all floating point immediate operands to double.
-      // This is fine for numeric values, but may cause NaNs to change bits.
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      double Imm;
-      memcpy(&Imm, &Bits, sizeof(Imm));
-      MI.addOperand(MCOperand::createFPImm(Imm));
-      break;
-    }
-    default:
-      llvm_unreachable("unimplemented operand kind");
-    }
-  }
-
-  // Read the extra operands.
-  assert(NumExtraOperands == 0 || Desc.isVariadic());
-  for (unsigned i = 0; i < NumExtraOperands; ++i) {
-    if (Pos + sizeof(uint64_t) > Bytes.size())
-      return MCDisassembler::Fail;
-    if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
-      // Decode extra immediate operands.
-      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
-      MI.addOperand(MCOperand::createImm(Imm));
-    } else {
-      // Decode extra register operands.
-      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
-      MI.addOperand(MCOperand::createReg(Reg));
-    }
-    Pos += sizeof(uint64_t);
-  }
-
-  Size = Pos;
-  return MCDisassembler::Success;
+  return MCDisassembler::Fail;
 }
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 0af13cffdb04..f31dde0ce48f 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -242,3 +242,17 @@ const char *llvm::WebAssembly::TypeToString(MVT Ty) {
     llvm_unreachable("unsupported type");
   }
 }
+
+const char *llvm::WebAssembly::TypeToString(wasm::ValType Type) {
+  switch (Type) {
+  case wasm::ValType::I32:
+    return "i32";
+  case wasm::ValType::I64:
+    return "i64";
+  case wasm::ValType::F32:
+    return "f32";
+  case wasm::ValType::F64:
+    return "f64";
+  }
+  llvm_unreachable("unsupported type");
+}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index d11f99c1ff39..c6158720d62f 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 
@@ -50,6 +51,7 @@ public:
 namespace WebAssembly {
 
 const char *TypeToString(MVT Ty);
+const char *TypeToString(wasm::ValType Type);
 
 } // end namespace WebAssembly
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
index fd41df7b9635..13c0fe915908 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
@@ -5,4 +5,5 @@ add_llvm_library(LLVMWebAssemblyDesc
   WebAssemblyMCCodeEmitter.cpp
   WebAssemblyMCTargetDesc.cpp
   WebAssemblyTargetStreamer.cpp
+  WebAssemblyWasmObjectWriter.cpp
 )
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 97454a824a34..7c78285fbda4 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
@@ -22,21 +23,22 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 namespace {
-class WebAssemblyAsmBackend final : public MCAsmBackend {
+class WebAssemblyAsmBackendELF final : public MCAsmBackend {
   bool Is64Bit;
 
 public:
-  explicit WebAssemblyAsmBackend(bool Is64Bit)
+  explicit WebAssemblyAsmBackendELF(bool Is64Bit)
       : MCAsmBackend(), Is64Bit(Is64Bit) {}
-  ~WebAssemblyAsmBackend() override {}
+  ~WebAssemblyAsmBackendELF() override {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -61,6 +63,95 @@ public:
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
 
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+  bool Is64Bit;
+
+public:
+  explicit WebAssemblyAsmBackend(bool Is64Bit)
+      : MCAsmBackend(), Is64Bit(Is64Bit) {}
+  ~WebAssemblyAsmBackend() override {}
+
+  unsigned getNumFixupKinds() const override {
+    return WebAssembly::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
+                                            MCObjectWriter *OW) const {
+  for (uint64_t i = 0; i < Count; ++i)
+    OW->write8(WebAssembly::Nop);
+
+  return true;
+}
+
+void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
+                                          unsigned DataSize, uint64_t Value,
+                                          bool IsPCRel, MCContext &Ctx) const {
+  const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
+  if (Value == 0)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackendELF::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+
+const MCFixupKindInfo &
+WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[WebAssembly::NumTargetFixupKinds] = {
+    // This table *must* be in the order that the fixup_* kinds are defined in
+    // WebAssemblyFixupKinds.h.
+    //
+    // Name                     Offset (bits) Size (bits)     Flags
+    { "fixup_code_sleb128_i32", 0,            5*8,            0 },
+    { "fixup_code_sleb128_i64", 0,            10*8,           0 },
+    { "fixup_code_uleb128_i32", 0,            5*8,            0 },
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
 bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
                                          MCObjectWriter *OW) const {
   if (Count == 0)
@@ -74,11 +165,11 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
 
 void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                        unsigned DataSize, uint64_t Value,
-                                       bool IsPCRel) const {
+                                       bool IsPCRel, MCContext &Ctx) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
-  unsigned NumBytes = (Info.TargetSize + 7) / 8;
+  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
   if (Value == 0)
     return; // Doesn't change encoding.
 
@@ -96,10 +187,12 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 MCObjectWriter *
 WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+  return createWebAssemblyWasmObjectWriter(OS, Is64Bit);
 }
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyAsmBackendELF(TT.isArch64Bit());
   return new WebAssemblyAsmBackend(TT.isArch64Bit());
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
new file mode 100644
index 000000000000..b0af63c924bd
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
@@ -0,0 +1,31 @@
+//=- WebAssemblyFixupKinds.h - WebAssembly Specific Fixup Entries -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYFIXUPKINDS_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace WebAssembly {
+enum Fixups {
+  fixup_code_sleb128_i32 = FirstTargetFixupKind,      // 32-bit signed
+  fixup_code_sleb128_i64,                             // 64-bit signed
+  fixup_code_uleb128_i32,                             // 32-bit unsigned
+
+  fixup_code_global_index,                            // 32-bit unsigned
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index d8c39216c53b..2dcec5263fa1 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -19,9 +19,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-asm-info"
 
-WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+WebAssemblyMCAsmInfoELF::~WebAssemblyMCAsmInfoELF() {}
 
-WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) {
   PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
 
   // TODO: What should MaxInstLength be?
@@ -51,3 +51,33 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
   // WebAssembly's stack is never executable.
   UsesNonexecutableStackSection = false;
 }
+
+WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+
+WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+  PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
+
+  // TODO: What should MaxInstLength be?
+
+  UseDataRegionDirectives = true;
+
+  // Use .skip instead of .zero because .zero is confusing when used with two
+  // arguments (it doesn't actually zero things out).
+  ZeroDirective = "\t.skip\t";
+
+  Data8bitsDirective = "\t.int8\t";
+  Data16bitsDirective = "\t.int16\t";
+  Data32bitsDirective = "\t.int32\t";
+  Data64bitsDirective = "\t.int64\t";
+
+  AlignmentIsInBytes = false;
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+
+  SupportsDebugInformation = true;
+
+  // For now, WebAssembly does not support exceptions.
+  ExceptionsType = ExceptionHandling::None;
+
+  // TODO: UseIntegratedAssembler?
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index 2dcf2cd3c892..d9547096190e 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -16,12 +16,19 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoWasm.h"
 
 namespace llvm {
 
 class Triple;
 
-class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
+class WebAssemblyMCAsmInfoELF final : public MCAsmInfoELF {
+public:
+  explicit WebAssemblyMCAsmInfoELF(const Triple &T);
+  ~WebAssemblyMCAsmInfoELF() override;
+};
+
+class WebAssemblyMCAsmInfo final : public MCAsmInfoWasm {
 public:
   explicit WebAssemblyMCAsmInfo(const Triple &T);
   ~WebAssemblyMCAsmInfo() override;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index d0e0eecd3002..a0b008947491 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -35,6 +36,7 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
   const MCInstrInfo &MCII;
+  MCContext &Ctx;
 
   // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -46,12 +48,14 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  explicit WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
 };
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
-  return new WebAssemblyMCCodeEmitter(MCII);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+                                                    MCContext &Ctx) {
+  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -63,6 +67,13 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
   assert(Binary < UINT8_MAX && "Multi-byte opcodes not supported yet");
   OS << uint8_t(Binary);
 
+  // For br_table instructions, encode the size of the table. In the MCInst,
+  // there's an index operand, one operand for each table entry, and the
+  // default operand.
+  if (MI.getOpcode() == WebAssembly::BR_TABLE_I32 ||
+      MI.getOpcode() == WebAssembly::BR_TABLE_I64)
+    encodeULEB128(MI.getNumOperands() - 2, OS);
+
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
     const MCOperand &MO = MI.getOperand(i);
@@ -77,6 +88,12 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
           encodeSLEB128(int32_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
+        } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+          Fixups.push_back(MCFixup::create(
+              OS.tell() - Start, MCConstantExpr::create(MO.getImm(), Ctx),
+              MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc()));
+          ++MCNumFixups;
+          encodeULEB128(uint64_t(MO.getImm()), OS);
         } else {
           encodeULEB128(uint64_t(MO.getImm()), OS);
         }
@@ -102,14 +119,28 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         support::endian::Writer<support::little>(OS).write<double>(d);
       }
     } else if (MO.isExpr()) {
+      const MCOperandInfo &Info = Desc.OpInfo[i];
+      llvm::MCFixupKind FixupKind;
+      size_t PaddedSize;
+      if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32);
+        PaddedSize = 5;
+      } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64);
+        PaddedSize = 10;
+      } else if (Info.OperandType == WebAssembly::OPERAND_FUNCTION32 ||
+                 Info.OperandType == WebAssembly::OPERAND_OFFSET32 ||
+                 Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
+        PaddedSize = 5;
+      } else {
+        llvm_unreachable("unexpected symbolic operand kind");
+      }
       Fixups.push_back(MCFixup::create(
           OS.tell() - Start, MO.getExpr(),
-          STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
-          MI.getLoc()));
+          FixupKind, MI.getLoc()));
       ++MCNumFixups;
-      encodeULEB128(STI.getTargetTriple().isArch64Bit() ? UINT64_MAX
-                                                        : uint64_t(UINT32_MAX),
-                    OS);
+      encodeULEB128(0, OS, PaddedSize - 1);
     } else {
       llvm_unreachable("unexpected operand kind");
     }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 3dc1ded17116..9fd3ec81c258 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -36,6 +36,8 @@ using namespace llvm;
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT) {
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyMCAsmInfoELF(TT);
   return new WebAssemblyMCAsmInfo(TT);
 }
 
@@ -71,8 +73,8 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
 
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
-                                        MCContext & /*Ctx*/) {
-  return createWebAssemblyMCCodeEmitter(MCII);
+                                        MCContext &Ctx) {
+  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
 }
 
 static MCAsmBackend *createAsmBackend(const Target & /*T*/,
@@ -88,8 +90,12 @@ static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
 }
 
 static MCTargetStreamer *
-createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
-  return new WebAssemblyTargetELFStreamer(S);
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyTargetELFStreamer(S);
+
+  return new WebAssemblyTargetWasmStreamer(S);
 }
 
 static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
@@ -135,12 +141,12 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
   }
 }
 
-WebAssembly::ValType WebAssembly::toValType(const MVT &Ty) {
+wasm::ValType WebAssembly::toValType(const MVT &Ty) {
   switch (Ty.SimpleTy) {
-  case MVT::i32: return WebAssembly::ValType::I32;
-  case MVT::i64: return WebAssembly::ValType::I64;
-  case MVT::f32: return WebAssembly::ValType::F32;
-  case MVT::f64: return WebAssembly::ValType::F64;
+  case MVT::i32: return wasm::ValType::I32;
+  case MVT::i64: return wasm::ValType::I64;
+  case MVT::f32: return wasm::ValType::F32;
+  case MVT::f64: return wasm::ValType::F64;
   default: llvm_unreachable("unexpected type");
   }
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 8583b772deab..795658ca96b4 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -17,6 +17,7 @@
 
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 
@@ -34,19 +35,25 @@ class raw_pwrite_stream;
 Target &getTheWebAssemblyTarget32();
 Target &getTheWebAssemblyTarget64();
 
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+                                              MCContext &Ctx);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
 MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
                                                  bool Is64Bit, uint8_t OSABI);
 
+MCObjectWriter *createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
+                                                  bool Is64Bit);
+
 namespace WebAssembly {
 enum OperandType {
   /// Basic block label in a branch construct.
   OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
   /// Local index.
   OPERAND_LOCAL,
+  /// Global index.
+  OPERAND_GLOBAL,
   /// 32-bit integer immediates.
   OPERAND_I32IMM,
   /// 64-bit integer immediates.
@@ -62,7 +69,9 @@ enum OperandType {
   /// p2align immediate for load and store address alignment.
   OPERAND_P2ALIGN,
   /// signature immediate for block/loop.
-  OPERAND_SIGNATURE
+  OPERAND_SIGNATURE,
+  /// type signature immediate for call_indirect.
+  OPERAND_TYPEINDEX,
 };
 } // end namespace WebAssembly
 
@@ -141,40 +150,25 @@ static const unsigned StoreP2AlignOperandNo = 0;
 
 /// This is used to indicate block signatures.
 enum class ExprType {
-  Void    = 0x40,
-  I32     = 0x7f,
-  I64     = 0x7e,
-  F32     = 0x7d,
-  F64     = 0x7c,
-  I8x16   = 0x7b,
-  I16x8   = 0x7a,
-  I32x4   = 0x79,
-  F32x4   = 0x78,
-  B8x16   = 0x77,
-  B16x8   = 0x76,
-  B32x4   = 0x75
-};
-
-/// This is used to indicate local types.
-enum class ValType {
-  I32     = 0x7f,
-  I64     = 0x7e,
-  F32     = 0x7d,
-  F64     = 0x7c,
-  I8x16   = 0x7b,
-  I16x8   = 0x7a,
-  I32x4   = 0x79,
-  F32x4   = 0x78,
-  B8x16   = 0x77,
-  B16x8   = 0x76,
-  B32x4   = 0x75
+  Void    = -0x40,
+  I32     = -0x01,
+  I64     = -0x02,
+  F32     = -0x03,
+  F64     = -0x04,
+  I8x16   = -0x05,
+  I16x8   = -0x06,
+  I32x4   = -0x07,
+  F32x4   = -0x08,
+  B8x16   = -0x09,
+  B16x8   = -0x0a,
+  B32x4   = -0x0b
 };
 
 /// Instruction opcodes emitted via means other than CodeGen.
 static const unsigned Nop = 0x01;
 static const unsigned End = 0x0b;
 
-ValType toValType(const MVT &Ty);
+wasm::ValType toValType(const MVT &Ty);
 
 } // end namespace WebAssembly
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 3cee8b2a1844..ad59f2f40587 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -18,9 +18,11 @@
 #include "WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
@@ -28,6 +30,10 @@ using namespace llvm;
 WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
+void WebAssemblyTargetStreamer::emitValueType(wasm::ValType Type) {
+  Streamer.EmitSLEB128IntValue(int32_t(Type));
+}
+
 WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
     MCStreamer &S, formatted_raw_ostream &OS)
     : WebAssemblyTargetStreamer(S), OS(OS) {}
@@ -35,6 +41,9 @@ WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
 WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
     : WebAssemblyTargetStreamer(S) {}
 
+WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S)
+    : WebAssemblyTargetStreamer(S) {}
+
 static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
   bool First = true;
   for (MVT Type : Types) {
@@ -47,14 +56,28 @@ static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
   OS << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
-  OS << "\t.param  \t";
-  PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitParam(MCSymbol *Symbol,
+                                             ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.param  \t";
+
+    // FIXME: Currently this applies to the "current" function; it may
+    // be cleaner to specify an explicit symbol as part of the directive.
+
+    PrintTypes(OS, Types);
+  }
 }
 
-void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
-  OS << "\t.result \t";
-  PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitResult(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.result \t";
+
+    // FIXME: Currently this applies to the "current" function; it may
+    // be cleaner to specify an explicit symbol as part of the directive.
+
+    PrintTypes(OS, Types);
+  }
 }
 
 void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
@@ -64,6 +87,31 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
   }
 }
 
+void WebAssemblyTargetAsmStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  if (!Globals.empty()) {
+    OS << "\t.globalvar  \t";
+
+    bool First = true;
+    for (const wasm::Global &G : Globals) {
+      if (First)
+        First = false;
+      else
+        OS << ", ";
+      OS << WebAssembly::TypeToString(G.Type);
+      if (!G.InitialModule.empty())
+        OS << '=' << G.InitialModule << ':' << G.InitialName;
+      else
+        OS << '=' << G.InitialValue;
+    }
+    OS << '\n';
+  }
+}
+
+void WebAssemblyTargetAsmStreamer::emitStackPointer(uint32_t Index) {
+  OS << "\t.stack_pointer\t" << Index << '\n';
+}
+
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
 void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
@@ -88,18 +136,30 @@ void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
   OS << "\t.indidx  \t" << *Value << '\n';
 }
 
-void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+void WebAssemblyTargetELFStreamer::emitParam(MCSymbol *Symbol,
+                                             ArrayRef<MVT> Types) {
   // Nothing to emit; params are declared as part of the function signature.
 }
 
-void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+void WebAssemblyTargetELFStreamer::emitResult(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
   // Nothing to emit; results are declared as part of the function signature.
 }
 
 void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
   Streamer.EmitULEB128IntValue(Types.size());
   for (MVT Type : Types)
-    Streamer.EmitIntValue(int64_t(WebAssembly::toValType(Type)), 1);
+    emitValueType(WebAssembly::toValType(Type));
+}
+
+void WebAssemblyTargetELFStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  llvm_unreachable(".globalvar encoding not yet implemented");
+}
+
+void WebAssemblyTargetELFStreamer::emitStackPointer(
+    uint32_t Index) {
+  llvm_unreachable(".stack_pointer encoding not yet implemented");
 }
 
 void WebAssemblyTargetELFStreamer::emitEndFunc() {
@@ -117,4 +177,88 @@ void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
 }
 
 void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
-}
-\ No newline at end of file
+}
+
+void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
+  SmallVector<wasm::ValType, 4> Params;
+  for (MVT Ty : Types)
+    Params.push_back(WebAssembly::toValType(Ty));
+
+  cast<MCSymbolWasm>(Symbol)->setParams(std::move(Params));
+}
+
+void WebAssemblyTargetWasmStreamer::emitResult(MCSymbol *Symbol,
+                                               ArrayRef<MVT> Types) {
+  SmallVector<wasm::ValType, 4> Returns;
+  for (MVT Ty : Types)
+    Returns.push_back(WebAssembly::toValType(Ty));
+
+  cast<MCSymbolWasm>(Symbol)->setReturns(std::move(Returns));
+}
+
+void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
+  SmallVector<std::pair<MVT, uint32_t>, 4> Grouped;
+  for (MVT Type : Types) {
+    if (Grouped.empty() || Grouped.back().first != Type)
+      Grouped.push_back(std::make_pair(Type, 1));
+    else
+      ++Grouped.back().second;
+  }
+
+  Streamer.EmitULEB128IntValue(Grouped.size());
+  for (auto Pair : Grouped) {
+    Streamer.EmitULEB128IntValue(Pair.second);
+    emitValueType(WebAssembly::toValType(Pair.first));
+  }
+}
+
+void WebAssemblyTargetWasmStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  // Encode the globals use by the funciton into the special .global_variables
+  // section. This will later be decoded and turned into contents for the
+  // Globals Section.
+  Streamer.PushSection();
+  Streamer.SwitchSection(Streamer.getContext()
+                                 .getWasmSection(".global_variables", 0, 0));
+  for (const wasm::Global &G : Globals) {
+    Streamer.EmitIntValue(int32_t(G.Type), 1);
+    Streamer.EmitIntValue(G.Mutable, 1);
+    if (G.InitialModule.empty()) {
+      Streamer.EmitIntValue(0, 1); // indicate that we have an int value
+      Streamer.EmitSLEB128IntValue(0);
+    } else {
+      Streamer.EmitIntValue(1, 1); // indicate that we have a module import
+      Streamer.EmitBytes(G.InitialModule);
+      Streamer.EmitIntValue(0, 1); // nul-terminate
+      Streamer.EmitBytes(G.InitialName);
+      Streamer.EmitIntValue(0, 1); // nul-terminate
+    }
+  }
+  Streamer.PopSection();
+}
+
+void WebAssemblyTargetWasmStreamer::emitStackPointer(uint32_t Index) {
+  Streamer.PushSection();
+  Streamer.SwitchSection(Streamer.getContext()
+                                 .getWasmSection(".stack_pointer", 0, 0));
+  Streamer.EmitIntValue(Index, 4);
+  Streamer.PopSection();
+}
+
+void WebAssemblyTargetWasmStreamer::emitEndFunc() {
+  llvm_unreachable(".end_func is not needed for direct wasm output");
+}
+
+void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) {
+  llvm_unreachable(".indidx encoding not yet implemented");
+}
+
+void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
+    StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+  // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
+  // whether it's necessary for .o files to declare indirect function types.
+}
+
+void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 23ac3190243a..68d6747298df 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -18,10 +18,12 @@
 
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 
 class MCELFStreamer;
+class MCWasmStreamer;
 
 /// WebAssembly-specific streamer interface, to implement support
 /// WebAssembly-specific assembly directives.
@@ -30,11 +32,15 @@ public:
   explicit WebAssemblyTargetStreamer(MCStreamer &S);
 
   /// .param
-  virtual void emitParam(ArrayRef<MVT> Types) = 0;
+  virtual void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .result
-  virtual void emitResult(ArrayRef<MVT> Types) = 0;
+  virtual void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .local
   virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  /// .globalvar
+  virtual void emitGlobal(ArrayRef<wasm::Global> Globals) = 0;
+  /// .stack_pointer
+  virtual void emitStackPointer(uint32_t Index) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
   /// .functype
@@ -47,6 +53,9 @@ public:
   virtual void emitIndIdx(const MCExpr *Value) = 0;
   /// .import_global
   virtual void emitGlobalImport(StringRef name) = 0;
+
+protected:
+  void emitValueType(wasm::ValType Type);
 };
 
 /// This part is for ascii assembly output
@@ -56,9 +65,11 @@ class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
 public:
   WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
-  void emitParam(ArrayRef<MVT> Types) override;
-  void emitResult(ArrayRef<MVT> Types) override;
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
   void emitEndFunc() override;
   void emitIndirectFunctionType(StringRef name,
                                 SmallVectorImpl<MVT> &Params,
@@ -72,9 +83,29 @@ class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
 public:
   explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
 
-  void emitParam(ArrayRef<MVT> Types) override;
-  void emitResult(ArrayRef<MVT> Types) override;
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
+  void emitEndFunc() override;
+  void emitIndirectFunctionType(StringRef name,
+                                SmallVectorImpl<MVT> &Params,
+                                SmallVectorImpl<MVT> &Results) override;
+  void emitIndIdx(const MCExpr *Value) override;
+  void emitGlobalImport(StringRef name) override;
+};
+
+/// This part is for Wasm object output
+class WebAssemblyTargetWasmStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetWasmStreamer(MCStreamer &S);
+
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
   void emitEndFunc() override;
   void emitIndirectFunctionType(StringRef name,
                                 SmallVectorImpl<MVT> &Params,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
new file mode 100644
index 000000000000..2846ec5e9337
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -0,0 +1,92 @@
+//===-- WebAssemblyWasmObjectWriter.cpp - WebAssembly Wasm Writer ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles Wasm-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Wasm.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyWasmObjectWriter final : public MCWasmObjectTargetWriter {
+public:
+  explicit WebAssemblyWasmObjectWriter(bool Is64Bit);
+
+private:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
+    : MCWasmObjectTargetWriter(Is64Bit) {}
+
+// Test whether the given expression computes a function address.
+static bool IsFunctionExpr(const MCExpr *Expr) {
+  if (const MCSymbolRefExpr *SyExp =
+          dyn_cast<MCSymbolRefExpr>(Expr))
+    return cast<MCSymbolWasm>(SyExp->getSymbol()).isFunction();
+
+  if (const MCBinaryExpr *BinOp =
+          dyn_cast<MCBinaryExpr>(Expr))
+    return IsFunctionExpr(BinOp->getLHS()) != IsFunctionExpr(BinOp->getRHS());
+
+  if (const MCUnaryExpr *UnOp =
+          dyn_cast<MCUnaryExpr>(Expr))
+    return IsFunctionExpr(UnOp->getSubExpr());
+
+  return false;
+}
+
+unsigned WebAssemblyWasmObjectWriter::getRelocType(MCContext &Ctx,
+                                                   const MCValue &Target,
+                                                   const MCFixup &Fixup,
+                                                   bool IsPCRel) const {
+  // WebAssembly functions are not allocated in the data address space. To
+  // resolve a pointer to a function, we must use a special relocation type.
+  bool IsFunction = IsFunctionExpr(Fixup.getValue());
+
+  assert(!IsPCRel);
+  switch (unsigned(Fixup.getKind())) {
+  case WebAssembly::fixup_code_sleb128_i32:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB;
+  case WebAssembly::fixup_code_sleb128_i64:
+    llvm_unreachable("fixup_sleb128_i64 not implemented yet");
+  case WebAssembly::fixup_code_uleb128_i32:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB;
+  case FK_Data_4:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32;
+  case FK_Data_8:
+    llvm_unreachable("FK_Data_8 not implemented yet");
+  default:
+    llvm_unreachable("unimplemented fixup kind");
+  }
+}
+
+MCObjectWriter *llvm::createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
+                                                        bool Is64Bit) {
+  MCWasmObjectTargetWriter *MOTW = new WebAssemblyWasmObjectWriter(Is64Bit);
+  return createWasmObjectWriter(MOTW, OS);
+}
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 64991ad14071..3433b1553e8c 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -145,3 +145,24 @@ WebAssemblyRegStackify could be extended, or possibly rewritten, to take
 advantage of the new opportunities.
 
 //===---------------------------------------------------------------------===//
+
+Add support for mergeable sections in the Wasm writer, such as for strings and
+floating-point constants.
+
+//===---------------------------------------------------------------------===//
+
+The function @dynamic_alloca_redzone in test/CodeGen/WebAssembly/userstack.ll
+ends up with a tee_local in its prolog which has an unused result, requiring
+an extra drop:
+
+    get_global  $push8=, 0
+    tee_local   $push9=, 1, $pop8
+    drop        $pop9
+    [...]
+
+The prologue code initially thinks it needs an FP register, but later it
+turns out to be unneeded, so one could either approach this by being more
+clever about not inserting code for an FP in the first place, or optimizing
+away the copy later.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 8738263ad847..e04c4db19c8c 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -46,6 +46,7 @@ FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
 FunctionPass *createWebAssemblyExplicitLocals();
 FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyCFGSort();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 5b4b82eb5603..d9c2dba5bace 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -14,6 +14,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "WebAssemblyAsmPrinter.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "MCTargetDesc/WebAssemblyTargetStreamer.h"
@@ -21,13 +22,14 @@
 #include "WebAssemblyMCInstLower.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyRegisterInfo.h"
-#include "WebAssemblySubtarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -38,56 +40,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-namespace {
-
-class WebAssemblyAsmPrinter final : public AsmPrinter {
-  const MachineRegisterInfo *MRI;
-  WebAssemblyFunctionInfo *MFI;
-
-public:
-  WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
-
-private:
-  StringRef getPassName() const override {
-    return "WebAssembly Assembly Printer";
-  }
-
-  //===------------------------------------------------------------------===//
-  // MachineFunctionPass Implementation.
-  //===------------------------------------------------------------------===//
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    MRI = &MF.getRegInfo();
-    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(MF);
-  }
-
-  //===------------------------------------------------------------------===//
-  // AsmPrinter Implementation.
-  //===------------------------------------------------------------------===//
-
-  void EmitEndOfAsmFile(Module &M) override;
-  void EmitJumpTableInfo() override;
-  void EmitConstantPool() override;
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
-  void EmitInstruction(const MachineInstr *MI) override;
-  const MCExpr *lowerConstant(const Constant *CV) override;
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &OS) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &OS) override;
-
-  MVT getRegType(unsigned RegNo) const;
-  std::string regToString(const MachineOperand &MO);
-  WebAssemblyTargetStreamer *getTargetStreamer();
-};
-
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // Helpers.
 //===----------------------------------------------------------------------===//
@@ -135,9 +87,19 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
   for (const auto &G : M.globals()) {
     if (!G.hasInitializer() && G.hasExternalLinkage()) {
+      uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
       getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
+      OutStreamer->emitELFSize(getSymbol(&G),
+                               MCConstantExpr::create(Size, OutContext));
     }
   }
+
+  if (!TM.getTargetTriple().isOSBinFormatELF()) {
+    MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
+    getTargetStreamer()->emitGlobal(MMIW.getGlobals());
+    if (MMIW.hasStackPointerGlobal())
+      getTargetStreamer()->emitStackPointer(MMIW.getStackPointerGlobal());
+  }
 }
 
 void WebAssemblyAsmPrinter::EmitConstantPool() {
@@ -150,8 +112,7 @@ void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
-  if (!MFI->getParams().empty())
-    getTargetStreamer()->emitParam(MFI->getParams());
+  getTargetStreamer()->emitParam(CurrentFnSym, MFI->getParams());
 
   SmallVector<MVT, 4> ResultVTs;
   const Function &F(*MF->getFunction());
@@ -169,23 +130,26 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   // If the return type needs to be legalized it will get converted into
   // passing a pointer.
   if (ResultVTs.size() == 1)
-    getTargetStreamer()->emitResult(ResultVTs);
-
-  // FIXME: When ExplicitLocals is enabled by default, we won't need
-  // to define the locals here (and MFI can go back to being pointer-to-const).
-  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
-    unsigned WAReg = MFI->getWAReg(VReg);
-    // Don't declare unused registers.
-    if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
-      continue;
-    // Don't redeclare parameters.
-    if (WAReg < MFI->getParams().size())
-      continue;
-    // Don't declare stackified registers.
-    if (int(WAReg) < 0)
-      continue;
-    MFI->addLocal(getRegType(VReg));
+    getTargetStreamer()->emitResult(CurrentFnSym, ResultVTs);
+  else
+    getTargetStreamer()->emitResult(CurrentFnSym, ArrayRef<MVT>());
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    assert(MFI->getLocals().empty());
+    for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+      unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+      unsigned WAReg = MFI->getWAReg(VReg);
+      // Don't declare unused registers.
+      if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+        continue;
+      // Don't redeclare parameters.
+      if (WAReg < MFI->getParams().size())
+        continue;
+      // Don't declare stackified registers.
+      if (int(WAReg) < 0)
+        continue;
+      MFI->addLocal(getRegType(VReg));
+    }
   }
 
   getTargetStreamer()->emitLocal(MFI->getLocals());
@@ -194,7 +158,8 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
-  getTargetStreamer()->emitEndFunc();
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    getTargetStreamer()->emitEndFunc();
 }
 
 void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
new file mode 100644
index 000000000000..c8917b8d7e48
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -0,0 +1,77 @@
+// WebAssemblyAsmPrinter.h - WebAssembly implementation of AsmPrinter-*- C++ -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYASMPRINTER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYASMPRINTER_H
+
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCSymbol;
+class WebAssemblyFunctionInfo;
+class WebAssemblyTargetStreamer;
+class WebAssemblyMCInstLower;
+
+class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
+  const WebAssemblySubtarget *Subtarget;
+  const MachineRegisterInfo *MRI;
+  WebAssemblyFunctionInfo *MFI;
+
+public:
+  explicit WebAssemblyAsmPrinter(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)),
+        Subtarget(nullptr), MRI(nullptr), MFI(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Assembly Printer";
+  }
+
+  const WebAssemblySubtarget &getSubtarget() const { return *Subtarget; }
+
+  //===------------------------------------------------------------------===//
+  // MachineFunctionPass Implementation.
+  //===------------------------------------------------------------------===//
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+    MRI = &MF.getRegInfo();
+    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(MF);
+  }
+
+  //===------------------------------------------------------------------===//
+  // AsmPrinter Implementation.
+  //===------------------------------------------------------------------===//
+
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitJumpTableInfo() override;
+  void EmitConstantPool() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  const MCExpr *lowerConstant(const Constant *CV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  MVT getRegType(unsigned RegNo) const;
+  std::string regToString(const MachineOperand &MO);
+  WebAssemblyTargetStreamer *getTargetStreamer();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
new file mode 100644
index 000000000000..40e1928197bc
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -0,0 +1,277 @@
+//===-- WebAssemblyCFGSort.cpp - CFG Sorting ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG sorting pass.
+///
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
+///
+////===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-sort"
+
+namespace {
+class WebAssemblyCFGSort final : public MachineFunctionPass {
+  StringRef getPassName() const override { return "WebAssembly CFG Sort"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCFGSort() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGSort::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGSort() {
+  return new WebAssemblyCFGSort();
+}
+
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+  bool AnyBarrier = false;
+#endif
+  bool AllAnalyzable = true;
+  for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+    AnyBarrier |= Term.isBarrier();
+#endif
+    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+  }
+  assert((AnyBarrier || AllAnalyzable) &&
+         "AnalyzeBranch needs to analyze any block with a fallthrough");
+  if (AllAnalyzable)
+    MBB->updateTerminator();
+}
+
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() > B->getNumber();
+  }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() < B->getNumber();
+  }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+  const MachineLoop *Loop;
+  unsigned NumBlocksLeft;
+
+  /// List of blocks not dominated by Loop's header that are deferred until
+  /// after all of Loop's blocks have been seen.
+  std::vector<MachineBasicBlock *> Deferred;
+
+  explicit Entry(const MachineLoop *L)
+      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+} // end anonymous namespace
+
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+                       const MachineDominatorTree &MDT) {
+  // Prepare for a topological sort: Record the number of predecessors each
+  // block has, ignoring loop backedges.
+  MF.RenumberBlocks();
+  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned N = MBB.pred_size();
+    if (MachineLoop *L = MLI.getLoopFor(&MBB))
+      if (L->getHeader() == &MBB)
+        for (const MachineBasicBlock *Pred : MBB.predecessors())
+          if (L->contains(Pred))
+            --N;
+    NumPredsLeft[MBB.getNumber()] = N;
+  }
+
+  // Topological sort the CFG, with additional constraints:
+  //  - Between a loop header and the last block in the loop, there can be
+  //    no blocks not dominated by the loop header.
+  //  - It's desirable to preserve the original block order when possible.
+  // We use two ready lists; Preferred and Ready. Preferred has recently
+  // processed sucessors, to help preserve block sequences from the original
+  // order. Ready has the remaining ready blocks.
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbers>
+      Preferred;
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbersBackwards>
+      Ready;
+  SmallVector<Entry, 4> Loops;
+  for (MachineBasicBlock *MBB = &MF.front();;) {
+    const MachineLoop *L = MLI.getLoopFor(MBB);
+    if (L) {
+      // If MBB is a loop header, add it to the active loop list. We can't put
+      // any blocks that it doesn't dominate until we see the end of the loop.
+      if (L->getHeader() == MBB)
+        Loops.push_back(Entry(L));
+      // For each active loop the block is in, decrement the count. If MBB is
+      // the last block in an active loop, take it off the list and pick up any
+      // blocks deferred because the header didn't dominate them.
+      for (Entry &E : Loops)
+        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+          for (auto DeferredBlock : E.Deferred)
+            Ready.push(DeferredBlock);
+      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+        Loops.pop_back();
+    }
+    // The main topological sort logic.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      // Ignore backedges.
+      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+          continue;
+      // Decrement the predecessor count. If it's now zero, it's ready.
+      if (--NumPredsLeft[Succ->getNumber()] == 0)
+        Preferred.push(Succ);
+    }
+    // Determine the block to follow MBB. First try to find a preferred block,
+    // to preserve the original block order when possible.
+    MachineBasicBlock *Next = nullptr;
+    while (!Preferred.empty()) {
+      Next = Preferred.top();
+      Preferred.pop();
+      // If X isn't dominated by the top active loop header, defer it until that
+      // loop is done.
+      if (!Loops.empty() &&
+          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+        Loops.back().Deferred.push_back(Next);
+        Next = nullptr;
+        continue;
+      }
+      // If Next was originally ordered before MBB, and it isn't because it was
+      // loop-rotated above the header, it's not preferred.
+      if (Next->getNumber() < MBB->getNumber() &&
+          (!L || !L->contains(Next) ||
+           L->getHeader()->getNumber() < Next->getNumber())) {
+        Ready.push(Next);
+        Next = nullptr;
+        continue;
+      }
+      break;
+    }
+    // If we didn't find a suitable block in the Preferred list, check the
+    // general Ready list.
+    if (!Next) {
+      // If there are no more blocks to process, we're done.
+      if (Ready.empty()) {
+        MaybeUpdateTerminator(MBB);
+        break;
+      }
+      for (;;) {
+        Next = Ready.top();
+        Ready.pop();
+        // If Next isn't dominated by the top active loop header, defer it until
+        // that loop is done.
+        if (!Loops.empty() &&
+            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+          Loops.back().Deferred.push_back(Next);
+          continue;
+        }
+        break;
+      }
+    }
+    // Move the next block into place and iterate.
+    Next->moveAfter(MBB);
+    MaybeUpdateTerminator(MBB);
+    MBB = Next;
+  }
+  assert(Loops.empty() && "Active loop list not finished");
+  MF.RenumberBlocks();
+
+#ifndef NDEBUG
+  SmallSetVector<MachineLoop *, 8> OnStack;
+
+  // Insert a sentinel representing the degenerate loop that starts at the
+  // function entry block and includes the entire function as a "loop" that
+  // executes once.
+  OnStack.insert(nullptr);
+
+  for (auto &MBB : MF) {
+    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+    MachineLoop *Loop = MLI.getLoopFor(&MBB);
+    if (Loop && &MBB == Loop->getHeader()) {
+      // Loop header. The loop predecessor should be sorted above, and the other
+      // predecessors should be backedges below.
+      for (auto Pred : MBB.predecessors())
+        assert(
+            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+            "Loop header predecessors must be loop predecessors or backedges");
+      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+    } else {
+      // Not a loop header. All predecessors should be sorted above.
+      for (auto Pred : MBB.predecessors())
+        assert(Pred->getNumber() < MBB.getNumber() &&
+               "Non-loop-header predecessors should be topologically sorted");
+      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+             "Blocks must be nested in their loops");
+    }
+    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+      OnStack.pop_back();
+  }
+  assert(OnStack.pop_back_val() == nullptr &&
+         "The function entry block shouldn't actually be a loop header");
+  assert(OnStack.empty() &&
+         "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** CFG Sorting **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto &MDT = getAnalysis<MachineDominatorTree>();
+  // Liveness is not tracked for VALUE_STACK physreg.
+  MF.getRegInfo().invalidateLiveness();
+
+  // Sort the blocks, with contiguous loops.
+  SortBlocks(MF, MLI, MDT);
+
+  return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 49b9754e6b62..bd11d1b46906 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -10,12 +10,7 @@
 /// \file
 /// \brief This file implements a CFG stacking pass.
 ///
-/// This pass reorders the blocks in a function to put them into topological
-/// order, ignoring loop backedges, and without any loop being interrupted
-/// by a block not dominated by the loop header, with special care to keep the
-/// order as similar as possible to the original order.
-///
-/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// This pass inserts BLOCK and LOOP markers to mark the start of scopes, since
 /// scope boundaries serve as the labels for WebAssembly's control transfers.
 ///
 /// This is sufficient to convert arbitrary CFGs into a form that works on
@@ -28,8 +23,6 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
-#include "llvm/ADT/PriorityQueue.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -68,217 +61,6 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
-}
-
-static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
-#ifndef NDEBUG
-  bool AnyBarrier = false;
-#endif
-  bool AllAnalyzable = true;
-  for (const MachineInstr &Term : MBB->terminators()) {
-#ifndef NDEBUG
-    AnyBarrier |= Term.isBarrier();
-#endif
-    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
-  }
-  assert((AnyBarrier || AllAnalyzable) &&
-         "AnalyzeBranch needs to analyze any block with a fallthrough");
-  if (AllAnalyzable)
-    MBB->updateTerminator();
-}
-
-namespace {
-/// Sort blocks by their number.
-struct CompareBlockNumbers {
-  bool operator()(const MachineBasicBlock *A,
-                  const MachineBasicBlock *B) const {
-    return A->getNumber() > B->getNumber();
-  }
-};
-/// Sort blocks by their number in the opposite order..
-struct CompareBlockNumbersBackwards {
-  bool operator()(const MachineBasicBlock *A,
-                  const MachineBasicBlock *B) const {
-    return A->getNumber() < B->getNumber();
-  }
-};
-/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
-/// by the loop header among the loop's blocks.
-struct Entry {
-  const MachineLoop *Loop;
-  unsigned NumBlocksLeft;
-
-  /// List of blocks not dominated by Loop's header that are deferred until
-  /// after all of Loop's blocks have been seen.
-  std::vector<MachineBasicBlock *> Deferred;
-
-  explicit Entry(const MachineLoop *L)
-      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
-};
-}
-
-/// Sort the blocks, taking special care to make sure that loops are not
-/// interrupted by blocks not dominated by their header.
-/// TODO: There are many opportunities for improving the heuristics here.
-/// Explore them.
-static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
-                       const MachineDominatorTree &MDT) {
-  // Prepare for a topological sort: Record the number of predecessors each
-  // block has, ignoring loop backedges.
-  MF.RenumberBlocks();
-  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
-  for (MachineBasicBlock &MBB : MF) {
-    unsigned N = MBB.pred_size();
-    if (MachineLoop *L = MLI.getLoopFor(&MBB))
-      if (L->getHeader() == &MBB)
-        for (const MachineBasicBlock *Pred : MBB.predecessors())
-          if (L->contains(Pred))
-            --N;
-    NumPredsLeft[MBB.getNumber()] = N;
-  }
-
-  // Topological sort the CFG, with additional constraints:
-  //  - Between a loop header and the last block in the loop, there can be
-  //    no blocks not dominated by the loop header.
-  //  - It's desirable to preserve the original block order when possible.
-  // We use two ready lists; Preferred and Ready. Preferred has recently
-  // processed sucessors, to help preserve block sequences from the original
-  // order. Ready has the remaining ready blocks.
-  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
-                CompareBlockNumbers>
-      Preferred;
-  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
-                CompareBlockNumbersBackwards>
-      Ready;
-  SmallVector<Entry, 4> Loops;
-  for (MachineBasicBlock *MBB = &MF.front();;) {
-    const MachineLoop *L = MLI.getLoopFor(MBB);
-    if (L) {
-      // If MBB is a loop header, add it to the active loop list. We can't put
-      // any blocks that it doesn't dominate until we see the end of the loop.
-      if (L->getHeader() == MBB)
-        Loops.push_back(Entry(L));
-      // For each active loop the block is in, decrement the count. If MBB is
-      // the last block in an active loop, take it off the list and pick up any
-      // blocks deferred because the header didn't dominate them.
-      for (Entry &E : Loops)
-        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
-          for (auto DeferredBlock : E.Deferred)
-            Ready.push(DeferredBlock);
-      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
-        Loops.pop_back();
-    }
-    // The main topological sort logic.
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      // Ignore backedges.
-      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
-        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
-          continue;
-      // Decrement the predecessor count. If it's now zero, it's ready.
-      if (--NumPredsLeft[Succ->getNumber()] == 0)
-        Preferred.push(Succ);
-    }
-    // Determine the block to follow MBB. First try to find a preferred block,
-    // to preserve the original block order when possible.
-    MachineBasicBlock *Next = nullptr;
-    while (!Preferred.empty()) {
-      Next = Preferred.top();
-      Preferred.pop();
-      // If X isn't dominated by the top active loop header, defer it until that
-      // loop is done.
-      if (!Loops.empty() &&
-          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
-        Loops.back().Deferred.push_back(Next);
-        Next = nullptr;
-        continue;
-      }
-      // If Next was originally ordered before MBB, and it isn't because it was
-      // loop-rotated above the header, it's not preferred.
-      if (Next->getNumber() < MBB->getNumber() &&
-          (!L || !L->contains(Next) ||
-           L->getHeader()->getNumber() < Next->getNumber())) {
-        Ready.push(Next);
-        Next = nullptr;
-        continue;
-      }
-      break;
-    }
-    // If we didn't find a suitable block in the Preferred list, check the
-    // general Ready list.
-    if (!Next) {
-      // If there are no more blocks to process, we're done.
-      if (Ready.empty()) {
-        MaybeUpdateTerminator(MBB);
-        break;
-      }
-      for (;;) {
-        Next = Ready.top();
-        Ready.pop();
-        // If Next isn't dominated by the top active loop header, defer it until
-        // that loop is done.
-        if (!Loops.empty() &&
-            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
-          Loops.back().Deferred.push_back(Next);
-          continue;
-        }
-        break;
-      }
-    }
-    // Move the next block into place and iterate.
-    Next->moveAfter(MBB);
-    MaybeUpdateTerminator(MBB);
-    MBB = Next;
-  }
-  assert(Loops.empty() && "Active loop list not finished");
-  MF.RenumberBlocks();
-
-#ifndef NDEBUG
-  SmallSetVector<MachineLoop *, 8> OnStack;
-
-  // Insert a sentinel representing the degenerate loop that starts at the
-  // function entry block and includes the entire function as a "loop" that
-  // executes once.
-  OnStack.insert(nullptr);
-
-  for (auto &MBB : MF) {
-    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
-
-    MachineLoop *Loop = MLI.getLoopFor(&MBB);
-    if (Loop && &MBB == Loop->getHeader()) {
-      // Loop header. The loop predecessor should be sorted above, and the other
-      // predecessors should be backedges below.
-      for (auto Pred : MBB.predecessors())
-        assert(
-            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
-            "Loop header predecessors must be loop predecessors or backedges");
-      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
-    } else {
-      // Not a loop header. All predecessors should be sorted above.
-      for (auto Pred : MBB.predecessors())
-        assert(Pred->getNumber() < MBB.getNumber() &&
-               "Non-loop-header predecessors should be topologically sorted");
-      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
-             "Blocks must be nested in their loops");
-    }
-    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
-      OnStack.pop_back();
-  }
-  assert(OnStack.pop_back_val() == nullptr &&
-         "The function entry block shouldn't actually be a loop header");
-  assert(OnStack.empty() &&
-         "Control flow stack pushes and pops should be balanced.");
-#endif
-}
-
 /// Test whether Pred has any terminators explicitly branching to MBB, as
 /// opposed to falling through. Note that it's possible (eg. in unoptimized
 /// code) for a branch instruction to both branch to a block and fallthrough
@@ -488,6 +270,15 @@ static void FixEndsAtEndOfFunction(
   }
 }
 
+// WebAssembly functions end with an end instruction, as if the function body
+// were a block.
+static void AppendEndToFunction(
+    MachineFunction &MF,
+    const WebAssemblyInstrInfo &TII) {
+  BuildMI(MF.back(), MF.back().end(), DebugLoc(),
+          TII.get(WebAssembly::END_FUNCTION));
+}
+
 /// Insert LOOP and BLOCK markers at appropriate places.
 static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
                          const WebAssemblyInstrInfo &TII,
@@ -555,6 +346,11 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
   // Fix up block/loop signatures at the end of the function to conform to
   // WebAssembly's rules.
   FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
+
+  // Add an end instruction at the end of the function body.
+  if (!MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF())
+    AppendEndToFunction(MF, TII);
 }
 
 bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
@@ -569,9 +365,6 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MF.getRegInfo().invalidateLiveness();
 
-  // Sort the blocks, with contiguous loops.
-  SortBlocks(MF, MLI, MDT);
-
   // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
   PlaceMarkers(MF, MLI, TII, MDT, MFI);
 
diff --git a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
index fc0a01ca30e5..bc6360aafd61 100644
--- a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -97,15 +97,28 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
         MI.setDesc(Desc);
 
         // Rewrite argument order
-        auto Uses = MI.explicit_uses();
-        MachineInstr::mop_iterator it = Uses.begin();
-        const MachineOperand MO = *it;
+        SmallVector<MachineOperand, 8> Ops;
+
+        // Set up a placeholder for the type signature immediate.
+        Ops.push_back(MachineOperand::CreateImm(0));
 
         // Set up the flags immediate, which currently has no defined flags
         // so it's always zero.
-        it->ChangeToImmediate(0);
-
-        MI.addOperand(MF, MO);
+        Ops.push_back(MachineOperand::CreateImm(0));
+
+        for (const MachineOperand &MO :
+                 make_range(MI.operands_begin() +
+                                MI.getDesc().getNumDefs() + 1,
+                            MI.operands_begin() +
+                                MI.getNumExplicitOperands()))
+          Ops.push_back(MO);
+        Ops.push_back(MI.getOperand(MI.getDesc().getNumDefs()));
+
+        // Replace the instructions operands.
+        while (MI.getNumOperands() > MI.getDesc().getNumDefs())
+          MI.RemoveOperand(MI.getNumOperands() - 1);
+        for (const MachineOperand &MO : Ops)
+          MI.addOperand(MO);
 
         DEBUG(dbgs() << "  After transform: " << MI);
         Changed = true;
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 04ede7ff110c..41249117ae0e 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -31,6 +31,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-explicit-locals"
 
+// A command-line option to disable this pass. Note that this produces output
+// which is not valid WebAssembly, though it may be more convenient for writing
+// LLVM unit tests with.
+static cl::opt<bool> DisableWebAssemblyExplicitLocals(
+    "disable-wasm-explicit-locals", cl::ReallyHidden,
+    cl::desc("WebAssembly: Disable emission of get_local/set_local."),
+    cl::init(false));
+
 namespace {
 class WebAssemblyExplicitLocals final : public MachineFunctionPass {
   StringRef getPassName() const override {
@@ -60,7 +68,25 @@ FunctionPass *llvm::createWebAssemblyExplicitLocals() {
 /// if it doesn't yet have one.
 static unsigned getLocalId(DenseMap<unsigned, unsigned> &Reg2Local,
                            unsigned &CurLocal, unsigned Reg) {
-  return Reg2Local.insert(std::make_pair(Reg, CurLocal++)).first->second;
+  auto P = Reg2Local.insert(std::make_pair(Reg, CurLocal));
+  if (P.second)
+    ++CurLocal;
+  return P.first->second;
+}
+
+/// Get the appropriate drop opcode for the given register class.
+static unsigned getDropOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::DROP_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::DROP_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::DROP_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::DROP_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::DROP_V128;
+  llvm_unreachable("Unexpected register class");
 }
 
 /// Get the appropriate get_local opcode for the given register class.
@@ -146,6 +172,10 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                   "********** Function: "
                << MF.getName() << '\n');
 
+  // Disable this pass if directed to do so.
+  if (DisableWebAssemblyExplicitLocals)
+    return false;
+
   // Disable this pass if we aren't doing direct wasm object emission.
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF())
@@ -176,6 +206,12 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // Start assigning local numbers after the last parameter.
   unsigned CurLocal = MFI.getParams().size();
 
+  // Precompute the set of registers that are unused, so that we can insert
+  // drops to their defs.
+  BitVector UseEmpty(MRI.getNumVirtRegs());
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i)
+    UseEmpty[i] = MRI.use_empty(TargetRegisterInfo::index2VirtReg(i));
+
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
@@ -224,15 +260,26 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       assert(MI.getDesc().getNumDefs() <= 1);
       if (MI.getDesc().getNumDefs() == 1) {
         unsigned OldReg = MI.getOperand(0).getReg();
-        if (!MFI.isVRegStackified(OldReg) && !MRI.use_empty(OldReg)) {
-          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+        if (!MFI.isVRegStackified(OldReg)) {
           const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
           unsigned NewReg = MRI.createVirtualRegister(RC);
           auto InsertPt = std::next(MachineBasicBlock::iterator(&MI));
-          unsigned Opc = getSetLocalOpcode(RC);
-          BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
-              .addImm(LocalId)
-              .addReg(NewReg);
+          if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
+            MI.eraseFromParent();
+            Changed = true;
+            continue;
+          }
+          if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
+            unsigned Opc = getDropOpcode(RC);
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                .addReg(NewReg);
+          } else {
+            unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+            unsigned Opc = getSetLocalOpcode(RC);
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                .addImm(LocalId)
+                .addReg(NewReg);
+          }
           MI.getOperand(0).setReg(NewReg);
           MFI.stackifyVReg(NewReg);
           Changed = true;
@@ -278,13 +325,16 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Define the locals.
+  // TODO: Sort the locals for better compression.
+  MFI.setNumLocals(CurLocal - MFI.getParams().size());
   for (size_t i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     auto I = Reg2Local.find(Reg);
     if (I == Reg2Local.end() || I->second < MFI.getParams().size())
       continue;
 
-    MFI.addLocal(typeForRegClass(MRI.getRegClass(Reg)));
+    MFI.setLocal(I->second - MFI.getParams().size(),
+                 typeForRegClass(MRI.getRegClass(Reg)));
     Changed = true;
   }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index bc7020fded8c..53698ff09b10 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -116,6 +116,8 @@ private:
     case MVT::f32:
     case MVT::f64:
       return VT;
+    case MVT::f16:
+      return MVT::f32;
     case MVT::v16i8:
     case MVT::v8i16:
     case MVT::v4i32:
@@ -594,12 +596,12 @@ bool WebAssemblyFastISel::fastLowerArguments() {
 
   unsigned i = 0;
   for (auto const &Arg : F->args()) {
-    const AttributeSet &Attrs = F->getAttributes();
-    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
-        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
-        Attrs.hasAttribute(i+1, Attribute::Nest))
+    const AttributeList &Attrs = F->getAttributes();
+    if (Attrs.hasParamAttribute(i, Attribute::ByVal) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftSelf) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftError) ||
+        Attrs.hasParamAttribute(i, Attribute::InAlloca) ||
+        Attrs.hasParamAttribute(i, Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
@@ -744,19 +746,19 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
       return false;
 
-    const AttributeSet &Attrs = Call->getAttributes();
-    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
-        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
-        Attrs.hasAttribute(i+1, Attribute::Nest))
+    const AttributeList &Attrs = Call->getAttributes();
+    if (Attrs.hasParamAttribute(i, Attribute::ByVal) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftSelf) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftError) ||
+        Attrs.hasParamAttribute(i, Attribute::InAlloca) ||
+        Attrs.hasParamAttribute(i, Attribute::Nest))
       return false;
 
     unsigned Reg;
 
-    if (Attrs.hasAttribute(i+1, Attribute::SExt))
+    if (Attrs.hasParamAttribute(i, Attribute::SExt))
       Reg = getRegForSignedValue(V);
-    else if (Attrs.hasAttribute(i+1, Attribute::ZExt))
+    else if (Attrs.hasParamAttribute(i, Attribute::ZExt))
       Reg = getRegForUnsignedValue(V);
     else
       Reg = getRegForValue(V);
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index adf904ee0269..76a2ff3f9803 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -84,7 +84,7 @@ static void FindUses(Value *V, Function &F,
 //  - Call with fewer arguments than needed: arguments are filled in with undef
 //  - Return value is not needed: drop it
 //  - Return value needed but not present: supply an undef
-//  
+//
 // For now, return nullptr without creating a wrapper if the wrapper cannot
 // be generated due to incompatible types.
 static Function *CreateWrapper(Function *F, FunctionType *Ty) {
@@ -148,6 +148,11 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
     if (!Ty)
       continue;
 
+    // Wasm varargs are not ABI-compatible with non-varargs. Just ignore
+    // such casts for now.
+    if (Ty->isVarArg() || F->isVarArg())
+      continue;
+
     auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
     if (Pair.second)
       Pair.first->second = CreateWrapper(F, Ty);
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index a6a2c0bf06ae..4209bc333f23 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -24,10 +24,11 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -101,25 +102,35 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
                             MachineBasicBlock::iterator &InsertAddr,
                             MachineBasicBlock::iterator &InsertStore,
                             const DebugLoc &DL) {
-  const char *ES = "__stack_pointer";
-  auto *SPSymbol = MF.createExternalSymbolName(ES);
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterClass *PtrRC =
-      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned Zero = MRI.createVirtualRegister(PtrRC);
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
-  BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
-      .addImm(0);
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-      MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
-      .addImm(2)  // p2align
-      .addExternalSymbol(SPSymbol)
-      .addReg(Zero)
-      .addReg(SrcReg)
-      .addMemOperand(MMO);
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    const char *ES = "__stack_pointer";
+    auto *SPSymbol = MF.createExternalSymbolName(ES);
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned Zero = MRI.createVirtualRegister(PtrRC);
+
+    BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+        .addImm(0);
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+        MachineMemOperand::MOStore, 4, 4);
+    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
+        .addImm(2)  // p2align
+        .addExternalSymbol(SPSymbol)
+        .addReg(Zero)
+        .addReg(SrcReg)
+        .addMemOperand(MMO);
+  } else {
+    MachineModuleInfoWasm &MMIW =
+        MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
+    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
+        .addImm(MMIW.getStackPointerGlobal())
+        .addReg(SrcReg);
+  }
 }
 
 MachineBasicBlock::iterator
@@ -151,27 +162,50 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto &MRI = MF.getRegInfo();
 
   auto InsertPt = MBB.begin();
+  while (InsertPt != MBB.end() && WebAssembly::isArgument(*InsertPt))
+    ++InsertPt;
   DebugLoc DL;
 
   const TargetRegisterClass *PtrRC =
       MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned Zero = MRI.createVirtualRegister(PtrRC);
   unsigned SPReg = WebAssembly::SP32;
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
-  const char *ES = "__stack_pointer";
-  auto *SPSymbol = MF.createExternalSymbolName(ES);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
-      .addImm(0);
-  MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-      MachineMemOperand::MOLoad, 4, 4);
-  // Load the SP value.
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
-      .addImm(2)       // p2align
-      .addExternalSymbol(SPSymbol)
-      .addReg(Zero)    // addr
-      .addMemOperand(LoadMMO);
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    const char *ES = "__stack_pointer";
+    auto *SPSymbol = MF.createExternalSymbolName(ES);
+    unsigned Zero = MRI.createVirtualRegister(PtrRC);
+
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+        .addImm(0);
+    MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
+        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+        MachineMemOperand::MOLoad, 4, 4);
+    // Load the SP value.
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+        .addImm(2)       // p2align
+        .addExternalSymbol(SPSymbol)
+        .addReg(Zero)    // addr
+        .addMemOperand(LoadMMO);
+  } else {
+    auto &MMIW = MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
+    if (!MMIW.hasStackPointerGlobal()) {
+      MMIW.setStackPointerGlobal(MMIW.getGlobals().size());
+
+      // Create the stack-pointer global. For now, just use the
+      // Emscripten/Binaryen ABI names.
+      wasm::Global G;
+      G.Type = wasm::ValType::I32;
+      G.Mutable = true;
+      G.InitialValue = 0;
+      G.InitialModule = "env";
+      G.InitialName = "STACKTOP";
+      MMIW.addGlobal(G);
+    }
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
+        .addImm(MMIW.getStackPointerGlobal());
+  }
 
   bool HasBP = hasBP(MF);
   if (HasBP) {
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6a7f75a6b3a1..31a5ca1f4cc2 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -95,6 +95,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Support minnan and maxnan, which otherwise default to expand.
     setOperationAction(ISD::FMINNAN, T, Legal);
     setOperationAction(ISD::FMAXNAN, T, Legal);
+    // WebAssembly currently has no builtin f16 support.
+    setOperationAction(ISD::FP16_TO_FP, T, Expand);
+    setOperationAction(ISD::FP_TO_FP16, T, Expand);
+    setLoadExtAction(ISD::EXTLOAD, T, MVT::f16, Expand);
+    setTruncStoreAction(T, MVT::f16, Expand);
   }
 
   for (auto T : {MVT::i32, MVT::i64}) {
@@ -253,7 +258,8 @@ bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
-bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
+                                              AttributeList Attr) const {
   // The current thinking is that wasm engines will perform this optimization,
   // so we can save on code size.
   return true;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 5bc723028e63..99d3d0d558f5 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -58,7 +58,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
                              unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
                                       bool *Fast) const override;
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 047f4be066c0..73d1d4be293b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -30,13 +30,15 @@ multiclass CALL<WebAssemblyRegClass vt, string prefix> {
                    [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
                    !strconcat(prefix, "call\t$dst, $callee"),
                    0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
                               [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
                               "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins i32imm:$flags, variable_ops),
+  def CALL_INDIRECT_#vt : I<(outs vt:$dst),
+                            (ins TypeIndex:$type, i32imm:$flags, variable_ops),
                             [],
                             !strconcat(prefix, "call_indirect\t$dst"),
                             0x11>;
@@ -48,6 +50,7 @@ multiclass SIMD_CALL<ValueType vt, string prefix> {
                                (WebAssemblycall1 (i32 imm:$callee)))],
                          !strconcat(prefix, "call\t$dst, $callee"),
                          0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
                                     (ins I32:$callee, variable_ops),
@@ -57,7 +60,8 @@ multiclass SIMD_CALL<ValueType vt, string prefix> {
   } // isCodeGenOnly = 1
 
   def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
-                                  (ins i32imm:$flags, variable_ops),
+                                  (ins TypeIndex:$type, i32imm:$flags,
+                                       variable_ops),
                                   [],
                                   !strconcat(prefix, "call_indirect\t$dst"),
                                   0x11>;
@@ -76,13 +80,15 @@ let Uses = [SP32, SP64], isCall = 1 in {
   def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
                     [(WebAssemblycall0 (i32 imm:$callee))],
                     "call    \t$callee", 0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
                       [(WebAssemblycall0 I32:$callee)],
                       "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_VOID : I<(outs), (ins i32imm:$flags, variable_ops),
+  def CALL_INDIRECT_VOID : I<(outs),
+                             (ins TypeIndex:$type, i32imm:$flags, variable_ops),
                              [],
                              "call_indirect\t", 0x11>;
 } // Uses = [SP32,SP64], isCall = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 1146431e6b77..39cb1ca336f2 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -64,9 +64,12 @@ let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
 def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
 def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
 
-// END_BLOCK and END_LOOP are represented with the same opcode in wasm.
+// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode
+// in wasm.
 def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
 def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
+let isTerminator = 1, isBarrier = 1 in
+def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
 multiclass RETURN<WebAssemblyRegClass vt> {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 030be0862a56..03c9c1f8d5c0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -55,8 +55,8 @@ defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
 } // isCommutable = 1
 defm LT : ComparisonFP<SETOLT, "lt  ", 0x5d, 0x63>;
-defm LE : ComparisonFP<SETOLE, "le  ", 0x5e, 0x64>;
-defm GT : ComparisonFP<SETOGT, "gt  ", 0x5f, 0x65>;
+defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
+defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
 } // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 0e2d8bbaf64c..8846952e5af4 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -183,11 +183,9 @@ unsigned WebAssemblyInstrInfo::insertBranch(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 && "Expected a flag and a successor block");
 
   if (Cond[0].getImm()) {
-    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).addOperand(Cond[1]);
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
   } else {
-    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
-        .addMBB(TBB)
-        .addOperand(Cond[1]);
+    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)).addMBB(TBB).add(Cond[1]);
   }
   if (!FBB)
     return 1;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index dcfd1a42c6aa..a601b575f579 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -74,6 +74,9 @@ def bb_op : Operand<OtherVT>;
 let OperandType = "OPERAND_LOCAL" in
 def local_op : Operand<i32>;
 
+let OperandType = "OPERAND_GLOBAL" in
+def global_op : Operand<i32>;
+
 let OperandType = "OPERAND_I32IMM" in
 def i32imm_op : Operand<i32>;
 
@@ -104,6 +107,9 @@ def Signature : Operand<i32> {
 }
 } // OperandType = "OPERAND_SIGNATURE"
 
+let OperandType = "OPERAND_TYPEINDEX" in
+def TypeIndex : Operand<i32>;
+
 } // OperandNamespace = "WebAssembly"
 
 //===----------------------------------------------------------------------===//
@@ -178,6 +184,18 @@ let hasSideEffects = 0 in {
   def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
                          "tee_local\t$res, $local, $src", 0x22>;
 
+  // Unused values must be dropped in some contexts.
+  def DROP_#vt : I<(outs), (ins vt:$src), [],
+                   "drop\t$src", 0x1a>;
+
+  let mayLoad = 1 in
+  def GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local), [],
+                         "get_global\t$res, $local", 0x23>;
+
+  let mayStore = 1 in
+  def SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src), [],
+                         "set_global\t$local, $src", 0x24>;
+
 } // hasSideEffects = 0
 }
 defm : LOCAL<I32>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index b606ebb0a68d..25d77bb1f234 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -673,9 +673,9 @@ def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
                          Requires<[HasAddr32]>;
 
 // Grow memory.
-def GROW_MEMORY_I32 : I<(outs), (ins i32imm:$flags, I32:$delta),
+def GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
                         [],
-                        "grow_memory\t$delta", 0x40>,
+                        "grow_memory\t$dst, $delta", 0x40>,
                       Requires<[HasAddr32]>;
 
 } // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 7ea5d05a1b21..744a3ed427af 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -118,7 +118,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // delete the br_unless.
       assert(Inverted);
       BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
-          .addOperand(MI->getOperand(0))
+          .add(MI->getOperand(0))
           .addReg(Cond);
       MBB.erase(MI);
     }
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 72cb1ccbe668..947c0329bb6e 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -412,7 +412,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   if (CI->doesNotReturn()) {
     if (auto *F = dyn_cast<Function>(CI->getCalledValue()))
       F->removeFnAttr(Attribute::NoReturn);
-    CI->removeAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+    CI->removeAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
 
   IRBuilder<> IRB(C);
@@ -435,25 +435,20 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
 
   // Because we added the pointer to the callee as first argument, all
   // argument attribute indices have to be incremented by one.
-  SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &InvokePAL = CI->getAttributes();
-  CallSite::arg_iterator AI = CI->arg_begin();
-  unsigned i = 1; // Argument attribute index starts from 1
-  for (unsigned e = CI->getNumArgOperands(); i <= e; ++AI, ++i) {
-    if (InvokePAL.hasAttributes(i)) {
-      AttrBuilder B(InvokePAL, i);
-      AttributesVec.push_back(AttributeSet::get(C, i + 1, B));
-    }
-  }
-  // Add any return attributes.
-  if (InvokePAL.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getRetAttributes()));
-  // Add any function attributes.
-  if (InvokePAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getFnAttributes()));
+  SmallVector<AttributeSet, 8> ArgAttributes;
+  const AttributeList &InvokeAL = CI->getAttributes();
+
+  // No attributes for the callee pointer.
+  ArgAttributes.push_back(AttributeSet());
+  // Copy the argument attributes from the original
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i < e; ++i)
+    ArgAttributes.push_back(InvokeAL.getParamAttributes(i));
+
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeSet NewCallPAL = AttributeSet::get(C, AttributesVec);
-  NewCall->setAttributes(NewCallPAL);
+  AttributeList NewCallAL =
+      AttributeList::get(C, InvokeAL.getFnAttributes(),
+                         InvokeAL.getRetAttributes(), ArgAttributes);
+  NewCall->setAttributes(NewCallAL);
 
   CI->replaceAllUsesWith(NewCall);
 
@@ -624,7 +619,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
   Function *F =
       Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
   Argument *Arg1 = &*(F->arg_begin());
-  Argument *Arg2 = &*(++F->arg_begin());
+  Argument *Arg2 = &*std::next(F->arg_begin());
   Arg1->setName("threw");
   Arg2->setName("value");
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 022a448590ec..ff186eb91503 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -14,7 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyAsmPrinter.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRuntimeLibcallSignatures.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Constants.h"
@@ -22,18 +25,85 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 MCSymbol *
 WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
+  const GlobalValue *Global = MO.getGlobal();
+  MCSymbol *Sym = Printer.getSymbol(Global);
+  if (isa<MCSymbolELF>(Sym))
+    return Sym;
+
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+
+  if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
+    const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
+    const TargetMachine &TM = MF.getTarget();
+    const Function &CurrentFunc = *MF.getFunction();
+
+    SmallVector<wasm::ValType, 4> Returns;
+    SmallVector<wasm::ValType, 4> Params;
+
+    wasm::ValType iPTR =
+        MF.getSubtarget<WebAssemblySubtarget>().hasAddr64() ?
+        wasm::ValType::I64 :
+        wasm::ValType::I32;
+
+    SmallVector<MVT, 4> ResultMVTs;
+    ComputeLegalValueVTs(CurrentFunc, TM, FuncTy->getReturnType(), ResultMVTs);
+    // WebAssembly can't currently handle returning tuples.
+    if (ResultMVTs.size() <= 1)
+      for (MVT ResultMVT : ResultMVTs)
+        Returns.push_back(WebAssembly::toValType(ResultMVT));
+    else
+      Params.push_back(iPTR);
+
+    for (Type *Ty : FuncTy->params()) {
+      SmallVector<MVT, 4> ParamMVTs;
+      ComputeLegalValueVTs(CurrentFunc, TM, Ty, ParamMVTs);
+      for (MVT ParamMVT : ParamMVTs)
+        Params.push_back(WebAssembly::toValType(ParamMVT));
+    }
+
+    if (FuncTy->isVarArg())
+      Params.push_back(iPTR);
+
+    WasmSym->setReturns(std::move(Returns));
+    WasmSym->setParams(std::move(Params));
+    WasmSym->setIsFunction(true);
+  }
+
+  return WasmSym;
 }
 
 MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
     const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+  const char *Name = MO.getSymbolName();
+  MCSymbol *Sym = Printer.GetExternalSymbolSymbol(Name);
+  if (isa<MCSymbolELF>(Sym))
+    return Sym;
+
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+  const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
+
+  // __stack_pointer is a global variable; all other external symbols used by
+  // CodeGen are functions.
+  if (strcmp(Name, "__stack_pointer") == 0)
+    return WasmSym;
+
+  SmallVector<wasm::ValType, 4> Returns;
+  SmallVector<wasm::ValType, 4> Params;
+  GetSignature(Subtarget, Name, Returns, Params);
+
+  WasmSym->setReturns(std::move(Returns));
+  WasmSym->setParams(std::move(Params));
+  WasmSym->setIsFunction(true);
+
+  return WasmSym;
 }
 
 MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
@@ -42,6 +112,9 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
   MCSymbolRefExpr::VariantKind VK =
       IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
              : MCSymbolRefExpr::VK_None;
+  if (!isa<MCSymbolELF>(Sym))
+    cast<MCSymbolWasm>(Sym)->setIsFunction(IsFunc);
+
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
 
   if (Offset != 0) {
@@ -54,20 +127,34 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
   return MCOperand::createExpr(Expr);
 }
 
+// Return the WebAssembly type associated with the given register class.
+static wasm::ValType getType(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return wasm::ValType::I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return wasm::ValType::I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return wasm::ValType::F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return wasm::ValType::F64;
+  llvm_unreachable("Unexpected register class");
+}
+
 void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
                                    MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
+  const MCInstrDesc &Desc = MI->getDesc();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_MachineBasicBlock:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("MachineBasicBlock operand should have been rewritten");
     case MachineOperand::MO_Register: {
       // Ignore all implicit register operands.
@@ -80,6 +167,41 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
       break;
     }
     case MachineOperand::MO_Immediate:
+      if (i < Desc.NumOperands) {
+        const MCOperandInfo &Info = Desc.OpInfo[i];
+        if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+          MCSymbol *Sym = Printer.createTempSymbol("typeindex");
+          if (!isa<MCSymbolELF>(Sym)) {
+            SmallVector<wasm::ValType, 4> Returns;
+            SmallVector<wasm::ValType, 4> Params;
+
+            const MachineRegisterInfo &MRI =
+                MI->getParent()->getParent()->getRegInfo();
+            for (const MachineOperand &MO : MI->defs())
+              Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
+            for (const MachineOperand &MO : MI->explicit_uses())
+              if (MO.isReg())
+                Params.push_back(getType(MRI.getRegClass(MO.getReg())));
+
+            // call_indirect instructions have a callee operand at the end which
+            // doesn't count as a param.
+            if (WebAssembly::isCallIndirect(*MI))
+              Params.pop_back();
+
+            MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+            WasmSym->setReturns(std::move(Returns));
+            WasmSym->setParams(std::move(Params));
+            WasmSym->setIsFunction(true);
+
+            const MCExpr *Expr =
+                MCSymbolRefExpr::create(WasmSym,
+                                        MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX,
+                                        Ctx);
+            MCOp = MCOperand::createExpr(Expr);
+            break;
+          }
+        }
+      }
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_FPImmediate: {
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index ab4ba1c28d53..d1d2794c3b8f 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -20,7 +20,7 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class AsmPrinter;
+class WebAssemblyAsmPrinter;
 class MCContext;
 class MCSymbol;
 class MachineInstr;
@@ -29,7 +29,7 @@ class MachineOperand;
 /// This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCContext &Ctx;
-  AsmPrinter &Printer;
+  WebAssemblyAsmPrinter &Printer;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
@@ -37,7 +37,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
                                bool IsFunc) const;
 
 public:
-  WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+  WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
       : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 756619bebbed..1fcbb7791d4e 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -60,6 +60,8 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   void addResult(MVT VT) { Results.push_back(VT); }
   const std::vector<MVT> &getResults() const { return Results; }
 
+  void setNumLocals(size_t NumLocals) { Locals.resize(NumLocals, MVT::i32); }
+  void setLocal(size_t i, MVT VT) { Locals[i] = VT; }
   void addLocal(MVT VT) { Locals.push_back(VT); }
   const std::vector<MVT> &getLocals() const { return Locals; }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 96520aa5d28c..f4c9a4ef6b9c 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -54,7 +54,7 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
 
 void OptimizeReturned::visitCallSite(CallSite CS) {
   for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
-    if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
+    if (CS.paramHasAttr(0, Attribute::Returned)) {
       Instruction *Inst = CS.getInstruction();
       Value *Arg = CS.getArgOperand(i);
       // Ignore constants, globals, undef, etc.
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 32dde88c2234..d2fbc5a22308 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -80,19 +80,31 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
     return false;
   if (&MBB != &MF.back())
     return false;
-  if (&MI != &MBB.back())
-    return false;
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    if (&MI != &MBB.back())
+      return false;
+  } else {
+    MachineBasicBlock::iterator End = MBB.end();
+    --End;
+    assert(End->getOpcode() == WebAssembly::END_FUNCTION);
+    --End;
+    if (&MI != &*End)
+      return false;
+  }
 
-  // If the operand isn't stackified, insert a COPY to read the operand and
-  // stackify it.
-  MachineOperand &MO = MI.getOperand(0);
-  unsigned Reg = MO.getReg();
-  if (!MFI.isVRegStackified(Reg)) {
-    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
-        .addReg(Reg);
-    MO.setReg(NewReg);
-    MFI.stackifyVReg(NewReg);
+  if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
+    // If the operand isn't stackified, insert a COPY to read the operand and
+    // stackify it.
+    MachineOperand &MO = MI.getOperand(0);
+    unsigned Reg = MO.getReg();
+    if (!MFI.isVRegStackified(Reg)) {
+      unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+          .addReg(Reg);
+      MO.setReg(NewReg);
+      MFI.stackifyVReg(NewReg);
+    }
   }
 
   // Rewrite the return.
@@ -127,7 +139,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
               Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
               Name == TLI.getLibcallName(RTLIB::MEMSET)) {
-            LibFunc::Func Func;
+            LibFunc Func;
             if (LibInfo.getLibFunc(Name, Func)) {
               const auto &Op2 = MI.getOperand(2);
               if (!Op2.isReg())
@@ -188,9 +200,9 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
             WebAssembly::COPY_V128);
         break;
       case WebAssembly::RETURN_VOID:
-        if (!DisableWebAssemblyFallthroughReturnOpt &&
-            &MBB == &MF.back() && &MI == &MBB.back())
-          MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID,
+            WebAssembly::INSTRUCTION_LIST_END);
         break;
       }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 32ee09e45796..57d454746b06 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -152,7 +153,7 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
 }
 
 // Determine whether MI reads memory, writes memory, has side effects,
-// and/or uses the __stack_pointer value.
+// and/or uses the stack pointer value.
 static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
                   bool &Write, bool &Effects, bool &StackPointer) {
   assert(!MI.isPosition());
@@ -169,15 +170,28 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
   if (MI.mayStore()) {
     Write = true;
 
-    // Check for stores to __stack_pointer.
-    for (auto MMO : MI.memoperands()) {
-      const MachinePointerInfo &MPI = MMO->getPointerInfo();
-      if (MPI.V.is<const PseudoSourceValue *>()) {
-        auto PSV = MPI.V.get<const PseudoSourceValue *>();
-        if (const ExternalSymbolPseudoSourceValue *EPSV =
-                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
-          if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
-            StackPointer = true;
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    if (MF.getSubtarget<WebAssemblySubtarget>()
+          .getTargetTriple().isOSBinFormatELF()) {
+      // Check for stores to __stack_pointer.
+      for (auto MMO : MI.memoperands()) {
+        const MachinePointerInfo &MPI = MMO->getPointerInfo();
+        if (MPI.V.is<const PseudoSourceValue *>()) {
+          auto PSV = MPI.V.get<const PseudoSourceValue *>();
+          if (const ExternalSymbolPseudoSourceValue *EPSV =
+                  dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+            if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+              StackPointer = true;
+        }
+      }
+    } else {
+      // Check for sets of the stack pointer.
+      const MachineModuleInfoWasm &MMIW =
+          MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
+      if ((MI.getOpcode() == WebAssembly::SET_LOCAL_I32 ||
+           MI.getOpcode() == WebAssembly::SET_LOCAL_I64) &&
+          MI.getOperand(0).getImm() == MMIW.getStackPointerGlobal()) {
+        StackPointer = true;
       }
     }
   } else if (MI.hasOrderedMemoryRef()) {
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
new file mode 100644
index 000000000000..c02ef4a1c399
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -0,0 +1,1302 @@
+// CodeGen/RuntimeLibcallSignatures.cpp - R.T. Lib. Call Signatures -*- C++ -*--
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains signature information for runtime libcalls.
+///
+/// CodeGen uses external symbols, which it refers to by name. The WebAssembly
+/// target needs type information for all functions. This file contains a big
+/// table providing type signatures for all runtime library functions that LLVM
+/// uses.
+///
+/// This is currently a fairly heavy-handed solution.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyRuntimeLibcallSignatures.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+
+using namespace llvm;
+
+namespace {
+
+enum RuntimeLibcallSignature {
+  func,
+  f32_func_f32,
+  f32_func_f64,
+  f32_func_i32,
+  f32_func_i64,
+  f32_func_i16,
+  f64_func_f32,
+  f64_func_f64,
+  f64_func_i32,
+  f64_func_i64,
+  i32_func_f32,
+  i32_func_f64,
+  i32_func_i32,
+  i64_func_f32,
+  i64_func_f64,
+  i64_func_i64,
+  f32_func_f32_f32,
+  f32_func_f32_i32,
+  f32_func_i64_i64,
+  f64_func_f64_f64,
+  f64_func_f64_i32,
+  f64_func_i64_i64,
+  i16_func_f32,
+  i8_func_i8_i8,
+  func_f32_iPTR_iPTR,
+  func_f64_iPTR_iPTR,
+  i16_func_i16_i16,
+  i32_func_f32_f32,
+  i32_func_f64_f64,
+  i32_func_i32_i32,
+  i64_func_i64_i64,
+  i64_i64_func_f32,
+  i64_i64_func_f64,
+  i16_i16_func_i16_i16,
+  i32_i32_func_i32_i32,
+  i64_i64_func_i64_i64,
+  i64_i64_func_i64_i64_i64_i64,
+  i64_i64_i64_i64_func_i64_i64_i64_i64,
+  i64_i64_func_i64_i64_i32,
+  iPTR_func_iPTR_i32_iPTR,
+  iPTR_func_iPTR_iPTR_iPTR,
+  f32_func_f32_f32_f32,
+  f64_func_f64_f64_f64,
+  func_i64_i64_iPTR_iPTR,
+  func_iPTR_f32,
+  func_iPTR_f64,
+  func_iPTR_i32,
+  func_iPTR_i64,
+  func_iPTR_i64_i64,
+  func_iPTR_i64_i64_i64_i64,
+  func_iPTR_i64_i64_i64_i64_i64_i64,
+  i32_func_i64_i64,
+  i32_func_i64_i64_i64_i64,
+  unsupported
+};
+
+} // end anonymous namespace
+
+static const RuntimeLibcallSignature
+RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
+// Integer
+/* SHL_I16 */ i16_func_i16_i16,
+/* SHL_I32 */ i32_func_i32_i32,
+/* SHL_I64 */ i64_func_i64_i64,
+/* SHL_I128 */ i64_i64_func_i64_i64_i32,
+/* SRL_I16 */ i16_func_i16_i16,
+/* SRL_I32 */ i32_func_i32_i32,
+/* SRL_I64 */ i64_func_i64_i64,
+/* SRL_I128 */ i64_i64_func_i64_i64_i32,
+/* SRA_I16 */ i16_func_i16_i16,
+/* SRA_I32 */ i32_func_i32_i32,
+/* SRA_I64 */ i64_func_i64_i64,
+/* SRA_I128 */ i64_i64_func_i64_i64_i32,
+/* MUL_I8 */ i8_func_i8_i8,
+/* MUL_I16 */ i16_func_i16_i16,
+/* MUL_I32 */ i32_func_i32_i32,
+/* MUL_I64 */ i64_func_i64_i64,
+/* MUL_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* MULO_I32 */ i32_func_i32_i32,
+/* MULO_I64 */ i64_func_i64_i64,
+/* MULO_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SDIV_I8 */ i8_func_i8_i8,
+/* SDIV_I16 */ i16_func_i16_i16,
+/* SDIV_I32 */ i32_func_i32_i32,
+/* SDIV_I64 */ i64_func_i64_i64,
+/* SDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* UDIV_I8 */ i8_func_i8_i8,
+/* UDIV_I16 */ i16_func_i16_i16,
+/* UDIV_I32 */ i32_func_i32_i32,
+/* UDIV_I64 */ i64_func_i64_i64,
+/* UDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SREM_I8 */ i8_func_i8_i8,
+/* SREM_I16 */ i16_func_i16_i16,
+/* SREM_I32 */ i32_func_i32_i32,
+/* SREM_I64 */ i64_func_i64_i64,
+/* SREM_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* UREM_I8 */ i8_func_i8_i8,
+/* UREM_I16 */ i16_func_i16_i16,
+/* UREM_I32 */ i32_func_i32_i32,
+/* UREM_I64 */ i64_func_i64_i64,
+/* UREM_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SDIVREM_I8 */ i8_func_i8_i8,
+/* SDIVREM_I16 */ i16_i16_func_i16_i16,
+/* SDIVREM_I32 */ i32_i32_func_i32_i32,
+/* SDIVREM_I64 */ i64_func_i64_i64,
+/* SDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
+/* UDIVREM_I8 */ i8_func_i8_i8,
+/* UDIVREM_I16 */ i16_i16_func_i16_i16,
+/* UDIVREM_I32 */ i32_i32_func_i32_i32,
+/* UDIVREM_I64 */ i64_i64_func_i64_i64,
+/* UDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
+/* NEG_I32 */ i32_func_i32,
+/* NEG_I64 */ i64_func_i64,
+
+// FLOATING POINT
+/* ADD_F32 */ f32_func_f32_f32,
+/* ADD_F64 */ f64_func_f64_f64,
+/* ADD_F80 */ unsupported,
+/* ADD_F128 */ func_iPTR_i64_i64_i64_i64,
+/* ADD_PPCF128 */ unsupported,
+/* SUB_F32 */ f32_func_f32_f32,
+/* SUB_F64 */ f64_func_f64_f64,
+/* SUB_F80 */ unsupported,
+/* SUB_F128 */ func_iPTR_i64_i64_i64_i64,
+/* SUB_PPCF128 */ unsupported,
+/* MUL_F32 */ f32_func_f32_f32,
+/* MUL_F64 */ f64_func_f64_f64,
+/* MUL_F80 */ unsupported,
+/* MUL_F128 */ func_iPTR_i64_i64_i64_i64,
+/* MUL_PPCF128 */ unsupported,
+/* DIV_F32 */ f32_func_f32_f32,
+/* DIV_F64 */ f64_func_f64_f64,
+/* DIV_F80 */ unsupported,
+/* DIV_F128 */ func_iPTR_i64_i64_i64_i64,
+/* DIV_PPCF128 */ unsupported,
+/* REM_F32 */ f32_func_f32_f32,
+/* REM_F64 */ f64_func_f64_f64,
+/* REM_F80 */ unsupported,
+/* REM_F128 */ func_iPTR_i64_i64_i64_i64,
+/* REM_PPCF128 */ unsupported,
+/* FMA_F32 */ f32_func_f32_f32_f32,
+/* FMA_F64 */ f64_func_f64_f64_f64,
+/* FMA_F80 */ unsupported,
+/* FMA_F128 */ func_iPTR_i64_i64_i64_i64_i64_i64,
+/* FMA_PPCF128 */ unsupported,
+/* POWI_F32 */ f32_func_f32_i32,
+/* POWI_F64 */ f64_func_f64_i32,
+/* POWI_F80 */ unsupported,
+/* POWI_F128 */ func_iPTR_i64_i64_i64_i64,
+/* POWI_PPCF128 */ unsupported,
+/* SQRT_F32 */ f32_func_f32,
+/* SQRT_F64 */ f64_func_f64,
+/* SQRT_F80 */ unsupported,
+/* SQRT_F128 */ func_iPTR_i64_i64,
+/* SQRT_PPCF128 */ unsupported,
+/* LOG_F32 */ f32_func_f32,
+/* LOG_F64 */ f64_func_f64,
+/* LOG_F80 */ unsupported,
+/* LOG_F128 */ func_iPTR_i64_i64,
+/* LOG_PPCF128 */ unsupported,
+/* LOG2_F32 */ f32_func_f32,
+/* LOG2_F64 */ f64_func_f64,
+/* LOG2_F80 */ unsupported,
+/* LOG2_F128 */ func_iPTR_i64_i64,
+/* LOG2_PPCF128 */ unsupported,
+/* LOG10_F32 */ f32_func_f32,
+/* LOG10_F64 */ f64_func_f64,
+/* LOG10_F80 */ unsupported,
+/* LOG10_F128 */ func_iPTR_i64_i64,
+/* LOG10_PPCF128 */ unsupported,
+/* EXP_F32 */ f32_func_f32,
+/* EXP_F64 */ f64_func_f64,
+/* EXP_F80 */ unsupported,
+/* EXP_F128 */ func_iPTR_i64_i64,
+/* EXP_PPCF128 */ unsupported,
+/* EXP2_F32 */ f32_func_f32,
+/* EXP2_F64 */ f64_func_f64,
+/* EXP2_F80 */ unsupported,
+/* EXP2_F128 */ func_iPTR_i64_i64,
+/* EXP2_PPCF128 */ unsupported,
+/* SIN_F32 */ f32_func_f32,
+/* SIN_F64 */ f64_func_f64,
+/* SIN_F80 */ unsupported,
+/* SIN_F128 */ func_iPTR_i64_i64,
+/* SIN_PPCF128 */ unsupported,
+/* COS_F32 */ f32_func_f32,
+/* COS_F64 */ f64_func_f64,
+/* COS_F80 */ unsupported,
+/* COS_F128 */ func_iPTR_i64_i64,
+/* COS_PPCF128 */ unsupported,
+/* SINCOS_F32 */ func_f32_iPTR_iPTR,
+/* SINCOS_F64 */ func_f64_iPTR_iPTR,
+/* SINCOS_F80 */ unsupported,
+/* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,
+/* SINCOS_PPCF128 */ unsupported,
+/* POW_F32 */ f32_func_f32_f32,
+/* POW_F64 */ f64_func_f64_f64,
+/* POW_F80 */ unsupported,
+/* POW_F128 */ func_iPTR_i64_i64_i64_i64,
+/* POW_PPCF128 */ unsupported,
+/* CEIL_F32 */ f32_func_f32,
+/* CEIL_F64 */ f64_func_f64,
+/* CEIL_F80 */ unsupported,
+/* CEIL_F128 */ func_iPTR_i64_i64,
+/* CEIL_PPCF128 */ unsupported,
+/* TRUNC_F32 */ f32_func_f32,
+/* TRUNC_F64 */ f64_func_f64,
+/* TRUNC_F80 */ unsupported,
+/* TRUNC_F128 */ func_iPTR_i64_i64,
+/* TRUNC_PPCF128 */ unsupported,
+/* RINT_F32 */ f32_func_f32,
+/* RINT_F64 */ f64_func_f64,
+/* RINT_F80 */ unsupported,
+/* RINT_F128 */ func_iPTR_i64_i64,
+/* RINT_PPCF128 */ unsupported,
+/* NEARBYINT_F32 */ f32_func_f32,
+/* NEARBYINT_F64 */ f64_func_f64,
+/* NEARBYINT_F80 */ unsupported,
+/* NEARBYINT_F128 */ func_iPTR_i64_i64,
+/* NEARBYINT_PPCF128 */ unsupported,
+/* ROUND_F32 */ f32_func_f32,
+/* ROUND_F64 */ f64_func_f64,
+/* ROUND_F80 */ unsupported,
+/* ROUND_F128 */ func_iPTR_i64_i64,
+/* ROUND_PPCF128 */ unsupported,
+/* FLOOR_F32 */ f32_func_f32,
+/* FLOOR_F64 */ f64_func_f64,
+/* FLOOR_F80 */ unsupported,
+/* FLOOR_F128 */ func_iPTR_i64_i64,
+/* FLOOR_PPCF128 */ unsupported,
+/* COPYSIGN_F32 */ f32_func_f32_f32,
+/* COPYSIGN_F64 */ f64_func_f64_f64,
+/* COPYSIGN_F80 */ unsupported,
+/* COPYSIGN_F128 */ func_iPTR_i64_i64_i64_i64,
+/* COPYSIGN_PPCF128 */ unsupported,
+/* FMIN_F32 */ f32_func_f32_f32,
+/* FMIN_F64 */ f64_func_f64_f64,
+/* FMIN_F80 */ unsupported,
+/* FMIN_F128 */ func_iPTR_i64_i64_i64_i64,
+/* FMIN_PPCF128 */ unsupported,
+/* FMAX_F32 */ f32_func_f32_f32,
+/* FMAX_F64 */ f64_func_f64_f64,
+/* FMAX_F80 */ unsupported,
+/* FMAX_F128 */ func_iPTR_i64_i64_i64_i64,
+/* FMAX_PPCF128 */ unsupported,
+
+// CONVERSION
+/* FPEXT_F32_PPCF128 */ unsupported,
+/* FPEXT_F64_PPCF128 */ unsupported,
+/* FPEXT_F64_F128 */ func_iPTR_f64,
+/* FPEXT_F32_F128 */ func_iPTR_f32,
+/* FPEXT_F32_F64 */ f64_func_f32,
+/* FPEXT_F16_F32 */ f32_func_i16,
+/* FPROUND_F32_F16 */ i16_func_f32,
+/* FPROUND_F64_F16 */ unsupported,
+/* FPROUND_F80_F16 */ unsupported,
+/* FPROUND_F128_F16 */ unsupported,
+/* FPROUND_PPCF128_F16 */ unsupported,
+/* FPROUND_F64_F32 */ f32_func_f64,
+/* FPROUND_F80_F32 */ unsupported,
+/* FPROUND_F128_F32 */ f32_func_i64_i64,
+/* FPROUND_PPCF128_F32 */ unsupported,
+/* FPROUND_F80_F64 */ unsupported,
+/* FPROUND_F128_F64 */ f64_func_i64_i64,
+/* FPROUND_PPCF128_F64 */ unsupported,
+/* FPTOSINT_F32_I32 */ i32_func_f32,
+/* FPTOSINT_F32_I64 */ i64_func_f32,
+/* FPTOSINT_F32_I128 */ i64_i64_func_f32,
+/* FPTOSINT_F64_I32 */ i32_func_f64,
+/* FPTOSINT_F64_I64 */ i64_func_f64,
+/* FPTOSINT_F64_I128 */ i64_i64_func_f64,
+/* FPTOSINT_F80_I32 */ unsupported,
+/* FPTOSINT_F80_I64 */ unsupported,
+/* FPTOSINT_F80_I128 */ unsupported,
+/* FPTOSINT_F128_I32 */ i32_func_i64_i64,
+/* FPTOSINT_F128_I64 */ i64_func_i64_i64,
+/* FPTOSINT_F128_I128 */ i64_i64_func_i64_i64,
+/* FPTOSINT_PPCF128_I32 */ unsupported,
+/* FPTOSINT_PPCF128_I64 */ unsupported,
+/* FPTOSINT_PPCF128_I128 */ unsupported,
+/* FPTOUINT_F32_I32 */ i32_func_f32,
+/* FPTOUINT_F32_I64 */ i64_func_f32,
+/* FPTOUINT_F32_I128 */ i64_i64_func_f32,
+/* FPTOUINT_F64_I32 */ i32_func_f64,
+/* FPTOUINT_F64_I64 */ i64_func_f64,
+/* FPTOUINT_F64_I128 */ i64_i64_func_f64,
+/* FPTOUINT_F80_I32 */ unsupported,
+/* FPTOUINT_F80_I64 */ unsupported,
+/* FPTOUINT_F80_I128 */ unsupported,
+/* FPTOUINT_F128_I32 */ i32_func_i64_i64,
+/* FPTOUINT_F128_I64 */ i64_func_i64_i64,
+/* FPTOUINT_F128_I128 */ i64_i64_func_i64_i64,
+/* FPTOUINT_PPCF128_I32 */ unsupported,
+/* FPTOUINT_PPCF128_I64 */ unsupported,
+/* FPTOUINT_PPCF128_I128 */ unsupported,
+/* SINTTOFP_I32_F32 */ f32_func_i32,
+/* SINTTOFP_I32_F64 */ f64_func_i32,
+/* SINTTOFP_I32_F80 */ unsupported,
+/* SINTTOFP_I32_F128 */ func_iPTR_i32,
+/* SINTTOFP_I32_PPCF128 */ unsupported,
+/* SINTTOFP_I64_F32 */ f32_func_i64,
+/* SINTTOFP_I64_F64 */ f64_func_i64,
+/* SINTTOFP_I64_F80 */ unsupported,
+/* SINTTOFP_I64_F128 */ func_iPTR_i64,
+/* SINTTOFP_I64_PPCF128 */ unsupported,
+/* SINTTOFP_I128_F32 */ f32_func_i64_i64,
+/* SINTTOFP_I128_F64 */ f64_func_i64_i64,
+/* SINTTOFP_I128_F80 */ unsupported,
+/* SINTTOFP_I128_F128 */ func_iPTR_i64_i64,
+/* SINTTOFP_I128_PPCF128 */ unsupported,
+/* UINTTOFP_I32_F32 */ f32_func_i32,
+/* UINTTOFP_I32_F64 */ f64_func_i64,
+/* UINTTOFP_I32_F80 */ unsupported,
+/* UINTTOFP_I32_F128 */ func_iPTR_i32,
+/* UINTTOFP_I32_PPCF128 */ unsupported,
+/* UINTTOFP_I64_F32 */ f32_func_i64,
+/* UINTTOFP_I64_F64 */ f64_func_i64,
+/* UINTTOFP_I64_F80 */ unsupported,
+/* UINTTOFP_I64_F128 */ func_iPTR_i64,
+/* UINTTOFP_I64_PPCF128 */ unsupported,
+/* UINTTOFP_I128_F32 */ f32_func_i64_i64,
+/* UINTTOFP_I128_F64 */ f64_func_i64_i64,
+/* UINTTOFP_I128_F80 */ unsupported,
+/* UINTTOFP_I128_F128 */ func_iPTR_i64_i64,
+/* UINTTOFP_I128_PPCF128 */ unsupported,
+
+// COMPARISON
+/* OEQ_F32 */ i32_func_f32_f32,
+/* OEQ_F64 */ i32_func_f64_f64,
+/* OEQ_F128 */ i32_func_i64_i64_i64_i64,
+/* OEQ_PPCF128 */ unsupported,
+/* UNE_F32 */ i32_func_f32_f32,
+/* UNE_F64 */ i32_func_f64_f64,
+/* UNE_F128 */ i32_func_i64_i64_i64_i64,
+/* UNE_PPCF128 */ unsupported,
+/* OGE_F32 */ i32_func_f32_f32,
+/* OGE_F64 */ i32_func_f64_f64,
+/* OGE_F128 */ i32_func_i64_i64_i64_i64,
+/* OGE_PPCF128 */ unsupported,
+/* OLT_F32 */ i32_func_f32_f32,
+/* OLT_F64 */ i32_func_f64_f64,
+/* OLT_F128 */ i32_func_i64_i64_i64_i64,
+/* OLT_PPCF128 */ unsupported,
+/* OLE_F32 */ i32_func_f32_f32,
+/* OLE_F64 */ i32_func_f64_f64,
+/* OLE_F128 */ i32_func_i64_i64_i64_i64,
+/* OLE_PPCF128 */ unsupported,
+/* OGT_F32 */ i32_func_f32_f32,
+/* OGT_F64 */ i32_func_f64_f64,
+/* OGT_F128 */ i32_func_i64_i64_i64_i64,
+/* OGT_PPCF128 */ unsupported,
+/* UO_F32 */ i32_func_f32_f32,
+/* UO_F64 */ i32_func_f64_f64,
+/* UO_F128 */ i32_func_i64_i64_i64_i64,
+/* UO_PPCF128 */ unsupported,
+/* O_F32 */ i32_func_f32_f32,
+/* O_F64 */ i32_func_f64_f64,
+/* O_F128 */ i32_func_i64_i64_i64_i64,
+/* O_PPCF128 */ unsupported,
+
+// MEMORY
+/* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
+/* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR,
+
+// ELEMENT-WISE ATOMIC MEMORY
+/* MEMCPY_ELEMENT_ATOMIC_1 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_2 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_4 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_8 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_16 */ iPTR_func_iPTR_iPTR_iPTR,
+
+// EXCEPTION HANDLING
+/* UNWIND_RESUME */ unsupported,
+
+// Note: there's two sets of atomics libcalls; see
+// <http://llvm.org/docs/Atomics.html> for more info on the
+// difference between them.
+
+// Atomic '__sync_*' libcalls.
+/* SYNC_VAL_COMPARE_AND_SWAP_1 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_2 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_4 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_8 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_16 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_1 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_2 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_4 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_8 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_16 */ unsupported,
+/* SYNC_FETCH_AND_ADD_1 */ unsupported,
+/* SYNC_FETCH_AND_ADD_2 */ unsupported,
+/* SYNC_FETCH_AND_ADD_4 */ unsupported,
+/* SYNC_FETCH_AND_ADD_8 */ unsupported,
+/* SYNC_FETCH_AND_ADD_16 */ unsupported,
+/* SYNC_FETCH_AND_SUB_1 */ unsupported,
+/* SYNC_FETCH_AND_SUB_2 */ unsupported,
+/* SYNC_FETCH_AND_SUB_4 */ unsupported,
+/* SYNC_FETCH_AND_SUB_8 */ unsupported,
+/* SYNC_FETCH_AND_SUB_16 */ unsupported,
+/* SYNC_FETCH_AND_AND_1 */ unsupported,
+/* SYNC_FETCH_AND_AND_2 */ unsupported,
+/* SYNC_FETCH_AND_AND_4 */ unsupported,
+/* SYNC_FETCH_AND_AND_8 */ unsupported,
+/* SYNC_FETCH_AND_AND_16 */ unsupported,
+/* SYNC_FETCH_AND_OR_1 */ unsupported,
+/* SYNC_FETCH_AND_OR_2 */ unsupported,
+/* SYNC_FETCH_AND_OR_4 */ unsupported,
+/* SYNC_FETCH_AND_OR_8 */ unsupported,
+/* SYNC_FETCH_AND_OR_16 */ unsupported,
+/* SYNC_FETCH_AND_XOR_1 */ unsupported,
+/* SYNC_FETCH_AND_XOR_2 */ unsupported,
+/* SYNC_FETCH_AND_XOR_4 */ unsupported,
+/* SYNC_FETCH_AND_XOR_8 */ unsupported,
+/* SYNC_FETCH_AND_XOR_16 */ unsupported,
+/* SYNC_FETCH_AND_NAND_1 */ unsupported,
+/* SYNC_FETCH_AND_NAND_2 */ unsupported,
+/* SYNC_FETCH_AND_NAND_4 */ unsupported,
+/* SYNC_FETCH_AND_NAND_8 */ unsupported,
+/* SYNC_FETCH_AND_NAND_16 */ unsupported,
+/* SYNC_FETCH_AND_MAX_1 */ unsupported,
+/* SYNC_FETCH_AND_MAX_2 */ unsupported,
+/* SYNC_FETCH_AND_MAX_4 */ unsupported,
+/* SYNC_FETCH_AND_MAX_8 */ unsupported,
+/* SYNC_FETCH_AND_MAX_16 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_1 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_2 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_4 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_8 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_16 */ unsupported,
+/* SYNC_FETCH_AND_MIN_1 */ unsupported,
+/* SYNC_FETCH_AND_MIN_2 */ unsupported,
+/* SYNC_FETCH_AND_MIN_4 */ unsupported,
+/* SYNC_FETCH_AND_MIN_8 */ unsupported,
+/* SYNC_FETCH_AND_MIN_16 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_1 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_2 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_4 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_8 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_16 */ unsupported,
+
+// Atomic '__atomic_*' libcalls.
+/* ATOMIC_LOAD */ unsupported,
+/* ATOMIC_LOAD_1 */ unsupported,
+/* ATOMIC_LOAD_2 */ unsupported,
+/* ATOMIC_LOAD_4 */ unsupported,
+/* ATOMIC_LOAD_8 */ unsupported,
+/* ATOMIC_LOAD_16 */ unsupported,
+
+/* ATOMIC_STORE */ unsupported,
+/* ATOMIC_STORE_1 */ unsupported,
+/* ATOMIC_STORE_2 */ unsupported,
+/* ATOMIC_STORE_4 */ unsupported,
+/* ATOMIC_STORE_8 */ unsupported,
+/* ATOMIC_STORE_16 */ unsupported,
+
+/* ATOMIC_EXCHANGE */ unsupported,
+/* ATOMIC_EXCHANGE_1 */ unsupported,
+/* ATOMIC_EXCHANGE_2 */ unsupported,
+/* ATOMIC_EXCHANGE_4 */ unsupported,
+/* ATOMIC_EXCHANGE_8 */ unsupported,
+/* ATOMIC_EXCHANGE_16 */ unsupported,
+
+/* ATOMIC_COMPARE_EXCHANGE */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_1 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_2 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_4 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_8 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_16 */ unsupported,
+
+/* ATOMIC_FETCH_ADD_1 */ unsupported,
+/* ATOMIC_FETCH_ADD_2 */ unsupported,
+/* ATOMIC_FETCH_ADD_4 */ unsupported,
+/* ATOMIC_FETCH_ADD_8 */ unsupported,
+/* ATOMIC_FETCH_ADD_16 */ unsupported,
+
+/* ATOMIC_FETCH_SUB_1 */ unsupported,
+/* ATOMIC_FETCH_SUB_2 */ unsupported,
+/* ATOMIC_FETCH_SUB_4 */ unsupported,
+/* ATOMIC_FETCH_SUB_8 */ unsupported,
+/* ATOMIC_FETCH_SUB_16 */ unsupported,
+
+/* ATOMIC_FETCH_AND_1 */ unsupported,
+/* ATOMIC_FETCH_AND_2 */ unsupported,
+/* ATOMIC_FETCH_AND_4 */ unsupported,
+/* ATOMIC_FETCH_AND_8 */ unsupported,
+/* ATOMIC_FETCH_AND_16 */ unsupported,
+
+/* ATOMIC_FETCH_OR_1 */ unsupported,
+/* ATOMIC_FETCH_OR_2 */ unsupported,
+/* ATOMIC_FETCH_OR_4 */ unsupported,
+/* ATOMIC_FETCH_OR_8 */ unsupported,
+/* ATOMIC_FETCH_OR_16 */ unsupported,
+
+/* ATOMIC_FETCH_XOR_1 */ unsupported,
+/* ATOMIC_FETCH_XOR_2 */ unsupported,
+/* ATOMIC_FETCH_XOR_4 */ unsupported,
+/* ATOMIC_FETCH_XOR_8 */ unsupported,
+/* ATOMIC_FETCH_XOR_16 */ unsupported,
+
+/* ATOMIC_FETCH_NAND_1 */ unsupported,
+/* ATOMIC_FETCH_NAND_2 */ unsupported,
+/* ATOMIC_FETCH_NAND_4 */ unsupported,
+/* ATOMIC_FETCH_NAND_8 */ unsupported,
+/* ATOMIC_FETCH_NAND_16 */ unsupported,
+
+// Stack Protector Fail.
+/* STACKPROTECTOR_CHECK_FAIL */ func,
+
+// Deoptimization.
+/* DEOPTIMIZE */ unsupported,
+
+};
+
+static const char *
+RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
+/* SHL_I16 */ "__ashlhi3",
+/* SHL_I32 */ "__ashlsi3",
+/* SHL_I64 */ "__ashldi3",
+/* SHL_I128 */ "__ashlti3",
+/* SRL_I16 */ "__lshrhi3",
+/* SRL_I32 */ "__lshrsi3",
+/* SRL_I64 */ "__lshrdi3",
+/* SRL_I128 */ "__lshrti3",
+/* SRA_I16 */ "__ashrhi3",
+/* SRA_I32 */ "__ashrsi3",
+/* SRA_I64 */ "__ashrdi3",
+/* SRA_I128 */ "__ashrti3",
+/* MUL_I8 */ "__mulqi3",
+/* MUL_I16 */ "__mulhi3",
+/* MUL_I32 */ "__mulsi3",
+/* MUL_I64 */ "__muldi3",
+/* MUL_I128 */ "__multi3",
+/* MULO_I32 */ "__mulosi4",
+/* MULO_I64 */ "__mulodi4",
+/* MULO_I128 */ "__muloti4",
+/* SDIV_I8 */ "__divqi3",
+/* SDIV_I16 */ "__divhi3",
+/* SDIV_I32 */ "__divsi3",
+/* SDIV_I64 */ "__divdi3",
+/* SDIV_I128 */ "__divti3",
+/* UDIV_I8 */ "__udivqi3",
+/* UDIV_I16 */ "__udivhi3",
+/* UDIV_I32 */ "__udivsi3",
+/* UDIV_I64 */ "__udivdi3",
+/* UDIV_I128 */ "__udivti3",
+/* SREM_I8 */ "__modqi3",
+/* SREM_I16 */ "__modhi3",
+/* SREM_I32 */ "__modsi3",
+/* SREM_I64 */ "__moddi3",
+/* SREM_I128 */ "__modti3",
+/* UREM_I8 */ "__umodqi3",
+/* UREM_I16 */ "__umodhi3",
+/* UREM_I32 */ "__umodsi3",
+/* UREM_I64 */ "__umoddi3",
+/* UREM_I128 */ "__umodti3",
+/* SDIVREM_I8 */ nullptr,
+/* SDIVREM_I16 */ nullptr,
+/* SDIVREM_I32 */ nullptr,
+/* SDIVREM_I64 */ nullptr,
+/* SDIVREM_I128 */ nullptr,
+/* UDIVREM_I8 */ nullptr,
+/* UDIVREM_I16 */ nullptr,
+/* UDIVREM_I32 */ nullptr,
+/* UDIVREM_I64 */ nullptr,
+/* UDIVREM_I128 */ nullptr,
+/* NEG_I32 */ "__negsi2",
+/* NEG_I64 */ "__negdi2",
+/* ADD_F32 */ "__addsf3",
+/* ADD_F64 */ "__adddf3",
+/* ADD_F80 */ nullptr,
+/* ADD_F128 */ "__addtf3",
+/* ADD_PPCF128 */ nullptr,
+/* SUB_F32 */ "__subsf3",
+/* SUB_F64 */ "__subdf3",
+/* SUB_F80 */ nullptr,
+/* SUB_F128 */ "__subtf3",
+/* SUB_PPCF128 */ nullptr,
+/* MUL_F32 */ "__mulsf3",
+/* MUL_F64 */ "__muldf3",
+/* MUL_F80 */ nullptr,
+/* MUL_F128 */ "__multf3",
+/* MUL_PPCF128 */ nullptr,
+/* DIV_F32 */ "__divsf3",
+/* DIV_F64 */ "__divdf3",
+/* DIV_F80 */ nullptr,
+/* DIV_F128 */ "__divtf3",
+/* DIV_PPCF128 */ nullptr,
+/* REM_F32 */ "fmodf",
+/* REM_F64 */ "fmod",
+/* REM_F80 */ nullptr,
+/* REM_F128 */ "fmodl",
+/* REM_PPCF128 */ nullptr,
+/* FMA_F32 */ "fmaf",
+/* FMA_F64 */ "fma",
+/* FMA_F80 */ nullptr,
+/* FMA_F128 */ "fmal",
+/* FMA_PPCF128 */ nullptr,
+/* POWI_F32 */ "__powisf2",
+/* POWI_F64 */ "__powidf2",
+/* POWI_F80 */ nullptr,
+/* POWI_F128 */ "__powitf2",
+/* POWI_PPCF128 */ nullptr,
+/* SQRT_F32 */ "sqrtf",
+/* SQRT_F64 */ "sqrt",
+/* SQRT_F80 */ nullptr,
+/* SQRT_F128 */ "sqrtl",
+/* SQRT_PPCF128 */ nullptr,
+/* LOG_F32 */ "logf",
+/* LOG_F64 */ "log",
+/* LOG_F80 */ nullptr,
+/* LOG_F128 */ "logl",
+/* LOG_PPCF128 */ nullptr,
+/* LOG2_F32 */ "log2f",
+/* LOG2_F64 */ "log2",
+/* LOG2_F80 */ nullptr,
+/* LOG2_F128 */ "log2l",
+/* LOG2_PPCF128 */ nullptr,
+/* LOG10_F32 */ "log10f",
+/* LOG10_F64 */ "log10",
+/* LOG10_F80 */ nullptr,
+/* LOG10_F128 */ "log10l",
+/* LOG10_PPCF128 */ nullptr,
+/* EXP_F32 */ "expf",
+/* EXP_F64 */ "exp",
+/* EXP_F80 */ nullptr,
+/* EXP_F128 */ "expl",
+/* EXP_PPCF128 */ nullptr,
+/* EXP2_F32 */ "exp2f",
+/* EXP2_F64 */ "exp2",
+/* EXP2_F80 */ nullptr,
+/* EXP2_F128 */ "exp2l",
+/* EXP2_PPCF128 */ nullptr,
+/* SIN_F32 */ "sinf",
+/* SIN_F64 */ "sin",
+/* SIN_F80 */ nullptr,
+/* SIN_F128 */ "sinl",
+/* SIN_PPCF128 */ nullptr,
+/* COS_F32 */ "cosf",
+/* COS_F64 */ "cos",
+/* COS_F80 */ nullptr,
+/* COS_F128 */ "cosl",
+/* COS_PPCF128 */ nullptr,
+/* SINCOS_F32 */ "sincosf",
+/* SINCOS_F64 */ "sincos",
+/* SINCOS_F80 */ nullptr,
+/* SINCOS_F128 */ "sincosl",
+/* SINCOS_PPCF128 */ nullptr,
+/* POW_F32 */ "powf",
+/* POW_F64 */ "pow",
+/* POW_F80 */ nullptr,
+/* POW_F128 */ "powl",
+/* POW_PPCF128 */ nullptr,
+/* CEIL_F32 */ "ceilf",
+/* CEIL_F64 */ "ceil",
+/* CEIL_F80 */ nullptr,
+/* CEIL_F128 */ "ceill",
+/* CEIL_PPCF128 */ nullptr,
+/* TRUNC_F32 */ "truncf",
+/* TRUNC_F64 */ "trunc",
+/* TRUNC_F80 */ nullptr,
+/* TRUNC_F128 */ "truncl",
+/* TRUNC_PPCF128 */ nullptr,
+/* RINT_F32 */ "rintf",
+/* RINT_F64 */ "rint",
+/* RINT_F80 */ nullptr,
+/* RINT_F128 */ "rintl",
+/* RINT_PPCF128 */ nullptr,
+/* NEARBYINT_F32 */ "nearbyintf",
+/* NEARBYINT_F64 */ "nearbyint",
+/* NEARBYINT_F80 */ nullptr,
+/* NEARBYINT_F128 */ "nearbyintl",
+/* NEARBYINT_PPCF128 */ nullptr,
+/* ROUND_F32 */ "roundf",
+/* ROUND_F64 */ "round",
+/* ROUND_F80 */ nullptr,
+/* ROUND_F128 */ "roundl",
+/* ROUND_PPCF128 */ nullptr,
+/* FLOOR_F32 */ "floorf",
+/* FLOOR_F64 */ "floor",
+/* FLOOR_F80 */ nullptr,
+/* FLOOR_F128 */ "floorl",
+/* FLOOR_PPCF128 */ nullptr,
+/* COPYSIGN_F32 */ "copysignf",
+/* COPYSIGN_F64 */ "copysign",
+/* COPYSIGN_F80 */ nullptr,
+/* COPYSIGN_F128 */ "copysignl",
+/* COPYSIGN_PPCF128 */ nullptr,
+/* FMIN_F32 */ "fminf",
+/* FMIN_F64 */ "fmin",
+/* FMIN_F80 */ nullptr,
+/* FMIN_F128 */ "fminl",
+/* FMIN_PPCF128 */ nullptr,
+/* FMAX_F32 */ "fmaxf",
+/* FMAX_F64 */ "fmax",
+/* FMAX_F80 */ nullptr,
+/* FMAX_F128 */ "fmaxl",
+/* FMAX_PPCF128 */ nullptr,
+/* FPEXT_F32_PPCF128 */ nullptr,
+/* FPEXT_F64_PPCF128 */ nullptr,
+/* FPEXT_F64_F128 */ "__extenddftf2",
+/* FPEXT_F32_F128 */ "__extendsftf2",
+/* FPEXT_F32_F64 */ "__extendsfdf2",
+/* FPEXT_F16_F32 */ "__gnu_h2f_ieee",
+/* FPROUND_F32_F16 */ "__gnu_f2h_ieee",
+/* FPROUND_F64_F16 */ nullptr,
+/* FPROUND_F80_F16 */ nullptr,
+/* FPROUND_F128_F16 */ nullptr,
+/* FPROUND_PPCF128_F16 */ nullptr,
+/* FPROUND_F64_F32 */ "__truncdfsf2",
+/* FPROUND_F80_F32 */ "__truncxfsf2",
+/* FPROUND_F128_F32 */ "__trunctfsf2",
+/* FPROUND_PPCF128_F32 */ nullptr,
+/* FPROUND_F80_F64 */ "__truncxfdf2",
+/* FPROUND_F128_F64 */ "__trunctfdf2",
+/* FPROUND_PPCF128_F64 */ nullptr,
+/* FPTOSINT_F32_I32 */ "__fixsfsi",
+/* FPTOSINT_F32_I64 */ "__fixsfdi",
+/* FPTOSINT_F32_I128 */ "__fixsfti",
+/* FPTOSINT_F64_I32 */ "__fixdfsi",
+/* FPTOSINT_F64_I64 */ "__fixdfdi",
+/* FPTOSINT_F64_I128 */ "__fixdfti",
+/* FPTOSINT_F80_I32 */ "__fixxfsi",
+/* FPTOSINT_F80_I64 */ "__fixxfdi",
+/* FPTOSINT_F80_I128 */ "__fixxfti",
+/* FPTOSINT_F128_I32 */ "__fixtfsi",
+/* FPTOSINT_F128_I64 */ "__fixtfdi",
+/* FPTOSINT_F128_I128 */ "__fixtfti",
+/* FPTOSINT_PPCF128_I32 */ nullptr,
+/* FPTOSINT_PPCF128_I64 */ nullptr,
+/* FPTOSINT_PPCF128_I128 */ nullptr,
+/* FPTOUINT_F32_I32 */ "__fixunssfsi",
+/* FPTOUINT_F32_I64 */ "__fixunssfdi",
+/* FPTOUINT_F32_I128 */ "__fixunssfti",
+/* FPTOUINT_F64_I32 */ "__fixunsdfsi",
+/* FPTOUINT_F64_I64 */ "__fixunsdfdi",
+/* FPTOUINT_F64_I128 */ "__fixunsdfti",
+/* FPTOUINT_F80_I32 */ "__fixunsxfsi",
+/* FPTOUINT_F80_I64 */ "__fixunsxfdi",
+/* FPTOUINT_F80_I128 */ "__fixunsxfti",
+/* FPTOUINT_F128_I32 */ "__fixunstfsi",
+/* FPTOUINT_F128_I64 */ "__fixunstfdi",
+/* FPTOUINT_F128_I128 */ "__fixunstfti",
+/* FPTOUINT_PPCF128_I32 */ nullptr,
+/* FPTOUINT_PPCF128_I64 */ nullptr,
+/* FPTOUINT_PPCF128_I128 */ nullptr,
+/* SINTTOFP_I32_F32 */ "__floatsisf",
+/* SINTTOFP_I32_F64 */ "__floatsidf",
+/* SINTTOFP_I32_F80 */ nullptr,
+/* SINTTOFP_I32_F128 */ "__floatsitf",
+/* SINTTOFP_I32_PPCF128 */ nullptr,
+/* SINTTOFP_I64_F32 */ "__floatdisf",
+/* SINTTOFP_I64_F64 */ "__floatdidf",
+/* SINTTOFP_I64_F80 */ nullptr,
+/* SINTTOFP_I64_F128 */ "__floatditf",
+/* SINTTOFP_I64_PPCF128 */ nullptr,
+/* SINTTOFP_I128_F32 */ "__floattisf",
+/* SINTTOFP_I128_F64 */ "__floattidf",
+/* SINTTOFP_I128_F80 */ nullptr,
+/* SINTTOFP_I128_F128 */ "__floattitf",
+/* SINTTOFP_I128_PPCF128 */ nullptr,
+/* UINTTOFP_I32_F32 */ "__floatunsisf",
+/* UINTTOFP_I32_F64 */ "__floatunsidf",
+/* UINTTOFP_I32_F80 */ nullptr,
+/* UINTTOFP_I32_F128 */ "__floatunsitf",
+/* UINTTOFP_I32_PPCF128 */ nullptr,
+/* UINTTOFP_I64_F32 */ "__floatundisf",
+/* UINTTOFP_I64_F64 */ "__floatundidf",
+/* UINTTOFP_I64_F80 */ nullptr,
+/* UINTTOFP_I64_F128 */ "__floatunditf",
+/* UINTTOFP_I64_PPCF128 */ nullptr,
+/* UINTTOFP_I128_F32 */ "__floatuntisf",
+/* UINTTOFP_I128_F64 */ "__floatuntidf",
+/* UINTTOFP_I128_F80 */ nullptr,
+/* UINTTOFP_I128_F128 */ "__floatuntitf",
+/* UINTTOFP_I128_PPCF128 */ nullptr,
+/* OEQ_F32 */ "__eqsf2",
+/* OEQ_F64 */ "__eqdf2",
+/* OEQ_F128 */ "__eqtf2",
+/* OEQ_PPCF128 */ nullptr,
+/* UNE_F32 */ "__nesf2",
+/* UNE_F64 */ "__nedf2",
+/* UNE_F128 */ "__netf2",
+/* UNE_PPCF128 */ nullptr,
+/* OGE_F32 */ "__gesf2",
+/* OGE_F64 */ "__gedf2",
+/* OGE_F128 */ "__getf2",
+/* OGE_PPCF128 */ nullptr,
+/* OLT_F32 */ "__ltsf2",
+/* OLT_F64 */ "__ltdf2",
+/* OLT_F128 */ "__lttf2",
+/* OLT_PPCF128 */ nullptr,
+/* OLE_F32 */ "__lesf2",
+/* OLE_F64 */ "__ledf2",
+/* OLE_F128 */ "__letf2",
+/* OLE_PPCF128 */ nullptr,
+/* OGT_F32 */ "__gtsf2",
+/* OGT_F64 */ "__gtdf2",
+/* OGT_F128 */ "__gttf2",
+/* OGT_PPCF128 */ nullptr,
+/* UO_F32 */ "__unordsf2",
+/* UO_F64 */ "__unorddf2",
+/* UO_F128 */ "__unordtf2",
+/* UO_PPCF128 */ nullptr,
+/* O_F32 */ "__unordsf2",
+/* O_F64 */ "__unorddf2",
+/* O_F128 */ "__unordtf2",
+/* O_PPCF128 */ nullptr,
+/* MEMCPY */ "memcpy",
+/* MEMMOVE */ "memset",
+/* MEMSET */ "memmove",
+/* MEMCPY_ELEMENT_ATOMIC_1 */ "MEMCPY_ELEMENT_ATOMIC_1",
+/* MEMCPY_ELEMENT_ATOMIC_2 */ "MEMCPY_ELEMENT_ATOMIC_2",
+/* MEMCPY_ELEMENT_ATOMIC_4 */ "MEMCPY_ELEMENT_ATOMIC_4",
+/* MEMCPY_ELEMENT_ATOMIC_8 */ "MEMCPY_ELEMENT_ATOMIC_8",
+/* MEMCPY_ELEMENT_ATOMIC_16 */ "MEMCPY_ELEMENT_ATOMIC_16",
+/* UNWIND_RESUME */ "_Unwind_Resume",
+/* SYNC_VAL_COMPARE_AND_SWAP_1 */ "__sync_val_compare_and_swap_1",
+/* SYNC_VAL_COMPARE_AND_SWAP_2 */ "__sync_val_compare_and_swap_2",
+/* SYNC_VAL_COMPARE_AND_SWAP_4 */ "__sync_val_compare_and_swap_4",
+/* SYNC_VAL_COMPARE_AND_SWAP_8 */ "__sync_val_compare_and_swap_8",
+/* SYNC_VAL_COMPARE_AND_SWAP_16 */ "__sync_val_compare_and_swap_16",
+/* SYNC_LOCK_TEST_AND_SET_1 */ "__sync_lock_test_and_set_1",
+/* SYNC_LOCK_TEST_AND_SET_2 */ "__sync_lock_test_and_set_2",
+/* SYNC_LOCK_TEST_AND_SET_4 */ "__sync_lock_test_and_set_4",
+/* SYNC_LOCK_TEST_AND_SET_8 */ "__sync_lock_test_and_set_8",
+/* SYNC_LOCK_TEST_AND_SET_16 */ "__sync_lock_test_and_set_16",
+/* SYNC_FETCH_AND_ADD_1 */ "__sync_fetch_and_add_1",
+/* SYNC_FETCH_AND_ADD_2 */ "__sync_fetch_and_add_2",
+/* SYNC_FETCH_AND_ADD_4 */ "__sync_fetch_and_add_4",
+/* SYNC_FETCH_AND_ADD_8 */ "__sync_fetch_and_add_8",
+/* SYNC_FETCH_AND_ADD_16 */ "__sync_fetch_and_add_16",
+/* SYNC_FETCH_AND_SUB_1 */ "__sync_fetch_and_sub_1",
+/* SYNC_FETCH_AND_SUB_2 */ "__sync_fetch_and_sub_2",
+/* SYNC_FETCH_AND_SUB_4 */ "__sync_fetch_and_sub_4",
+/* SYNC_FETCH_AND_SUB_8 */ "__sync_fetch_and_sub_8",
+/* SYNC_FETCH_AND_SUB_16 */ "__sync_fetch_and_sub_16",
+/* SYNC_FETCH_AND_AND_1 */ "__sync_fetch_and_and_1",
+/* SYNC_FETCH_AND_AND_2 */ "__sync_fetch_and_and_2",
+/* SYNC_FETCH_AND_AND_4 */ "__sync_fetch_and_and_4",
+/* SYNC_FETCH_AND_AND_8 */ "__sync_fetch_and_and_8",
+/* SYNC_FETCH_AND_AND_16 */ "__sync_fetch_and_and_16",
+/* SYNC_FETCH_AND_OR_1 */ "__sync_fetch_and_or_1",
+/* SYNC_FETCH_AND_OR_2 */ "__sync_fetch_and_or_2",
+/* SYNC_FETCH_AND_OR_4 */ "__sync_fetch_and_or_4",
+/* SYNC_FETCH_AND_OR_8 */ "__sync_fetch_and_or_8",
+/* SYNC_FETCH_AND_OR_16 */ "__sync_fetch_and_or_16",
+/* SYNC_FETCH_AND_XOR_1 */ "__sync_fetch_and_xor_1",
+/* SYNC_FETCH_AND_XOR_2 */ "__sync_fetch_and_xor_2",
+/* SYNC_FETCH_AND_XOR_4 */ "__sync_fetch_and_xor_4",
+/* SYNC_FETCH_AND_XOR_8 */ "__sync_fetch_and_xor_8",
+/* SYNC_FETCH_AND_XOR_16 */ "__sync_fetch_and_xor_16",
+/* SYNC_FETCH_AND_NAND_1 */ "__sync_fetch_and_nand_1",
+/* SYNC_FETCH_AND_NAND_2 */ "__sync_fetch_and_nand_2",
+/* SYNC_FETCH_AND_NAND_4 */ "__sync_fetch_and_nand_4",
+/* SYNC_FETCH_AND_NAND_8 */ "__sync_fetch_and_nand_8",
+/* SYNC_FETCH_AND_NAND_16 */ "__sync_fetch_and_nand_16",
+/* SYNC_FETCH_AND_MAX_1 */ "__sync_fetch_and_max_1",
+/* SYNC_FETCH_AND_MAX_2 */ "__sync_fetch_and_max_2",
+/* SYNC_FETCH_AND_MAX_4 */ "__sync_fetch_and_max_4",
+/* SYNC_FETCH_AND_MAX_8 */ "__sync_fetch_and_max_8",
+/* SYNC_FETCH_AND_MAX_16 */ "__sync_fetch_and_max_16",
+/* SYNC_FETCH_AND_UMAX_1 */ "__sync_fetch_and_umax_1",
+/* SYNC_FETCH_AND_UMAX_2 */ "__sync_fetch_and_umax_2",
+/* SYNC_FETCH_AND_UMAX_4 */ "__sync_fetch_and_umax_4",
+/* SYNC_FETCH_AND_UMAX_8 */ "__sync_fetch_and_umax_8",
+/* SYNC_FETCH_AND_UMAX_16 */ "__sync_fetch_and_umax_16",
+/* SYNC_FETCH_AND_MIN_1 */ "__sync_fetch_and_min_1",
+/* SYNC_FETCH_AND_MIN_2 */ "__sync_fetch_and_min_2",
+/* SYNC_FETCH_AND_MIN_4 */ "__sync_fetch_and_min_4",
+/* SYNC_FETCH_AND_MIN_8 */ "__sync_fetch_and_min_8",
+/* SYNC_FETCH_AND_MIN_16 */ "__sync_fetch_and_min_16",
+/* SYNC_FETCH_AND_UMIN_1 */ "__sync_fetch_and_umin_1",
+/* SYNC_FETCH_AND_UMIN_2 */ "__sync_fetch_and_umin_2",
+/* SYNC_FETCH_AND_UMIN_4 */ "__sync_fetch_and_umin_4",
+/* SYNC_FETCH_AND_UMIN_8 */ "__sync_fetch_and_umin_8",
+/* SYNC_FETCH_AND_UMIN_16 */ "__sync_fetch_and_umin_16",
+
+/* ATOMIC_LOAD */ "__atomic_load",
+/* ATOMIC_LOAD_1 */ "__atomic_load_1",
+/* ATOMIC_LOAD_2 */ "__atomic_load_2",
+/* ATOMIC_LOAD_4 */ "__atomic_load_4",
+/* ATOMIC_LOAD_8 */ "__atomic_load_8",
+/* ATOMIC_LOAD_16 */ "__atomic_load_16",
+
+/* ATOMIC_STORE */ "__atomic_store",
+/* ATOMIC_STORE_1 */ "__atomic_store_1",
+/* ATOMIC_STORE_2 */ "__atomic_store_2",
+/* ATOMIC_STORE_4 */ "__atomic_store_4",
+/* ATOMIC_STORE_8 */ "__atomic_store_8",
+/* ATOMIC_STORE_16 */ "__atomic_store_16",
+
+/* ATOMIC_EXCHANGE */ "__atomic_exchange",
+/* ATOMIC_EXCHANGE_1 */ "__atomic_exchange_1",
+/* ATOMIC_EXCHANGE_2 */ "__atomic_exchange_2",
+/* ATOMIC_EXCHANGE_4 */ "__atomic_exchange_4",
+/* ATOMIC_EXCHANGE_8 */ "__atomic_exchange_8",
+/* ATOMIC_EXCHANGE_16 */ "__atomic_exchange_16",
+
+/* ATOMIC_COMPARE_EXCHANGE */ "__atomic_compare_exchange",
+/* ATOMIC_COMPARE_EXCHANGE_1 */ "__atomic_compare_exchange_1",
+/* ATOMIC_COMPARE_EXCHANGE_2 */ "__atomic_compare_exchange_2",
+/* ATOMIC_COMPARE_EXCHANGE_4 */ "__atomic_compare_exchange_4",
+/* ATOMIC_COMPARE_EXCHANGE_8 */ "__atomic_compare_exchange_8",
+/* ATOMIC_COMPARE_EXCHANGE_16 */ "__atomic_compare_exchange_16",
+
+/* ATOMIC_FETCH_ADD_1 */ "__atomic_fetch_add_1",
+/* ATOMIC_FETCH_ADD_2 */ "__atomic_fetch_add_2",
+/* ATOMIC_FETCH_ADD_4 */ "__atomic_fetch_add_4",
+/* ATOMIC_FETCH_ADD_8 */ "__atomic_fetch_add_8",
+/* ATOMIC_FETCH_ADD_16 */ "__atomic_fetch_add_16",
+/* ATOMIC_FETCH_SUB_1 */ "__atomic_fetch_sub_1",
+/* ATOMIC_FETCH_SUB_2 */ "__atomic_fetch_sub_2",
+/* ATOMIC_FETCH_SUB_4 */ "__atomic_fetch_sub_4",
+/* ATOMIC_FETCH_SUB_8 */ "__atomic_fetch_sub_8",
+/* ATOMIC_FETCH_SUB_16 */ "__atomic_fetch_sub_16",
+/* ATOMIC_FETCH_AND_1 */ "__atomic_fetch_and_1",
+/* ATOMIC_FETCH_AND_2 */ "__atomic_fetch_and_2",
+/* ATOMIC_FETCH_AND_4 */ "__atomic_fetch_and_4",
+/* ATOMIC_FETCH_AND_8 */ "__atomic_fetch_and_8",
+/* ATOMIC_FETCH_AND_16 */ "__atomic_fetch_and_16",
+/* ATOMIC_FETCH_OR_1 */ "__atomic_fetch_or_1",
+/* ATOMIC_FETCH_OR_2 */ "__atomic_fetch_or_2",
+/* ATOMIC_FETCH_OR_4 */ "__atomic_fetch_or_4",
+/* ATOMIC_FETCH_OR_8 */ "__atomic_fetch_or_8",
+/* ATOMIC_FETCH_OR_16 */ "__atomic_fetch_or_16",
+/* ATOMIC_FETCH_XOR_1 */ "__atomic_fetch_xor_1",
+/* ATOMIC_FETCH_XOR_2 */ "__atomic_fetch_xor_2",
+/* ATOMIC_FETCH_XOR_4 */ "__atomic_fetch_xor_4",
+/* ATOMIC_FETCH_XOR_8 */ "__atomic_fetch_xor_8",
+/* ATOMIC_FETCH_XOR_16 */ "__atomic_fetch_xor_16",
+/* ATOMIC_FETCH_NAND_1 */ "__atomic_fetch_nand_1",
+/* ATOMIC_FETCH_NAND_2 */ "__atomic_fetch_nand_2",
+/* ATOMIC_FETCH_NAND_4 */ "__atomic_fetch_nand_4",
+/* ATOMIC_FETCH_NAND_8 */ "__atomic_fetch_nand_8",
+/* ATOMIC_FETCH_NAND_16 */ "__atomic_fetch_nand_16",
+
+/* STACKPROTECTOR_CHECK_FAIL */ "__stack_chk_fail",
+
+/* DEOPTIMIZE */ "__llvm_deoptimize",
+};
+
+void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
+                        RTLIB::Libcall LC, SmallVectorImpl<wasm::ValType> &Rets,
+                        SmallVectorImpl<wasm::ValType> &Params) {
+  assert(Rets.empty());
+  assert(Params.empty());
+
+  WebAssembly::ExprType iPTR = Subtarget.hasAddr64() ?
+                               WebAssembly::ExprType::I64 :
+                               WebAssembly::ExprType::I32;
+
+  switch (RuntimeLibcallSignatures[LC]) {
+  case func:
+    break;
+  case f32_func_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f32_func_f64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f32_func_i32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f32_func_i64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f32_func_i16:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_f32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f64_func_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f64_func_i32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_i64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i32_func_f64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i32_func_i32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_func_f32:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i64_func_f64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i64_func_i64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f32_func_f32_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f32_func_f32_i32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f32_func_i64_i64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f64_func_f64_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f64_func_f64_i32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_i64_i64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i16_func_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i8_func_i8_i8:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case func_f32_iPTR_iPTR:
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case func_f64_iPTR_iPTR:
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case i16_func_i16_i16:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i32_func_f32_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i32_func_f64_f64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i32_func_i32_i32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_func_i64_i64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_f32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i64_i64_func_f64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i16_i16_func_i16_i16:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I32);
+    Rets.push_back(wasm::ValType::I32);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i32_i32_func_i32_i32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I32);
+    Rets.push_back(wasm::ValType::I32);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_i64_func_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_i64_i64_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_i64_i64_func_i64_i64_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_i64_i64_i32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case iPTR_func_iPTR_i32_iPTR:
+    Rets.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case iPTR_func_iPTR_iPTR_iPTR:
+    Rets.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case f32_func_f32_f32_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f64_func_f64_f64_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case func_i64_i64_iPTR_iPTR:
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case func_iPTR_f32:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case func_iPTR_f64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case func_iPTR_i32:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case func_iPTR_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64_i64_i64_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_i64_i64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_i64_i64_i64_i64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case unsupported:
+    llvm_unreachable("unsupported runtime library signature");
+  }
+}
+
+void llvm::GetSignature(const WebAssemblySubtarget &Subtarget, const char *Name,
+                        SmallVectorImpl<wasm::ValType> &Rets,
+                        SmallVectorImpl<wasm::ValType> &Params) {
+  assert(strcmp(RuntimeLibcallNames[RTLIB::DEOPTIMIZE], "__llvm_deoptimize") ==
+         0);
+
+  for (size_t i = 0, e = RTLIB::UNKNOWN_LIBCALL; i < e; ++i)
+    if (RuntimeLibcallNames[i] && strcmp(RuntimeLibcallNames[i], Name) == 0)
+      return GetSignature(Subtarget, RTLIB::Libcall(i), Rets, Params);
+
+  llvm_unreachable("unexpected runtime library name");
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
new file mode 100644
index 000000000000..129067604784
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -0,0 +1,37 @@
+// CodeGen/RuntimeLibcallSignatures.h - R.T. Lib. Call Signatures -*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides signature information for runtime libcalls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_RUNTIME_LIBCALL_SIGNATURES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_RUNTIME_LIBCALL_SIGNATURES_H
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+
+namespace llvm {
+
+class WebAssemblySubtarget;
+
+extern void GetSignature(const WebAssemblySubtarget &Subtarget,
+                         RTLIB::Libcall LC,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+extern void GetSignature(const WebAssemblySubtarget &Subtarget,
+                         const char *Name, SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 34ec6f2d34a7..a9aa781610ce 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -154,7 +154,7 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
   if (!callReturnsInput)
     return false;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!LibInfo.getLibFunc(Name, Func))
     return false;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index f5ef35a2ad40..44c794ef5da1 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -74,13 +74,25 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, getEffectiveRelocModel(RM),
                         CM, OL),
-      TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+      TLOF(TT.isOSBinFormatELF() ?
+              static_cast<TargetLoweringObjectFile*>(
+                  new WebAssemblyTargetObjectFileELF()) :
+              static_cast<TargetLoweringObjectFile*>(
+                  new WebAssemblyTargetObjectFile())) {
   // WebAssembly type-checks instructions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
   // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
   // 'unreachable' instructions which is meant for that case.
   this->Options.TrapUnreachable = true;
 
+  // WebAssembly treats each function as an independent unit. Force
+  // -ffunction-sections, effectively, so that we can emit them independently.
+  if (!TT.isOSBinFormatELF()) {
+    this->Options.FunctionSections = true;
+    this->Options.DataSections = true;
+    this->Options.UniqueSectionNames = true;
+  }
+
   initAsmInfo();
 
   // Note that we don't use setRequiresStructuredCFG(true). It disables
@@ -260,13 +272,19 @@ void WebAssemblyPassConfig::addPreEmitPass() {
     addPass(createWebAssemblyRegColoring());
   }
 
+  // Eliminate multiple-entry loops. Do this before inserting explicit get_local
+  // and set_local operators because we create a new variable that we want
+  // converted into a local.
+  addPass(createWebAssemblyFixIrreducibleControlFlow());
+
   // Insert explicit get_local and set_local operators.
   addPass(createWebAssemblyExplicitLocals());
 
-  // Eliminate multiple-entry loops.
-  addPass(createWebAssemblyFixIrreducibleControlFlow());
+  // Sort the blocks of the CFG into topological order, a prerequisite for
+  // BLOCK and LOOP markers.
+  addPass(createWebAssemblyCFGSort());
 
-  // Put the CFG in structured form; insert BLOCK and LOOP markers.
+  // Insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
   // Lower br_unless into br_if.
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
index 74e33b93e00d..b1fd108bc249 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -17,8 +17,14 @@
 #include "WebAssemblyTargetMachine.h"
 using namespace llvm;
 
-void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
-                                             const TargetMachine &TM) {
+void WebAssemblyTargetObjectFileELF::Initialize(MCContext &Ctx,
+                                                const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileWasm::Initialize(Ctx, TM);
+  InitializeWasm();
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index 39e50c9c575d..ace87c9e442f 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -20,7 +20,13 @@
 
 namespace llvm {
 
-class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
+class WebAssemblyTargetObjectFileELF final
+    : public TargetLoweringObjectFileELF {
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileWasm {
 public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index a0049c147d2c..e32772d491cf 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -15,6 +15,7 @@
 #include "WebAssemblyUtilities.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 using namespace llvm;
 
 bool WebAssembly::isArgument(const MachineInstr &MI) {
@@ -69,3 +70,28 @@ bool WebAssembly::isChild(const MachineInstr &MI,
   return TargetRegisterInfo::isVirtualRegister(Reg) &&
          MFI.isVRegStackified(Reg);
 }
+
+bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CALL_INDIRECT_VOID:
+  case WebAssembly::CALL_INDIRECT_I32:
+  case WebAssembly::CALL_INDIRECT_I64:
+  case WebAssembly::CALL_INDIRECT_F32:
+  case WebAssembly::CALL_INDIRECT_F64:
+  case WebAssembly::CALL_INDIRECT_v16i8:
+  case WebAssembly::CALL_INDIRECT_v8i16:
+  case WebAssembly::CALL_INDIRECT_v4i32:
+  case WebAssembly::CALL_INDIRECT_v4f32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+MachineBasicBlock *llvm::LoopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.h b/lib/Target/WebAssembly/WebAssemblyUtilities.h
index eb114403d14e..595491f1bf5b 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -18,7 +18,9 @@
 
 namespace llvm {
 
+class MachineBasicBlock;
 class MachineInstr;
+class MachineLoop;
 class WebAssemblyFunctionInfo;
 
 namespace WebAssembly {
@@ -27,8 +29,15 @@ bool isArgument(const MachineInstr &MI);
 bool isCopy(const MachineInstr &MI);
 bool isTee(const MachineInstr &MI);
 bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+bool isCallIndirect(const MachineInstr &MI);
 
 } // end namespace WebAssembly
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *LoopBottom(const MachineLoop *Loop);
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index c38a7d1dd44d..788fac62626b 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -1,4 +1,4 @@
-//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,24 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86AsmInstrumentation.h"
-#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Operand.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
 #include <vector>
 
 // Following comment describes how assembly instrumentation works.
@@ -91,30 +98,35 @@
 //   register as a frame register and temprorary override current CFA
 //   register.
 
-namespace llvm {
-namespace {
+using namespace llvm;
 
 static cl::opt<bool> ClAsanInstrumentAssembly(
     "asan-instrument-assembly",
     cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
     cl::init(false));
 
-const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
-const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+static const int64_t MinAllowedDisplacement =
+    std::numeric_limits<int32_t>::min();
+static const int64_t MaxAllowedDisplacement =
+    std::numeric_limits<int32_t>::max();
 
-int64_t ApplyDisplacementBounds(int64_t Displacement) {
+static int64_t ApplyDisplacementBounds(int64_t Displacement) {
   return std::max(std::min(MaxAllowedDisplacement, Displacement),
                   MinAllowedDisplacement);
 }
 
-void CheckDisplacementBounds(int64_t Displacement) {
+static void CheckDisplacementBounds(int64_t Displacement) {
   assert(Displacement >= MinAllowedDisplacement &&
          Displacement <= MaxAllowedDisplacement);
 }
 
-bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+static bool IsStackReg(unsigned Reg) {
+  return Reg == X86::RSP || Reg == X86::ESP;
+}
 
-bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+namespace {
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
@@ -178,7 +190,7 @@ public:
   X86AddressSanitizer(const MCSubtargetInfo *&STI)
       : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
 
-  ~X86AddressSanitizer() override {}
+  ~X86AddressSanitizer() override = default;
 
   // X86AsmInstrumentation implementation:
   void InstrumentAndEmitInstruction(const MCInst &Inst,
@@ -255,9 +267,11 @@ protected:
   bool is64BitMode() const {
     return STI->getFeatureBits()[X86::Mode64Bit];
   }
+
   bool is32BitMode() const {
     return STI->getFeatureBits()[X86::Mode32Bit];
   }
+
   bool is16BitMode() const {
     return STI->getFeatureBits()[X86::Mode16Bit];
   }
@@ -498,7 +512,7 @@ public:
   X86AddressSanitizer32(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  ~X86AddressSanitizer32() override {}
+  ~X86AddressSanitizer32() override = default;
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
@@ -604,9 +618,9 @@ private:
     EmitInstruction(
         Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
 
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
                                             (IsWrite ? "store" : "load") +
-                                            llvm::Twine(AccessSize));
+                                            Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
@@ -756,7 +770,7 @@ public:
   X86AddressSanitizer64(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  ~X86AddressSanitizer64() override {}
+  ~X86AddressSanitizer64() override = default;
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
@@ -875,15 +889,17 @@ private:
       EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
                                RegCtx.AddressReg(64)));
     }
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
                                             (IsWrite ? "store" : "load") +
-                                            llvm::Twine(AccessSize));
+                                            Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
   }
 };
 
+} // end anonymous namespace
+
 void X86AddressSanitizer64::InstrumentMemOperandSmall(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
@@ -1022,12 +1038,10 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
   RestoreFlags(Out);
 }
 
-} // End anonymous namespace
-
 X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
-    : STI(STI), InitialFrameReg(0) {}
+    : STI(STI) {}
 
-X86AsmInstrumentation::~X86AsmInstrumentation() {}
+X86AsmInstrumentation::~X86AsmInstrumentation() = default;
 
 void X86AsmInstrumentation::InstrumentAndEmitInstruction(
     const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
@@ -1060,8 +1074,9 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
 }
 
 X86AsmInstrumentation *
-CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
-                            const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                                  const MCContext &Ctx,
+                                  const MCSubtargetInfo *&STI) {
   Triple T(STI->getTargetTriple());
   const bool hasCompilerRTSupport = T.isOSLinux();
   if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
@@ -1073,5 +1088,3 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
   }
   return new X86AsmInstrumentation(STI);
 }
-
-} // end llvm namespace
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 470ceadb0aa6..97a55cd8ad98 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -1,4 +1,4 @@
-//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,7 +11,6 @@
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
 
 #include "llvm/ADT/SmallVector.h"
-
 #include <memory>
 
 namespace llvm {
@@ -23,7 +22,6 @@ class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCTargetOptions;
-
 class X86AsmInstrumentation;
 
 X86AsmInstrumentation *
@@ -43,7 +41,7 @@ public:
   // Tries to instrument and emit instruction.
   virtual void InstrumentAndEmitInstruction(
       const MCInst &Inst,
-      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
       MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
 
 protected:
@@ -60,9 +58,9 @@ protected:
 
   const MCSubtargetInfo *&STI;
 
-  unsigned InitialFrameReg;
+  unsigned InitialFrameReg = 0;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e692118f47fd..324da650e74e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -98,6 +98,14 @@ private:
     IC_REGISTER
   };
 
+  enum IntelOperatorKind {
+    IOK_INVALID = 0,
+    IOK_LENGTH,
+    IOK_SIZE,
+    IOK_TYPE,
+    IOK_OFFSET
+  };
+
   class InfixCalculator {
     typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
@@ -704,10 +712,12 @@ private:
   std::unique_ptr<X86Operand> ParseIntelOperand();
   std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
   bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
-  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  unsigned IdentifyIntelOperator(StringRef Name);
+  unsigned ParseIntelOperator(unsigned OpKind);
   std::unique_ptr<X86Operand>
   ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
   std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+  bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
   bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
   std::unique_ptr<X86Operand>
   ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
@@ -814,6 +824,7 @@ private:
   /// }
 
 public:
+
   X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
                const MCInstrInfo &mii, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
@@ -1266,10 +1277,12 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
     }
   }
   // Remove all the ImmPrefix rewrites within the brackets.
+  // We may have some Imm rewrties as a result of an operator applying,
+  // remove them as well
   for (AsmRewrite &AR : AsmRewrites) {
     if (AR.Loc.getPointer() < StartInBrac.getPointer())
       continue;
-    if (AR.Kind == AOK_ImmPrefix)
+    if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm)
       AR.Kind = AOK_Delete;
   }
   const char *SymLocPtr = SymName.data();
@@ -1286,6 +1299,30 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
   }
 }
 
+// Some binary bitwise operators have a named synonymous
+// Query a candidate string for being such a named operator
+// and if so - invoke the appropriate handler
+bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) {
+  // A named operator should be either lower or upper case, but not a mix
+  if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
+    return false;
+  if (Name.equals_lower("not"))
+    SM.onNot();
+  else if (Name.equals_lower("or"))
+    SM.onOr();
+  else if (Name.equals_lower("shl"))
+    SM.onLShift();
+  else if (Name.equals_lower("shr"))
+    SM.onRShift();
+  else if (Name.equals_lower("xor"))
+    SM.onXor();
+  else if (Name.equals_lower("and"))
+    SM.onAnd();
+  else
+    return false;
+  return true;
+}
+
 bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
@@ -1324,31 +1361,36 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       const MCExpr *Val;
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
+      UpdateLocLex = false;
       if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
         SM.onRegister(TmpReg);
-        UpdateLocLex = false;
-        break;
+      } else if (ParseIntelNamedOperator(Identifier, SM)) {
+        UpdateLocLex = true;
+      } else if (!isParsingInlineAsm()) {
+        if (getParser().parsePrimaryExpr(Val, End))
+          return Error(Tok.getLoc(), "Unexpected identifier!");
+        SM.onIdentifierExpr(Val, Identifier);
+      } else if (unsigned OpKind = IdentifyIntelOperator(Identifier)) {
+        if (OpKind == IOK_OFFSET) 
+          return Error(IdentLoc, "Dealing OFFSET operator as part of"
+            "a compound immediate expression is yet to be supported");
+        int64_t Val = ParseIntelOperator(OpKind);
+        if (!Val)
+          return true;
+        StringRef ErrMsg;
+        if (SM.onInteger(Val, ErrMsg))
+          return Error(IdentLoc, ErrMsg);
+      } else if (Identifier.find('.') != StringRef::npos &&
+            PrevTK == AsmToken::RBrac) {
+          return false;
       } else {
-        if (!isParsingInlineAsm()) {
-          if (getParser().parsePrimaryExpr(Val, End))
-            return Error(Tok.getLoc(), "Unexpected identifier!");
-        } else {
-          // This is a dot operator, not an adjacent identifier.
-          if (Identifier.find('.') != StringRef::npos &&
-              PrevTK == AsmToken::RBrac) {
-            return false;
-          } else {
-            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-            if (ParseIntelIdentifier(Val, Identifier, Info,
-                                     /*Unevaluated=*/false, End))
-              return true;
-          }
-        }
+        InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+        if (ParseIntelIdentifier(Val, Identifier, Info,
+                                 /*Unevaluated=*/false, End))
+          return true;
         SM.onIdentifierExpr(Val, Identifier);
-        UpdateLocLex = false;
-        break;
       }
-      return Error(Tok.getLoc(), "Unexpected identifier!");
+      break;
     }
     case AsmToken::Integer: {
       StringRef ErrMsg;
@@ -1715,11 +1757,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
                                OffsetOfLoc, Identifier, Info.OpDecl);
 }
 
-enum IntelOperatorKind {
-  IOK_LENGTH,
-  IOK_SIZE,
-  IOK_TYPE
-};
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+    .Cases("TYPE","type",IOK_TYPE)
+    .Cases("SIZE","size",IOK_SIZE)
+    .Cases("LENGTH","length",IOK_LENGTH)
+    .Cases("OFFSET","offset",IOK_OFFSET)
+    .Default(IOK_INVALID);
+}
 
 /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
 /// returns the number of elements in an array.  It returns the value 1 for
@@ -1727,7 +1774,7 @@ enum IntelOperatorKind {
 /// variable.  A variable's size is the product of its LENGTH and TYPE.  The
 /// TYPE operator returns the size of a C or C++ type or variable. If the
 /// variable is an array, TYPE returns the size of a single element.
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+unsigned X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc TypeLoc = Tok.getLoc();
@@ -1739,11 +1786,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/true, End))
-    return nullptr;
-
-  if (!Info.OpDecl)
-    return ErrorOperand(Start, "unable to lookup expression");
+    return 0;
 
+  if (!Info.OpDecl) {
+    Error(Start, "unable to lookup expression");
+    return 0;
+  }
+  
   unsigned CVal = 0;
   switch(OpKind) {
   default: llvm_unreachable("Unexpected operand kind!");
@@ -1757,8 +1806,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   unsigned Len = End.getPointer() - TypeLoc.getPointer();
   InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
 
-  const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
-  return X86Operand::CreateImm(Imm, Start, End);
+  return CVal;
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
@@ -1766,18 +1814,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
 
-  // Offset, length, type and size operators.
-  if (isParsingInlineAsm()) {
-    StringRef AsmTokStr = Tok.getString();
-    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+  // FIXME: Offset operator
+  // Should be handled as part of immediate expression, as other operators
+  // Currently, only supported as a stand-alone operand
+  if (isParsingInlineAsm())
+    if (IdentifyIntelOperator(Tok.getString()) == IOK_OFFSET)
       return ParseIntelOffsetOfOperator();
-    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
-      return ParseIntelOperator(IOK_LENGTH);
-    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
-      return ParseIntelOperator(IOK_SIZE);
-    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
-      return ParseIntelOperator(IOK_TYPE);
-  }
 
   bool PtrInOperand = false;
   unsigned Size = getIntelMemOperandSize(Tok.getString());
@@ -2360,7 +2402,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     Name == "lock" || Name == "rep" ||
     Name == "repe" || Name == "repz" ||
     Name == "repne" || Name == "repnz" ||
-    Name == "rex64" || Name == "data16";
+    Name == "rex64" || Name == "data16" || Name == "data32";
 
   bool CurlyAsEndOfStatement = false;
   // This does the actual operand parsing.  Don't parse any more if we have a
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 9db1a8483bee..9f1fa6c65907 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -1,4 +1,4 @@
-//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,17 @@
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
 
 #include "X86AsmParserCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/ADT/STLExtras.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <memory>
 
 namespace llvm {
 
@@ -74,11 +79,14 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
+
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
+
   /// getLocRange - Get the range between the first and last token of this
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   /// getOffsetOfLoc - Get the location of the offset operator.
   SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
 
@@ -271,6 +279,9 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isMem256_RC256X() const {
     return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
+  bool isMem256_RC512() const {
+    return isMem256() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+  }
   bool isMem512_RC256X() const {
     return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
@@ -419,10 +430,12 @@ struct X86Operand : public MCParsedAsmOperand {
       RegNo = getGR32FromGR64(RegNo);
     Inst.addOperand(MCOperand::createReg(RegNo));
   }
+
   void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
@@ -451,6 +464,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
     Inst.addOperand(MCOperand::createReg(getMemSegReg()));
   }
+
   void addDstIdxOperands(MCInst &Inst, unsigned N) const {
     assert((N == 1) && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
@@ -541,6 +555,6 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 };
 
-} // End of namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 9dfd09022bdc..fc4adddc149b 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -10,11 +10,20 @@ tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
+if(LLVM_BUILD_GLOBAL_ISEL)
+  tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
+  tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
+endif()
+
 add_public_tablegen_target(X86CommonTableGen)
 
 # Add GlobalISel files if the build option was enabled.
 set(GLOBAL_ISEL_FILES
   X86CallLowering.cpp
+  X86LegalizerInfo.cpp
+  X86RegisterBankInfo.cpp
+  X86InstructionSelector.cpp
   )
 
 if(LLVM_BUILD_GLOBAL_ISEL)
@@ -43,6 +52,7 @@ set(sources
   X86EvexToVex.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
+  X86MacroFusion.cpp
   X86OptimizeLEAs.cpp
   X86PadShortFunction.cpp
   X86RegisterInfo.cpp
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 0871888bbfcd..36ad23bb41c0 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -368,32 +368,49 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
 
   bool isBranch = false;
   uint64_t pcrel = 0;
-  if (type == TYPE_RELv) {
+  if (type == TYPE_REL) {
     isBranch = true;
     pcrel = insn.startLocation +
             insn.immediateOffset + insn.immediateSize;
-    switch (insn.displacementSize) {
+    switch (operand.encoding) {
     default:
       break;
-    case 1:
+    case ENCODING_Iv:
+      switch (insn.displacementSize) {
+      default:
+        break;
+      case 1:
+        if(immediate & 0x80)
+          immediate |= ~(0xffull);
+        break;
+      case 2:
+        if(immediate & 0x8000)
+          immediate |= ~(0xffffull);
+        break;
+      case 4:
+        if(immediate & 0x80000000)
+          immediate |= ~(0xffffffffull);
+        break;
+      case 8:
+        break;
+      }
+      break;
+    case ENCODING_IB:
       if(immediate & 0x80)
         immediate |= ~(0xffull);
       break;
-    case 2:
+    case ENCODING_IW:
       if(immediate & 0x8000)
         immediate |= ~(0xffffull);
       break;
-    case 4:
+    case ENCODING_ID:
       if(immediate & 0x80000000)
         immediate |= ~(0xffffffffull);
       break;
-    case 8:
-      break;
     }
   }
   // By default sign-extend all X86 immediates based on their encoding.
-  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
-           type == TYPE_IMM64 || type == TYPE_IMMv) {
+  else if (type == TYPE_IMM) {
     switch (operand.encoding) {
     default:
       break;
@@ -620,38 +637,17 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   }
 
   switch (type) {
-  case TYPE_XMM32:
-  case TYPE_XMM64:
-  case TYPE_XMM128:
+  case TYPE_XMM:
     mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
     return;
-  case TYPE_XMM256:
+  case TYPE_YMM:
     mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
     return;
-  case TYPE_XMM512:
+  case TYPE_ZMM:
     mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
     return;
   case TYPE_BNDR:
     mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
-  case TYPE_REL8:
-    isBranch = true;
-    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    if (immediate & 0x80)
-      immediate |= ~(0xffull);
-    break;
-  case TYPE_REL16:
-    isBranch = true;
-    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    if (immediate & 0x8000)
-      immediate |= ~(0xffffull);
-    break;
-  case TYPE_REL32:
-  case TYPE_REL64:
-    isBranch = true;
-    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    if(immediate & 0x80000000)
-      immediate |= ~(0xffffffffull);
-    break;
   default:
     // operand is 64 bits wide.  Do nothing.
     break;
@@ -662,8 +658,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
                                mcInst, Dis))
     mcInst.addOperand(MCOperand::createImm(immediate));
 
-  if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
-      type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+  if (type == TYPE_MOFFS) {
     MCOperand segmentReg;
     segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
     mcInst.addOperand(segmentReg);
@@ -767,7 +762,27 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
                        Opcode == X86::VPGATHERDQYrm ||
                        Opcode == X86::VPGATHERQQrm ||
                        Opcode == X86::VPGATHERDDrm ||
-                       Opcode == X86::VPGATHERQDrm);
+                       Opcode == X86::VPGATHERQDrm ||
+                       Opcode == X86::VGATHERDPDZ128rm ||
+                       Opcode == X86::VGATHERDPDZ256rm ||
+                       Opcode == X86::VGATHERDPSZ128rm ||
+                       Opcode == X86::VGATHERQPDZ128rm ||
+                       Opcode == X86::VGATHERQPSZ128rm ||
+                       Opcode == X86::VPGATHERDDZ128rm ||
+                       Opcode == X86::VPGATHERDQZ128rm ||
+                       Opcode == X86::VPGATHERDQZ256rm ||
+                       Opcode == X86::VPGATHERQDZ128rm ||
+                       Opcode == X86::VPGATHERQQZ128rm ||
+                       Opcode == X86::VSCATTERDPDZ128mr ||
+                       Opcode == X86::VSCATTERDPDZ256mr ||
+                       Opcode == X86::VSCATTERDPSZ128mr ||
+                       Opcode == X86::VSCATTERQPDZ128mr ||
+                       Opcode == X86::VSCATTERQPSZ128mr ||
+                       Opcode == X86::VPSCATTERDDZ128mr ||
+                       Opcode == X86::VPSCATTERDQZ128mr ||
+                       Opcode == X86::VPSCATTERDQZ256mr ||
+                       Opcode == X86::VPSCATTERQDZ128mr ||
+                       Opcode == X86::VPSCATTERQQZ128mr);
     bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
                        Opcode == X86::VGATHERDPSYrm ||
                        Opcode == X86::VGATHERQPSYrm ||
@@ -775,13 +790,49 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
                        Opcode == X86::VPGATHERDQZrm ||
                        Opcode == X86::VPGATHERQQYrm ||
                        Opcode == X86::VPGATHERDDYrm ||
-                       Opcode == X86::VPGATHERQDYrm);
+                       Opcode == X86::VPGATHERQDYrm ||
+                       Opcode == X86::VGATHERDPSZ256rm ||
+                       Opcode == X86::VGATHERQPDZ256rm ||
+                       Opcode == X86::VGATHERQPSZ256rm ||
+                       Opcode == X86::VPGATHERDDZ256rm ||
+                       Opcode == X86::VPGATHERQQZ256rm ||
+                       Opcode == X86::VPGATHERQDZ256rm ||
+                       Opcode == X86::VSCATTERDPDZmr ||
+                       Opcode == X86::VPSCATTERDQZmr ||
+                       Opcode == X86::VSCATTERDPSZ256mr ||
+                       Opcode == X86::VSCATTERQPDZ256mr ||
+                       Opcode == X86::VSCATTERQPSZ256mr ||
+                       Opcode == X86::VPSCATTERDDZ256mr ||
+                       Opcode == X86::VPSCATTERQQZ256mr ||
+                       Opcode == X86::VPSCATTERQDZ256mr ||
+                       Opcode == X86::VGATHERPF0DPDm ||
+                       Opcode == X86::VGATHERPF1DPDm ||
+                       Opcode == X86::VSCATTERPF0DPDm ||
+                       Opcode == X86::VSCATTERPF1DPDm);
     bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
                        Opcode == X86::VGATHERDPSZrm ||
                        Opcode == X86::VGATHERQPSZrm ||
                        Opcode == X86::VPGATHERQQZrm ||
                        Opcode == X86::VPGATHERDDZrm ||
-                       Opcode == X86::VPGATHERQDZrm);
+                       Opcode == X86::VPGATHERQDZrm ||
+                       Opcode == X86::VSCATTERQPDZmr ||
+                       Opcode == X86::VSCATTERDPSZmr ||
+                       Opcode == X86::VSCATTERQPSZmr ||
+                       Opcode == X86::VPSCATTERQQZmr ||
+                       Opcode == X86::VPSCATTERDDZmr ||
+                       Opcode == X86::VPSCATTERQDZmr ||
+                       Opcode == X86::VGATHERPF0DPSm ||
+                       Opcode == X86::VGATHERPF0QPDm ||
+                       Opcode == X86::VGATHERPF0QPSm ||
+                       Opcode == X86::VGATHERPF1DPSm ||
+                       Opcode == X86::VGATHERPF1QPDm ||
+                       Opcode == X86::VGATHERPF1QPSm ||
+                       Opcode == X86::VSCATTERPF0DPSm ||
+                       Opcode == X86::VSCATTERPF0QPDm ||
+                       Opcode == X86::VSCATTERPF0QPSm ||
+                       Opcode == X86::VSCATTERPF1DPSm ||
+                       Opcode == X86::VSCATTERPF1QPDm ||
+                       Opcode == X86::VSCATTERPF1QPSm);
     if (IndexIs128 || IndexIs256 || IndexIs512) {
       unsigned IndexOffset = insn.sibIndex -
                          (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
@@ -909,38 +960,15 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_R64:
   case TYPE_Rv:
   case TYPE_MM64:
-  case TYPE_XMM32:
-  case TYPE_XMM64:
-  case TYPE_XMM128:
-  case TYPE_XMM256:
-  case TYPE_XMM512:
-  case TYPE_VK1:
-  case TYPE_VK2:
-  case TYPE_VK4:
-  case TYPE_VK8:
-  case TYPE_VK16:
-  case TYPE_VK32:
-  case TYPE_VK64:
+  case TYPE_XMM:
+  case TYPE_YMM:
+  case TYPE_ZMM:
+  case TYPE_VK:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
   case TYPE_BNDR:
     return translateRMRegister(mcInst, insn);
   case TYPE_M:
-  case TYPE_M8:
-  case TYPE_M16:
-  case TYPE_M32:
-  case TYPE_M64:
-  case TYPE_M128:
-  case TYPE_M256:
-  case TYPE_M512:
-  case TYPE_Mv:
-  case TYPE_M32FP:
-  case TYPE_M64FP:
-  case TYPE_M80FP:
-  case TYPE_M1616:
-  case TYPE_M1632:
-  case TYPE_M1664:
-  case TYPE_LEA:
     return translateRMMemory(mcInst, insn, Dis);
   }
 }
@@ -992,6 +1020,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
   case ENCODING_WRITEMASK:
     return translateMaskRegister(mcInst, insn.writemask);
   CASE_ENCODING_RM:
+  CASE_ENCODING_VSIB:
     return translateRM(mcInst, operand, insn, Dis);
   case ENCODING_IB:
   case ENCODING_IW:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index ab64d6fcf70b..b7f637e9a8cd 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -650,11 +650,6 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->addressSize        = (hasAdSize ? 4 : 8);
       insn->displacementSize   = 4;
       insn->immediateSize      = 4;
-    } else if (insn->rexPrefix) {
-      insn->registerSize       = (hasOpSize ? 2 : 4);
-      insn->addressSize        = (hasAdSize ? 4 : 8);
-      insn->displacementSize   = (hasOpSize ? 2 : 4);
-      insn->immediateSize      = (hasOpSize ? 2 : 4);
     } else {
       insn->registerSize       = (hasOpSize ? 2 : 4);
       insn->addressSize        = (hasAdSize ? 4 : 8);
@@ -1475,21 +1470,13 @@ static int readModRM(struct InternalInstruction* insn) {
       return prefix##_EAX + index;                        \
     case TYPE_R64:                                        \
       return prefix##_RAX + index;                        \
-    case TYPE_XMM512:                                     \
+    case TYPE_ZMM:                                        \
       return prefix##_ZMM0 + index;                       \
-    case TYPE_XMM256:                                     \
+    case TYPE_YMM:                                        \
       return prefix##_YMM0 + index;                       \
-    case TYPE_XMM128:                                     \
-    case TYPE_XMM64:                                      \
-    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
       return prefix##_XMM0 + index;                       \
-    case TYPE_VK1:                                        \
-    case TYPE_VK2:                                        \
-    case TYPE_VK4:                                        \
-    case TYPE_VK8:                                        \
-    case TYPE_VK16:                                       \
-    case TYPE_VK32:                                       \
-    case TYPE_VK64:                                       \
+    case TYPE_VK:                                         \
       if (index > 7)                                      \
         *valid = 0;                                       \
       return prefix##_K0 + index;                         \
@@ -1562,6 +1549,7 @@ static int fixupReg(struct InternalInstruction *insn,
       return -1;
     break;
   CASE_ENCODING_RM:
+  CASE_ENCODING_VSIB:
     if (insn->eaBase >= insn->eaRegBase) {
       insn->eaBase = (EABase)fixupRMValue(insn,
                                           (OperandType)op->type,
@@ -1753,6 +1741,18 @@ static int readOperands(struct InternalInstruction* insn) {
     case ENCODING_SI:
     case ENCODING_DI:
       break;
+    CASE_ENCODING_VSIB:
+      // VSIB can use the V2 bit so check only the other bits.
+      if (needVVVV)
+        needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      // Apply the AVX512 compressed displacement scaling factor.
+      if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+        insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
+      break;
     case ENCODING_REG:
     CASE_ENCODING_RM:
       if (readModRM(insn))
@@ -1774,8 +1774,7 @@ static int readOperands(struct InternalInstruction* insn) {
       }
       if (readImmediate(insn, 1))
         return -1;
-      if (Op.type == TYPE_XMM128 ||
-          Op.type == TYPE_XMM256)
+      if (Op.type == TYPE_XMM || Op.type == TYPE_YMM)
         sawRegImm = 1;
       break;
     case ENCODING_IW:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 0a835b876d90..e0f4399b3687 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -339,6 +339,15 @@ enum ModRMDecisionType {
     case ENCODING_RM_CD32:   \
     case ENCODING_RM_CD64
 
+#define CASE_ENCODING_VSIB   \
+    case ENCODING_VSIB:      \
+    case ENCODING_VSIB_CD2:  \
+    case ENCODING_VSIB_CD4:  \
+    case ENCODING_VSIB_CD8:  \
+    case ENCODING_VSIB_CD16: \
+    case ENCODING_VSIB_CD32: \
+    case ENCODING_VSIB_CD64
+
 // Physical encodings of instruction operands.
 #define ENCODINGS                                                              \
   ENUM_ENTRY(ENCODING_NONE,   "")                                              \
@@ -350,6 +359,13 @@ enum ModRMDecisionType {
   ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16")          \
   ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32")          \
   ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
+  ENUM_ENTRY(ENCODING_VSIB,     "VSIB operand in ModR/M byte.")                \
+  ENUM_ENTRY(ENCODING_VSIB_CD2, "VSIB operand with CDisp scaling of 2")        \
+  ENUM_ENTRY(ENCODING_VSIB_CD4, "VSIB operand with CDisp scaling of 4")        \
+  ENUM_ENTRY(ENCODING_VSIB_CD8, "VSIB operand with CDisp scaling of 8")        \
+  ENUM_ENTRY(ENCODING_VSIB_CD16,"VSIB operand with CDisp scaling of 16")       \
+  ENUM_ENTRY(ENCODING_VSIB_CD32,"VSIB operand with CDisp scaling of 32")       \
+  ENUM_ENTRY(ENCODING_VSIB_CD64,"VSIB operand with CDisp scaling of 64")       \
   ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
   ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
   ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
@@ -383,85 +399,38 @@ enum OperandEncoding {
 // Semantic interpretations of instruction operands.
 #define TYPES                                                                  \
   ENUM_ENTRY(TYPE_NONE,       "")                                              \
-  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
-  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
-  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
-  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
-  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
-  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
-  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_REL,        "immediate address")                             \
   ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
   ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
   ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
   ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
-  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM,        "immediate operand")                      \
   ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
   ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
   ENUM_ENTRY(TYPE_AVX512ICC,  "1-byte immediate operand for AVX512 icmp")      \
   ENUM_ENTRY(TYPE_UIMM8,      "1-byte unsigned immediate operand")             \
-  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
-  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
-  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
-  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
   ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
-  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
-  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
-  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
-  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
-  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
-  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
-  ENUM_ENTRY(TYPE_M256,       "256-byte (AVX)")                                \
-  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
-  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
-  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
-  ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_SRCIDX64,   "8-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_DSTIDX8,    "1-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_DSTIDX16,   "2-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_DSTIDX32,   "4-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_DSTIDX64,   "8-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
-                              "base)")                                         \
-  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
-  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
-  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
-  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
-  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
-  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_SRCIDX,     "memory at source index")                        \
+  ENUM_ENTRY(TYPE_DSTIDX,     "memory at destination index")                   \
+  ENUM_ENTRY(TYPE_MOFFS,      "memory offset (relative to segment base)")      \
   ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
   ENUM_ENTRY(TYPE_MM64,       "8-byte MMX register")                           \
-  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
-  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
-  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
-  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
-  ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
-  ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
-  ENUM_ENTRY(TYPE_VK2,        "2-bit")                                         \
-  ENUM_ENTRY(TYPE_VK4,        "4-bit")                                         \
-  ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
-  ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
-  ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
-  ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
+  ENUM_ENTRY(TYPE_XMM,        "16-byte")                                       \
+  ENUM_ENTRY(TYPE_YMM,        "32-byte")                                       \
+  ENUM_ENTRY(TYPE_ZMM,        "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK,         "mask register")                                 \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
   ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
   ENUM_ENTRY(TYPE_BNDR,       "MPX bounds register")                           \
                                                                                \
-  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
   ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
-  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
   ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
   ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
   ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
   ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
   ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
   ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
-  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
 
 #define ENUM_ENTRY(n, d) n,
 enum OperandType {
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 10b7e6ff5ee2..6aa700306744 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -12,19 +12,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86ATTInstPrinter.h"
 #include "X86InstComments.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -61,6 +64,17 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     OS << "\tcallq\t";
     printPCRelImm(MI, 0, OS);
   }
+  // data16 and data32 both have the same encoding of 0x66. While data32 is
+  // valid only in 16 bit systems, data16 is valid in the rest.
+  // There seems to be some lack of support of the Requires clause that causes
+  // 0x66 to be interpreted as "data16" by the asm printer.
+  // Thus we add an adjustment here in order to print the "right" instruction.
+  else if (MI->getOpcode() == X86::DATA16_PREFIX &&
+    (STI.getFeatureBits()[X86::Mode16Bit])) {
+    MCInst Data32MI(*MI);
+    Data32MI.setOpcode(X86::DATA32_PREFIX);
+    printInstruction(&Data32MI, OS);
+  }
   // Try to print any aliases first.
   else if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
@@ -135,6 +149,7 @@ void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
   case 3: O << "{rz-sae}"; break;
   }
 }
+
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value (e.g. for jumps and calls).  These
 /// print slightly differently than normal immediates.  For example, a $ is not
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index bbb309076610..946c1c73f088 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -1,4 +1,4 @@
-//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -137,6 +137,7 @@ public:
 private:
   bool HasCustomInstComment;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 8594addb5dd4..6e062ec59347 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -1189,8 +1189,6 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     OS << ']';
     --i; // For loop increments element #.
   }
-  //MI->print(OS, 0);
-  OS << "\n";
 
   // We successfully added a comment to this instruction.
   return true;
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 4443edb8e342..a8c631ae282f 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -12,16 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86InstComments.h"
+#include "X86IntelInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#include <cctype>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 20cd7ffb2e63..ace31186a054 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -157,6 +157,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e83ec9f4045a..a713af6aadb5 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -109,7 +109,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
     assert(Fixup.getOffset() + Size <= DataSize &&
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index aab552547fac..d8953da4abb2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -212,7 +212,12 @@ namespace X86II {
     /// the offset from beginning of section.
     ///
     /// This is the TLS offset for the COFF/Windows TLS mechanism.
-    MO_SECREL
+    MO_SECREL,
+
+    /// MO_ABS8 - On a symbol operand this indicates that the symbol is known
+    /// to be an absolute symbol in range [0,128), so we can use the @ABS8
+    /// symbol modifier.
+    MO_ABS8,
   };
 
   enum : uint64_t {
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index da69da51df10..0b73df3a2ff8 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -13,24 +13,28 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
-  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
-  public:
-    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
 
-    ~X86ELFObjectWriter() override;
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+  ~X86ELFObjectWriter() override = default;
 
-  protected:
-    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
-                          const MCFixup &Fixup, bool IsPCRel) const override;
-  };
-}
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+} // end anonymous namespace
 
 X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                                        uint16_t EMachine)
@@ -40,9 +44,6 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                               (EMachine != ELF::EM_386) &&
                                   (EMachine != ELF::EM_IAMCU)) {}
 
-X86ELFObjectWriter::~X86ELFObjectWriter()
-{}
-
 enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
 
 static X86_64RelType getType64(unsigned Kind,
@@ -96,6 +97,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
   default:
     llvm_unreachable("Unimplemented");
   case MCSymbolRefExpr::VK_None:
+  case MCSymbolRefExpr::VK_X86_ABS8:
     switch (Type) {
     case RT64_64:
       return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
@@ -219,6 +221,7 @@ static unsigned getRelocType32(MCContext &Ctx,
   default:
     llvm_unreachable("Unimplemented");
   case MCSymbolRefExpr::VK_None:
+  case MCSymbolRefExpr::VK_X86_ABS8:
     switch (Type) {
     case RT32_32:
       return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8045e7c6d872..10e2bbc64d3c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -11,35 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
-  void operator=(const X86MCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
+
 public:
   X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
     : MCII(mcii), Ctx(ctx) {
   }
-
-  ~X86MCCodeEmitter() override {}
+  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+  X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete;
+  ~X86MCCodeEmitter() override = default;
 
   bool is64BitMode(const MCSubtargetInfo &STI) const {
     return STI.getFeatureBits()[X86::Mode64Bit];
@@ -106,8 +114,7 @@ public:
                      SmallVectorImpl<MCFixup> &Fixups,
                      int ImmOffset = 0) const;
 
-  inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode,
-                                  unsigned RM) {
+  static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
     assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
     return RM | (RegOpcode << 3) | (Mod << 6);
   }
@@ -149,12 +156,6 @@ public:
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
-                                            MCContext &Ctx) {
-  return new X86MCCodeEmitter(MCII, Ctx);
-}
-
 /// isDisp8 - Return true if this signed displacement fits in a 8-bit
 /// sign-extended field.
 static bool isDisp8(int Value) {
@@ -1436,7 +1437,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r: {
+  case X86II::MRM6r: case X86II::MRM7r:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
@@ -1446,13 +1447,12 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
                      CurByte, OS);
     break;
-  }
 
   case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m: {
+  case X86II::MRM6m: case X86II::MRM7m:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
@@ -1463,7 +1463,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      Rex, CurByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
-  }
+
   case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
   case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
   case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
@@ -1527,3 +1527,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 #endif
 }
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 33376b6d1b90..d6777fc8aa6a 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/COFF.h"
@@ -17,28 +18,24 @@
 
 using namespace llvm;
 
-namespace llvm {
-  class MCObjectWriter;
-}
-
 namespace {
-  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
-  public:
-    X86WinCOFFObjectWriter(bool Is64Bit);
-    ~X86WinCOFFObjectWriter() override;
 
-    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsCrossSection,
-                          const MCAsmBackend &MAB) const override;
-  };
-}
+class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  X86WinCOFFObjectWriter(bool Is64Bit);
+  ~X86WinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+};
+
+} // end anonymous namespace
 
 X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
                                           : COFF::IMAGE_FILE_MACHINE_I386) {}
 
-X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
-
 unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 2cb80a482d06..fdcc7e1ab7b0 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -21,7 +21,10 @@ namespace llvm {
 
 class FunctionPass;
 class ImmutablePass;
+class InstructionSelector;
 class PassRegistry;
+class X86RegisterBankInfo;
+class X86Subtarget;
 class X86TargetMachine;
 
 /// This pass converts a legalized DAG into a X86-specific DAG, ready for
@@ -92,6 +95,9 @@ void initializeFixupBWInstPassPass(PassRegistry &);
 /// encoding when possible in order to reduce code size.
 FunctionPass *createX86EvexToVexInsts();
 
+InstructionSelector *createX86InstructionSelector(X86Subtarget &,
+                                                  X86RegisterBankInfo &);
+
 void initializeEvexToVexInstPassPass(PassRegistry &);
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 83a23d4ad680..8fcc8e31d5d4 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -187,8 +187,6 @@ def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
 def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
                                       "Support RTM instructions">;
-def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
-                                      "Support HLE">;
 def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
                                       "Support ADX instructions">;
 def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
@@ -202,6 +200,8 @@ def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
                                        "Support LAHF and SAHF instructions">;
 def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
                                       "Enable MONITORX/MWAITX timer functionality">;
+def FeatureCLZERO  : SubtargetFeature<"clzero", "HasCLZERO", "true",
+                                      "Enable Cache Line Zero">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -215,18 +215,10 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
-def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true",
-                                      "Invalidate Process-Context Identifier">;
-def FeatureVMFUNC  : SubtargetFeature<"vmfunc", "HasVMFUNC", "true",
-                                      "VM Functions">;
-def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
-                                      "Supervisor Mode Access Protection">;
 def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
                                       "Enable Software Guard Extensions">;
 def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
                                       "Flush A Cache Line Optimized">;
-def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true",
-                                      "Enable Persistent Commit">;
 def FeatureCLWB    : SubtargetFeature<"clwb", "HasCLWB", "true",
                                       "Cache Line Write Back">;
 // TODO: This feature ought to be renamed.
@@ -246,11 +238,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
-// On at least some AMD processors, there is no performance hazard to writing
-// only the lower parts of a YMM register without clearing the upper part.
-def FeatureFastPartialYMMWrite
-    : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
-                       "true", "Partial writes to YMM registers are fast">;
+// On some X86 processors, there is no performance hazard to writing only the
+// lower parts of a YMM or ZMM register without clearing the upper part.
+def FeatureFastPartialYMMorZMMWrite
+    : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
+                       "HasFastPartialYMMorZMMWrite",
+                       "true", "Partial writes to YMM/ZMM registers are fast">;
 // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
 // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
 // vector FSQRT has higher throughput than the corresponding NR code.
@@ -271,6 +264,15 @@ def FeatureFastLZCNT
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
 
+
+// Sandy Bridge and newer processors can use SHLD with the same source on both
+// inputs to implement rotate to avoid the partial flag update of the normal
+// rotate instructions.
+def FeatureFastSHLDRotate
+    : SubtargetFeature<
+          "fast-shld-rotate", "HasFastSHLDRotate", "true",
+          "SHLD can be used as a faster rotate">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -466,7 +468,8 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
-  FeatureFastScalarFSQRT
+  FeatureFastScalarFSQRT,
+  FeatureFastSHLDRotate
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -498,10 +501,6 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureFMA,
   FeatureLZCNT,
   FeatureMOVBE,
-  FeatureINVPCID,
-  FeatureVMFUNC,
-  FeatureRTM,
-  FeatureHLE,
   FeatureSlowIncDec
 ]>;
 
@@ -512,8 +511,7 @@ def : HaswellProc<"core-avx2">; // Legacy alias.
 
 def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
   FeatureADX,
-  FeatureRDSEED,
-  FeatureSMAP
+  FeatureRDSEED
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
                                              BDWFeatures.Value, []>;
@@ -521,6 +519,7 @@ def : BroadwellProc<"broadwell">;
 
 def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
   FeatureMPX,
+  FeatureRTM,
   FeatureXSAVEC,
   FeatureXSAVES,
   FeatureSGX,
@@ -547,7 +546,8 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
   FeatureLZCNT,
   FeatureBMI,
   FeatureBMI2,
-  FeatureFMA
+  FeatureFMA,
+  FeatureFastPartialYMMorZMMWrite
 ]>;
 def : KnightsLandingProc<"knl">;
 
@@ -558,7 +558,6 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
   FeatureBWI,
   FeatureVLX,
   FeaturePKU,
-  FeaturePCOMMIT,
   FeatureCLWB
 ]>;
 
@@ -662,7 +661,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
-  FeatureFastPartialYMMWrite
+  FeatureFastPartialYMMorZMMWrite
 ]>;
 
 // Bulldozer
@@ -771,6 +770,7 @@ def: ProcessorModel<"znver1", BtVer2Model, [
   FeatureBMI,
   FeatureBMI2,
   FeatureCLFLUSHOPT,
+  FeatureCLZERO,
   FeatureCMPXCHG16B,
   FeatureF16C,
   FeatureFMA,
@@ -788,7 +788,6 @@ def: ProcessorModel<"znver1", BtVer2Model, [
   FeatureRDRAND,
   FeatureRDSEED,
   FeatureSHA,
-  FeatureSMAP,
   FeatureSSE4A,
   FeatureSlowSHLD,
   FeatureX87,
@@ -824,6 +823,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel,
 //===----------------------------------------------------------------------===//
 
 include "X86RegisterInfo.td"
+include "X86RegisterBanks.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 6798253d0f6a..44bc373b0394 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -81,7 +81,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
-  void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerFAULTING_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
 
   void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
 
+  void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
   // Helper function that emits the XRay sleds we've collected for a particular
   // function.
   void EmitXRayTable();
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 78bd2add8c3b..765af67de160 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -17,22 +17,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-
-#include "X86.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86FrameLowering.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -44,6 +57,7 @@ static cl::opt<bool>
                cl::init(false), cl::Hidden);
 
 namespace {
+
 class X86CallFrameOptimization : public MachineFunctionPass {
 public:
   X86CallFrameOptimization() : MachineFunctionPass(ID) {}
@@ -53,30 +67,28 @@ public:
 private:
   // Information we know about a particular call site
   struct CallContext {
-    CallContext()
-        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
-          MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
+    CallContext() : FrameSetup(nullptr), MovVector(4, nullptr) {}
 
     // Iterator referring to the frame setup instruction
     MachineBasicBlock::iterator FrameSetup;
 
     // Actual call instruction
-    MachineInstr *Call;
+    MachineInstr *Call = nullptr;
 
     // A copy of the stack pointer
-    MachineInstr *SPCopy;
+    MachineInstr *SPCopy = nullptr;
 
     // The total displacement of all passed parameters
-    int64_t ExpectedDist;
+    int64_t ExpectedDist = 0;
 
     // The sequence of movs used to pass the parameters
     SmallVector<MachineInstr *, 4> MovVector;
 
     // True if this call site has no stack parameters
-    bool NoStackParams;
+    bool NoStackParams = false;
 
     // True if this call site can use push instructions
-    bool UsePush;
+    bool UsePush = false;
   };
 
   typedef SmallVector<CallContext, 8> ContextVector;
@@ -102,7 +114,7 @@ private:
 
   StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
 
-  const TargetInstrInfo *TII;
+  const X86InstrInfo *TII;
   const X86FrameLowering *TFL;
   const X86Subtarget *STI;
   MachineRegisterInfo *MRI;
@@ -112,11 +124,8 @@ private:
 };
 
 char X86CallFrameOptimization::ID = 0;
-} // end anonymous namespace
 
-FunctionPass *llvm::createX86CallFrameOptimization() {
-  return new X86CallFrameOptimization();
-}
+} // end anonymous namespace
 
 // This checks whether the transformation is legal.
 // Also returns false in cases where it's potentially legal, but
@@ -322,7 +331,6 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   // transformation.
   const X86RegisterInfo &RegInfo =
       *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
-  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 
   // We expect to enter this at the beginning of a call sequence
   assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
@@ -331,8 +339,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
 
   // How much do we adjust the stack? This puts an upper bound on
   // the number of parameters actually passed on it.
-  unsigned int MaxAdjust =
-      FrameSetup->getOperand(0).getImm() >> Log2SlotSize;
+  unsigned int MaxAdjust = TII->getFrameSize(*FrameSetup) >> Log2SlotSize;
 
   // A zero adjustment means no stack parameters
   if (!MaxAdjust) {
@@ -425,7 +432,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
     return;
 
   Context.Call = &*I;
-  if ((++I)->getOpcode() != FrameDestroyOpcode)
+  if ((++I)->getOpcode() != TII->getCallFrameDestroyOpcode())
     return;
 
   // Now, go through the vector, and see that we don't have any gaps,
@@ -455,7 +462,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
   // PEI will end up finalizing the handling of this.
   MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
   MachineBasicBlock &MBB = *(FrameSetup->getParent());
-  FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+  TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist);
 
   DebugLoc DL = FrameSetup->getDebugLoc();
   bool Is64Bit = STI->is64Bit();
@@ -482,11 +489,10 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         if (isInt<8>(Val))
           PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
       }
-      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
-                 .addOperand(PushOp);
+      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
       break;
     case X86::MOV32mr:
-    case X86::MOV64mr:
+    case X86::MOV64mr: {
       unsigned int Reg = PushOp.getReg();
 
       // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
@@ -496,9 +502,9 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
         BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
         BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
-          .addReg(UndefReg)
-          .addOperand(PushOp)
-          .addImm(X86::sub_32bit);
+            .addReg(UndefReg)
+            .add(PushOp)
+            .addImm(X86::sub_32bit);
       }
 
       // If PUSHrmm is not slow on this target, try to fold the source of the
@@ -525,6 +531,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       }
       break;
     }
+    }
 
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
@@ -584,3 +591,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
 
   return &DefMI;
 }
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 5ae4962378d3..137ef166aaeb 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -14,12 +14,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86CallLowering.h"
+#include "X86CallingConv.h"
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
+
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
+#include "X86GenCallingConv.inc"
+
 #ifndef LLVM_BUILD_GLOBAL_ISEL
 #error "This shouldn't be built without GISel"
 #endif
@@ -27,20 +35,183 @@ using namespace llvm;
 X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
     : CallLowering(&TLI) {}
 
+void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                        SmallVectorImpl<ArgInfo> &SplitArgs,
+                                        const DataLayout &DL,
+                                        MachineRegisterInfo &MRI,
+                                        SplitArgTy PerformArgSplit) const {
+
+  const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
+  LLVMContext &Context = OrigArg.Ty->getContext();
+  EVT VT = TLI.getValueType(DL, OrigArg.Ty);
+  unsigned NumParts = TLI.getNumRegisters(Context, VT);
+
+  if (NumParts == 1) {
+    // replace the original type ( pointer -> GPR ).
+    SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context),
+                           OrigArg.Flags, OrigArg.IsFixed);
+    return;
+  }
+
+  SmallVector<uint64_t, 4> BitOffsets;
+  SmallVector<unsigned, 8> SplitRegs;
+
+  EVT PartVT = TLI.getRegisterType(Context, VT);
+  Type *PartTy = PartVT.getTypeForEVT(Context);
+
+  for (unsigned i = 0; i < NumParts; ++i) {
+    ArgInfo Info =
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
+                PartTy, OrigArg.Flags};
+    SplitArgs.push_back(Info);
+    PerformArgSplit(Info.Reg, PartVT.getSizeInBits() * i);
+  }
+}
+
+namespace {
+struct FuncReturnHandler : public CallLowering::ValueHandler {
+  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    llvm_unreachable("Don't know how to get a stack address yet");
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    MIB.addUse(PhysReg, RegState::Implicit);
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    llvm_unreachable("Don't know how to assign a value to an address yet");
+  }
+
+  MachineInstrBuilder &MIB;
+};
+} // End anonymous namespace.
+
 bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                   const Value *Val, unsigned VReg) const {
-  // TODO: handle functions returning non-void values.
-  if (Val)
-    return false;
 
-  MIRBuilder.buildInstr(X86::RET).addImm(0);
+  assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
+
+  if (VReg) {
+    MachineFunction &MF = MIRBuilder.getMF();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    auto &DL = MF.getDataLayout();
+    const Function &F = *MF.getFunction();
+
+    ArgInfo OrigArg{VReg, Val->getType()};
+    setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
+
+    SmallVector<ArgInfo, 8> SplitArgs;
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, VReg, Offset);
+                      });
 
+    FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+    if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+      return false;
+  }
+
+  MIRBuilder.insertInstr(MIB);
   return true;
 }
 
+namespace {
+struct FormalArgHandler : public CallLowering::ValueHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn *AssignFn, const DataLayout &DL)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), DL(DL) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    int FI = MFI.CreateFixedObject(Size, Offset, true);
+    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+    unsigned AddrReg = MRI.createGenericVirtualRegister(
+        LLT::pointer(0, DL.getPointerSizeInBits(0)));
+    MIRBuilder.buildFrameIndex(AddrReg, FI);
+    return AddrReg;
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+        0);
+    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+    MIRBuilder.buildCopy(ValVReg, PhysReg);
+  }
+
+  const DataLayout &DL;
+};
+} // namespace
+
 bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                            const Function &F,
                                            ArrayRef<unsigned> VRegs) const {
-  // TODO: handle functions with one or more arguments.
-  return F.arg_empty();
+  if (F.arg_empty())
+    return true;
+
+  // TODO: handle variadic function
+  if (F.isVarArg())
+    return false;
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto DL = MF.getDataLayout();
+
+  SmallVector<ArgInfo, 8> SplitArgs;
+  unsigned Idx = 0;
+  for (auto &Arg : F.args()) {
+    ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+    setArgFlags(OrigArg, Idx + 1, DL, F);
+    LLT Ty = MRI.getType(VRegs[Idx]);
+    unsigned Dst = VRegs[Idx];
+    bool Split = false;
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](unsigned Reg, uint64_t Offset) {
+                        if (!Split) {
+                          Split = true;
+                          Dst = MRI.createGenericVirtualRegister(Ty);
+                          MIRBuilder.buildUndef(Dst);
+                        }
+                        unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+                        MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
+                        Dst = Tmp;
+                      });
+    if (Dst != VRegs[Idx])
+      MIRBuilder.buildCopy(VRegs[Idx], Dst);
+    Idx++;
+  }
+
+  MachineBasicBlock &MBB = MIRBuilder.getMBB();
+  if (!MBB.empty())
+    MIRBuilder.setInstr(*MBB.begin());
+
+  FormalArgHandler Handler(MIRBuilder, MRI, CC_X86, DL);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+    return false;
+
+  // Move back to the end of the basic block.
+  MIRBuilder.setMBB(MBB);
+
+  return true;
 }
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index f2672f09d855..204e6974c702 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -34,6 +34,14 @@ public:
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
+private:
+  /// A function of this type is used to perform value split action.
+  typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
+
+  void splitToValueTypes(const ArgInfo &OrigArgInfo,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI,
+                         SplitArgTy SplitArg) const;
 };
 } // End of namespace llvm;
 #endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index cf7bc981b8a5..6781d761a1c4 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -1074,6 +1074,8 @@ def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
                                                  (sequence "K%u", 0, 7))>;
 
 def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_NoSSE : CalleeSavedRegs<(add RAX, RBX, RCX, RDX, RSI, RDI, R8, R9,
+                                                R10, R11, R12, R13, R14, R15, RBP)>;
 def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
                                                    (sequence "YMM%u", 0, 15)),
                                               (sequence "XMM%u", 0, 15))>;
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index bdd1ab537bb2..6472bbbc9016 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -20,16 +20,30 @@
 //===---------------------------------------------------------------------===//
 
 #include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
-#include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
-#include "X86InstrTablesInfo.h"
-#include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
-#include "X86TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
+// Including the generated EVEX2VEX tables.
+struct X86EvexToVexCompressTableEntry {
+  uint16_t EvexOpcode;
+  uint16_t VexOpcode;
+};
+#include "X86GenEVEX2VEXTables.inc"
+
 #define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
 #define EVEX2VEX_NAME "x86-evex-to-vex-compress"
 
@@ -56,8 +70,6 @@ class EvexToVexInstPass : public MachineFunctionPass {
 public:
   static char ID;
 
-  StringRef getPassName() const override { return EVEX2VEX_DESC; }
-
   EvexToVexInstPass() : MachineFunctionPass(ID) {
     initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
 
@@ -72,6 +84,8 @@ public:
     }
   }
 
+  StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
   /// Loop over all of the basic blocks, replacing EVEX instructions
   /// by equivalent VEX instructions when possible for reducing code size.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -88,13 +102,8 @@ private:
 };
 
 char EvexToVexInstPass::ID = 0;
-}
 
-INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
-
-FunctionPass *llvm::createX86EvexToVexInsts() {
-  return new EvexToVexInstPass();
-}
+} // end anonymous namespace
 
 bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
@@ -125,7 +134,6 @@ void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
 // For EVEX instructions that can be encoded using VEX encoding
 // replace them by the VEX encoding in order to reduce size.
 bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
-
   // VEX format.
   // # of bytes: 0,2,3  1      1      0,1   0,1,2,4  0,1
   //  [Prefixes] [VEX]  OPCODE ModR/M [SIB] [DISP]  [IMM]
@@ -211,3 +219,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
   return true; 
 }
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+  return new EvexToVexInstPass();
+}
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 985acf92a2d4..5dfd95f71301 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -77,9 +77,11 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   default:
     return false;
   case X86::TCRETURNdi:
+  case X86::TCRETURNdicc:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
+  case X86::TCRETURNdi64cc:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64: {
     bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
@@ -97,6 +99,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     Offset = StackAdj - MaxTCDelta;
     assert(Offset >= 0 && "Offset should never be negative");
 
+    if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) {
+      assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
+    }
+
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
@@ -105,12 +111,22 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     // Jump to label or value in register.
     bool IsWin64 = STI->isTargetWin64();
-    if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) {
+    if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
+        Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
       unsigned Op;
       switch (Opcode) {
       case X86::TCRETURNdi:
         Op = X86::TAILJMPd;
         break;
+      case X86::TCRETURNdicc:
+        Op = X86::TAILJMPd_CC;
+        break;
+      case X86::TCRETURNdi64cc:
+        assert(!MBB.getParent()->hasWinCFI() &&
+               "Conditional tail calls confuse "
+               "the Win64 unwinder.");
+        Op = X86::TAILJMPd64_CC;
+        break;
       default:
         // Note: Win64 uses REX prefixes indirect jumps out of functions, but
         // not direct ones.
@@ -126,13 +142,17 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         MIB.addExternalSymbol(JumpTarget.getSymbolName(),
                               JumpTarget.getTargetFlags());
       }
+      if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) {
+        MIB.addImm(MBBI->getOperand(2).getImm());
+      }
+
     } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
       unsigned Op = (Opcode == X86::TCRETURNmi)
                         ? X86::TAILJMPm
                         : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
       for (unsigned i = 0; i != 5; ++i)
-        MIB.addOperand(MBBI->getOperand(i));
+        MIB.add(MBBI->getOperand(i));
     } else if (Opcode == X86::TCRETURNri64) {
       BuildMI(MBB, MBBI, DL,
               TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
@@ -195,7 +215,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
     }
     for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
-      MIB.addOperand(MBBI->getOperand(I));
+      MIB.add(MBBI->getOperand(I));
     MBB.erase(MBBI);
     return true;
   }
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index c890fdd1e519..036f5d2610e4 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -367,6 +367,10 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
+    // TODO: Support this properly.
+    if (Subtarget->hasAVX512())
+      return false;
+    LLVM_FALLTHROUGH;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -524,6 +528,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
                                    X86AddressMode &AM,
                                    MachineMemOperand *MMO, bool Aligned) {
+  bool HasSSE1 = Subtarget->hasSSE1();
   bool HasSSE2 = Subtarget->hasSSE2();
   bool HasSSE4A = Subtarget->hasSSE4A();
   bool HasAVX = Subtarget->hasAVX();
@@ -537,6 +542,16 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
   case MVT::f80: // No f80 support yet.
   default: return false;
   case MVT::i1: {
+    // In case ValReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) {
+      unsigned KValReg = ValReg;
+      ValReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ValReg)
+          .addReg(KValReg);
+      ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true,
+                                          X86::sub_8bit);
+    }
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -574,6 +589,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
     } else
       Opc = X86::ST_Fp64m;
     break;
+  case MVT::x86mmx:
+    Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
+    break;
   case MVT::v4f32:
     if (Aligned) {
       if (IsNonTemporal)
@@ -1268,6 +1286,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       if (SrcVT == MVT::i1) {
         if (Outs[0].Flags.isSExt())
           return false;
+        // In case SrcReg is a K register, COPY to a GPR
+        if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) {
+          unsigned KSrcReg = SrcReg;
+          SrcReg = createResultReg(&X86::GR32RegClass);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::COPY), SrcReg)
+              .addReg(KSrcReg);
+          SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+                                              X86::sub_8bit);
+        }
         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
         SrcVT = MVT::i8;
       }
@@ -1559,6 +1587,17 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   // Handle zero-extension from i1 to i8, which is common.
   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
   if (SrcVT == MVT::i1) {
+    // In case ResultReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) {
+      unsigned KResultReg = ResultReg;
+      ResultReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(KResultReg);
+      ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                             X86::sub_8bit);
+    }
+
     // Set the high bits to zero.
     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
     SrcVT = MVT::i8;
@@ -1740,10 +1779,12 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // In case OpReg is a K register, COPY to a GPR
   if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
     unsigned KOpReg = OpReg;
-    OpReg = createResultReg(&X86::GR8RegClass);
+    OpReg = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), OpReg)
         .addReg(KOpReg);
+    OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true,
+                                       X86::sub_8bit);
   }
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
       .addReg(OpReg)
@@ -2084,10 +2125,12 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     // In case OpReg is a K register, COPY to a GPR
     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
       unsigned KCondReg = CondReg;
-      CondReg = createResultReg(&X86::GR8RegClass);
+      CondReg = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+                                           X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
         .addReg(CondReg, getKillRegState(CondIsKill))
@@ -2297,10 +2340,12 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
     // In case OpReg is a K register, COPY to a GPR
     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
       unsigned KCondReg = CondReg;
-      CondReg = createResultReg(&X86::GR8RegClass);
+      CondReg = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+                                           X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
         .addReg(CondReg, getKillRegState(CondIsKill))
@@ -2423,12 +2468,22 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
   if (OpReg == 0)
     return false;
 
+  unsigned ImplicitDefReg;
+  if (Subtarget->hasAVX()) {
+    ImplicitDefReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+  }
+
   unsigned ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB;
   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
                 ResultReg);
+
   if (Subtarget->hasAVX())
-    MIB.addReg(OpReg);
+    MIB.addReg(ImplicitDefReg);
+
   MIB.addReg(OpReg);
   updateValueMap(I, ResultReg);
   return true;
@@ -2461,7 +2516,8 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   EVT DstVT = TLI.getValueType(DL, I->getType());
 
   // This code only handles truncation to byte.
-  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+  // TODO: Support truncate to i1 with AVX512.
+  if (DstVT != MVT::i8 && (DstVT != MVT::i1 || Subtarget->hasAVX512()))
     return false;
   if (!TLI.isTypeLegal(SrcVT))
     return false;
@@ -3105,8 +3161,8 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
     return 0;
 
   if (CS)
-    if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
-        CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+    if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) ||
+        CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
       return 0;
 
   return 4;
@@ -3266,6 +3322,16 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
       // Handle zero-extension from i1 to i8, which is common.
       if (ArgVT == MVT::i1) {
+        // In case SrcReg is a K register, COPY to a GPR
+        if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) {
+          unsigned KArgReg = ArgReg;
+          ArgReg = createResultReg(&X86::GR32RegClass);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::COPY), ArgReg)
+              .addReg(KArgReg);
+          ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true,
+                                              X86::sub_8bit);
+        }
         // Set the high bits to zero.
         ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
         ArgVT = MVT::i8;
@@ -3463,6 +3529,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     CCValAssign &VA = RVLocs[i];
     EVT CopyVT = VA.getValVT();
     unsigned CopyReg = ResultReg + i;
+    unsigned SrcReg = VA.getLocReg();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
@@ -3470,9 +3537,19 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       report_fatal_error("SSE register return with SSE disabled");
     }
 
+    // If the return value is an i1 and AVX-512 is enabled, we need
+    // to do a fixup to make the copy legal.
+    if (CopyVT == MVT::i1 && SrcReg == X86::AL && Subtarget->hasAVX512()) {
+      // Need to copy to a GR32 first.
+      // TODO: MOVZX isn't great here. We don't care about the upper bits.
+      SrcReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::MOVZX32rr8), SrcReg).addReg(X86::AL);
+    }
+
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
-    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+    if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
       CopyVT = MVT::f80;
       CopyReg = createResultReg(&X86::RFP80RegClass);
@@ -3480,7 +3557,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
     // Copy out the result.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
     InRegs.push_back(VA.getLocReg());
 
     // Round the f80 to the right size, which also moves it to the appropriate
@@ -3601,6 +3678,13 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
     case MVT::i1:
+      if (Subtarget->hasAVX512()) {
+        // Need to copy to a VK1 register.
+        unsigned ResultReg = createResultReg(&X86::VK1RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg);
+        return ResultReg;
+      }
     case MVT::i8:
       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
                                         X86::sub_8bit);
@@ -3622,7 +3706,12 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
   unsigned Opc = 0;
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type");
-  case MVT::i1:  VT = MVT::i8;       LLVM_FALLTHROUGH;
+  case MVT::i1:
+    // TODO: Support this properly.
+    if (Subtarget->hasAVX512())
+      return 0;
+    VT = MVT::i8;
+    LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = X86::MOV8ri;  break;
   case MVT::i16: Opc = X86::MOV16ri; break;
   case MVT::i32: Opc = X86::MOV32ri; break;
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index 8bde4bf98d66..c28746f96439 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -95,10 +95,9 @@ class FixupBWInstPass : public MachineFunctionPass {
 
   // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
   // possible.  Return the replacement instruction if OK, return nullptr
-  // otherwise. Set WasCandidate to true or false depending on whether the
-  // MI was a candidate for this sort of transformation.
-  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
-                                bool &WasCandidate) const;
+  // otherwise.
+  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const;
+
 public:
   static char ID;
 
@@ -226,7 +225,7 @@ MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
 
   unsigned NumArgs = MI->getNumOperands();
   for (unsigned i = 1; i < NumArgs; ++i)
-    MIB.addOperand(MI->getOperand(i));
+    MIB.add(MI->getOperand(i));
 
   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
@@ -264,17 +263,13 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
   // Drop imp-defs/uses that would be redundant with the new def/use.
   for (auto &Op : MI->implicit_operands())
     if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
-      MIB.addOperand(Op);
+      MIB.add(Op);
 
   return MIB;
 }
 
-MachineInstr *FixupBWInstPass::tryReplaceInstr(
-                  MachineInstr *MI, MachineBasicBlock &MBB,
-                  bool &WasCandidate) const {
-  MachineInstr *NewMI = nullptr;
-  WasCandidate = false;
-
+MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
+                                               MachineBasicBlock &MBB) const {
   // See if this is an instruction of the type we are currently looking for.
   switch (MI->getOpcode()) {
 
@@ -282,12 +277,9 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Only replace 8 bit loads with the zero extending versions if
     // in an inner most loop and not optimizing for size. This takes
     // an extra byte to encode, and provides limited performance upside.
-    if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
-      if (ML->begin() == ML->end() && !OptForSize) {
-        NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
-        WasCandidate = true;
-      }
-    }
+    if (MachineLoop *ML = MLI->getLoopFor(&MBB))
+      if (ML->begin() == ML->end() && !OptForSize)
+        return tryReplaceLoad(X86::MOVZX32rm8, MI);
     break;
 
   case X86::MOV16rm:
@@ -295,9 +287,7 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Code size is the same, and there is sometimes a perf advantage
     // from eliminating a false dependence on the upper portion of
     // the register.
-    NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
-    WasCandidate = true;
-    break;
+    return tryReplaceLoad(X86::MOVZX32rm16, MI);
 
   case X86::MOV8rr:
   case X86::MOV16rr:
@@ -305,16 +295,14 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Code size is either less (16) or equal (8), and there is sometimes a
     // perf advantage from eliminating a false dependence on the upper portion
     // of the register.
-    NewMI = tryReplaceCopy(MI);
-    WasCandidate = true;
-    break;
+    return tryReplaceCopy(MI);
 
   default:
     // nothing to do here.
     break;
   }
 
-  return NewMI;
+  return nullptr;
 }
 
 void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
@@ -338,18 +326,11 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
   // We run after PEI, so we need to AddPristinesAndCSRs.
   LiveRegs.addLiveOuts(MBB);
 
-  bool WasCandidate = false;
-
   for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
     MachineInstr *MI = &*I;
     
-    MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
-
-    // Add this to replacements if it was a candidate, even if NewMI is
-    // nullptr.  We will revisit that in a bit.
-    if (WasCandidate) {
+    if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
       MIReplacements.push_back(std::make_pair(MI, NewMI));
-    }
 
     // We're done with this instruction, update liveness for the next one.
     LiveRegs.stepBackward(*MI);
@@ -359,9 +340,7 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
     MachineInstr *MI = MIReplacements.back().first;
     MachineInstr *NewMI = MIReplacements.back().second;
     MIReplacements.pop_back();
-    if (NewMI) {
-      MBB.insert(MI, NewMI);
-      MBB.erase(MI);
-    }
+    MBB.insert(MI, NewMI);
+    MBB.erase(MI);
   }
 }
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 12095917ca30..2cd4c1a3e7b3 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -120,8 +120,8 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
         BuildMI(*MF, MI.getDebugLoc(),
                 TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
                                                         : X86::LEA64r))
-            .addOperand(Dest)
-            .addOperand(Src)
+            .add(Dest)
+            .add(Src)
             .addImm(1)
             .addReg(0)
             .addImm(0)
@@ -287,8 +287,8 @@ bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
 
     MachineInstr *NewMI =
         BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
-            .addOperand(MI.getOperand(0))
-            .addOperand(MI.getOperand(1));
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1));
     MFI->erase(I);
     I = static_cast<MachineBasicBlock::iterator>(NewMI);
     return true;
@@ -377,9 +377,9 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
     const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
     NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
-                .addOperand(Dst)
-                .addOperand(Src1)
-                .addOperand(Src2);
+                .add(Dst)
+                .add(Src1)
+                .add(Src2);
     MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
   }
@@ -387,8 +387,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
   if (MI.getOperand(4).getImm() != 0) {
     const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
     NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
-                .addOperand(Dst)
-                .addOperand(SrcR)
+                .add(Dst)
+                .add(SrcR)
                 .addImm(MI.getOperand(4).getImm());
     MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index cd690442bb9f..78e0bca4158e 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -252,40 +252,76 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
                                     int64_t NumBytes, bool InEpilogue) const {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  MachineInstr::MIFlag Flag =
+      isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-  while (Offset) {
-    if (Offset > Chunk) {
-      // Rather than emit a long series of instructions for large offsets,
-      // load the offset into a register and do one sub/add
-      unsigned Reg = 0;
+  if (Offset > Chunk) {
+    // Rather than emit a long series of instructions for large offsets,
+    // load the offset into a register and do one sub/add
+    unsigned Reg = 0;
+    unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
 
-      if (isSub && !isEAXLiveIn(MBB))
-        Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+    if (isSub && !isEAXLiveIn(MBB))
+      Reg = Rax;
+    else
+      Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+    unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+    unsigned AddSubRROpc =
+        isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
+    if (Reg) {
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
+                             .addReg(StackPtr)
+                             .addReg(Reg);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      return;
+    } else if (Offset > 8 * Chunk) {
+      // If we would need more than 8 add or sub instructions (a >16GB stack
+      // frame), it's worth spilling RAX to materialize this immediate.
+      //   pushq %rax
+      //   movabsq +-$Offset+-SlotSize, %rax
+      //   addq %rsp, %rax
+      //   xchg %rax, (%rsp)
+      //   movq (%rsp), %rsp
+      assert(Is64Bit && "can't have 32-bit 16GB stack frame");
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+          .addReg(Rax, RegState::Kill)
+          .setMIFlag(Flag);
+      // Subtract is not commutative, so negate the offset and always use add.
+      // Subtract 8 less and add 8 more to account for the PUSH we just did.
+      if (isSub)
+        Offset = -(Offset - SlotSize);
       else
-        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
-
-      if (Reg) {
-        unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
-        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
-          .addImm(Offset);
-        Opc = isSub
-          ? getSUBrrOpcode(Is64Bit)
-          : getADDrrOpcode(Is64Bit);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
-          .addReg(StackPtr)
-          .addReg(Reg);
-        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
-        Offset = 0;
-        continue;
-      }
+        Offset = Offset + SlotSize;
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
+                             .addReg(Rax)
+                             .addReg(StackPtr);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      // Exchange the new SP in RAX with the top of the stack.
+      addRegOffset(
+          BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
+          StackPtr, false, 0);
+      // Load new SP from the top of the stack into RSP.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
+                   StackPtr, false, 0);
+      return;
     }
+  }
 
+  while (Offset) {
     uint64_t ThisVal = std::min(Offset, Chunk);
-    if (ThisVal == (Is64Bit ? 8 : 4)) {
-      // Use push / pop instead.
+    if (ThisVal == SlotSize) {
+      // Use push / pop for slot sized adjustments as a size optimization. We
+      // need to find a dead register when using pop.
       unsigned Reg = isSub
         ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
         : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
@@ -293,23 +329,16 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
         unsigned Opc = isSub
           ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
           : (Is64Bit ? X86::POP64r  : X86::POP32r);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
-          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
-        if (isSub)
-          MI->setFlag(MachineInstr::FrameSetup);
-        else
-          MI->setFlag(MachineInstr::FrameDestroy);
+        BuildMI(MBB, MBBI, DL, TII.get(Opc))
+            .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
+            .setMIFlag(Flag);
         Offset -= ThisVal;
         continue;
       }
     }
 
-    MachineInstrBuilder MI = BuildStackAdjustment(
-        MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
-    if (isSub)
-      MI.setMIFlag(MachineInstr::FrameSetup);
-    else
-      MI.setMIFlag(MachineInstr::FrameDestroy);
+    BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
+        .setMIFlag(Flag);
 
     Offset -= ThisVal;
   }
@@ -959,6 +988,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
 
+  // Re-align the stack on 64-bit if the x86-interrupt calling convention is
+  // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
+  // stack alignment.
+  if (Fn->getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
+      Fn->arg_size() == 2) {
+    StackSize += 8;
+    MFI.setStackSize(StackSize);
+    emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false);
+  }
+
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
@@ -2587,8 +2626,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   unsigned Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
   DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
-  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+  uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
   I = MBB.erase(I);
   auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
 
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index e1b04d6dc300..863dc8b22968 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -20,6 +20,7 @@ namespace llvm {
 
 class MachineInstrBuilder;
 class MCCFIInstruction;
+class X86InstrInfo;
 class X86Subtarget;
 class X86RegisterInfo;
 
@@ -30,7 +31,7 @@ public:
   // Cached subtarget predicates.
 
   const X86Subtarget &STI;
-  const TargetInstrInfo &TII;
+  const X86InstrInfo &TII;
   const X86RegisterInfo *TRI;
 
   unsigned SlotSize;
diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def
new file mode 100644
index 000000000000..06be142432f7
--- /dev/null
+++ b/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -0,0 +1,104 @@
+//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by X86RegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{
+    /* StartIdx, Length, RegBank */
+    // GPR value
+    {0, 8, X86::GPRRegBank},   // :0
+    {0, 16, X86::GPRRegBank},  // :1
+    {0, 32, X86::GPRRegBank},  // :2
+    {0, 64, X86::GPRRegBank},  // :3
+    // FR32/64 , xmm registers
+    {0, 32, X86::VECRRegBank},  // :4
+    {0, 64, X86::VECRRegBank},  // :5
+    // VR128/256/512
+    {0, 128, X86::VECRRegBank}, // :6
+    {0, 256, X86::VECRRegBank}, // :7
+    {0, 512, X86::VECRRegBank}, // :8   
+};
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum PartialMappingIdx {
+  PMI_None = -1,
+  PMI_GPR8,
+  PMI_GPR16,
+  PMI_GPR32,
+  PMI_GPR64,
+  PMI_FP32,
+  PMI_FP64,
+  PMI_VEC128,
+  PMI_VEC256,
+  PMI_VEC512
+};
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#define INSTR_3OP(INFO) INFO, INFO, INFO,
+#define BREAKDOWN(INDEX, NUM)                                                  \
+  { &X86GenRegisterBankInfo::PartMappings[INDEX], NUM }
+// ValueMappings.
+RegisterBankInfo::ValueMapping X86GenRegisterBankInfo::ValMappings[]{
+    /* BreakDown, NumBreakDowns */
+    // 3-operands instructions (all binary operations should end up with one of
+    // those mapping).
+    INSTR_3OP(BREAKDOWN(PMI_GPR8, 1))  // 0: GPR_8
+    INSTR_3OP(BREAKDOWN(PMI_GPR16, 1)) // 3: GPR_16
+    INSTR_3OP(BREAKDOWN(PMI_GPR32, 1)) // 6: GPR_32
+    INSTR_3OP(BREAKDOWN(PMI_GPR64, 1)) // 9: GPR_64    
+    INSTR_3OP(BREAKDOWN(PMI_FP32, 1))   // 12: Fp32
+    INSTR_3OP(BREAKDOWN(PMI_FP64, 1))   // 15: Fp64
+    INSTR_3OP(BREAKDOWN(PMI_VEC128, 1)) // 18: Vec128
+    INSTR_3OP(BREAKDOWN(PMI_VEC256, 1)) // 21: Vec256
+    INSTR_3OP(BREAKDOWN(PMI_VEC512, 1)) // 24: Vec512    
+};
+#undef INSTR_3OP
+#undef BREAKDOWN
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum ValueMappingIdx {
+  VMI_None = -1,
+  VMI_3OpsGpr8Idx =  PMI_GPR8  * 3,
+  VMI_3OpsGpr16Idx = PMI_GPR16 * 3,
+  VMI_3OpsGpr32Idx = PMI_GPR32 * 3,
+  VMI_3OpsGpr64Idx = PMI_GPR64 * 3,  
+  VMI_3OpsFp32Idx = PMI_FP32 * 3,
+  VMI_3OpsFp64Idx = PMI_FP64 * 3,
+  VMI_3OpsVec128Idx = PMI_VEC128 * 3,
+  VMI_3OpsVec256Idx = PMI_VEC256 * 3,
+  VMI_3OpsVec512Idx = PMI_VEC512 * 3,
+};
+#undef GET_TARGET_REGBANK_INFO_CLASS
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#undef GET_TARGET_REGBANK_INFO_IMPL
+const RegisterBankInfo::ValueMapping *
+X86GenRegisterBankInfo::getValueMapping(PartialMappingIdx Idx,
+                                        unsigned NumOperands) {
+  
+  // We can use VMI_3Ops Mapping for all the cases.
+  if (NumOperands <= 3 && (Idx >= PMI_GPR8 && Idx <= PMI_VEC512))
+    return &ValMappings[(unsigned)Idx * 3];
+  
+  llvm_unreachable("Unsupported PartialMappingIdx.");
+}
+
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8ab4c0616880..eb5c56ff2ff9 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -188,7 +188,6 @@ namespace {
 
   private:
     void Select(SDNode *N) override;
-    bool tryGather(SDNode *N, unsigned Opc);
 
     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -384,6 +383,16 @@ namespace {
     bool ComplexPatternFuncMutatesDAG() const override {
       return true;
     }
+
+    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
+
+    /// Returns whether this is a relocatable immediate in the range
+    /// [-2^Width .. 2^Width-1].
+    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
+      if (auto *CN = dyn_cast<ConstantSDNode>(N))
+        return isInt<Width>(CN->getSExtValue());
+      return isSExtAbsoluteSymbolRef(Width, N);
+    }
   };
 }
 
@@ -709,7 +718,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
-        Subtarget->isTargetGlibc())
+        (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
+         Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1325,8 +1335,8 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     AM.Scale = 1;
 
     // Insert the new nodes into the topological ordering.
-    insertDAGNode(*CurDAG, N, Zero);
-    insertDAGNode(*CurDAG, N, Neg);
+    insertDAGNode(*CurDAG, Handle.getValue(), Zero);
+    insertDAGNode(*CurDAG, Handle.getValue(), Neg);
     return false;
   }
 
@@ -1789,6 +1799,21 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
+bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
+  if (N->getOpcode() == ISD::TRUNCATE)
+    N = N->getOperand(0).getNode();
+  if (N->getOpcode() != X86ISD::Wrapper)
+    return false;
+
+  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
+  if (!GA)
+    return false;
+
+  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+  return CR && CR->getSignedMin().sge(-1ull << Width) &&
+         CR->getSignedMax().slt(1ull << Width);
+}
+
 /// Test whether the given X86ISD::CMP node has any uses which require the SF
 /// or OF bits to be accurate.
 static bool hasNoSignedComparisonUses(SDNode *N) {
@@ -1905,6 +1930,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
       SDValue Op = Chain.getOperand(i);
       if (Op == Load.getValue(1)) {
         ChainCheck = true;
+        // Drop Load, but keep its chain. No cycle check necessary.
+        ChainOps.push_back(Load.getOperand(0));
         continue;
       }
 
@@ -1954,39 +1981,6 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   llvm_unreachable("unrecognized size for LdVT");
 }
 
-/// Customized ISel for GATHER operations.
-bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
-  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
-  SDValue Chain = Node->getOperand(0);
-  SDValue VSrc = Node->getOperand(2);
-  SDValue Base = Node->getOperand(3);
-  SDValue VIdx = Node->getOperand(4);
-  SDValue VMask = Node->getOperand(5);
-  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
-  if (!Scale)
-    return false;
-
-  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
-                                   MVT::Other);
-
-  SDLoc DL(Node);
-
-  // Memory Operands: Base, Scale, Index, Disp, Segment
-  SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
-  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
-                          Disp, Segment, VMask, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
-  // Node has 2 outputs: VDst and MVT::Other.
-  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
-  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
-  // of ResNode.
-  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
-  CurDAG->RemoveDeadNode(Node);
-  return true;
-}
-
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
@@ -2024,55 +2018,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default: break;
-    case Intrinsic::x86_avx2_gather_d_pd:
-    case Intrinsic::x86_avx2_gather_d_pd_256:
-    case Intrinsic::x86_avx2_gather_q_pd:
-    case Intrinsic::x86_avx2_gather_q_pd_256:
-    case Intrinsic::x86_avx2_gather_d_ps:
-    case Intrinsic::x86_avx2_gather_d_ps_256:
-    case Intrinsic::x86_avx2_gather_q_ps:
-    case Intrinsic::x86_avx2_gather_q_ps_256:
-    case Intrinsic::x86_avx2_gather_d_q:
-    case Intrinsic::x86_avx2_gather_d_q_256:
-    case Intrinsic::x86_avx2_gather_q_q:
-    case Intrinsic::x86_avx2_gather_q_q_256:
-    case Intrinsic::x86_avx2_gather_d_d:
-    case Intrinsic::x86_avx2_gather_d_d_256:
-    case Intrinsic::x86_avx2_gather_q_d:
-    case Intrinsic::x86_avx2_gather_q_d_256: {
-      if (!Subtarget->hasAVX2())
-        break;
-      unsigned Opc;
-      switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
-      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
-      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
-      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
-      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
-      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
-      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
-      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
-      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
-      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
-      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
-      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
-      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
-      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
-      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
-      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
-      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
-      }
-      if (tryGather(Node, Opc))
-        return;
-      break;
-    }
-    }
-    break;
-  }
   case X86ISD::GlobalBaseReg:
     ReplaceNode(Node, getGlobalBaseReg());
     return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 08fe2bad281e..7ff483063ec2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <bitset>
@@ -70,6 +71,13 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
+static cl::opt<int> ExperimentalPrefLoopAlignment(
+    "x86-experimental-pref-loop-alignment", cl::init(4),
+    cl::desc("Sets the preferable loop alignment for experiments "
+             "(the last x86-experimental-pref-loop-alignment bits"
+             " of the loop header PC will be 0)."),
+    cl::Hidden);
+
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -427,7 +435,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
     setOperationAction(ISD::BlockAddress    , VT, Custom);
   }
-  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+
+  // 64-bit shl, sra, srl (iff 32-bit x86)
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
@@ -782,6 +791,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
@@ -888,6 +898,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
@@ -922,6 +935,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
+
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
+
     for (MVT VT : MVT::integer_vector_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
@@ -1065,6 +1086,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1126,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
@@ -1271,6 +1293,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       }
     }
     if (Subtarget.hasVLX()) {
+      setOperationAction(ISD::ABS,              MVT::v4i64, Legal);
+      setOperationAction(ISD::ABS,              MVT::v2i64, Legal);
       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
@@ -1357,16 +1381,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
     setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
 
-    setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
-    setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
-    setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
-    setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
-    setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
-    setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
+    setOperationAction(ISD::ADD,                MVT::v8i1,  Custom);
+    setOperationAction(ISD::ADD,                MVT::v16i1, Custom);
+    setOperationAction(ISD::SUB,                MVT::v8i1,  Custom);
+    setOperationAction(ISD::SUB,                MVT::v16i1, Custom);
+    setOperationAction(ISD::MUL,                MVT::v8i1,  Custom);
+    setOperationAction(ISD::MUL,                MVT::v16i1, Custom);
 
     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
 
     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
@@ -1441,7 +1466,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VSELECT,             VT, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
       setOperationAction(ISD::MLOAD,               VT, Legal);
       setOperationAction(ISD::MSTORE,              VT, Legal);
       setOperationAction(ISD::MGATHER,             VT, Legal);
@@ -1460,12 +1485,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
-    setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
-    setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
-    setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
-    setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
-    setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
-    setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
+    setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
+    setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
+    setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
+    setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
+    setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
+    setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
@@ -1479,8 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
@@ -1546,6 +1571,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
       setOperationAction(ISD::SRA,          VT, Custom);
@@ -1574,9 +1600,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
 
     for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
-      setOperationAction(ISD::ADD,                VT, Expand);
-      setOperationAction(ISD::SUB,                VT, Expand);
-      setOperationAction(ISD::MUL,                VT, Expand);
+      setOperationAction(ISD::ADD,                VT, Custom);
+      setOperationAction(ISD::SUB,                VT, Custom);
+      setOperationAction(ISD::MUL,                VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Expand);
 
       setOperationAction(ISD::TRUNCATE,           VT, Custom);
@@ -1671,6 +1697,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
@@ -1696,6 +1723,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
@@ -1712,7 +1741,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
-  setPrefLoopAlignment(4); // 2^4 bytes.
+  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
+  setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
@@ -1933,6 +1963,34 @@ bool X86TargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
+void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                                              ArgListTy &Args) const {
+
+  // Only relabel X86-32 for C / Stdcall CCs.
+  if (Subtarget.is64Bit())
+    return;
+  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+    return;
+  unsigned ParamRegs = 0;
+  if (auto *M = MF->getFunction()->getParent())
+    ParamRegs = M->getNumberRegisterParameters();
+
+  // Mark the first N int arguments as having reg
+  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
+    Type *T = Args[Idx].Ty;
+    if (T->isPointerTy() || T->isIntegerTy())
+      if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
+        unsigned numRegs = 1;
+        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
+          numRegs = 2;
+        if (ParamRegs < numRegs)
+          return;
+        ParamRegs -= numRegs;
+        Args[Idx].IsInReg = true;
+      }
+  }
+}
+
 const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
@@ -2001,21 +2059,37 @@ unsigned X86TargetLowering::getAddressSpace() const {
   return 256;
 }
 
-Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
-  // glibc has a special slot for the stack guard in tcbhead_t, use it instead
-  // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
-  if (!Subtarget.isTargetGlibc())
-    return TargetLowering::getIRStackGuard(IRB);
-
-  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
-  // %gs:0x14 on i386
-  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
-  unsigned AddressSpace = getAddressSpace();
+static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
+  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
+         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+}
+
+static Constant* SegmentOffset(IRBuilder<> &IRB,
+                               unsigned Offset, unsigned AddressSpace) {
   return ConstantExpr::getIntToPtr(
       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
 }
 
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
+  // tcbhead_t; use it instead of the usual global variable (see
+  // sysdeps/{i386,x86_64}/nptl/tls.h)
+  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
+    if (Subtarget.isTargetFuchsia()) {
+      // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+      return SegmentOffset(IRB, 0x10, getAddressSpace());
+    } else {
+      // %fs:0x28, unless we're using a Kernel code model, in which case
+      // it's %gs:0x28.  gs:0x14 on i386.
+      unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+      return SegmentOffset(IRB, Offset, getAddressSpace());
+    }
+  }
+
+  return TargetLowering::getIRStackGuard(IRB);
+}
+
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -2027,13 +2101,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
     auto *SecurityCheckCookie = cast<Function>(
         M.getOrInsertFunction("__security_check_cookie",
                               Type::getVoidTy(M.getContext()),
-                              Type::getInt8PtrTy(M.getContext()), nullptr));
+                              Type::getInt8PtrTy(M.getContext())));
     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
     return;
   }
-  // glibc has a special slot for the stack guard.
-  if (Subtarget.isTargetGlibc())
+  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
+  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
     return;
   TargetLowering::insertSSPDeclarations(M);
 }
@@ -2056,21 +2130,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (Subtarget.getTargetTriple().isOSContiki())
     return getDefaultSafeStackPointerLocation(IRB, false);
 
-  if (!Subtarget.isTargetAndroid())
-    return TargetLowering::getSafeStackPointerLocation(IRB);
-
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  unsigned AddressSpace, Offset;
+  if (Subtarget.isTargetAndroid()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    // %gs:0x24 on i386
+    unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+    return SegmentOffset(IRB, Offset, getAddressSpace());
+  }
 
-  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
-  // %gs:0x24 on i386
-  Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
-  AddressSpace = getAddressSpace();
-  return ConstantExpr::getIntToPtr(
-      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
-      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+  // Fuchsia is similar.
+  if (Subtarget.isTargetFuchsia()) {
+    // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+    return SegmentOffset(IRB, 0x18, getAddressSpace());
+  }
+
+  return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -2179,6 +2255,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
        ++I, ++OutsIndex) {
     CCValAssign &VA = RVLocs[I];
     assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Add the register to the CalleeSaveDisableRegs list.
+    if (CallConv == CallingConv::X86_RegCall)
+      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
+
     SDValue ValToCopy = OutVals[OutsIndex];
     EVT ValVT = ValToCopy.getValueType();
 
@@ -2253,6 +2334,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
       assert(2 == RegsToPass.size() &&
              "Expecting two registers after Pass64BitArgInRegs");
+
+      // Add the second register to the CalleeSaveDisableRegs list.
+      if (CallConv == CallingConv::X86_RegCall)
+        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
     } else {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
     }
@@ -2309,6 +2394,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // RAX/EAX now acts like a return value.
     RetOps.push_back(
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+
+    // Add the returned register to the CalleeSaveDisableRegs list.
+    if (CallConv == CallingConv::X86_RegCall)
+      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
   }
 
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -2444,7 +2533,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
   // Convert the i32 type into v32i1 type
   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
 
-  // Concantenate the two values together
+  // Concatenate the two values together
   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
 }
 
@@ -2488,8 +2577,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
 SDValue X86TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    uint32_t *RegMask) const {
 
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget.is64Bit();
@@ -2503,6 +2594,14 @@ SDValue X86TargetLowering::LowerCallResult(
     CCValAssign &VA = RVLocs[I];
     EVT CopyVT = VA.getLocVT();
 
+    // In some calling conventions we need to remove the used registers
+    // from the register mask.
+    if (RegMask && CallConv == CallingConv::X86_RegCall) {
+      for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+    }
+
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
@@ -2669,6 +2768,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // If value is passed by pointer we have address passed instead of the value
   // itself. No need to extend if the mask value and location share the same
@@ -2686,13 +2786,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   // taken by a return address.
   int Offset = 0;
   if (CallConv == CallingConv::X86_INTR) {
-    const X86Subtarget& Subtarget =
-        static_cast<const X86Subtarget&>(DAG.getSubtarget());
     // X86 interrupts may take one or two arguments.
     // On the stack there will be no return address as in regular call.
     // Offset of last argument need to be set to -4/-8 bytes.
     // Where offset of the first argument out of two, should be set to 0 bytes.
     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+    if (Subtarget.is64Bit() && Ins.size() == 2) {
+      // The stack pointer needs to be realigned for 64 bit handlers with error
+      // code, so the argument offset changes by 8 bytes.
+      Offset += 8;
+    }
   }
 
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
@@ -2707,30 +2810,71 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
     }
-    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-  } else {
-    int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
-                                   VA.getLocMemOffset(), isImmutable);
-
-    // Set SExt or ZExt flag.
-    if (VA.getLocInfo() == CCValAssign::ZExt) {
-      MFI.setObjectZExt(FI, true);
-    } else if (VA.getLocInfo() == CCValAssign::SExt) {
-      MFI.setObjectSExt(FI, true);
+    return DAG.getFrameIndex(FI, PtrVT);
+  }
+
+  // This is an argument in memory. We might be able to perform copy elision.
+  if (Flags.isCopyElisionCandidate()) {
+    EVT ArgVT = Ins[i].ArgVT;
+    SDValue PartAddr;
+    if (Ins[i].PartOffset == 0) {
+      // If this is a one-part value or the first part of a multi-part value,
+      // create a stack object for the entire argument value type and return a
+      // load from our portion of it. This assumes that if the first part of an
+      // argument is in memory, the rest will also be in memory.
+      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
+                                     /*Immutable=*/false);
+      PartAddr = DAG.getFrameIndex(FI, PtrVT);
+      return DAG.getLoad(
+          ValVT, dl, Chain, PartAddr,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+    } else {
+      // This is not the first piece of an argument in memory. See if there is
+      // already a fixed stack object including this offset. If so, assume it
+      // was created by the PartOffset == 0 branch above and create a load from
+      // the appropriate offset into it.
+      int64_t PartBegin = VA.getLocMemOffset();
+      int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+      int FI = MFI.getObjectIndexBegin();
+      for (; MFI.isFixedObjectIndex(FI); ++FI) {
+        int64_t ObjBegin = MFI.getObjectOffset(FI);
+        int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+        if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+          break;
+      }
+      if (MFI.isFixedObjectIndex(FI)) {
+        SDValue Addr =
+            DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+                        DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+        return DAG.getLoad(
+            ValVT, dl, Chain, Addr,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+                                              Ins[i].PartOffset));
+      }
     }
+  }
 
-    // Adjust SP offset of interrupt parameter.
-    if (CallConv == CallingConv::X86_INTR) {
-      MFI.setObjectOffset(FI, Offset);
-    }
+  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+                                 VA.getLocMemOffset(), isImmutable);
 
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue Val = DAG.getLoad(
-        ValVT, dl, Chain, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-    return ExtendedInMem ?
-      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+  // Set SExt or ZExt flag.
+  if (VA.getLocInfo() == CCValAssign::ZExt) {
+    MFI.setObjectZExt(FI, true);
+  } else if (VA.getLocInfo() == CCValAssign::SExt) {
+    MFI.setObjectSExt(FI, true);
+  }
+
+  // Adjust SP offset of interrupt parameter.
+  if (CallConv == CallingConv::X86_INTR) {
+    MFI.setObjectOffset(FI, Offset);
   }
+
+  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+  SDValue Val = DAG.getLoad(
+      ValVT, dl, Chain, FIN,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
+                       : Val;
 }
 
 // FIXME: Get this from tablegen.
@@ -2781,12 +2925,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
+#ifndef NDEBUG
 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
                           return A.getValNo() < B.getValNo();
                         });
 }
+#endif
 
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -2836,8 +2982,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
-  if (!isSortedByValueNo(ArgLocs))
-    llvm_unreachable("Argument Location list must be sorted before lowering");
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -2853,7 +2999,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
             "Currently the only custom case is when we split v64i1 to 2 regs");
 
         // v64i1 values, in regcall calling convention, that are
-        // compiled to 32 bit arch, are splited up into two registers.
+        // compiled to 32 bit arch, are split up into two registers.
         ArgValue =
             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
       } else {
@@ -3107,8 +3253,9 @@ SDValue X86TargetLowering::LowerFormalArguments(
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
-    // X86 interrupts must pop the error code if present
-    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+    // X86 interrupts must pop the error code (and the alignment padding) if
+    // present.
+    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
@@ -3146,6 +3293,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
     }
   }
 
+  if (CallConv == CallingConv::X86_RegCall) {
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
+      MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
+  }
+
   return Chain;
 }
 
@@ -3348,8 +3501,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
-  if (!isSortedByValueNo(ArgLocs))
-    llvm_unreachable("Argument Location list must be sorted before lowering");
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
@@ -3517,7 +3670,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (VA.isRegLoc()) {
         if (VA.needsCustom()) {
           assert((CallConv == CallingConv::X86_RegCall) &&
-                 "Expecting custome case only in regcall calling convention");
+                 "Expecting custom case only in regcall calling convention");
           // This means that we are in special case where one argument was
           // passed through two register locations - Skip the next location
           ++I;
@@ -3662,7 +3815,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Mask = RegInfo->getNoPreservedMask();
   }
 
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  // Define a new register mask from the existing mask.
+  uint32_t *RegMask = nullptr;
+
+  // In some calling conventions we need to remove the used physical registers
+  // from the reg mask.
+  if (CallConv == CallingConv::X86_RegCall) {
+    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+    // Allocate a new Reg Mask and copy Mask.
+    RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
+    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+    memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+
+    // Make sure all sub registers of the argument registers are reset
+    // in the RegMask.
+    for (auto const &RegPair : RegsToPass)
+      for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+
+    // Create the RegMask Operand according to our updated mask.
+    Ops.push_back(DAG.getRegisterMask(RegMask));
+  } else {
+    // Create the RegMask Operand according to the static mask.
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -3715,8 +3893,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
-                         Ins, dl, DAG, InVals);
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, RegMask);
 }
 
 //===----------------------------------------------------------------------===//
@@ -4132,6 +4310,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
     return true;
   // 'Faux' Target Shuffles.
   case ISD::AND:
+  case X86ISD::ANDNP:
     return true;
   }
 }
@@ -4448,6 +4627,11 @@ bool X86TargetLowering::isCtlzFast() const {
   return Subtarget.hasFastLZCNT();
 }
 
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  return true;
+}
+
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (!Subtarget.hasBMI())
     return false;
@@ -4460,6 +4644,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   return true;
 }
 
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+  MVT VT = MVT::getIntegerVT(NumBits);
+  if (isTypeLegal(VT))
+    return VT;
+
+  // PMOVMSKB can handle this.
+  if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+    return MVT::v16i8;
+
+  // VPMOVMSKB can handle this.
+  if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+    return MVT::v32i8;
+
+  // TODO: Allow 64-bit type for 32-bit target.
+  // TODO: 512-bit types should be allowed, but make sure that those
+  // cases are handled in combineVectorSizedSetCCEquality().
+
+  return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
 /// Val is the undef sentinel value or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
@@ -4555,28 +4759,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     SmallVectorImpl<int> &WidenedMask) {
   WidenedMask.assign(Mask.size() / 2, 0);
   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    int M0 = Mask[i];
+    int M1 = Mask[i + 1];
+
     // If both elements are undef, its trivial.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+    if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
       WidenedMask[i / 2] = SM_SentinelUndef;
       continue;
     }
 
     // Check for an undef mask and a mask value properly aligned to fit with
     // a pair of values. If we find such a case, use the non-undef mask's value.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
-        Mask[i + 1] % 2 == 1) {
-      WidenedMask[i / 2] = Mask[i + 1] / 2;
+    if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
+      WidenedMask[i / 2] = M1 / 2;
       continue;
     }
-    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
-      WidenedMask[i / 2] = Mask[i] / 2;
+    if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
+      WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // When zeroing, we need to spread the zeroing across both lanes to widen.
-    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
-      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
-          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+    if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
+      if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
+          (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
         WidenedMask[i / 2] = SM_SentinelZero;
         continue;
       }
@@ -4585,9 +4791,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
 
     // Finally check if the two mask values are adjacent and aligned with
     // a pair.
-    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
-        Mask[i] + 1 == Mask[i + 1]) {
-      WidenedMask[i / 2] = Mask[i] / 2;
+    if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
+      WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
@@ -4770,9 +4975,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   return ConstsNode;
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+  assert(Bits.size() == Undefs.getBitWidth() &&
+         "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
@@ -4844,10 +5050,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
-  // Extract from UNDEF is UNDEF.
-  if (Vec.isUndef())
-    return DAG.getUNDEF(ResultVT);
-
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
@@ -4918,50 +5120,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
-
-  // For insertion into the zero index (low half) of a 256-bit vector, it is
-  // more efficient to generate a blend with immediate instead of an insert*128.
-  // We are still creating an INSERT_SUBVECTOR below with an undef node to
-  // extend the subvector to the size of the result vector. Make sure that
-  // we are not recursing on that node by checking for undef here.
-  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
-      !Result.isUndef()) {
-    EVT ResultVT = Result.getValueType();
-    SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
-    SDValue Undef = DAG.getUNDEF(ResultVT);
-    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
-                                 Vec, ZeroIndex);
-
-    // The blend instruction, and therefore its mask, depend on the data type.
-    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
-    if (ScalarType.isFloatingPoint()) {
-      // Choose either vblendps (float) or vblendpd (double).
-      unsigned ScalarSize = ScalarType.getSizeInBits();
-      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
-      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
-      SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
-      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
-    }
-
-    const X86Subtarget &Subtarget =
-    static_cast<const X86Subtarget &>(DAG.getSubtarget());
-
-    // AVX2 is needed for 256-bit integer blend support.
-    // Integers must be cast to 32-bit because there is only vpblendd;
-    // vpblendw can't be used for this because it has a handicapped mask.
-
-    // If we don't have AVX2, then cast to float. Using a wrong domain blend
-    // is still more efficient than using the wrong domain vinsertf128 that
-    // will be created by InsertSubVector().
-    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
-
-    SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
-    Result = DAG.getBitcast(CastVT, Result);
-    Vec256 = DAG.getBitcast(CastVT, Vec256);
-    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
-    return DAG.getBitcast(ResultVT, Vec256);
-  }
-
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
@@ -5023,7 +5181,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (Vec.isUndef()) {
     if (IdxVal != 0) {
       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
-      WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+      WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+                               ShiftBits);
     }
     return ExtractSubVec(WideSubVec);
   }
@@ -5032,9 +5191,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
     NumElems = WideOpVT.getVectorNumElements();
     unsigned ShiftLeft = NumElems - SubVecNumElems;
     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
-                             DAG.getConstant(ShiftLeft, dl, MVT::i8));
-    Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+                      DAG.getConstant(ShiftLeft, dl, MVT::i8));
+    Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
     return ExtractSubVec(Vec);
   }
@@ -5043,8 +5202,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
@@ -5056,12 +5215,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // Simple case when we put subvector in the upper part
   if (IdxVal + SubVecNumElems == NumElems) {
     // Zero upper bits of the Vec
-    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+    WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
                              DAG.getConstant(IdxVal, dl, MVT::i8));
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
     return ExtractSubVec(Vec);
   }
@@ -5094,26 +5253,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 }
 
 /// Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
-/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
 /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
-                             SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
 
   APInt Ones = APInt::getAllOnesValue(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
-  SDValue Vec;
-  if (!Subtarget.hasInt256() && NumElts == 8) {
-    Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
-    Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
-  } else {
-    Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
-  }
+  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
+static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
+                              SelectionDAG &DAG) {
+  EVT InVT = In.getValueType();
+  assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
+
+  if (VT.is128BitVector() && InVT.is128BitVector())
+    return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
+                                : DAG.getZeroExtendVectorInReg(In, DL, VT);
+
+  // For 256-bit vectors, we only need the lower (128-bit) input half.
+  // For 512-bit vectors, we only need the lower input half or quarter.
+  if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
+    int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+    In = extractSubVector(In, 0, DAG, DL,
+                          std::max(128, (int)VT.getSizeInBits() / Scale));
+  }
+
+  return DAG.getNode(Opc, DL, VT, In);
+}
+
 /// Generate unpacklo/unpackhi shuffle mask.
 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
                                     bool Unary) {
@@ -5199,9 +5370,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
 
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
-                                          SmallBitVector &UndefElts,
-                                          SmallVectorImpl<APInt> &EltBits) {
-  assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+                                          APInt &UndefElts,
+                                          SmallVectorImpl<APInt> &EltBits,
+                                          bool AllowWholeUndefs = true,
+                                          bool AllowPartialUndefs = true) {
   assert(EltBits.empty() && "Expected an empty EltBits vector");
 
   Op = peekThroughBitcasts(Op);
@@ -5211,56 +5383,83 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
+  unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
   // Extract all the undef/constant element data and pack into single bitsets.
   APInt UndefBits(SizeInBits, 0);
   APInt MaskBits(SizeInBits, 0);
 
   // Split the undef/constant single bitset data into the target elements.
   auto SplitBitData = [&]() {
-    UndefElts = SmallBitVector(NumElts, false);
+    // Don't split if we don't allow undef bits.
+    bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
+    if (UndefBits.getBoolValue() && !AllowUndefs)
+      return false;
+
+    UndefElts = APInt(NumElts, 0);
     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
 
     for (unsigned i = 0; i != NumElts; ++i) {
-      APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
-      UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+      unsigned BitOffset = i * EltSizeInBits;
+      APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
 
-      // Only treat an element as UNDEF if all bits are UNDEF, otherwise
-      // treat it as zero.
+      // Only treat an element as UNDEF if all bits are UNDEF.
       if (UndefEltBits.isAllOnesValue()) {
-        UndefElts[i] = true;
+        if (!AllowWholeUndefs)
+          return false;
+        UndefElts.setBit(i);
         continue;
       }
 
-      APInt Bits = MaskBits.lshr(i * EltSizeInBits);
-      Bits = Bits.zextOrTrunc(EltSizeInBits);
+      // If only some bits are UNDEF then treat them as zero (or bail if not
+      // supported).
+      if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
+        return false;
+
+      APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
       EltBits[i] = Bits.getZExtValue();
     }
     return true;
   };
 
-  auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
-                                          APInt &Undefs) {
+  // Collect constant bits and insert into mask/undef bit masks.
+  auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
+                                unsigned BitOffset) {
     if (!Cst)
       return false;
     unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
     if (isa<UndefValue>(Cst)) {
-      Mask = APInt::getNullValue(SizeInBits);
-      Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+      Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
       return true;
     }
     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
-      Mask = CInt->getValue().zextOrTrunc(SizeInBits);
-      Undefs = APInt::getNullValue(SizeInBits);
+      Mask.insertBits(CInt->getValue(), BitOffset);
       return true;
     }
     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
-      Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
-      Undefs = APInt::getNullValue(SizeInBits);
+      Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
       return true;
     }
     return false;
   };
 
+  // Extract constant bits from build vector.
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      const SDValue &Src = Op.getOperand(i);
+      unsigned BitOffset = i * SrcEltSizeInBits;
+      if (Src.isUndef()) {
+        UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+        continue;
+      }
+      auto *Cst = cast<ConstantSDNode>(Src);
+      APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+      MaskBits.insertBits(Bits, BitOffset);
+    }
+    return SplitBitData();
+  }
+
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
@@ -5268,117 +5467,59 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
       return false;
 
     unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
-    for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
-      APInt Bits, Undefs;
-      if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+    for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
+      if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
+                               i * CstEltSizeInBits))
         return false;
-      MaskBits |= Bits.shl(i * CstEltSizeInBits);
-      UndefBits |= Undefs.shl(i * CstEltSizeInBits);
-    }
 
     return SplitBitData();
   }
 
   // Extract constant bits from a broadcasted constant pool scalar.
   if (Op.getOpcode() == X86ISD::VBROADCAST &&
-      EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+      EltSizeInBits <= SrcEltSizeInBits) {
     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
-      APInt Bits, Undefs;
-      if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
-        unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
-        unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
-        for (unsigned i = 0; i != NumBroadcastElts; ++i) {
-          MaskBits |= Bits.shl(i * NumBroadcastBits);
-          UndefBits |= Undefs.shl(i * NumBroadcastBits);
+      APInt Bits(SizeInBits, 0);
+      APInt Undefs(SizeInBits, 0);
+      if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
+        for (unsigned i = 0; i != NumSrcElts; ++i) {
+          MaskBits |= Bits.shl(i * SrcEltSizeInBits);
+          UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
         }
         return SplitBitData();
       }
     }
   }
 
+  // Extract a rematerialized scalar constant insertion.
+  if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
+      Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+    auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
+    MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+    MaskBits = MaskBits.zext(SizeInBits);
+    return SplitBitData();
+  }
+
   return false;
 }
 
-// TODO: Merge more of this with getTargetConstantBitsFromNode.
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask) {
-  MaskNode = peekThroughBitcasts(MaskNode);
-
-  MVT VT = MaskNode.getSimpleValueType();
-  assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
-  unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
-
-  // Split an APInt element into MaskEltSizeInBits sized pieces and
-  // insert into the shuffle mask.
-  auto SplitElementToMask = [&](APInt Element) {
-    // Note that this is x86 and so always little endian: the low byte is
-    // the first byte of the mask.
-    int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-    for (int i = 0; i < Split; ++i) {
-      APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
-      Element = Element.lshr(MaskEltSizeInBits);
-      RawMask.push_back(RawElt.getZExtValue());
-    }
-  };
-
-  if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
-    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-    // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
-    if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
-      return false;
-    if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
-      const APInt &MaskElement = CN->getAPIntValue();
-      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-        APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
-        RawMask.push_back(RawElt.getZExtValue());
-      }
-    }
+  APInt UndefElts;
+  SmallVector<APInt, 64> EltBits;
+
+  // Extract the raw target constant bits.
+  // FIXME: We currently don't support UNDEF bits or mask entries.
+  if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
+                                     EltBits, /* AllowWholeUndefs */ false,
+                                     /* AllowPartialUndefs */ false))
     return false;
-  }
 
-  if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
-      MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
-    if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
-      if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
-        RawMask.push_back(CN->getZExtValue());
-        RawMask.append(NumMaskElts - 1, 0);
-        return true;
-      }
-
-      if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
-        unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-        SplitElementToMask(CN->getAPIntValue());
-        RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
-        return true;
-      }
-    }
-    return false;
-  }
-
-  if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  // We can always decode if the buildvector is all zero constants,
-  // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
-  if (all_of(MaskNode->ops(), X86::isZeroNode)) {
-    RawMask.append(NumMaskElts, 0);
-    return true;
-  }
-
-  // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-  if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
-    return false;
-
-  for (SDValue Op : MaskNode->ops()) {
-    if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
-      SplitElementToMask(CN->getAPIntValue());
-    else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
-      SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
-    else
-      return false;
-  }
+  // Insert the extracted elements into the mask.
+  for (APInt Elt : EltBits)
+    RawMask.push_back(Elt.getZExtValue());
 
   return true;
 }
@@ -5405,6 +5546,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::BLENDI:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5473,8 +5615,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
-    // We only decode broadcasts of same-sized vectors at the moment.
-    if (N->getOperand(0).getValueType() == VT) {
+    SDValue N0 = N->getOperand(0);
+    // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+    // add the pre-extracted value to the Ops vector.
+    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N0.getOperand(0).getValueType() == VT &&
+        N0.getConstantOperandVal(1) == 0)
+      Ops.push_back(N0.getOperand(0));
+
+    // We only decode broadcasts of same-sized vectors, unless the broadcast
+    // came from an extract from the original width. If we found one, we
+    // pushed it the Ops vector above.
+    if (N0.getValueType() == VT || !Ops.empty()) {
       DecodeVectorBroadcast(VT, Mask);
       IsUnary = true;
       break;
@@ -5669,6 +5821,19 @@ static bool setTargetShuffleZeroElements(SDValue N,
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
+  assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+         "Illegal split of shuffle value type");
+  unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+
+  // Extract known constant input data.
+  APInt UndefSrcElts[2];
+  SmallVector<APInt, 32> SrcEltBits[2];
+  bool IsSrcConstant[2] = {
+      getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
+                                    SrcEltBits[0], true, false),
+      getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
+                                    SrcEltBits[1], true, false)};
+
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
 
@@ -5677,6 +5842,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
       continue;
 
     // Determine shuffle input and normalize the mask.
+    unsigned SrcIdx = M / Size;
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
@@ -5686,39 +5852,27 @@ static bool setTargetShuffleZeroElements(SDValue N,
       continue;
     }
 
-    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
-    if (V.getOpcode() != ISD::BUILD_VECTOR)
-      continue;
-
-    // If the BUILD_VECTOR has fewer elements then the (larger) source
-    // element must be UNDEF/ZERO.
-    // TODO: Is it worth testing the individual bits of a constant?
-    if ((Size % V.getNumOperands()) == 0) {
-      int Scale = Size / V->getNumOperands();
-      SDValue Op = V.getOperand(M / Scale);
-      if (Op.isUndef())
+    // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
+    // TODO: We currently only set UNDEF for integer types - floats use the same
+    // registers as vectors and many of the scalar folded loads rely on the
+    // SCALAR_TO_VECTOR pattern.
+    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        (Size % V.getValueType().getVectorNumElements()) == 0) {
+      int Scale = Size / V.getValueType().getVectorNumElements();
+      int Idx = M / Scale;
+      if (Idx != 0 && !VT.isFloatingPoint())
         Mask[i] = SM_SentinelUndef;
-      else if (X86::isZeroNode(Op))
+      else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
         Mask[i] = SM_SentinelZero;
       continue;
     }
 
-    // If the BUILD_VECTOR has more elements then all the (smaller) source
-    // elements must be all UNDEF or all ZERO.
-    if ((V.getNumOperands() % Size) == 0) {
-      int Scale = V->getNumOperands() / Size;
-      bool AllUndef = true;
-      bool AllZero = true;
-      for (int j = 0; j < Scale; ++j) {
-        SDValue Op = V.getOperand((M * Scale) + j);
-        AllUndef &= Op.isUndef();
-        AllZero &= X86::isZeroNode(Op);
-      }
-      if (AllUndef)
+    // Attempt to extract from the source's constant bits.
+    if (IsSrcConstant[SrcIdx]) {
+      if (UndefSrcElts[SrcIdx][M])
         Mask[i] = SM_SentinelUndef;
-      else if (AllZero)
+      else if (SrcEltBits[SrcIdx][M] == 0)
         Mask[i] = SM_SentinelZero;
-      continue;
     }
   }
 
@@ -5744,11 +5898,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
-  case ISD::AND: {
+  case ISD::AND:
+  case X86ISD::ANDNP: {
     // Attempt to decode as a per-byte mask.
-    SmallBitVector UndefElts;
+    APInt UndefElts;
     SmallVector<APInt, 32> EltBits;
-    if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    bool IsAndN = (X86ISD::ANDNP == Opcode);
+    uint64_t ZeroMask = IsAndN ? 255 : 0;
+    if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
       return false;
     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
       if (UndefElts[i]) {
@@ -5758,9 +5917,55 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
       uint64_t ByteBits = EltBits[i].getZExtValue();
       if (ByteBits != 0 && ByteBits != 255)
         return false;
-      Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+      Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
     }
-    Ops.push_back(N.getOperand(0));
+    Ops.push_back(IsAndN ? N1 : N0);
+    return true;
+  }
+  case ISD::SCALAR_TO_VECTOR: {
+    // Match against a scalar_to_vector of an extract from a similar vector.
+    SDValue N0 = N.getOperand(0);
+    if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        N0.getOperand(0).getValueType() != VT ||
+        !isa<ConstantSDNode>(N0.getOperand(1)) ||
+        NumElts <= N0.getConstantOperandVal(1) ||
+        !N->isOnlyUserOf(N0.getNode()))
+      return false;
+    Ops.push_back(N0.getOperand(0));
+    Mask.push_back(N0.getConstantOperandVal(1));
+    Mask.append(NumElts - 1, SM_SentinelUndef);
+    return true;
+  }
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW: {
+    SDValue InVec = N.getOperand(0);
+    SDValue InScl = N.getOperand(1);
+    uint64_t InIdx = N.getConstantOperandVal(2);
+    assert(InIdx < NumElts && "Illegal insertion index");
+
+    // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
+    if (X86::isZeroNode(InScl)) {
+      Ops.push_back(InVec);
+      for (unsigned i = 0; i != NumElts; ++i)
+        Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
+      return true;
+    }
+
+    // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
+    // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
+    unsigned ExOp =
+        (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
+    if (InScl.getOpcode() != ISD::AssertZext ||
+        InScl.getOperand(0).getOpcode() != ExOp)
+      return false;
+
+    SDValue ExVec = InScl.getOperand(0).getOperand(0);
+    uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
+    assert(ExIdx < NumElts && "Illegal extraction index");
+    Ops.push_back(InVec);
+    Ops.push_back(ExVec);
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
     return true;
   }
   case X86ISD::VSHLI:
@@ -5795,6 +6000,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     }
     return true;
   }
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VZEXT: {
     // TODO - add support for VPMOVZX with smaller input vector types.
     SDValue Src = N.getOperand(0);
@@ -5810,36 +6016,38 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
   return false;
 }
 
+/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
+                                              SmallVectorImpl<int> &Mask) {
+  int MaskWidth = Mask.size();
+  SmallVector<SDValue, 16> UsedInputs;
+  for (int i = 0, e = Inputs.size(); i < e; ++i) {
+    int lo = UsedInputs.size() * MaskWidth;
+    int hi = lo + MaskWidth;
+    if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+      UsedInputs.push_back(Inputs[i]);
+      continue;
+    }
+    for (int &M : Mask)
+      if (lo <= M)
+        M -= MaskWidth;
+  }
+  Inputs = UsedInputs;
+}
+
 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
 /// remaining input indices in case we now have a unary shuffle and adjust the
-/// Op0/Op1 inputs accordingly.
+/// inputs accordingly.
 /// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
+static bool resolveTargetShuffleInputs(SDValue Op,
+                                       SmallVectorImpl<SDValue> &Inputs,
                                        SmallVectorImpl<int> &Mask) {
-  SmallVector<SDValue, 2> Ops;
-  if (!setTargetShuffleZeroElements(Op, Mask, Ops))
-    if (!getFauxShuffleMask(Op, Mask, Ops))
+  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
+    if (!getFauxShuffleMask(Op, Mask, Inputs))
       return false;
 
-  int NumElts = Mask.size();
-  bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
-    return 0 <= Idx && Idx < NumElts;
-  });
-  bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
-
-  Op0 = Op0InUse ? Ops[0] : SDValue();
-  Op1 = Op1InUse ? Ops[1] : SDValue();
-
-  // We're only using Op1 - commute the mask and inputs.
-  if (!Op0InUse && Op1InUse) {
-    for (int &M : Mask)
-      if (NumElts <= M)
-        M -= NumElts;
-    Op0 = Op1;
-    Op1 = SDValue();
-  }
-
+  resolveTargetShuffleInputsAndMask(Inputs, Mask);
   return true;
 }
 
@@ -5914,10 +6122,9 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
 /// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
-                                       unsigned NumNonZero, unsigned NumZero,
-                                       SelectionDAG &DAG,
-                                       const X86Subtarget &Subtarget,
-                                       const TargetLowering &TLI) {
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   if (NumNonZero > 8)
     return SDValue();
 
@@ -5928,18 +6135,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   // SSE4.1 - use PINSRB to insert each byte directly.
   if (Subtarget.hasSSE41()) {
     for (unsigned i = 0; i < 16; ++i) {
-      bool isNonZero = (NonZeros & (1 << i)) != 0;
-      if (isNonZero) {
+      bool IsNonZero = (NonZeros & (1 << i)) != 0;
+      if (IsNonZero) {
+        // If the build vector contains zeros or our first insertion is not the
+        // first index then insert into zero vector to break any register
+        // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
         if (First) {
-          if (NumZero)
-            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
-          else
-            V = DAG.getUNDEF(MVT::v16i8);
           First = false;
+          if (NumZero || 0 != i)
+            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+          else {
+            assert(0 == i && "Expected insertion into zero-index");
+            V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+            V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+            V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+            V = DAG.getBitcast(MVT::v16i8, V);
+            continue;
+          }
         }
-        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
-                        MVT::v16i8, V, Op.getOperand(i),
-                        DAG.getIntPtrConstant(i, dl));
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
+                        Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
       }
     }
 
@@ -5958,24 +6173,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     }
 
     if ((i & 1) != 0) {
+      // FIXME: Investigate extending to i32 instead of just i16.
+      // FIXME: Investigate combining the first 4 bytes as a i32 instead.
       SDValue ThisElt, LastElt;
-      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
       if (LastIsNonZero) {
-        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
-                              MVT::i16, Op.getOperand(i-1));
+        LastElt =
+            DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
       }
       if (ThisIsNonZero) {
         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
-        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
-                              ThisElt, DAG.getConstant(8, dl, MVT::i8));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
+                              DAG.getConstant(8, dl, MVT::i8));
         if (LastIsNonZero)
           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
       } else
         ThisElt = LastElt;
 
-      if (ThisElt.getNode())
-        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
-                        DAG.getIntPtrConstant(i/2, dl));
+      if (ThisElt) {
+        if (1 == i) {
+          V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
+                      : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
+          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+          V = DAG.getBitcast(MVT::v8i16, V);
+        } else {
+          V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                          DAG.getIntPtrConstant(i / 2, dl));
+        }
+      }
     }
   }
 
@@ -5986,8 +6212,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget,
-                                     const TargetLowering &TLI) {
+                                     const X86Subtarget &Subtarget) {
   if (NumNonZero > 4)
     return SDValue();
 
@@ -5995,18 +6220,26 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
-    bool isNonZero = (NonZeros & (1 << i)) != 0;
-    if (isNonZero) {
+    bool IsNonZero = (NonZeros & (1 << i)) != 0;
+    if (IsNonZero) {
+      // If the build vector contains zeros or our first insertion is not the
+      // first index then insert into zero vector to break any register
+      // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
       if (First) {
-        if (NumZero)
-          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
-        else
-          V = DAG.getUNDEF(MVT::v8i16);
         First = false;
+        if (NumZero || 0 != i)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else {
+          assert(0 == i && "Expected insertion into zero-index");
+          V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+          V = DAG.getBitcast(MVT::v8i16, V);
+          continue;
+        }
       }
-      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
-                      MVT::v8i16, V, Op.getOperand(i),
-                      DAG.getIntPtrConstant(i, dl));
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
+                      Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
     }
   }
 
@@ -6015,8 +6248,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 
 /// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget,
-                                     const TargetLowering &TLI) {
+                                     const X86Subtarget &Subtarget) {
   // Find all zeroable elements.
   std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
@@ -6212,7 +6444,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
 ///
 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
-                                        SDLoc &DL, SelectionDAG &DAG,
+                                        const SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
   unsigned NumElems = Elts.size();
 
@@ -6376,14 +6608,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   return SDValue();
 }
 
-static Constant *getConstantVector(MVT VT, APInt SplatValue,
+static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
                                    unsigned SplatBitSize, LLVMContext &C) {
   unsigned ScalarSize = VT.getScalarSizeInBits();
   unsigned NumElm = SplatBitSize / ScalarSize;
 
   SmallVector<Constant *, 32> ConstantVec;
   for (unsigned i = 0; i < NumElm; i++) {
-    APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+    APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
       assert((ScalarSize == 32 || ScalarSize == 64) &&
@@ -6664,6 +6896,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
     // Quit if non-constant index.
     if (!isa<ConstantSDNode>(ExtIdx))
       return SDValue();
@@ -6694,11 +6927,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
-  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
-    unsigned Idx = InsertIndices[i];
+
+  for (unsigned Idx : InsertIndices)
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
                      DAG.getIntPtrConstant(Idx, DL));
-  }
 
   return NV;
 }
@@ -7347,7 +7579,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
         (VT == MVT::v8i32 && Subtarget.hasInt256()))
       return Op;
 
-    return getOnesVector(VT, Subtarget, DAG, DL);
+    return getOnesVector(VT, DAG, DL);
   }
 
   return SDValue();
@@ -7418,7 +7650,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // a constant pool load than it is to do a movd + shuffle.
     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
-      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
         // Handle SSE only.
         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
         MVT VecVT = MVT::v4i32;
@@ -7561,17 +7793,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
-                                          DAG, Subtarget, *this))
+                                          DAG, Subtarget))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
-                                          DAG, Subtarget, *this))
+                                          DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   if (EVTBits == 32 && NumElems == 4)
-    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
@@ -7767,7 +7999,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
 
   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
   if (V1.isUndef())
-    V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
 
   if (IsZeroV1)
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
@@ -7956,7 +8188,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
               ExpectedBV->getOperand(ExpectedMask[i] % Size))
         return false;
     }
-}
+  }
 
   return true;
 }
@@ -7986,6 +8218,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
   return true;
 }
 
+// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
+// mask.
+static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
+                                                    const APInt &Zeroable) {
+  int NumElts = Mask.size();
+  assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+
+  SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    if (M == SM_SentinelUndef)
+      continue;
+    assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
+    TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+  }
+  return TargetMask;
+}
+
+// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
+// instructions.
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return false;
+
+  SmallVector<int, 8> Unpcklwd;
+  createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
+                          /* Unary = */ false);
+  SmallVector<int, 8> Unpckhwd;
+  createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
+                          /* Unary = */ false);
+  bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
+                         isTargetShuffleEquivalent(Mask, Unpckhwd));
+  return IsUnpackwdMask;
+}
+
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -8009,7 +8276,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   return Imm;
 }
 
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
                                           SelectionDAG &DAG) {
   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
@@ -8022,9 +8289,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
 /// as many lanes with this technique as possible to simplify the remaining
 /// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
-                                                     SDValue V1, SDValue V2) {
-  SmallBitVector Zeroable(Mask.size(), false);
+static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                            SDValue V1, SDValue V2) {
+  APInt Zeroable(Mask.size(), 0);
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
@@ -8039,7 +8306,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
     int M = Mask[i];
     // Handle the easy cases.
     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
-      Zeroable[i] = true;
+      Zeroable.setBit(i);
       continue;
     }
 
@@ -8057,17 +8324,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
       int Scale = Size / V->getNumOperands();
       SDValue Op = V.getOperand(M / Scale);
       if (Op.isUndef() || X86::isZeroNode(Op))
-        Zeroable[i] = true;
+        Zeroable.setBit(i);
       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
         APInt Val = Cst->getAPIntValue();
         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
-        Zeroable[i] = (Val == 0);
+        if (Val == 0)
+          Zeroable.setBit(i);
       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
         APInt Val = Cst->getValueAPF().bitcastToAPInt();
         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
-        Zeroable[i] = (Val == 0);
+        if (Val == 0)
+          Zeroable.setBit(i);
       }
       continue;
     }
@@ -8081,7 +8350,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
         SDValue Op = V.getOperand((M * Scale) + j);
         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
       }
-      Zeroable[i] = AllZeroable;
+      if (AllZeroable)
+        Zeroable.setBit(i);
       continue;
     }
   }
@@ -8096,19 +8366,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
-                                     ArrayRef<int> Mask,const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable,
+                                     ArrayRef<int> Mask, const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
-  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+  for (int i = 0, e = Mask.size(); i < e; i++) {
     // Checks if the mask's zeros elements are built from only zeros.
-    if (Mask[i] == -1)
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] < 0)
       return false;
     if (Zeroable[i])
       continue;
     // Find the lowest non zero element
-    if (NextElement == -1) {
+    if (NextElement < 0) {
       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
       IsZeroSideLeft = NextElement != 0;
     }
@@ -8124,7 +8395,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2,
-                                            const SmallBitVector &Zeroable,
+                                            const APInt &Zeroable,
                                             const X86Subtarget &Subtarget,
                                             SelectionDAG &DAG) {
   int Size = Mask.size();
@@ -8179,19 +8450,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl);
 
-// Function convertBitVectorToUnsigned - The function gets SmallBitVector
-// as argument and convert him to unsigned.
-// The output of the function is not(zeroable)
-static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
-  unsigned convertBit = 0;
-  for (int i = 0, e = Zeroable.size(); i < e; i++)
-    convertBit |= !(Zeroable[i]) << i;
-  return convertBit;
-}
-
 // X86 has dedicated shuffle that can be lowered to VEXPAND
 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
-                                          const SmallBitVector &Zeroable,
+                                          const APInt &Zeroable,
                                           ArrayRef<int> Mask, SDValue &V1,
                                           SDValue &V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
@@ -8199,7 +8460,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
     return SDValue();
-  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  unsigned VEXPANDMask = (~Zeroable).getZExtValue();
   MVT IntegerType =
       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
@@ -8215,6 +8476,91 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                      ZeroVector);
 }
 
+static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+                                        unsigned &UnpackOpcode, bool IsUnary,
+                                        ArrayRef<int> TargetMask, SDLoc &DL,
+                                        SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+
+  bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
+  for (int i = 0; i != NumElts; i += 2) {
+    int M1 = TargetMask[i + 0];
+    int M2 = TargetMask[i + 1];
+    Undef1 &= (SM_SentinelUndef == M1);
+    Undef2 &= (SM_SentinelUndef == M2);
+    Zero1 &= isUndefOrZero(M1);
+    Zero2 &= isUndefOrZero(M2);
+  }
+  assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
+         "Zeroable shuffle detected");
+
+  // Attempt to match the target mask against the unpack lo/hi mask patterns.
+  SmallVector<int, 64> Unpckl, Unpckh;
+  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
+  if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+    UnpackOpcode = X86ISD::UNPCKL;
+    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+    return true;
+  }
+
+  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
+  if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+    UnpackOpcode = X86ISD::UNPCKH;
+    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+    return true;
+  }
+
+  // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
+  if (IsUnary && (Zero1 || Zero2)) {
+    // Don't bother if we can blend instead.
+    if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
+        isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
+      return false;
+
+    bool MatchLo = true, MatchHi = true;
+    for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
+      int M = TargetMask[i];
+
+      // Ignore if the input is known to be zero or the index is undef.
+      if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
+          (M == SM_SentinelUndef))
+        continue;
+
+      MatchLo &= (M == Unpckl[i]);
+      MatchHi &= (M == Unpckh[i]);
+    }
+
+    if (MatchLo || MatchHi) {
+      UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+      V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+      return true;
+    }
+  }
+
+  // If a binary shuffle, commute and try again.
+  if (!IsUnary) {
+    ShuffleVectorSDNode::commuteMask(Unpckl);
+    if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+      UnpackOpcode = X86ISD::UNPCKL;
+      std::swap(V1, V2);
+      return true;
+    }
+
+    ShuffleVectorSDNode::commuteMask(Unpckh);
+    if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+      UnpackOpcode = X86ISD::UNPCKH;
+      std::swap(V1, V2);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -8248,13 +8594,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
 /// one of the inputs being zeroable.
 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
-                                           const SmallBitVector &Zeroable,
+                                           const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
-  SDValue AllOnes =
-      DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   SDValue V;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -8286,10 +8631,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
   MVT EltVT = VT.getVectorElementType();
-  int NumEltBits = EltVT.getSizeInBits();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
-  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
-                                    EltVT);
+  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> MaskOps;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
@@ -8307,51 +8650,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
-/// \brief Try to emit a blend instruction for a shuffle.
-///
-/// This doesn't do any checks for the availability of instructions for blending
-/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
-/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
-                                         SDValue V2, ArrayRef<int> Original,
-                                         const SmallBitVector &Zeroable,
-                                         const X86Subtarget &Subtarget,
-                                         SelectionDAG &DAG) {
-  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
-  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-  SmallVector<int, 8> Mask(Original.begin(), Original.end());
-  bool ForceV1Zero = false, ForceV2Zero = false;
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG);
+
+static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
+                                      MutableArrayRef<int> TargetMask,
+                                      bool &ForceV1Zero, bool &ForceV2Zero,
+                                      uint64_t &BlendMask) {
+  bool V1IsZeroOrUndef =
+      V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZeroOrUndef =
+      V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
+
+  BlendMask = 0;
+  ForceV1Zero = false, ForceV2Zero = false;
+  assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
 
   // Attempt to generate the binary blend mask. If an input is zero then
   // we can use any lane.
   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
-  unsigned BlendMask = 0;
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    int M = Mask[i];
-    if (M < 0)
+  for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+    int M = TargetMask[i];
+    if (M == SM_SentinelUndef)
       continue;
     if (M == i)
       continue;
     if (M == i + Size) {
-      BlendMask |= 1u << i;
+      BlendMask |= 1ull << i;
       continue;
     }
-    if (Zeroable[i]) {
-      if (V1IsZero) {
+    if (M == SM_SentinelZero) {
+      if (V1IsZeroOrUndef) {
         ForceV1Zero = true;
-        Mask[i] = i;
+        TargetMask[i] = i;
         continue;
       }
-      if (V2IsZero) {
+      if (V2IsZeroOrUndef) {
         ForceV2Zero = true;
-        BlendMask |= 1u << i;
-        Mask[i] = i + Size;
+        BlendMask |= 1ull << i;
+        TargetMask[i] = i + Size;
         continue;
       }
     }
-    return SDValue(); // Shuffled input!
+    return false;
   }
+  return true;
+}
+
+uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
+  uint64_t ScaledMask = 0;
+  for (int i = 0; i != Size; ++i)
+    if (BlendMask & (1ull << i))
+      ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
+  return ScaledMask;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Original,
+                                         const APInt &Zeroable,
+                                         const X86Subtarget &Subtarget,
+                                         SelectionDAG &DAG) {
+  SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
+
+  uint64_t BlendMask = 0;
+  bool ForceV1Zero = false, ForceV2Zero = false;
+  if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+                                 BlendMask))
+    return SDValue();
 
   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   if (ForceV1Zero)
@@ -8359,15 +8732,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   if (ForceV2Zero)
     V2 = getZeroVector(VT, Subtarget, DAG, DL);
 
-  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
-    unsigned ScaledMask = 0;
-    for (int i = 0; i != Size; ++i)
-      if (BlendMask & (1u << i))
-        for (int j = 0; j != Scale; ++j)
-          ScaledMask |= 1u << (i * Scale + j);
-    return ScaledMask;
-  };
-
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
@@ -8387,7 +8751,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     if (Subtarget.hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+      BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
       V1 = DAG.getBitcast(BlendVT, V1);
       V2 = DAG.getBitcast(BlendVT, V2);
@@ -8400,7 +8764,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     // For integer shuffles we need to expand the mask and cast the inputs to
     // v8i16s prior to blending.
     int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+    BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = DAG.getBitcast(MVT::v8i16, V2);
     return DAG.getBitcast(VT,
@@ -8417,7 +8781,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
       BlendMask = 0;
       for (int i = 0; i < 8; ++i)
         if (RepeatedMask[i] >= 8)
-          BlendMask |= 1u << i;
+          BlendMask |= 1ull << i;
       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                          DAG.getConstant(BlendMask, DL, MVT::i8));
     }
@@ -8428,6 +8792,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
+    if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
+      MVT IntegerType =
+          MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+      SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+      return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+    }
+
     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
     if (SDValue Masked =
             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
@@ -8465,7 +8836,17 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
         VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
                         DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
   }
-
+  case MVT::v16f32:
+  case MVT::v8f64:
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8: {
+    MVT IntegerType =
+        MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+    SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+    return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+  }
   default:
     llvm_unreachable("Not a supported integer vector type!");
   }
@@ -8503,7 +8884,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
 
-/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// \brief Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
@@ -8757,7 +9138,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
                                      unsigned ScalarSizeInBits,
                                      ArrayRef<int> Mask, int MaskOffset,
-                                     const SmallBitVector &Zeroable,
+                                     const APInt &Zeroable,
                                      const X86Subtarget &Subtarget) {
   int Size = Mask.size();
   unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -8819,7 +9200,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
 
 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
-                                         const SmallBitVector &Zeroable,
+                                         const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   int Size = Mask.size();
@@ -8855,12 +9236,12 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
-                                           const SmallBitVector &Zeroable,
+                                           const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
-  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+  assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
@@ -8987,7 +9368,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
 /// features of the subtarget. The extended elements are consecutive and
-/// begin and can start from an offseted element index in the input; to
+/// begin and can start from an offsetted element index in the input; to
 /// avoid excess shuffling the offset must either being in the bottom lane
 /// or at the start of a higher lane. All extended elements must be from
 /// the same lane.
@@ -9027,21 +9408,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget.hasSSE41()) {
-    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
     if (Offset && Scale == 2 && VT.is128BitVector())
       return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
-
-    // For 256-bit vectors, we only need the lower (128-bit) input half.
-    // For 512-bit vectors, we only need the lower input half or quarter.
-    if (VT.getSizeInBits() > 128)
-      InputV = extractSubVector(InputV, 0, DAG, DL,
-                                std::max(128, (int)VT.getSizeInBits() / Scale));
-
-    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+    InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -9158,7 +9532,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
@@ -9314,7 +9688,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
@@ -9612,7 +9986,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     if (((BroadcastIdx * EltSize) % 128) != 0)
       return SDValue();
 
-    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+    // The shuffle input might have been a bitcast we looked through; look at
+    // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
+    // later bitcast it to BroadcastVT.
+    MVT SrcVT = V.getSimpleValueType();
+    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+           "Unexpected vector element size");
+    assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
+           "Unexpected vector size");
+
+    MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
                     DAG.getIntPtrConstant(BroadcastIdx, DL));
   }
@@ -9642,6 +10025,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   }
 
+  // We only support broadcasting from 128-bit vectors to minimize the
+  // number of patterns we need to deal with in isel. So extract down to
+  // 128-bits.
+  if (SrcVT.getSizeInBits() > 128)
+    V = extract128BitVector(V, 0, DAG, DL);
+
   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
@@ -9653,7 +10042,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
 // elements are zeroable.
 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                          unsigned &InsertPSMask,
-                                         const SmallBitVector &Zeroable,
+                                         const APInt &Zeroable,
                                          ArrayRef<int> Mask,
                                          SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
@@ -9742,7 +10131,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
 
 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
-                                            const SmallBitVector &Zeroable,
+                                            const APInt &Zeroable,
                                             SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
@@ -9877,7 +10266,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
 /// it is better to avoid lowering through this for integer vectors where
 /// possible.
 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -9959,7 +10348,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// it falls back to the floating point shuffle operation with appropriate bit
 /// casting.
 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10178,7 +10567,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10261,7 +10650,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10353,7 +10742,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
-  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // up the inputs, bypassing domain shift penalties that we would incur if we
   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   // relevant.
   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
@@ -10384,18 +10773,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
-  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+  assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
 
   SmallVector<int, 4> LoInputs;
-  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
-               [](int M) { return M >= 0; });
+  copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
   std::sort(LoInputs.begin(), LoInputs.end());
   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   SmallVector<int, 4> HiInputs;
-  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
-               [](int M) { return M >= 0; });
+  copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
   std::sort(HiInputs.begin(), HiInputs.end());
   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   int NumLToL =
@@ -10574,7 +10961,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   };
   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
-  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+  if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
 
   // At this point there are at most two inputs to the low and high halves from
@@ -10830,7 +11217,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
 /// blend if only one input is used.
 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
     bool &V2InUse) {
   SDValue V1Mask[16];
   SDValue V2Mask[16];
@@ -10891,7 +11278,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
 /// halves of the inputs separately (making them have relatively few inputs)
 /// and then concatenate them.
 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -11075,7 +11462,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
 /// back together.
 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -11132,14 +11519,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (!canWidenViaDuplication(Mask))
         return SDValue();
       SmallVector<int, 4> LoInputs;
-      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
-                   [](int M) { return M >= 0 && M < 8; });
+      copy_if(Mask, std::back_inserter(LoInputs),
+              [](int M) { return M >= 0 && M < 8; });
       std::sort(LoInputs.begin(), LoInputs.end());
       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
                      LoInputs.end());
       SmallVector<int, 4> HiInputs;
-      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
-                   [](int M) { return M >= 8; });
+      copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
       std::sort(HiInputs.begin(), HiInputs.end());
       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
                      HiInputs.end());
@@ -11193,7 +11579,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
             PostDupI16Shuffle[i / 2] = MappedMask;
           else
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
-                   "Conflicting entrties in the original shuffle!");
+                   "Conflicting entries in the original shuffle!");
         }
       return DAG.getBitcast(
           MVT::v16i8,
@@ -11365,7 +11751,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// dispatches to the lowering routines accordingly.
 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   switch (VT.SimpleTy) {
@@ -11621,7 +12007,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
 /// \brief Handle lowering 2-lane 128-bit shuffles.
 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   SmallVector<int, 4> WidenedMask;
@@ -12091,7 +12477,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
                                          unsigned &ShuffleImm,
                                          ArrayRef<int> Mask) {
   int NumElts = VT.getVectorNumElements();
-  assert(VT.getScalarType() == MVT::f64 &&
+  assert(VT.getScalarSizeInBits() == 64 &&
          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected data type for VSHUFPD");
 
@@ -12127,6 +12513,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
+  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+         "Unexpected data type for VSHUFPD");
+
   unsigned Immediate = 0;
   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
     return SDValue();
@@ -12153,7 +12542,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12250,7 +12639,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12338,7 +12727,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12414,6 +12803,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
+  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+  // since after split we get a more efficient code using vpunpcklwd and
+  // vpunpckhwd instrs than vblend.
+  if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+    if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
+                                                     Mask, DAG))
+      return V;
+
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
@@ -12429,7 +12826,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12445,6 +12842,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
+  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+  // since after split we get a more efficient code than vblend by using
+  // vpunpcklwd and vpunpckhwd instrs.
+  if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+      !Subtarget.hasAVX512())
+    if (SDValue V =
+            lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
+      return V;
+
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
@@ -12533,7 +12939,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12619,7 +13025,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12692,7 +13098,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// together based on the available instructions.
 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // If we have a single input to the zero element, insert that into V1 if we
@@ -12844,7 +13250,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12891,12 +13297,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V2, DAG, Subtarget))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12925,6 +13335,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
       return Unpck;
 
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
@@ -12938,7 +13352,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12994,12 +13408,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V2, DAG, Subtarget))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -13062,12 +13480,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -13109,12 +13530,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     }
   }
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -13159,6 +13584,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
@@ -13170,7 +13599,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// together based on the available instructions.
 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
@@ -13251,7 +13680,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (ISD::isBuildVectorAllZeros(V1.getNode()))
     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
-    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+    V1 = getOnesVector(ExtVT, DAG, DL);
   else
     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
 
@@ -13260,7 +13689,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
-    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+    V2 = getOnesVector(ExtVT, DAG, DL);
   else
     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
 
@@ -13392,8 +13821,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   // We actually see shuffles that are entirely re-arrangements of a set of
   // zero inputs. This mostly happens while decomposing complex shuffles into
   // simple ones. Directly lower these as a buildvector of zeros.
-  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-  if (Zeroable.all())
+  APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   // Try to collapse shuffles into using a vector type with fewer elements but
@@ -13569,10 +13998,14 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
          "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
-  // extend vector to VR512
+  // extend vector to VR512/128
   if (!isa<ConstantSDNode>(Idx)) {
-    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    unsigned NumElts = VecVT.getVectorNumElements();
+    // Extending v8i1/v16i1 to 512-bit get better performance on KNL
+    // than extending to 128/256bit.
+    unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               ExtVT.getVectorElementType(), Ext, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
@@ -13590,9 +14023,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   }
   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
   if (MaxSift - IdxVal)
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                       DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
-  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                     DAG.getConstant(MaxSift, dl, MVT::i8));
   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
                        DAG.getIntPtrConstant(0, dl));
@@ -13610,24 +14043,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return ExtractBitFromMaskVector(Op, DAG);
 
   if (!isa<ConstantSDNode>(Idx)) {
-    if (VecVT.is512BitVector() ||
-        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
-         VecVT.getScalarSizeInBits() == 32)) {
-
-      MVT MaskEltVT =
-        MVT::getIntegerVT(VecVT.getScalarSizeInBits());
-      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
-                                    MaskEltVT.getSizeInBits());
+    // Its more profitable to go through memory (1 cycles throughput)
+    // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+    // IACA tool was used to get performance estimation
+    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+    //
+    // example : extractelement <16 x i8> %a, i32 %i
+    //
+    // Block Throughput: 3.00 Cycles
+    // Throughput Bottleneck: Port5
+    //
+    // | Num Of |   Ports pressure in cycles  |    |
+    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
+    // ---------------------------------------------
+    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
+    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
+    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
+    // Total Num Of Uops: 4
+    //
+    //
+    // Block Throughput: 1.00 Cycles
+    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+    //
+    // |    |  Ports pressure in cycles   |  |
+    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
+    // ---------------------------------------------------------
+    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
+    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
+    // Total Num Of Uops: 4
 
-      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
-      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
-                                 DAG.getConstant(0, dl, PtrVT));
-      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
-                         DAG.getConstant(0, dl, PtrVT));
-    }
     return SDValue();
   }
 
@@ -13675,7 +14120,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
-  // TODO: handle v16i8.
+  // TODO: We only extract a single element from v16i8, we can probably afford
+  // to be more aggressive here before using the default approach of spilling to
+  // stack.
+  if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+    // Extract either the lowest i32 or any i16, and extract the sub-byte.
+    int DWordIdx = IdxVal / 4;
+    if (DWordIdx == 0) {
+      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                DAG.getBitcast(MVT::v4i32, Vec),
+                                DAG.getIntPtrConstant(DWordIdx, dl));
+      int ShiftVal = (IdxVal % 4) * 8;
+      if (ShiftVal != 0)
+        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+                          DAG.getConstant(ShiftVal, dl, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    }
+
+    int WordIdx = IdxVal / 2;
+    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                              DAG.getBitcast(MVT::v8i16, Vec),
+                              DAG.getIntPtrConstant(WordIdx, dl));
+    int ShiftVal = (IdxVal % 2) * 8;
+    if (ShiftVal != 0)
+      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+                        DAG.getConstant(ShiftVal, dl, MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+  }
 
   if (VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
@@ -13734,7 +14205,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
 
   if(Vec.isUndef()) {
     if (IdxVal)
-      EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+      EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                              DAG.getConstant(IdxVal, dl, MVT::i8));
     return EltInVec;
   }
@@ -13744,21 +14215,21 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   if (IdxVal == 0 ) {
     // EltInVec already at correct index and other bits are 0.
     // Clean the first bit in source vector.
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                       DAG.getConstant(1 , dl, MVT::i8));
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                       DAG.getConstant(1, dl, MVT::i8));
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
   if (IdxVal == NumElems -1) {
     // Move the bit to the last position inside the vector.
-    EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                            DAG.getConstant(IdxVal, dl, MVT::i8));
     // Clean the last bit in the source vector.
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                            DAG.getConstant(1, dl, MVT::i8));
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                            DAG.getConstant(1 , dl, MVT::i8));
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
@@ -13790,17 +14261,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   auto *N2C = cast<ConstantSDNode>(N2);
   unsigned IdxVal = N2C->getZExtValue();
 
-  // If we are clearing out a element, we do this more efficiently with a
-  // blend shuffle than a costly integer insertion.
-  // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+  bool IsZeroElt = X86::isZeroNode(N1);
+  bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
+
+  // If we are inserting a element, see if we can do this more efficiently with
+  // a blend shuffle with a rematerializable vector than a costly integer
+  // insertion.
   // TODO: pre-SSE41 targets will tend to use bit masking - this could still
   // be beneficial if we are inserting several zeros and can combine the masks.
-  if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
-    SmallVector<int, 8> ClearMask;
+  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+    SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
-      ClearMask.push_back(i == IdxVal ? i + NumElts : i);
-    SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+      BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+    SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+                                  : DAG.getConstant(-1, dl, VT);
+    return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
   }
 
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
@@ -13837,25 +14312,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  if (Subtarget.hasSSE41()) {
-    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
-      unsigned Opc;
-      if (VT == MVT::v8i16) {
-        Opc = X86ISD::PINSRW;
-      } else {
-        assert(VT == MVT::v16i8);
-        Opc = X86ISD::PINSRB;
-      }
-
-      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
-      // argument.
-      if (N1.getValueType() != MVT::i32)
-        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-      if (N2.getValueType() != MVT::i32)
-        N2 = DAG.getIntPtrConstant(IdxVal, dl);
-      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+  // argument. SSE41 required for pinsrb.
+  if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
+    unsigned Opc;
+    if (VT == MVT::v8i16) {
+      assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
+      Opc = X86ISD::PINSRW;
+    } else {
+      assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
+      assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
+      Opc = X86ISD::PINSRB;
     }
 
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(IdxVal, dl);
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  }
+
+  if (Subtarget.hasSSE41()) {
     if (EltVT == MVT::f32) {
       // Bits [7:6] of the constant are the source select. This will always be
       //   zero here. The DAG Combiner may combine an extract_elt index into
@@ -13885,36 +14362,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
     }
 
-    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
-      // PINSR* works with constant index.
+    // PINSR* works with constant index.
+    if (EltVT == MVT::i32 || EltVT == MVT::i64)
       return Op;
-    }
   }
 
-  if (EltVT == MVT::i8)
-    return SDValue();
-
-  if (EltVT.getSizeInBits() == 16) {
-    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
-    // as its second argument.
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(IdxVal, dl);
-    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
-  }
   return SDValue();
 }
 
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT OpVT = Op.getSimpleValueType();
 
+  // It's always cheaper to replace a xor+movd with xorps and simplifies further
+  // combines.
+  if (X86::isZeroNode(Op.getOperand(0)))
+    return getZeroVector(OpVT, Subtarget, DAG, dl);
+
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
-    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    unsigned SizeFactor = OpVT.getSizeInBits() / 128;
     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
                                  OpVT.getVectorNumElements() / SizeFactor);
 
@@ -13923,9 +14393,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
     // Insert the 128-bit vector.
     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+
+  // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
+  if (OpVT == MVT::v4i32)
+    return Op;
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(OpVT.is128BitVector() && "Expected an SSE type!");
   return DAG.getBitcast(
       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
 }
@@ -13947,20 +14421,14 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
           In.getSimpleValueType().is512BitVector()) &&
          "Can only extract from 256-bit or 512-bit vectors");
 
-  if (ResVT.is128BitVector())
-    return extract128BitVector(In, IdxVal, DAG, dl);
-  if (ResVT.is256BitVector())
-    return extract256BitVector(In, IdxVal, DAG, dl);
-
-  llvm_unreachable("Unimplemented!");
-}
+  // If the input is a buildvector just emit a smaller one.
+  unsigned ElemsPerChunk = ResVT.getVectorNumElements();
+  if (In.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT,
+                       makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
 
-static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
-    if (llvm::all_of(ValidUsers,
-                     [&I](SDValue V) { return V.getNode() != *I; }))
-      return false;
-  return true;
+  // Everything else is legal.
+  return Op;
 }
 
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
@@ -13968,83 +14436,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
 // the upper bits of a vector.
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
-  assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
-
-  SDLoc dl(Op);
-  SDValue Vec = Op.getOperand(0);
-  SDValue SubVec = Op.getOperand(1);
-  SDValue Idx = Op.getOperand(2);
-
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  MVT OpVT = Op.getSimpleValueType();
-  MVT SubVecVT = SubVec.getSimpleValueType();
-
-  if (OpVT.getVectorElementType() == MVT::i1)
-    return insert1BitVector(Op, DAG, Subtarget);
-
-  assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
-         "Can only insert into 256-bit or 512-bit vectors");
+  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
 
-  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
-  // load:
-  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
-  //                   (load16 addr + 16), Elts/2)
-  // --> load32 addr
-  // or:
-  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
-  //                   (load32 addr + 32), Elts/2)
-  // --> load64 addr
-  // or a 16-byte or 32-byte broadcast:
-  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
-  //                   (load16 addr), Elts/2)
-  // --> X86SubVBroadcast(load16 addr)
-  // or:
-  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
-  //                   (load32 addr), Elts/2)
-  // --> X86SubVBroadcast(load32 addr)
-  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
-      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
-    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
-    if (Idx2 && Idx2->getZExtValue() == 0) {
-      SDValue SubVec2 = Vec.getOperand(1);
-      // If needed, look through bitcasts to get to the load.
-      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
-        bool Fast;
-        unsigned Alignment = FirstLd->getAlignment();
-        unsigned AS = FirstLd->getAddressSpace();
-        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
-        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
-                                    OpVT, AS, Alignment, &Fast) && Fast) {
-          SDValue Ops[] = {SubVec2, SubVec};
-          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
-            return Ld;
-        }
-      }
-      // If lower/upper loads are the same and the only users of the load, then
-      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
-      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
-        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
-            areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
-          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
-        }
-      }
-      // If this is subv_broadcast insert into both halves, use a larger
-      // subv_broadcast.
-      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
-        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
-                           SubVec.getOperand(0));
-      }
-    }
-  }
-
-  if (SubVecVT.is128BitVector())
-    return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
-  if (SubVecVT.is256BitVector())
-    return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
-  llvm_unreachable("Unimplemented!");
+  return insert1BitVector(Op, DAG, Subtarget);
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
@@ -14062,7 +14456,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
@@ -14438,7 +14832,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       Subtarget.isTargetWindowsItanium() ||
       Subtarget.isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
-    // Need to generate someting similar to:
+    // Need to generate something similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
     //                                  ; from TEB
     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
@@ -15489,32 +15883,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     // word to byte only under BWI
     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
-                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+                         getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   }
 
-  // Truncate with PACKSS if we are truncating a vector comparison result.
-  // TODO: We should be able to support other operations as long as we
-  // we are saturating+packing zero/all bits only.
-  auto IsPackableComparison = [](SDValue V) {
-    unsigned Opcode = V.getOpcode();
-    return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
-            Opcode == X86ISD::CMPP);
-  };
-
-  if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
-                                   all_of(In->ops(), IsPackableComparison))) {
+  // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
+  if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
     if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
       return V;
-  }
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v8i32, In);
-      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
-                                ShufMask);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
@@ -15530,30 +15913,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
-    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       In = DAG.getBitcast(MVT::v32i8, In);
 
-      SmallVector<SDValue,32> pshufbMask;
-      for (unsigned i = 0; i < 2; ++i) {
-        pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
-        for (unsigned j = 0; j < 8; ++j)
-          pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
-      }
-      SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
-      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      // The PSHUFB mask:
+      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
+                                      -1, -1, -1, -1, -1, -1, -1, -1,
+                                      16, 17, 20, 21, 24, 25, 28, 29,
+                                      -1, -1, -1, -1, -1, -1, -1, -1 };
+      In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
-      static const int ShufMask[] = {0,  2,  -1,  -1};
-      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
-                                ShufMask);
+      static const int ShufMask2[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
       return DAG.getBitcast(VT, In);
@@ -15572,9 +15945,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
-    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
 
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -15598,17 +15970,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   // Prepare truncation shuffle mask
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
-  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
-                                   DAG.getUNDEF(NVT), MaskVec);
+  In = DAG.getBitcast(NVT, In);
+  SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
 }
 
-SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                          const X86Subtarget &Subtarget,
-                                          SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
-
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) {
@@ -15616,8 +15985,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
     SDValue Src = Op.getOperand(0);
     SDLoc dl(Op);
     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
-      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
-                         dl, VT,
+      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                      DAG.getUNDEF(MVT::v2f32)));
     }
@@ -15891,7 +16259,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
 
-  // If more than one full vectors are evaluated, OR them first before PTEST.
+  // If more than one full vector is evaluated, OR them first before PTEST.
   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
     // Each iteration will OR 2 nodes and append the result until there is only
     // 1 node left, i.e. the final OR'd value of all vectors.
@@ -15900,8 +16268,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
-                     VecIns.back(), VecIns.back());
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
 }
 
 /// \brief return true if \c Op has a use that doesn't just read flags.
@@ -16366,7 +16733,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
-/// multplication by a reciprocal. This may need to be adjusted for a given
+/// multiplication by a reciprocal. This may need to be adjusted for a given
 /// CPU if a division's cost is not at least twice the cost of a multiplication.
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
@@ -17241,12 +17608,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+  // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
+  // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
   if (Cond.getOpcode() == X86ISD::SETCC &&
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
-
-    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+    unsigned CondCode =
+        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
@@ -17283,6 +17652,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       if (!isNullConstant(Op2))
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
+    } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+               Cmp.getOperand(0).getOpcode() == ISD::AND &&
+               isOneConstant(Cmp.getOperand(0).getOperand(1))) {
+      SDValue CmpOp0 = Cmp.getOperand(0);
+      SDValue Src1, Src2;
+      // true if Op2 is XOR or OR operator and one of its operands
+      // is equal to Op1
+      // ( a , a op b) || ( b , a op b)
+      auto isOrXorPattern = [&]() {
+        if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
+            (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
+          Src1 =
+              Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
+          Src2 = Op1;
+          return true;
+        }
+        return false;
+      };
+
+      if (isOrXorPattern()) {
+        SDValue Neg;
+        unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
+        // we need mask of all zeros or ones with same size of the other
+        // operands.
+        if (CmpSz > VT.getSizeInBits())
+          Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
+        else if (CmpSz < VT.getSizeInBits())
+          Neg = DAG.getNode(ISD::AND, DL, VT,
+              DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
+              DAG.getConstant(1, DL, VT));
+        else
+          Neg = CmpOp0;
+        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                   Neg); // -(and (x, 0x1))
+        SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+        return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
+      }
     }
   }
 
@@ -17423,17 +17829,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   // SKX processor
   if ((InVTElt == MVT::i1) &&
-      (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
-        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
-
-       ((Subtarget.hasBWI() && VT.is512BitVector() &&
-        VTElt.getSizeInBits() <= 16)) ||
+      (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
 
-       ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
-        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+       ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
 
-       ((Subtarget.hasDQI() && VT.is512BitVector() &&
-        VTElt.getSizeInBits() >= 32))))
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -17441,8 +17840,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   if (VT.is512BitVector() && InVTElt != MVT::i1 &&
       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
-      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
-    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+      return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
+    return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
   }
 
   if (InVTElt != MVT::i1)
@@ -17454,10 +17853,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   SDValue V;
   if (Subtarget.hasDQI()) {
-    V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+    V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
     assert(!VT.is512BitVector() && "Unexpected vector type");
   } else {
-    SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+    SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
     V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
     if (ExtVT == VT)
@@ -17506,11 +17905,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
 
-  // SSE41 targets can use the pmovsx* instructions directly.
-  unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
-                      X86ISD::VSEXT : X86ISD::VZEXT;
-  if (Subtarget.hasSSE41())
+  // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
+  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
+  // need to be handled here for 256/512-bit results.
+  if (Subtarget.hasInt256()) {
+    assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+    unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+                        X86ISD::VSEXT : X86ISD::VZEXT;
     return DAG.getNode(ExtOpc, dl, VT, In);
+  }
 
   // We should only get here for sign extend.
   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
@@ -17595,8 +17998,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
-  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
-  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+  OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
+  OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -17674,7 +18077,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
   MVT VT = Op.getValueType().getSimpleVT();
   unsigned NumElts = VT.getVectorNumElements();
 
-  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+  if ((Subtarget.hasBWI() && NumElts >= 32) ||
+      (Subtarget.hasDQI() && NumElts < 16) ||
       NumElts == 16) {
     // Load and extend - everything is legal
     if (NumElts < 8) {
@@ -17703,7 +18107,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
 
   if (NumElts <= 8) {
     // A subset, assume that we have only AVX-512F
-    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+    unsigned NumBitsToLoad = 8;
     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
                               Ld->getBasePtr(),
@@ -17911,7 +18315,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
   if (Ext == ISD::SEXTLOAD) {
     // If we have SSE4.1, we can directly emit a VSEXT node.
     if (Subtarget.hasSSE41()) {
-      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
       return Sext;
     }
@@ -18469,6 +18873,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
+  // Bitcast the source vector to the output type, this is mainly necessary for
+  // vXi8/vXi64 shifts.
+  if (VT != SrcOp.getSimpleValueType())
+    SrcOp = DAG.getBitcast(VT, SrcOp);
+
   // Fold this packed shift into its first operand if ShiftAmt is 0.
   if (ShiftAmt == 0)
     return SrcOp;
@@ -18485,9 +18894,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
          && "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
-  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
-  if (VT == SrcOp.getSimpleValueType() &&
-      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+  // vector of Constants or UNDEFs.
+  if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     SmallVector<SDValue, 8> Elts;
     unsigned NumElts = SrcOp->getNumOperands();
     ConstantSDNode *ND;
@@ -18578,11 +18986,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     ShAmt = ShAmt.getOperand(0);
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
-    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -18853,6 +19261,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       SDValue Src2 = Op.getOperand(2);
       SDValue passThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd))
+          return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                                  dl, VT, Src1, Src2, Rnd),
+                                      Mask, passThru, Subtarget, DAG);
+      }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
                                   Mask, passThru, Subtarget, DAG);
     }
@@ -19306,6 +19722,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                 Src2, Src1);
       return DAG.getBitcast(VT, Res);
     }
+    case MASK_BINOP: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
+      return DAG.getBitcast(VT, Res);
+    }
     case FIXUPIMMS:
     case FIXUPIMMS_MASKZ:
     case FIXUPIMM:
@@ -19478,6 +19903,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
+  case Intrinsic::x86_avx512_knot_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
+    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
+  case Intrinsic::x86_avx512_kandn_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    // Invert LHS for the not.
+    LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
+                      DAG.getConstant(1, dl, MVT::v16i1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
+  case Intrinsic::x86_avx512_kxnor_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+    // Invert result for the not.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
+                      DAG.getConstant(1, dl, MVT::v16i1));
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -19603,6 +20055,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
   }
 }
 
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                                 SDValue Src, SDValue Mask, SDValue Base,
+                                 SDValue Index, SDValue ScaleOp, SDValue Chain,
+                                 const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  EVT MaskVT = Mask.getValueType();
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  // If source is undef or we know it won't be used, use a zero vector
+  // to break register dependency.
+  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+  SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, dl);
+}
+
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
@@ -19617,7 +20091,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  if (Src.isUndef())
+  // If source is undef or we know it won't be used, use a zero vector
+  // to break register dependency.
+  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -19656,7 +20133,6 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-  //SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
@@ -19928,6 +20404,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
+  case GATHER_AVX2: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Mask  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+                             Scale, Chain, Subtarget);
+  }
   case GATHER: {
   //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
@@ -19953,8 +20439,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
-    assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
-    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+    assert((HintVal == 2 || HintVal == 3) &&
+           "Wrong prefetch hint in intrinsic: should be 2 or 3");
+    unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
     SDValue Mask  = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
@@ -20368,7 +20855,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
       FunctionType *FTy = Func->getFunctionType();
-      const AttributeSet &Attrs = Func->getAttributes();
+      const AttributeList &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
@@ -20802,9 +21289,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
-static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
-  if (Op.getValueType() == MVT::i1)
-    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  if (VT.getScalarType() == MVT::i1)
+    return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
                        Op.getOperand(0), Op.getOperand(1));
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
@@ -20812,14 +21300,23 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   return Lower256IntArith(Op, DAG);
 }
 
-static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
-  if (Op.getValueType() == MVT::i1)
-    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(0), Op.getOperand(1));
+static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
-  return Lower256IntArith(Op, DAG);
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  SDLoc dl(Op);
+  SDValue Src = Op.getOperand(0);
+  SDValue Lo = extract128BitVector(Src, 0, DAG, dl);
+  SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(ISD::ABS, dl, NewVT, Lo),
+                     DAG.getNode(ISD::ABS, dl, NewVT, Hi));
 }
 
 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
@@ -20834,7 +21331,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
-  if (VT == MVT::i1)
+  if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
 
   // Decompose 256-bit ops into smaller 128-bit ops.
@@ -20874,8 +21371,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
     // Extract the lo parts and sign extend to i16
     SDValue ALo, BLo;
     if (Subtarget.hasSSE41()) {
-      ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
-      BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+      ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
+      BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                               -1, 4, -1, 5, -1, 6, -1, 7};
@@ -20894,8 +21391,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                               -1, -1, -1, -1, -1, -1, -1, -1};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
-      BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+      AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
+      BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                               -1, 12, -1, 13, -1, 14, -1, 15};
@@ -21056,8 +21553,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
     }
 
-    SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
-    SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+    SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
+    SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
                                DAG.getConstant(8, dl, MVT::v16i16));
@@ -21073,8 +21570,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   // Extract the lo parts and zero/sign extend to i16.
   SDValue ALo, BLo;
   if (Subtarget.hasSSE41()) {
-    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
-    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+    ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
+    BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
   } else {
     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                             -1, 4, -1, 5, -1, 6, -1, 7};
@@ -21093,8 +21590,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                             -1, -1, -1, -1, -1, -1, -1, -1};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
-    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+    AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
+    BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
   } else {
     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                             -1, 12, -1, 13, -1, 14, -1, 15};
@@ -21148,8 +21645,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
                            MachinePointerInfo(), /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
-    Entry.isSExt = false;
-    Entry.isZExt = false;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
@@ -21157,11 +21654,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
                                          getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(getLibcallCallingConv(LC),
-               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, std::move(Args))
-    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(
+          getLibcallCallingConv(LC),
+          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+          std::move(Args))
+      .setInRegister()
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return DAG.getBitcast(VT, CallInfo.first);
@@ -21269,15 +21770,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
-  if (VT.is512BitVector() &&
+  if (VT.is512BitVector() && Subtarget.hasAVX512() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
-  bool LShift = VT.is128BitVector() ||
-    (VT.is256BitVector() && Subtarget.hasInt256());
+  bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
+                (VT.is256BitVector() && Subtarget.hasInt256());
 
-  bool AShift = LShift && (Subtarget.hasVLX() ||
-    (VT != MVT::v2i64 && VT != MVT::v4i64));
+  bool AShift = LShift && (Subtarget.hasAVX512() ||
+                           (VT != MVT::v2i64 && VT != MVT::v4i64));
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
@@ -21301,7 +21802,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
     return false;
 
-  if (VT.is512BitVector() || Subtarget.hasVLX())
+  if (Subtarget.hasAVX512())
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -22062,10 +22563,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
     if (isOneConstant(RHS)) {
-        BaseOp = X86ISD::INC;
-        Cond = X86::COND_O;
-        break;
-      }
+      BaseOp = X86ISD::INC;
+      Cond = X86::COND_O;
+      break;
+    }
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
     break;
@@ -22077,10 +22578,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
     // A subtract of one will be selected as a DEC. Note that DEC doesn't
     // set CF, so we can't do this for USUBO.
     if (isOneConstant(RHS)) {
-        BaseOp = X86ISD::DEC;
-        Cond = X86::COND_O;
-        break;
-      }
+      BaseOp = X86ISD::DEC;
+      Cond = X86::COND_O;
+      break;
+    }
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_O;
     break;
@@ -22470,7 +22971,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   // index into a in-register pre-computed pop count table. We then split up the
   // input vector in two new ones: (1) a vector with only the shifted-right
   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
-  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
   //
@@ -22867,8 +23368,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   bool isF64 = ArgVT == MVT::f64;
@@ -22885,8 +23386,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
     : (Type*)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -23086,7 +23588,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
@@ -23142,7 +23644,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
@@ -23202,7 +23704,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
 
-    // The pass-thru value
+    // The pass-through value
     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
     Src0 = ExtendToType(Src0, NewVT, DAG);
 
@@ -23284,7 +23786,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
-  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
@@ -23303,7 +23805,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, Subtarget, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   case ISD::FABS:
@@ -23360,12 +23862,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::ADD:                return LowerADD(Op, DAG);
-  case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::ADD:
+  case ISD::SUB:                return LowerADD_SUB(Op, DAG);
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
+  case ISD::ABS:                return LowerABS(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
@@ -23768,7 +24271,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
-  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
@@ -23779,16 +24281,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::ABS:                return "X86ISD::ABS";
   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMAXS:              return "X86ISD::FMAXS";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
+  case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMINS:              return "X86ISD::FMINS";
   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
+  case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
-  case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
+  case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
@@ -23827,7 +24332,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
-  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
@@ -23876,6 +24380,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   case X86ISD::KTEST:              return "X86ISD::KTEST";
+  case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
+  case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
@@ -23976,9 +24482,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
+  case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
+  case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
+  case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
@@ -24302,7 +24812,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
   for (unsigned i = 1; i < NumArgs; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
-      MIB.addOperand(Op);
+      MIB.add(Op);
   }
   if (MI.hasOneMemOperand())
     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24338,7 +24848,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
   for (unsigned i = 1; i < NumArgs; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
-      MIB.addOperand(Op);
+      MIB.add(Op);
   }
   if (MI.hasOneMemOperand())
     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24398,7 +24908,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
 
   unsigned ValOps = X86::AddrNumOperands;
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
@@ -24413,6 +24923,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
+                                      const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  // Address into RAX/EAX
+  unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.add(MI->getOperand(i));
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+
+
 MachineBasicBlock *
 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
@@ -24536,12 +25066,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
     // Load the offset value into a register
     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, UseFPOffset ? 4 : 0)
-      .addOperand(Segment)
-      .setMemRefs(MMOBegin, MMOEnd);
+        .add(Base)
+        .add(Scale)
+        .add(Index)
+        .addDisp(Disp, UseFPOffset ? 4 : 0)
+        .add(Segment)
+        .setMemRefs(MMOBegin, MMOEnd);
 
     // Check if there is enough room left to pull this argument.
     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -24561,12 +25091,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
     // Read the reg_save_area address.
     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, 16)
-      .addOperand(Segment)
-      .setMemRefs(MMOBegin, MMOEnd);
+        .add(Base)
+        .add(Scale)
+        .add(Index)
+        .addDisp(Disp, 16)
+        .add(Segment)
+        .setMemRefs(MMOBegin, MMOEnd);
 
     // Zero-extend the offset
     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -24588,13 +25118,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
     // Store it back into the va_list.
     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, UseFPOffset ? 4 : 0)
-      .addOperand(Segment)
-      .addReg(NextOffsetReg)
-      .setMemRefs(MMOBegin, MMOEnd);
+        .add(Base)
+        .add(Scale)
+        .add(Index)
+        .addDisp(Disp, UseFPOffset ? 4 : 0)
+        .add(Segment)
+        .addReg(NextOffsetReg)
+        .setMemRefs(MMOBegin, MMOEnd);
 
     // Jump to endMBB
     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -24608,12 +25138,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   // Load the overflow_area address into a register.
   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
-    .addOperand(Base)
-    .addOperand(Scale)
-    .addOperand(Index)
-    .addDisp(Disp, 8)
-    .addOperand(Segment)
-    .setMemRefs(MMOBegin, MMOEnd);
+      .add(Base)
+      .add(Scale)
+      .add(Index)
+      .addDisp(Disp, 8)
+      .add(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
 
   // If we need to align it, do so. Otherwise, just copy the address
   // to OverflowDestReg.
@@ -24644,13 +25174,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
   // Store the new overflow address.
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
-    .addOperand(Base)
-    .addOperand(Scale)
-    .addOperand(Index)
-    .addDisp(Disp, 8)
-    .addOperand(Segment)
-    .addReg(NextAddrReg)
-    .setMemRefs(MMOBegin, MMOEnd);
+      .add(Base)
+      .add(Scale)
+      .add(Index)
+      .addDisp(Disp, 8)
+      .add(Segment)
+      .addReg(NextAddrReg)
+      .setMemRefs(MMOBegin, MMOEnd);
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
@@ -24867,7 +25397,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   //
   //   (CMOV (CMOV F, T, cc1), T, cc2)
   //
-  // to two successives branches.  For that, we look for another CMOV as the
+  // to two successive branches.  For that, we look for another CMOV as the
   // following instruction.
   //
   // Without this, we would add a PHI between the two jumps, which ends up
@@ -25123,12 +25653,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
     // instruction using the same address operands.
     if (Operand.isReg())
       Operand.setIsKill(false);
-    MIB.addOperand(Operand);
+    MIB.add(Operand);
   }
   MachineInstr *FOpMI = MIB;
   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
   for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
@@ -25508,7 +26038,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
     else
-      MIB.addOperand(MI.getOperand(MemOpndSlot + i));
+      MIB.add(MI.getOperand(MemOpndSlot + i));
   }
   if (!UseImmLabel)
     MIB.addReg(LabelReg);
@@ -25591,7 +26121,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   // Reload FP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload IP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
@@ -25599,7 +26129,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), LabelOffset);
     else
-      MIB.addOperand(MI.getOperand(i));
+      MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload SP
@@ -25608,7 +26138,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), SPOffset);
     else
-      MIB.addOperand(MI.getOperand(i));
+      MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Jump
@@ -25625,7 +26155,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
@@ -25644,8 +26174,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     VR = MRI->createVirtualRegister(TRC);
     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
 
-    /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
-
     if (Subtarget.is64Bit())
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
           .addReg(X86::RIP)
@@ -25655,7 +26183,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
           .addReg(0);
     else
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
-          .addReg(0) /* XII->getGlobalBaseReg(MF) */
+          .addReg(0) /* TII->getGlobalBaseReg(MF) */
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
@@ -25677,7 +26205,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
   int FI = MFI.getFunctionContextIndex();
 
   // Get a mapping of the call site numbers to all of the landing pads they're
@@ -25749,9 +26277,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
 
-  const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
-  const X86RegisterInfo &RI = XII->getRegisterInfo();
-
+  const X86RegisterInfo &RI = TII->getRegisterInfo();
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   if (RI.hasBasePointer(*MF)) {
@@ -25799,8 +26325,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   SmallVector<MachineBasicBlock *, 64> MBBLPads;
-  const MCPhysReg *SavedRegs =
-      Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (MachineBasicBlock *MBB : InvokeBBs) {
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
@@ -26033,6 +26558,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   case X86::MONITORX:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+
+  // Cache line zero
+  case X86::CLZERO:
+    return emitClzero(&MI, BB, Subtarget);
+
   // PKU feature
   case X86::WRPKRU:
     return emitWRPKRU(MI, BB, Subtarget);
@@ -26137,10 +26667,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
           Opc == ISD::INTRINSIC_W_CHAIN ||
@@ -26167,44 +26699,91 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       break;
     LLVM_FALLTHROUGH;
   case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    KnownZero.setBits(1, BitWidth);
     break;
   case X86ISD::MOVMSK: {
     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+    KnownZero.setBits(NumLoBits, BitWidth);
+    break;
+  }
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
+        KnownZero = APInt::getAllOnesValue(BitWidth);
+        break;
+      }
+
+      DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1);
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      if (Opc == X86ISD::VSHLI) {
+        KnownZero = KnownZero << ShAmt;
+        KnownOne = KnownOne << ShAmt;
+        // Low bits are known zero.
+        KnownZero.setLowBits(ShAmt);
+      } else {
+        KnownZero = KnownZero.lshr(ShAmt);
+        KnownOne = KnownOne.lshr(ShAmt);
+        // High bits are known zero.
+        KnownZero.setHighBits(ShAmt);
+      }
+    }
     break;
   }
   case X86ISD::VZEXT: {
     SDValue N0 = Op.getOperand(0);
-    unsigned NumElts = Op.getValueType().getVectorNumElements();
-    unsigned InNumElts = N0.getValueType().getVectorNumElements();
-    unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
+    unsigned NumElts = VT.getVectorNumElements();
+
+    EVT SrcVT = N0.getValueType();
+    unsigned InNumElts = SrcVT.getVectorNumElements();
+    unsigned InBitWidth = SrcVT.getScalarSizeInBits();
+    assert(InNumElts >= NumElts && "Illegal VZEXT input");
 
     KnownZero = KnownOne = APInt(InBitWidth, 0);
-    APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
-    DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+    APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
+    DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1);
     KnownOne = KnownOne.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+    KnownZero.setBits(InBitWidth, BitWidth);
     break;
   }
   }
 }
 
 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
-    SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
-  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
-  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
-    return Op.getScalarValueSizeInBits();
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
+  unsigned VTBits = Op.getScalarValueSizeInBits();
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case X86ISD::SETCC_CARRY:
+    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+    return VTBits;
 
-  if (Op.getOpcode() == X86ISD::VSEXT) {
-    EVT VT = Op.getValueType();
-    EVT SrcVT = Op.getOperand(0).getValueType();
-    unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+  case X86ISD::VSEXT: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    Tmp += VTBits - Src.getScalarValueSizeInBits();
     return Tmp;
   }
 
+  case X86ISD::VSRAI: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+    ShiftVal += Tmp;
+    return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+  }
+
+  case X86ISD::PCMPGT:
+  case X86ISD::PCMPEQ:
+  case X86ISD::CMPP:
+  case X86ISD::VPCOM:
+  case X86ISD::VPCOMU:
+    // Vector compares return zero/all-bits result values.
+    return VTBits;
+  }
+
   // Fallback case.
   return 1;
 }
@@ -26228,24 +26807,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                    bool FloatDomain,
+                                    bool AllowFloatDomain, bool AllowIntDomain,
+                                    SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
-  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
-  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
-      isUndefOrEqual(Mask[0], 0) &&
-      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
-    Shuffle = X86ISD::VZEXT_MOVL;
-    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
-    return true;
-  }
-
-  // Match against a VZEXT instruction.
-  // TODO: Add 256/512-bit vector support.
-  if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+  // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+  // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
     unsigned MaxScale = 64 / MaskEltSize;
     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
       bool Match = true;
@@ -26255,19 +26827,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
       }
       if (Match) {
-        SrcVT = MaskVT;
+        unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
+        SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
+        if (SrcVT != MaskVT)
+          V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
-        Shuffle = X86ISD::VZEXT;
+        Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
+                                  : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
         return true;
       }
     }
   }
 
+  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+      isUndefOrEqual(Mask[0], 0) &&
+      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+    Shuffle = X86ISD::VZEXT_MOVL;
+    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+    return true;
+  }
+
   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   // instructions are no slower than UNPCKLPD but has the option to
   // fold the input operand into even an unaligned memory load.
-  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v2f64;
@@ -26285,7 +26870,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
-  if (MaskVT.is256BitVector() && FloatDomain) {
+  if (MaskVT.is256BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVDDUP;
@@ -26304,7 +26889,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
-  if (MaskVT.is512BitVector() && FloatDomain) {
+  if (MaskVT.is512BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX512() &&
            "AVX512 required for 512-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
@@ -26343,24 +26928,26 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                           bool FloatDomain,
+                                           bool AllowFloatDomain,
+                                           bool AllowIntDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
 
   bool ContainsZeros = false;
-  SmallBitVector Zeroable(NumMaskElts, false);
+  APInt Zeroable(NumMaskElts, false);
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
-    Zeroable[i] = isUndefOrZero(M);
+    if (isUndefOrZero(M))
+      Zeroable.setBit(i);
     ContainsZeros |= (M == SM_SentinelZero);
   }
 
   // Attempt to match against byte/bit shifts.
   // FIXME: Add 512-bit support.
-  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
-                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
                                              MaskVT.getScalarSizeInBits(), Mask,
                                              0, Zeroable, Subtarget);
@@ -26423,19 +27010,21 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 
   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
-  if (FloatDomain && !Subtarget.hasAVX())
+  if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
     return false;
 
   // Pre-AVX2 we must use float shuffles on 256-bit vectors.
-  if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
-    FloatDomain = true;
+  if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
+    AllowFloatDomain = true;
+    AllowIntDomain = false;
+  }
 
   // Check for lane crossing permutes.
   if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
     // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
     if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
       Shuffle = X86ISD::VPERMI;
-      ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+      ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
       PermuteImm = getV4X86ShuffleImm(Mask);
       return true;
     }
@@ -26443,7 +27032,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
       SmallVector<int, 4> RepeatedMask;
       if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
         Shuffle = X86ISD::VPERMI;
-        ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+        ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
         PermuteImm = getV4X86ShuffleImm(RepeatedMask);
         return true;
       }
@@ -26452,7 +27041,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // VPERMILPD can permute with a non-repeating shuffle.
-  if (FloatDomain && MaskScalarSizeInBits == 64) {
+  if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
     Shuffle = X86ISD::VPERMILPI;
     ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
     PermuteImm = 0;
@@ -26476,8 +27065,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (MaskScalarSizeInBits == 64)
     scaleShuffleMask(2, RepeatedMask, WordMask);
 
-  Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
-  ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+  Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+  ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
   ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
   PermuteImm = getV4X86ShuffleImm(WordMask);
   return true;
@@ -26487,34 +27076,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                     bool FloatDomain, SDValue &V1, SDValue &V2,
+                                     bool AllowFloatDomain, bool AllowIntDomain,
+                                     SDValue &V1, SDValue &V2, SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &ShuffleVT,
                                      bool IsUnary) {
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
-    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVLHPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVHLPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
-        (FloatDomain || !Subtarget.hasSSE41())) {
+        (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
       ShuffleVT = MaskVT;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
-        (FloatDomain || !Subtarget.hasSSE41())) {
+        (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
       ShuffleVT = MaskVT;
       return true;
@@ -26527,57 +27118,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
-    MVT LegalVT = MaskVT;
-    if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
-      LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
-
-    SmallVector<int, 64> Unpckl, Unpckh;
-    if (IsUnary) {
-      createUnpackShuffleMask(MaskVT, Unpckl, true, true);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        V2 = V1;
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      createUnpackShuffleMask(MaskVT, Unpckh, false, true);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        V2 = V1;
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-    } else {
-      createUnpackShuffleMask(MaskVT, Unpckl, true, false);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      createUnpackShuffleMask(MaskVT, Unpckh, false, false);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      ShuffleVectorSDNode::commuteMask(Unpckl);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        std::swap(V1, V2);
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      ShuffleVectorSDNode::commuteMask(Unpckh);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        std::swap(V1, V2);
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
+    if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
+                                    DAG, Subtarget)) {
+      ShuffleVT = MaskVT;
+      if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
+        ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+      return true;
     }
   }
 
@@ -26585,17 +27131,19 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 }
 
 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                            bool FloatDomain,
-                                            SDValue &V1, SDValue &V2,
-                                            SDLoc &DL, SelectionDAG &DAG,
+                                            bool AllowFloatDomain,
+                                            bool AllowIntDomain,
+                                            SDValue &V1, SDValue &V2, SDLoc &DL,
+                                            SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget,
                                             unsigned &Shuffle, MVT &ShuffleVT,
                                             unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
+  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   // Attempt to match against PALIGNR byte rotate.
-  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
-                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
     if (0 < ByteRotation) {
       Shuffle = X86ISD::PALIGNR;
@@ -26606,77 +27154,74 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to combine to X86ISD::BLENDI.
-  if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
-                           (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
-    // Determine a type compatible with X86ISD::BLENDI.
-    // TODO - add 16i16 support (requires lane duplication).
-    MVT BlendVT = MaskVT;
-    if (Subtarget.hasAVX2()) {
-      if (BlendVT == MVT::v4i64)
-        BlendVT = MVT::v8i32;
-      else if (BlendVT == MVT::v2i64)
-        BlendVT = MVT::v4i32;
-    } else {
-      if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
-        BlendVT = MVT::v8i16;
-      else if (BlendVT == MVT::v4i64)
-        BlendVT = MVT::v4f64;
-      else if (BlendVT == MVT::v8i32)
-        BlendVT = MVT::v8f32;
-    }
-
-    unsigned BlendSize = BlendVT.getVectorNumElements();
-    unsigned MaskRatio = BlendSize / NumMaskElts;
-
-    // Can we blend with zero?
-    if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
-                                         /*Low*/ 0) &&
-        NumMaskElts <= BlendVT.getVectorNumElements()) {
-      PermuteImm = 0;
-      for (unsigned i = 0; i != BlendSize; ++i)
-        if (Mask[i / MaskRatio] < 0)
-          PermuteImm |= 1u << i;
-
-      V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
-      Shuffle = X86ISD::BLENDI;
-      ShuffleVT = BlendVT;
-      return true;
-    }
-
-    // Attempt to match as a binary blend.
-    if (NumMaskElts <= BlendVT.getVectorNumElements()) {
-      bool MatchBlend = true;
-      for (int i = 0; i != (int)NumMaskElts; ++i) {
-        int M = Mask[i];
-        if (M == SM_SentinelUndef)
-          continue;
-        else if (M == SM_SentinelZero)
-          MatchBlend = false;
-        else if ((M != i) && (M != (i + (int)NumMaskElts)))
-          MatchBlend = false;
-      }
+  if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+                            (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
+      (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
+    uint64_t BlendMask = 0;
+    bool ForceV1Zero = false, ForceV2Zero = false;
+    SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
+    if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
+                                  BlendMask)) {
+      if (MaskVT == MVT::v16i16) {
+        // We can only use v16i16 PBLENDW if the lanes are repeated.
+        SmallVector<int, 8> RepeatedMask;
+        if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
+                                        RepeatedMask)) {
+          assert(RepeatedMask.size() == 8 &&
+                 "Repeated mask size doesn't match!");
+          PermuteImm = 0;
+          for (int i = 0; i < 8; ++i)
+            if (RepeatedMask[i] >= 8)
+              PermuteImm |= 1 << i;
+          V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+          V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+          Shuffle = X86ISD::BLENDI;
+          ShuffleVT = MaskVT;
+          return true;
+        }
+      } else {
+        // Determine a type compatible with X86ISD::BLENDI.
+        ShuffleVT = MaskVT;
+        if (Subtarget.hasAVX2()) {
+          if (ShuffleVT == MVT::v4i64)
+            ShuffleVT = MVT::v8i32;
+          else if (ShuffleVT == MVT::v2i64)
+            ShuffleVT = MVT::v4i32;
+        } else {
+          if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+            ShuffleVT = MVT::v8i16;
+          else if (ShuffleVT == MVT::v4i64)
+            ShuffleVT = MVT::v4f64;
+          else if (ShuffleVT == MVT::v8i32)
+            ShuffleVT = MVT::v8f32;
+        }
 
-      if (MatchBlend) {
-        PermuteImm = 0;
-        for (unsigned i = 0; i != BlendSize; ++i)
-          if ((int)NumMaskElts <= Mask[i / MaskRatio])
-            PermuteImm |= 1u << i;
+        if (!ShuffleVT.isFloatingPoint()) {
+          int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+          BlendMask =
+              scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+          ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+          ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+        }
 
+        V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+        V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+        PermuteImm = (unsigned)BlendMask;
         Shuffle = X86ISD::BLENDI;
-        ShuffleVT = BlendVT;
         return true;
       }
     }
   }
 
   // Attempt to combine to INSERTPS.
-  if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
-    SmallBitVector Zeroable(4, false);
+  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+      MaskVT.is128BitVector()) {
+    APInt Zeroable(4, 0);
     for (unsigned i = 0; i != NumMaskElts; ++i)
       if (Mask[i] < 0)
-        Zeroable[i] = true;
+        Zeroable.setBit(i);
 
-    if (Zeroable.any() &&
+    if (Zeroable.getBoolValue() &&
         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
       Shuffle = X86ISD::INSERTPS;
       ShuffleVT = MVT::v4f32;
@@ -26685,22 +27230,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to combine to SHUFPD.
-  if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
-      (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
-      (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+  if (AllowFloatDomain && EltSizeInBits == 64 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
       Shuffle = X86ISD::SHUFP;
-      ShuffleVT = MaskVT;
+      ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
       return true;
     }
   }
 
   // Attempt to combine to SHUFPS.
-  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
-      (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
-      (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+  if (AllowFloatDomain && EltSizeInBits == 32 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     SmallVector<int, 4> RepeatedMask;
     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+      // Match each half of the repeated mask, to determine if its just
+      // referencing one of the vectors, is zeroable or entirely undef.
       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
         int M0 = RepeatedMask[Offset];
         int M1 = RepeatedMask[Offset + 1];
@@ -26732,7 +27281,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         V1 = Lo;
         V2 = Hi;
         Shuffle = X86ISD::SHUFP;
-        ShuffleVT = MaskVT;
+        ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
         PermuteImm = getV4X86ShuffleImm(ShufMask);
         return true;
       }
@@ -26764,7 +27313,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // here, we're not going to remove the operands we find.
   bool UnaryShuffle = (Inputs.size() == 1);
   SDValue V1 = peekThroughBitcasts(Inputs[0]);
-  SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+  SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
+                             : peekThroughBitcasts(Inputs[1]));
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
@@ -26853,6 +27403,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   MVT ShuffleSrcVT, ShuffleVT;
   unsigned Shuffle, PermuteImm;
 
+  // Which shuffle domains are permitted?
+  // Permit domain crossing at higher combine depths.
+  bool AllowFloatDomain = FloatDomain || (Depth > 3);
+  bool AllowIntDomain = !FloatDomain || (Depth > 3);
+
   if (UnaryShuffle) {
     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
     // directly if we don't shuffle the lower element and we shuffle the upper
@@ -26869,8 +27424,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       }
     }
 
-    if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
-                                ShuffleSrcVT, ShuffleVT)) {
+    if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+                                V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+                                ShuffleVT)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26884,8 +27440,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       return true;
     }
 
-    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
-                                       Shuffle, ShuffleVT, PermuteImm)) {
+    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+                                       AllowIntDomain, Subtarget, Shuffle,
+                                       ShuffleVT, PermuteImm)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26901,8 +27458,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
-                               Shuffle, ShuffleVT, UnaryShuffle)) {
+  if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+                               V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
+                               UnaryShuffle)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26918,8 +27476,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return true;
   }
 
-  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
-                                      DAG, Subtarget, Shuffle, ShuffleVT,
+  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+                                      AllowIntDomain, V1, V2, DL, DAG,
+                                      Subtarget, Shuffle, ShuffleVT,
                                       PermuteImm)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
@@ -27039,12 +27598,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
-    SmallBitVector UndefElts(NumMaskElts, false);
+    APInt UndefElts(NumMaskElts, 0);
     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
       int M = Mask[i];
       if (M == SM_SentinelUndef) {
-        UndefElts[i] = true;
+        UndefElts.setBit(i);
         continue;
       }
       if (M == SM_SentinelZero)
@@ -27228,8 +27787,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
   // Extract constant bits from each source op.
   bool OneUseConstantOp = false;
-  SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
-  SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+  SmallVector<APInt, 16> UndefEltsOps(NumOps);
+  SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue SrcOp = Ops[i];
     OneUseConstantOp |= SrcOp.hasOneUse();
@@ -27245,18 +27804,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
     return false;
 
   // Shuffle the constant bits according to the mask.
-  SmallBitVector UndefElts(NumMaskElts, false);
-  SmallBitVector ZeroElts(NumMaskElts, false);
-  SmallBitVector ConstantElts(NumMaskElts, false);
+  APInt UndefElts(NumMaskElts, 0);
+  APInt ZeroElts(NumMaskElts, 0);
+  APInt ConstantElts(NumMaskElts, 0);
   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
                                         APInt::getNullValue(MaskSizeInBits));
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       continue;
     } else if (M == SM_SentinelZero) {
-      ZeroElts[i] = true;
+      ZeroElts.setBit(i);
       continue;
     }
     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
@@ -27266,21 +27825,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
     if (SrcUndefElts[SrcMaskIdx]) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       continue;
     }
 
     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
     APInt &Bits = SrcEltBits[SrcMaskIdx];
     if (!Bits) {
-      ZeroElts[i] = true;
+      ZeroElts.setBit(i);
       continue;
     }
 
-    ConstantElts[i] = true;
+    ConstantElts.setBit(i);
     ConstantBitData[i] = Bits;
   }
-  assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+  assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
 
   // Create the constant data.
   MVT MaskSVT;
@@ -27330,6 +27889,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
                                           int SrcOpIndex, SDValue Root,
                                           ArrayRef<int> RootMask,
+                                          ArrayRef<const SDNode*> SrcNodes,
                                           int Depth, bool HasVariableMask,
                                           SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
@@ -27353,13 +27913,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
-  SDValue Input0, Input1;
-  SmallVector<int, 16> OpMask;
-  if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
+  SmallVector<int, 64> OpMask;
+  SmallVector<SDValue, 2> OpInputs;
+  if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
     return false;
 
+  assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
+  SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
+  SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
+
   // Add the inputs to the Ops list, avoiding duplicates.
-  SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+  SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
 
   int InputIdx0 = -1, InputIdx1 = -1;
   for (int i = 0, e = Ops.size(); i < e; ++i) {
@@ -27392,8 +27956,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
           (RootRatio == 1) != (OpRatio == 1)) &&
          "Must not have a ratio for both incoming and op masks!");
 
-  SmallVector<int, 16> Mask;
-  Mask.reserve(MaskWidth);
+  SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
 
   // Merge this shuffle operation's mask into our accumulated mask. Note that
   // this shuffle's mask will be the first applied to the input, followed by the
@@ -27403,7 +27966,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
     int RootIdx = i / RootRatio;
     if (RootMask[RootIdx] < 0) {
       // This is a zero or undef lane, we're done.
-      Mask.push_back(RootMask[RootIdx]);
+      Mask[i] = RootMask[RootIdx];
       continue;
     }
 
@@ -27413,7 +27976,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
     // than the SrcOp we're currently inserting.
     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
-      Mask.push_back(RootMaskedIdx);
+      Mask[i] = RootMaskedIdx;
       continue;
     }
 
@@ -27423,7 +27986,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
     if (OpMask[OpIdx] < 0) {
       // The incoming lanes are zero or undef, it doesn't matter which ones we
       // are using.
-      Mask.push_back(OpMask[OpIdx]);
+      Mask[i] = OpMask[OpIdx];
       continue;
     }
 
@@ -27439,7 +28002,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
       OpMaskedIdx += InputIdx1 * MaskWidth;
     }
 
-    Mask.push_back(OpMaskedIdx);
+    Mask[i] = OpMaskedIdx;
   }
 
   // Handle the all undef/zero cases early.
@@ -27457,28 +28020,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
   }
 
   // Remove unused shuffle source ops.
-  SmallVector<SDValue, 8> UsedOps;
-  for (int i = 0, e = Ops.size(); i < e; ++i) {
-    int lo = UsedOps.size() * MaskWidth;
-    int hi = lo + MaskWidth;
-    if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
-      UsedOps.push_back(Ops[i]);
-      continue;
-    }
-    for (int &M : Mask)
-      if (lo <= M)
-        M -= MaskWidth;
-  }
-  assert(!UsedOps.empty() && "Shuffle with no inputs detected");
-  Ops = UsedOps;
+  resolveTargetShuffleInputsAndMask(Ops, Mask);
+  assert(!Ops.empty() && "Shuffle with no inputs detected");
 
   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
 
-  // See if we can recurse into each shuffle source op (if it's a target shuffle).
+  // Update the list of shuffle nodes that have been combined so far.
+  SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
+                                                SrcNodes.end());
+  CombinedNodes.push_back(Op.getNode());
+
+  // See if we can recurse into each shuffle source op (if it's a target
+  // shuffle). The source op should only be combined if it either has a
+  // single use (i.e. current Op) or all its users have already been combined.
   for (int i = 0, e = Ops.size(); i < e; ++i)
-    if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
-      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
-                                        HasVariableMask, DAG, DCI, Subtarget))
+    if (Ops[i].getNode()->hasOneUse() ||
+        SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+                                        Depth + 1, HasVariableMask, DAG, DCI,
+                                        Subtarget))
         return true;
 
   // Attempt to constant fold all of the constant source ops.
@@ -27495,7 +28055,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
   // elements, and shrink them to the half-width mask. It does this in a loop
   // so it will reduce the size of the mask to the minimal width mask which
   // performs an equivalent shuffle.
-  SmallVector<int, 16> WidenedMask;
+  SmallVector<int, 64> WidenedMask;
   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
     Mask = std::move(WidenedMask);
   }
@@ -27561,8 +28121,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
 /// altering anything.
 static SDValue
 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
-                             SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             SelectionDAG &DAG) {
   assert(N.getOpcode() == X86ISD::PSHUFD &&
          "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
@@ -27842,19 +28401,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   }
   case X86ISD::MOVSD:
   case X86ISD::MOVSS: {
-    bool isFloat = VT.isFloatingPoint();
     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
-    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
-    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
-    assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+    if (isZero0 && isZero1)
+      return SDValue();
 
     // We often lower to MOVSD/MOVSS from integer as well as native float
     // types; remove unnecessary domain-crossing bitcasts if we can to make it
     // easier to combine shuffles later on. We've already accounted for the
     // domain switching cost when we decided to lower with it.
+    bool isFloat = VT.isFloatingPoint();
+    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
@@ -28025,7 +28585,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     break;
 
   case X86ISD::PSHUFD:
-    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
       return NewN;
 
     break;
@@ -28173,12 +28733,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-
-  // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
-    return SDValue();
-
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
   if (TLI.isTypeLegal(VT))
@@ -28249,11 +28804,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
-  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+      Elts.push_back(Elt);
+      continue;
+    }
+    Elts.clear();
+    break;
+  }
 
-  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
-    return LD;
+  if (Elts.size() == VT.getVectorNumElements())
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+      return LD;
 
   // For AVX2, we sometimes want to combine
   // (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -28276,7 +28838,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     // a particular chain.
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -28303,18 +28865,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   EVT OriginalVT = InVec.getValueType();
 
-  if (InVec.getOpcode() == ISD::BITCAST) {
-    // Don't duplicate a load with other uses.
-    if (!InVec.hasOneUse())
-      return SDValue();
-    EVT BCVT = InVec.getOperand(0).getValueType();
-    if (!BCVT.isVector() ||
-        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
-      return SDValue();
-    InVec = InVec.getOperand(0);
-  }
+  // Peek through bitcasts, don't duplicate a load with other uses.
+  InVec = peekThroughOneUseBitcasts(InVec);
 
   EVT CurrentVT = InVec.getValueType();
+  if (!CurrentVT.isVector() ||
+      CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+    return SDValue();
 
   if (!isTargetShuffle(InVec.getOpcode()))
     return SDValue();
@@ -28393,19 +28950,41 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+
+  // Since MMX types are special and don't usually play with other vector types,
+  // it's better to handle them early to be sure we emit efficient code by
+  // avoiding store-load conversions.
 
-  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-  // special and don't usually play with other vector types, it's better to
-  // handle them early to be sure we emit efficient code by avoiding
-  // store-load conversions.
+  // Detect bitcasts between i32 to x86mmx low word.
   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
-      N0.getValueType() == MVT::v2i32 &&
-      isNullConstant(N0.getOperand(1))) {
+      SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
     SDValue N00 = N0->getOperand(0);
     if (N00.getValueType() == MVT::i32)
       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
   }
 
+  // Detect bitcasts between element or subvector extraction to x86mmx.
+  if (VT == MVT::x86mmx &&
+      (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+      isNullConstant(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N00.getValueType().is128BitVector())
+      return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+                         DAG.getBitcast(MVT::v2i64, N00));
+  }
+
+  // Detect bitcasts from FP_TO_SINT to x86mmx.
+  if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
+      N0.getOpcode() == ISD::FP_TO_SINT) {
+    SDLoc DL(N0);
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+                              DAG.getUNDEF(MVT::v2i32));
+    return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+                       DAG.getBitcast(MVT::v2i64, Res));
+  }
+
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
@@ -28511,12 +29090,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOpcode() != ISD::SETCC)
     return false;
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
-  if (CC != ISD::SETGT)
+  if (CC != ISD::SETGT && CC != ISD::SETLT)
     return false;
 
   SDValue SelectOp1 = Select->getOperand(1);
   SDValue SelectOp2 = Select->getOperand(2);
 
+  // The following instructions assume SelectOp1 is the subtraction operand
+  // and SelectOp2 is the negation operand.
+  // In the case of SETLT this is the other way around.
+  if (CC == ISD::SETLT)
+    std::swap(SelectOp1, SelectOp2);
+
   // The second operand of the select should be the negation of the first
   // operand, which is implemented as 0 - SelectOp1.
   if (!(SelectOp2.getOpcode() == ISD::SUB &&
@@ -28529,8 +29114,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOperand(0) != SelectOp1)
     return false;
 
-  // The second operand of the comparison can be either -1 or 0.
-  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+  // In SetLT case, The second operand of the comparison can be either 1 or 0.
+  APInt SplatVal;
+  if ((CC == ISD::SETLT) &&
+      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
+         SplatVal == 1) ||
+        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+    return false;
+
+  // In SetGT case, The second operand of the comparison can be either -1 or 0.
+  if ((CC == ISD::SETGT) &&
+      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
     return false;
 
@@ -28576,17 +29170,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
 }
 
+// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+static SDValue combineHorizontalPredicateResult(SDNode *Extract,
+                                                SelectionDAG &DAG,
+                                                const X86Subtarget &Subtarget) {
+  // Bail without SSE2 or with AVX512VL (which uses predicate registers).
+  if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+    return SDValue();
+
+  EVT ExtractVT = Extract->getValueType(0);
+  unsigned BitWidth = ExtractVT.getSizeInBits();
+  if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
+      ExtractVT != MVT::i8)
+    return SDValue();
+
+  // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+  for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
+    SDValue Match = matchBinOpReduction(Extract, Op);
+    if (!Match)
+      continue;
+
+    // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+    // which we can't support here for now.
+    if (Match.getScalarValueSizeInBits() != BitWidth)
+      continue;
+
+    // We require AVX2 for PMOVMSKB for v16i16/v32i8;
+    unsigned MatchSizeInBits = Match.getValueSizeInBits();
+    if (!(MatchSizeInBits == 128 ||
+          (MatchSizeInBits == 256 &&
+           ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
+      return SDValue();
+
+    // Don't bother performing this for 2-element vectors.
+    if (Match.getValueType().getVectorNumElements() <= 2)
+      return SDValue();
+
+    // Check that we are extracting a reduction of all sign bits.
+    if (DAG.ComputeNumSignBits(Match) != BitWidth)
+      return SDValue();
+
+    // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+    MVT MaskVT;
+    if (64 == BitWidth || 32 == BitWidth)
+      MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+                                MatchSizeInBits / BitWidth);
+    else
+      MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+    APInt CompareBits;
+    ISD::CondCode CondCode;
+    if (Op == ISD::OR) {
+      // any_of -> MOVMSK != 0
+      CompareBits = APInt::getNullValue(32);
+      CondCode = ISD::CondCode::SETNE;
+    } else {
+      // all_of -> MOVMSK == ((1 << NumElts) - 1)
+      CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+      CondCode = ISD::CondCode::SETEQ;
+    }
+
+    // Perform the select as i32/i64 and then truncate to avoid partial register
+    // stalls.
+    unsigned ResWidth = std::max(BitWidth, 32u);
+    EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
+    SDLoc DL(Extract);
+    SDValue Zero = DAG.getConstant(0, DL, ResVT);
+    SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
+    SDValue Res = DAG.getBitcast(MaskVT, Match);
+    Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
+    Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
+                          Ones, Zero, CondCode);
+    return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Verify the type we're extracting from is appropriate
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
+  // Verify the type we're extracting from is any integer type above i16.
   EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
     return SDValue();
 
   unsigned RegSize = 128;
@@ -28595,15 +29264,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   else if (Subtarget.hasAVX2())
     RegSize = 256;
 
-  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
+  if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
   // Match shuffle + add pyramid.
   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
 
+  // The operand is expected to be zero extended from i8
+  // (verified in detectZextAbsDiff).
+  // In order to convert to i64 and above, additional any/zero/sign
+  // extend is expected.
+  // The zero extend from 32 bit has no mathematical effect on the result.
+  // Also the sign extend is basically zero extend
+  // (extends the sign bit which is zero).
+  // So it is correct to skip the sign/zero extend instruction.
+  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+	  Root.getOpcode() == ISD::ZERO_EXTEND ||
+	  Root.getOpcode() == ISD::ANY_EXTEND))
+    Root = Root.getOperand(0);
+
   // If there was a match, we want Root to be a select that is the root of an
   // abs-diff pattern.
   if (!Root || (Root.getOpcode() != ISD::VSELECT))
@@ -28614,7 +29296,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
     return SDValue();
 
-  // Create the SAD instruction
+  // Create the SAD instruction.
   SDLoc DL(Extract);
   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
 
@@ -28636,13 +29318,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
     }
   }
 
-  // Return the lowest i32.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+  MVT Type = Extract->getSimpleValueType(0);
+  unsigned TypeSizeInBits = Type.getSizeInBits();
+  // Return the lowest TypeSizeInBits bits.
+  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                      Extract->getOperand(1));
 }
 
+// Attempt to peek through a target shuffle and extract the scalar from the
+// source.
+static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+  EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+  // Don't attempt this for boolean mask vectors or unknown extraction indices.
+  if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  // Resolve the target shuffle inputs and mask.
+  SmallVector<int, 16> Mask;
+  SmallVector<SDValue, 2> Ops;
+  if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
+    return SDValue();
+
+  // Attempt to narrow/widen the shuffle mask to the correct size.
+  if (Mask.size() != NumSrcElts) {
+    if ((NumSrcElts % Mask.size()) == 0) {
+      SmallVector<int, 16> ScaledMask;
+      int Scale = NumSrcElts / Mask.size();
+      scaleShuffleMask(Scale, Mask, ScaledMask);
+      Mask = std::move(ScaledMask);
+    } else if ((Mask.size() % NumSrcElts) == 0) {
+      SmallVector<int, 16> WidenedMask;
+      while (Mask.size() > NumSrcElts &&
+             canWidenShuffleElements(Mask, WidenedMask))
+        Mask = std::move(WidenedMask);
+      // TODO - investigate support for wider shuffle masks with known upper
+      // undef/zero elements for implicit zero-extension.
+    }
+  }
+
+  // Check if narrowing/widening failed.
+  if (Mask.size() != NumSrcElts)
+    return SDValue();
+
+  int SrcIdx = Mask[N->getConstantOperandVal(1)];
+  SDLoc dl(N);
+
+  // If the shuffle source element is undef/zero then we can just accept it.
+  if (SrcIdx == SM_SentinelUndef)
+    return DAG.getUNDEF(VT);
+
+  if (SrcIdx == SM_SentinelZero)
+    return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
+                                : DAG.getConstant(0, dl, VT);
+
+  SDValue SrcOp = Ops[SrcIdx / Mask.size()];
+  SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+  SrcIdx = SrcIdx % Mask.size();
+
+  // We can only extract other elements from 128-bit vectors and in certain
+  // circumstances, depending on SSE-level.
+  // TODO: Investigate using extract_subvector for larger vectors.
+  // TODO: Investigate float/double extraction if it will be just stored.
+  if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
+      ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+    assert(SrcSVT == VT && "Unexpected extraction type");
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
+                       DAG.getIntPtrConstant(SrcIdx, dl));
+  }
+
+  if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+      (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+    assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
+           "Unexpected extraction type");
+    unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+    SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
+                                DAG.getIntPtrConstant(SrcIdx, dl));
+    SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
+                                 DAG.getValueType(SrcSVT));
+    return DAG.getZExtOrTrunc(Assert, dl, VT);
+  }
+
+  return SDValue();
+}
+
 /// Detect vector gather/scatter index generation and convert it from being a
 /// bunch of shuffles and extracts into a somewhat faster sequence.
 /// For i686, the best sequence is apparently storing the value and loading
@@ -28653,14 +29425,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
+  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+    return NewOp;
+
   SDValue InputVector = N->getOperand(0);
+  SDValue EltIdx = N->getOperand(1);
+
+  EVT SrcVT = InputVector.getValueType();
+  EVT VT = N->getValueType(0);
   SDLoc dl(InputVector);
+
+  // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
+  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+      VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
+    SDValue MMXSrc = InputVector.getOperand(0);
+
+    // The bitcast source is a direct mmx result.
+    if (MMXSrc.getValueType() == MVT::x86mmx)
+      return DAG.getBitcast(VT, InputVector);
+  }
+
   // Detect mmx to i32 conversion through a v2i32 elt extract.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
-      N->getValueType(0) == MVT::i32 &&
-      InputVector.getValueType() == MVT::v2i32 &&
-      isa<ConstantSDNode>(N->getOperand(1)) &&
-      N->getConstantOperandVal(1) == 0) {
+      VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
@@ -28668,15 +29455,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   }
 
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
-      InputVector.getOpcode() == ISD::BITCAST &&
+  if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
+      isa<ConstantSDNode>(EltIdx) &&
       isa<ConstantSDNode>(InputVector.getOperand(0))) {
-    uint64_t ExtractedElt =
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    uint64_t InputValue =
-        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+    uint64_t ExtractedElt = N->getConstantOperandVal(1);
+    uint64_t InputValue = InputVector.getConstantOperandVal(0);
     uint64_t Res = (InputValue >> ExtractedElt) & 1;
     return DAG.getConstant(Res, dl, MVT::i1);
   }
@@ -28687,9 +29470,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
     return SAD;
 
+  // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
+  if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+    return Cmp;
+
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
-  if (InputVector.getValueType() != MVT::v4i32)
+  if (SrcVT != MVT::v4i32)
     return SDValue();
 
   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
@@ -28717,9 +29504,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     // Record which element was extracted.
-    ExtractedElements |=
-      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
-
+    ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
     Uses.push_back(Extract);
   }
 
@@ -28752,11 +29537,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   } else {
     // Store the value to a temporary stack slot.
-    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
                               MachinePointerInfo());
 
-    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    EVT ElementType = SrcVT.getVectorElementType();
     unsigned EltSize = ElementType.getSizeInBits() / 8;
 
     // Replace each use (extract) with a load of the appropriate element.
@@ -28779,8 +29564,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     UE = Uses.end(); UI != UE; ++UI) {
     SDNode *Extract = *UI;
 
-    SDValue Idx = Extract->getOperand(1);
-    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    uint64_t IdxVal = Extract->getConstantOperandVal(1);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   }
 
@@ -28788,6 +29572,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// TODO - merge with combineExtractVectorElt once it can handle the implicit
+// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
+// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+// combineBasicSADPattern.
+static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
+}
+
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 static SDValue
@@ -28812,12 +29606,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   // This situation only applies to avx512.
   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
-      //Invert the cond to not(cond) : xor(op,allones)=not(op)
-      SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-        DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
-                        DL, CondVT));
-      //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
-      return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+    // Invert the cond to not(cond) : xor(op,allones)=not(op)
+    SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                  DAG.getAllOnesConstant(DL, CondVT));
+    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+    return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
@@ -28920,18 +29713,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
                        DAG.getConstant(ShAmt, DL, MVT::i8));
   }
 
-  // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
-  if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
-    if (NeedsCondInvert) // Invert the condition if needed.
-      Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-                         DAG.getConstant(1, DL, Cond.getValueType()));
-
-    // Zero extend the condition if needed.
-    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
-    return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
-                       SDValue(FalseC, 0));
-  }
-
   // Optimize cases that will turn into an LEA instruction.  This requires
   // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
@@ -29049,7 +29830,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
       return false;
     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
     // Only change element size, not type.
-    if (VT.isInteger() != OpEltVT.isInteger())
+    if (EltVT.isInteger() != OpEltVT.isInteger())
       return false;
     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29063,7 +29844,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
     DCI.AddToWorklist(Op1.getNode());
     DCI.CombineTo(OrigOp.getNode(),
                   DAG.getNode(Opcode, DL, VT, Op0, Op1,
-                              DAG.getConstant(Imm, DL, MVT::i8)));
+                              DAG.getIntPtrConstant(Imm, DL)));
     return true;
   }
   case ISD::EXTRACT_SUBVECTOR: {
@@ -29072,7 +29853,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
       return false;
     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
     // Only change element size, not type.
-    if (VT.isInteger() != OpEltVT.isInteger())
+    if (EltVT.isInteger() != OpEltVT.isInteger())
       return false;
     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29084,7 +29865,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
     DCI.AddToWorklist(Op0.getNode());
     DCI.CombineTo(OrigOp.getNode(),
                   DAG.getNode(Opcode, DL, VT, Op0,
-                              DAG.getConstant(Imm, DL, MVT::i8)));
+                              DAG.getIntPtrConstant(Imm, DL)));
+    return true;
+  }
+  case X86ISD::SUBV_BROADCAST: {
+    unsigned EltSize = EltVT.getSizeInBits();
+    if (EltSize != 32 && EltSize != 64)
+      return false;
+    // Only change element size, not type.
+    if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+      return false;
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = MVT::getVectorVT(EltVT,
+                            Op0.getSimpleValueType().getSizeInBits() / EltSize);
+    Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
+    DCI.AddToWorklist(Op0.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(Opcode, DL, VT, Op0));
     return true;
   }
   }
@@ -29370,8 +30167,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
 
   // If this is a *dynamic* select (non-constant condition) and we can match
   // this node with one of the variable blend instructions, restructure the
-  // condition so that the blends can use the high bit of each element and use
-  // SimplifyDemandedBits to simplify the condition operand.
+  // condition so that blends can use the high (sign) bit of each element and
+  // use SimplifyDemandedBits to simplify the condition operand.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() &&
       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
@@ -29406,49 +30203,45 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
-    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
-
+    APInt DemandedMask(APInt::getSignBit(BitWidth));
     APInt KnownZero, KnownOne;
     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                           DCI.isBeforeLegalizeOps());
     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
                                  TLO)) {
-      // If we changed the computation somewhere in the DAG, this change
-      // will affect all users of Cond.
-      // Make sure it is fine and update all the nodes so that we do not
-      // use the generic VSELECT anymore. Otherwise, we may perform
-      // wrong optimizations as we messed up with the actual expectation
+      // If we changed the computation somewhere in the DAG, this change will
+      // affect all users of Cond. Make sure it is fine and update all the nodes
+      // so that we do not use the generic VSELECT anymore. Otherwise, we may
+      // perform wrong optimizations as we messed with the actual expectation
       // for the vector boolean values.
       if (Cond != TLO.Old) {
-        // Check all uses of that condition operand to check whether it will be
-        // consumed by non-BLEND instructions, which may depend on all bits are
-        // set properly.
-        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
-             I != E; ++I)
-          if (I->getOpcode() != ISD::VSELECT)
-            // TODO: Add other opcodes eventually lowered into BLEND.
+        // Check all uses of the condition operand to check whether it will be
+        // consumed by non-BLEND instructions. Those may require that all bits
+        // are set properly.
+        for (SDNode *U : Cond->uses()) {
+          // TODO: Add other opcodes eventually lowered into BLEND.
+          if (U->getOpcode() != ISD::VSELECT)
             return SDValue();
+        }
 
-        // Update all the users of the condition, before committing the change,
-        // so that the VSELECT optimizations that expect the correct vector
-        // boolean value will not be triggered.
-        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
-             I != E; ++I)
-          DAG.ReplaceAllUsesOfValueWith(
-              SDValue(*I, 0),
-              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
-                          Cond, I->getOperand(1), I->getOperand(2)));
+        // Update all users of the condition before committing the change, so
+        // that the VSELECT optimizations that expect the correct vector boolean
+        // value will not be triggered.
+        for (SDNode *U : Cond->uses()) {
+          SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
+                                   U->getValueType(0), Cond, U->getOperand(1),
+                                   U->getOperand(2));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+        }
         DCI.CommitTargetLoweringOpt(TLO);
         return SDValue();
       }
-      // At this point, only Cond is changed. Change the condition
-      // just for N to keep the opportunity to optimize all other
-      // users their own way.
-      DAG.ReplaceAllUsesOfValueWith(
-          SDValue(N, 0),
-          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
-                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      // Only Cond (rather than other nodes in the computation chain) was
+      // changed. Change the condition just for N to keep the opportunity to
+      // optimize all other users their own way.
+      SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
       return SDValue();
     }
   }
@@ -29456,7 +30249,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Look for vselects with LHS/RHS being bitcasted from an operation that
   // can be executed on another type. Push the bitcast to the inputs of
   // the operation. This exposes opportunities for using masking instructions.
-  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+  if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
       CondVT.getVectorElementType() == MVT::i1) {
     if (combineBitcastForMaskedOp(LHS, DAG, DCI))
       return SDValue(N, 0);
@@ -30208,22 +31001,37 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   }
 
   if (!NewMul) {
-    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
-           && "Both cases that could cause potential overflows should have "
-              "already been handled.");
-    if (isPowerOf2_64(MulAmt - 1))
-      // (mul x, 2^N + 1) => (add (shl x, N), x)
-      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
-                                MVT::i8)));
-
-    else if (isPowerOf2_64(MulAmt + 1))
-      // (mul x, 2^N - 1) => (sub (shl x, N), x)
-      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
-                                N->getOperand(0),
-                                DAG.getConstant(Log2_64(MulAmt + 1),
-                                DL, MVT::i8)), N->getOperand(0));
+    assert(MulAmt != 0 &&
+           MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
+           "Both cases that could cause potential overflows should have "
+           "already been handled.");
+    int64_t SignMulAmt = C->getSExtValue();
+    if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
+        (SignMulAmt != -INT64_MAX)) {
+      int NumSign = SignMulAmt > 0 ? 1 : -1;
+      bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
+      bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
+      if (IsPowerOf2_64PlusOne) {
+        // (mul x, 2^N + 1) => (add (shl x, N), x)
+        NewMul = DAG.getNode(
+            ISD::ADD, DL, VT, N->getOperand(0),
+            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                        DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
+                                        MVT::i8)));
+      } else if (IsPowerOf2_64MinusOne) {
+        // (mul x, 2^N - 1) => (sub (shl x, N), x)
+        NewMul = DAG.getNode(
+            ISD::SUB, DL, VT,
+            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                        DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
+                                        MVT::i8)),
+            N->getOperand(0));
+      }
+      // To negate, subtract the number from zero
+      if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
+        NewMul =
+            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+    }
   }
 
   if (NewMul)
@@ -30396,42 +31204,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget &Subtarget) {
-  assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
-         "Unexpected opcode");
+static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
+  unsigned Opcode = N->getOpcode();
+  assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
+          X86ISD::VSRLI == Opcode) &&
+         "Unexpected shift opcode");
+  bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
   EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
-  // This fails for mask register (vXi1) shifts.
-  if ((NumBitsPerElt % 8) != 0)
-    return SDValue();
+  assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
+         "Unexpected value type");
 
   // Out of range logical bit shifts are guaranteed to be zero.
-  APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
-  if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
-    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+  // Out of range arithmetic bit shifts splat the sign bit.
+  APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
+  if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
+    if (LogicalShift)
+      return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+    else
+      ShiftVal = NumBitsPerElt - 1;
+  }
 
   // Shift N0 by zero -> N0.
   if (!ShiftVal)
-    return N->getOperand(0);
+    return N0;
 
   // Shift zero -> zero.
-  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+  if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
 
+  // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
+  // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
+  // TODO - support other sra opcodes as needed.
+  if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
+      N0.getOpcode() == X86ISD::VSRAI)
+    return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
+
   // We can decode 'whole byte' logical bit shifts as shuffles.
-  if ((ShiftVal.getZExtValue() % 8) == 0) {
+  if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
   }
 
+  // Constant Folding.
+  APInt UndefElts;
+  SmallVector<APInt, 32> EltBits;
+  if (N->isOnlyUserOf(N0.getNode()) &&
+      getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
+    assert(EltBits.size() == VT.getVectorNumElements() &&
+           "Unexpected shift value type");
+    unsigned ShiftImm = ShiftVal.getZExtValue();
+    for (APInt &Elt : EltBits) {
+      if (X86ISD::VSHLI == Opcode)
+        Elt = Elt.shl(ShiftImm);
+      else if (X86ISD::VSRAI == Opcode)
+        Elt = Elt.ashr(ShiftImm);
+      else
+        Elt = Elt.lshr(ShiftImm);
+    }
+    return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
+  }
+
+  return SDValue();
+}
+
+static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget &Subtarget) {
+  assert(
+      ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
+       (N->getOpcode() == X86ISD::PINSRW &&
+        N->getValueType(0) == MVT::v8i16)) &&
+      "Unexpected vector insertion");
+
+  // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
+  SDValue Op(N, 0);
+  SmallVector<int, 1> NonceMask; // Just a placeholder.
+  NonceMask.push_back(0);
+  combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+                                /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                DCI, Subtarget);
   return SDValue();
 }
 
@@ -30550,33 +31411,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
     return SDValue();
 
-  // Canonicalize XOR to the left.
-  if (N1.getOpcode() == ISD::XOR)
-    std::swap(N0, N1);
+  if (N0.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
 
-  if (N0.getOpcode() != ISD::XOR)
-    return SDValue();
-
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
+  if (N1.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
 
-  N01 = peekThroughBitcasts(N01);
-
-  // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
-  // insert_subvector building a 256-bit AllOnes vector.
-  if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
-    if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
-      return SDValue();
-
-    SDValue V1 = N01->getOperand(0);
-    SDValue V2 = N01->getOperand(1);
-    if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
-        !V1.getOperand(0).isUndef() ||
-        !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
-        !ISD::isBuildVectorAllOnes(V2.getNode()))
-      return SDValue();
-  }
-  return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
+  return SDValue();
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -30696,38 +31539,34 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
-/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
-/// eliminate loading the vector constant mask value. This relies on the fact
-/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
-static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+/// If this is a zero/all-bits result that is bitwise-anded with a low bits
+/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
+/// with a shift-right to eliminate loading the vector constant mask value.
+static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+  EVT VT0 = Op0.getValueType();
+  EVT VT1 = Op1.getValueType();
 
-  // TODO: Use AssertSext to mark any nodes that have the property of producing
-  // all-ones or all-zeros. Then check for that node rather than particular
-  // opcodes.
-  if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+  if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
     return SDValue();
 
-  // The existence of the PCMP node guarantees that we have the required SSE2 or
-  // AVX2 for a shift of this vector type, but there is no vector shift by
-  // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
-  // masked compare nodes, so they should not make it here.
-  EVT VT0 = Op0.getValueType();
-  EVT VT1 = Op1.getValueType();
-  unsigned EltBitWidth = VT0.getScalarSizeInBits();
-  if (VT0 != VT1 || EltBitWidth == 8)
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
+      !SplatVal.isMask())
     return SDValue();
 
-  assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+  if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+    return SDValue();
 
-  APInt SplatVal;
-  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+  unsigned EltBitWidth = VT0.getScalarSizeInBits();
+  if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
   SDLoc DL(N);
-  SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+  unsigned ShiftVal = SplatVal.countTrailingOnes();
+  SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
@@ -30747,7 +31586,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
     return R;
 
-  if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+  if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
   EVT VT = N->getValueType(0);
@@ -30760,7 +31599,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -30969,7 +31808,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
            N->getOperand(1).getOpcode() == X86ISD::CMP &&
-           N->getOperand(1).getConstantOperandVal(1) == 0 &&
+           isNullConstant(N->getOperand(1).getOperand(1)) &&
            N->getOperand(1).getValueType().bitsGE(MVT::i32);
   };
 
@@ -31272,6 +32111,74 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
+/// Check if truncation with saturation form type \p SrcVT to \p DstVT
+/// is valid for the given \p Subtarget.
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+                                        const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX512())
+    return false;
+
+  // FIXME: Scalar type may be supported if we move it to vector register.
+  if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+    return false;
+
+  EVT SrcElVT = SrcVT.getScalarType();
+  EVT DstElVT = DstVT.getScalarType();
+  if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
+    return false;
+  if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+    return false;
+  if (SrcVT.is512BitVector() || Subtarget.hasVLX())
+    return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
+  return false;
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
+  if (In.getOpcode() != ISD::UMIN)
+    return SDValue();
+
+  //Saturation with truncation. We truncate from InVT to VT.
+  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
+    "Unexpected types for truncate operation");
+
+  APInt C;
+  if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
+    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+    // the element size of the destination type.
+    return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
+      SDValue();
+  }
+  return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+                                       const X86Subtarget &Subtarget) {
+  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+    return SDValue();
+  return detectUSatPattern(In, VT);
+}
+
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+                        const X86Subtarget &Subtarget) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
+    return SDValue();
+  if (auto USatVal = detectUSatPattern(In, VT))
+    if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+  return SDValue();
+}
+
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
@@ -31664,7 +32571,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                      Mld->getBasePtr(), NewMask, WideSrc0,
                                      Mld->getMemoryVT(), Mld->getMemOperand(),
                                      ISD::NON_EXTLOAD);
-  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 
@@ -31838,6 +32745,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
+    if (SDValue Val =
+        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+                             dl, Val, St->getBasePtr(),
+                             St->getMemoryVT(), St->getMemOperand(), DAG);
+
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
@@ -32198,13 +33111,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
 
-  auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
-    // TODO: Add extra cases where we can truncate both inputs for the
-    // cost of one (or none).
-    // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+  auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
+    unsigned TruncSizeInBits = VT.getScalarSizeInBits();
+
+    // Repeated operand, so we are only trading one output truncation for
+    // one input truncation.
     if (Op0 == Op1)
       return true;
 
+    // See if either operand has been extended from a smaller/equal size to
+    // the truncation size, allowing a truncation to combine with the extend.
+    unsigned Opcode0 = Op0.getOpcode();
+    if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
+         Opcode0 == ISD::ZERO_EXTEND) &&
+        Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+      return true;
+
+    unsigned Opcode1 = Op1.getOpcode();
+    if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
+         Opcode1 == ISD::ZERO_EXTEND) &&
+        Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+      return true;
+
+    // See if either operand is a single use constant which can be constant
+    // folded.
     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
@@ -32236,7 +33166,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
-        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+        IsRepeatedOpOrFreeTruncation(Op0, Op1))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
@@ -32252,7 +33182,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(Opcode, VT) &&
-        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+        IsRepeatedOpOrFreeTruncation(Op0, Op1))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
@@ -32458,6 +33388,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
+  // Try to combine truncation with unsigned saturation.
+  if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+    return Val;
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -32804,6 +33738,34 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
 }
 
+/// Do target-specific dag combines on X86ISD::ANDNP nodes.
+static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget &Subtarget) {
+  // ANDNP(0, x) -> x
+  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+    return N->getOperand(1);
+
+  // ANDNP(x, 0) -> 0
+  if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+    return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
+
+  EVT VT = N->getValueType(0);
+
+  // Attempt to recursively combine a bitmask ANDNP with shuffles.
+  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    SDValue Op(N, 0);
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+                                      /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  return SDValue();
+}
+
 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI) {
   // BT ignores high bits in the bit index operand.
@@ -33065,13 +34027,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps()) {
     if (InVT == MVT::i1) {
       SDValue Zero = DAG.getConstant(0, DL, VT);
-      SDValue AllOnes =
-          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
     }
     return SDValue();
   }
 
+  if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
+      isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
+    // Invert and sign-extend a boolean is the same as zero-extend and subtract
+    // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
+    // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
+    // sext (xor Bool, -1) --> sub (zext Bool), 1
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
+  }
+
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
@@ -33212,8 +34183,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Optimize x == -y --> x+y == 0
-///          x != -y --> x+y != 0
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+                                               const X86Subtarget &Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+  // We're looking for an oversized integer equality comparison, but ignore a
+  // comparison with zero because that gets special treatment in EmitTest().
+  SDValue X = SetCC->getOperand(0);
+  SDValue Y = SetCC->getOperand(1);
+  EVT OpVT = X.getValueType();
+  unsigned OpSize = OpVT.getSizeInBits();
+  if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
+    return SDValue();
+
+  // TODO: Use PXOR + PTEST for SSE4.1 or later?
+  // TODO: Add support for AVX-512.
+  EVT VT = SetCC->getValueType(0);
+  SDLoc DL(SetCC);
+  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+      (OpSize == 256 && Subtarget.hasAVX2())) {
+    EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
+    SDValue VecX = DAG.getBitcast(VecVT, X);
+    SDValue VecY = DAG.getBitcast(VecVT, Y);
+
+    // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
+    // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
+    // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
+    // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
+    // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+    SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
+    SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
+    SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
+                                    MVT::i32);
+    return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -33222,21 +34232,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
-    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
-      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                 LHS.getOperand(1));
-      return DAG.getSetCC(DL, N->getValueType(0), addV,
-                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+  if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+    EVT OpVT = LHS.getValueType();
+    // 0-x == y --> x+y == 0
+    // 0-x != y --> x+y != 0
+    if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+        LHS.hasOneUse()) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
-  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
-    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
-      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                 RHS.getOperand(1));
-      return DAG.getSetCC(DL, N->getValueType(0), addV,
-                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    // x == 0-y --> x+y == 0
+    // x != 0-y --> x+y != 0
+    if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+        RHS.hasOneUse()) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
 
+    if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+      return V;
+  }
+
   if (VT.getScalarType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
     bool IsSEXT0 =
@@ -33293,56 +34309,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Helper function of performSETCCCombine. It is to materialize "setb reg"
-// as "sbb reg,reg", since it can be extended without zext and produces
-// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
-                               SelectionDAG &DAG, MVT VT) {
-  if (VT == MVT::i8)
-    return DAG.getNode(ISD::AND, DL, VT,
-                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                   EFLAGS),
-                       DAG.getConstant(1, DL, VT));
-  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
-  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
-                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                 EFLAGS));
-}
-
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
-                               TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
-  if (CC == X86::COND_A) {
-    // Try to convert COND_A into COND_B in an attempt to facilitate
-    // materializing "setb reg".
-    //
-    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
-    // cannot take an immediate as its first operand.
-    //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
-                                   EFLAGS.getNode()->getVTList(),
-                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
-    }
-  }
-
-  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
-  // a zext and produces an all-ones bit which is more useful than 0/1 in some
-  // cases.
-  if (CC == X86::COND_B)
-    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
-
   // Try to simplify the EFLAGS and condition code operands.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
     return getSETCC(CC, Flags, DL, DAG);
@@ -33352,7 +34325,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
 
 /// Optimize branch condition evaluation.
 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue EFLAGS = N->getOperand(3);
@@ -33538,45 +34510,159 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// fold (add Y, (sete  X, 0)) -> adc  0, Y
-///      (add Y, (setne X, 0)) -> sbb -1, Y
-///      (sub (sete  X, 0), Y) -> sbb  0, Y
-///      (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
+/// which is more useful than 0/1 in some cases.
+static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
   SDLoc DL(N);
+  // "Condition code B" is also known as "the carry flag" (CF).
+  SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
+  SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
+  MVT VT = N->getSimpleValueType(0);
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
 
-  // Look through ZExts.
-  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
-  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
-    return SDValue();
+  assert(VT == MVT::i1 && "Unexpected type for SETCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+  bool IsSub = N->getOpcode() == ISD::SUB;
+  SDValue X = N->getOperand(0);
+  SDValue Y = N->getOperand(1);
+
+  // If this is an add, canonicalize a zext operand to the RHS.
+  // TODO: Incomplete? What if both sides are zexts?
+  if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
+      Y.getOpcode() != ISD::ZERO_EXTEND)
+    std::swap(X, Y);
+
+  // Look through a one-use zext.
+  bool PeekedThroughZext = false;
+  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
+    Y = Y.getOperand(0);
+    PeekedThroughZext = true;
+  }
 
-  SDValue SetCC = Ext.getOperand(0);
-  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+  // If this is an add, canonicalize a setcc operand to the RHS.
+  // TODO: Incomplete? What if both sides are setcc?
+  // TODO: Should we allow peeking through a zext of the other operand?
+  if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
+      Y.getOpcode() != X86ISD::SETCC)
+    std::swap(X, Y);
+
+  if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
     return SDValue();
 
-  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+
+  if (CC == X86::COND_B) {
+    // X + SETB Z --> X + (mask SBB Z, Z)
+    // X - SETB Z --> X - (mask SBB Z, Z)
+    // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
+    SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
+    if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+      SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+    return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+  }
+
+  if (CC == X86::COND_A) {
+    SDValue EFLAGS = Y->getOperand(1);
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
+      if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+        SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+      return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+    }
+  }
+
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
-  SDValue Cmp = SetCC.getOperand(1);
+  SDValue Cmp = Y.getOperand(1);
   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
       !X86::isZeroNode(Cmp.getOperand(1)) ||
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
-  SDValue CmpOp0 = Cmp.getOperand(0);
-  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
-                               DAG.getConstant(1, DL, CmpOp0.getValueType()));
+  // (cmp Z, 1) sets the carry flag if Z is 0.
+  SDValue Z = Cmp.getOperand(0);
+  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
+                               DAG.getConstant(1, DL, Z.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
 
-  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
-    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
-                       DL, OtherVal.getValueType(), OtherVal,
-                       DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
-                       NewCmp);
-  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
-                     DL, OtherVal.getValueType(), OtherVal,
-                     DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+                       DAG.getConstant(-1ULL, DL, VT), NewCmp);
+
+  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
+  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
+  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+                     DAG.getConstant(0, DL, VT), NewCmp);
+}
+
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  SDValue MulOp = N->getOperand(0);
+  SDValue Phi = N->getOperand(1);
+
+  if (MulOp.getOpcode() != ISD::MUL)
+    std::swap(MulOp, Phi);
+  if (MulOp.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+  unsigned VectorSize = VT.getVectorNumElements() * 16;
+  // If the vector size is less than 128, or greater than the supported RegSize,
+  // do not use PMADD.
+  if (VectorSize < 128 || VectorSize > RegSize)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                   VT.getVectorNumElements());
+  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                VT.getVectorNumElements() / 2);
+
+  // Shrink the operands of mul.
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
+  // Madd vector size is half of the original vector size
+  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+  // Fill the rest of the output with 0
+  SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+  return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
 }
 
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -33656,6 +34742,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   if (Flags->hasVectorReduction()) {
     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
       return Sad;
+    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+      return MAdd;
   }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
@@ -33667,7 +34755,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
-  return OptimizeConditionalInDecrement(N, DAG);
+  return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
@@ -33700,36 +34788,44 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
-  return OptimizeConditionalInDecrement(N, DAG);
+  return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
   SDLoc DL(N);
   unsigned Opcode = N->getOpcode();
   MVT VT = N->getSimpleValueType(0);
   MVT SVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = SVT.getSizeInBits();
+
   SDValue Op = N->getOperand(0);
   MVT OpVT = Op.getSimpleValueType();
   MVT OpEltVT = OpVT.getVectorElementType();
-  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+  unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
+  unsigned InputBits = OpEltSizeInBits * NumElts;
 
   // Perform any constant folding.
   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
-    unsigned NumDstElts = VT.getVectorNumElements();
-    SmallBitVector Undefs(NumDstElts, false);
-    SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
-    for (unsigned i = 0; i != NumDstElts; ++i) {
-      SDValue OpElt = Op.getOperand(i);
-      if (OpElt.getOpcode() == ISD::UNDEF) {
-        Undefs[i] = true;
+  APInt UndefElts;
+  SmallVector<APInt, 64> EltBits;
+  if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
+    APInt Undefs(NumElts, 0);
+    SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
+    bool IsZEXT =
+        (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (UndefElts[i]) {
+        Undefs.setBit(i);
         continue;
       }
-      APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
-      Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
-                                        : Cst.sextOrTrunc(SVT.getSizeInBits());
+      Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
+                       : EltBits[i].sextOrTrunc(EltSizeInBits);
     }
     return getConstVector(Vals, Undefs, VT, DAG, DL);
   }
@@ -33829,7 +34925,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
 
   if (N->getOperand(0) == N->getOperand(1)) {
     if (N->getOpcode() == X86ISD::PCMPEQ)
-      return getOnesVector(VT, Subtarget, DAG, DL);
+      return getOnesVector(VT, DAG, DL);
     if (N->getOpcode() == X86ISD::PCMPGT)
       return getZeroVector(VT, Subtarget, DAG, DL);
   }
@@ -33837,6 +34933,98 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT OpVT = N->getSimpleValueType(0);
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // If this is an insert of an extract, combine to a shuffle. Don't do this
+  // if the insert or extract can be represented with a subvector operation.
+  if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      SubVec.getOperand(0).getSimpleValueType() == OpVT &&
+      (IdxVal != 0 || !Vec.isUndef())) {
+    int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
+    if (ExtIdxVal != 0) {
+      int VecNumElts = OpVT.getVectorNumElements();
+      int SubVecNumElts = SubVecVT.getVectorNumElements();
+      SmallVector<int, 64> Mask(VecNumElts);
+      // First create an identity shuffle mask.
+      for (int i = 0; i != VecNumElts; ++i)
+        Mask[i] = i;
+      // Now insert the extracted portion.
+      for (int i = 0; i != SubVecNumElts; ++i)
+        Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
+
+      return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+    }
+  }
+
+  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+  // load:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr + 16), Elts/2)
+  // --> load32 addr
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr + 32), Elts/2)
+  // --> load64 addr
+  // or a 16-byte or 32-byte broadcast:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr), Elts/2)
+  // --> X86SubVBroadcast(load16 addr)
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr), Elts/2)
+  // --> X86SubVBroadcast(load32 addr)
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+    if (Idx2 && Idx2->getZExtValue() == 0) {
+      SDValue SubVec2 = Vec.getOperand(1);
+      // If needed, look through bitcasts to get to the load.
+      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+        bool Fast;
+        unsigned Alignment = FirstLd->getAlignment();
+        unsigned AS = FirstLd->getAddressSpace();
+        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                    OpVT, AS, Alignment, &Fast) && Fast) {
+          SDValue Ops[] = {SubVec2, SubVec};
+          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+            return Ld;
+        }
+      }
+      // If lower/upper loads are the same and the only users of the load, then
+      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+            SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
+          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+        }
+      }
+      // If this is subv_broadcast insert into both halves, use a larger
+      // subv_broadcast.
+      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+                           SubVec.getOperand(0));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -33845,6 +35033,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+  case X86ISD::PEXTRW:
+  case X86ISD::PEXTRB:
+    return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+  case ISD::INSERT_SUBVECTOR:
+    return combineInsertSubvector(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
@@ -33870,6 +35063,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
+  case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -33884,12 +35078,18 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
-  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
-  case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
+  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
+  case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::VSHLI:
-  case X86ISD::VSRLI:       return combineVectorShift(N, DAG, DCI, Subtarget);
+  case X86ISD::VSRAI:
+  case X86ISD::VSRLI:
+    return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VSEXT:
   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
@@ -34717,10 +35917,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       return Res;
     }
 
-    // 'A' means EAX + EDX.
+    // 'A' means [ER]AX + [ER]DX.
     if (Constraint == "A") {
-      Res.first = X86::EAX;
-      Res.second = &X86::GR32_ADRegClass;
+      if (Subtarget.is64Bit()) {
+        Res.first = X86::RAX;
+        Res.second = &X86::GR64_ADRegClass;
+      } else if (Subtarget.is32Bit()) {
+        Res.first = X86::EAX;
+        Res.second = &X86::GR32_ADRegClass;
+      } else if (Subtarget.is16Bit()) {
+        Res.first = X86::AX;
+        Res.second = &X86::GR16_ADRegClass;
+      } else {
+        llvm_unreachable("Expecting 64, 32 or 16 bit subtarget");
+      }
       return Res;
     }
     return Res;
@@ -34812,7 +36022,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   return -1;
 }
 
-bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on x86 is expensive. However, when aggressively optimizing
   // for code size, we prefer to use a div instruction, as it is usually smaller
   // than the alternative sequence.
@@ -34820,8 +36030,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::MinSize);
+  bool OptSize =
+      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 37f9353042b1..ab4910daca02 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -149,8 +149,7 @@ namespace llvm {
       WrapperRIP,
 
       /// Copies a 64-bit value from the low word of an XMM vector
-      /// to an MMX vector.  If you think this is too close to the previous
-      /// mnemonic, so do I; blame Intel.
+      /// to an MMX vector.
       MOVDQ2Q,
 
       /// Copies a 32-bit value from the low word of a MMX
@@ -179,7 +178,7 @@ namespace llvm {
 
       /// Insert the lower 16-bits of a 32-bit value to a vector,
       /// corresponds to X86::PINSRW.
-      PINSRW, MMX_PINSRW,
+      PINSRW,
 
       /// Shuffle 16 8-bit values within a vector.
       PSHUFB,
@@ -195,21 +194,21 @@ namespace llvm {
       /// Blend where the selector is an immediate.
       BLENDI,
 
-      /// Blend where the condition has been shrunk.
-      /// This is used to emphasize that the condition mask is
-      /// no more valid for generic VSELECT optimizations.
+      /// Dynamic (non-constant condition) vector blend where only the sign bits
+      /// of the condition elements are used. This is used to enforce that the
+      /// condition mask is not valid for generic VSELECT optimizations.
       SHRUNKBLEND,
 
       /// Combined add and sub on an FP vector.
       ADDSUB,
 
       //  FP vector ops with rounding mode.
-      FADD_RND,
-      FSUB_RND,
-      FMUL_RND,
-      FDIV_RND,
-      FMAX_RND,
-      FMIN_RND,
+      FADD_RND, FADDS_RND,
+      FSUB_RND, FSUBS_RND,
+      FMUL_RND, FMULS_RND,
+      FDIV_RND, FDIVS_RND,
+      FMAX_RND, FMAXS_RND,
+      FMIN_RND, FMINS_RND,
       FSQRT_RND, FSQRTS_RND,
 
       // FP vector get exponent.
@@ -239,9 +238,6 @@ namespace llvm {
       FHADD,
       FHSUB,
 
-      // Integer absolute value
-      ABS,
-
       // Detect Conflicts Within a Vector
       CONFLICT,
 
@@ -251,6 +247,9 @@ namespace llvm {
       /// Commutative FMIN and FMAX.
       FMAXC, FMINC,
 
+      /// Scalar intrinsic floating point max and min.
+      FMAXS, FMINS,
+
       /// Floating point reciprocal-sqrt and reciprocal approximation.
       /// Note that these typically require refinement
       /// in order to obtain suitable precision.
@@ -320,6 +319,9 @@ namespace llvm {
       // Vector shift elements by immediate
       VSHLI, VSRLI, VSRAI,
 
+      // Shifts of mask registers.
+      KSHIFTL, KSHIFTR,
+
       // Bit rotate by immediate
       VROTLI, VROTRI,
 
@@ -443,8 +445,7 @@ namespace llvm {
       // Broadcast subvector to vector.
       SUBV_BROADCAST,
 
-      // Insert/Extract vector element.
-      VINSERT,
+      // Extract vector element.
       VEXTRACT,
 
       /// SSE4A Extraction and Insertion.
@@ -686,6 +687,9 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
+    void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                               ArgListTy &Args) const override;
+
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
       return MVT::i8;
     }
@@ -806,8 +810,17 @@ namespace llvm {
       return false;
     }
 
+    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
     bool hasAndNotCompare(SDValue Y) const override;
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
+    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+    MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
@@ -817,11 +830,13 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
     /// Determine the number of bits in the operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth) const override;
 
@@ -984,6 +999,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool convertSelectOfConstantsToMath() const override {
+      return true;
+    }
+
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
@@ -1035,7 +1054,7 @@ namespace llvm {
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
-    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+    bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
     bool supportSwiftError() const override;
 
@@ -1076,7 +1095,8 @@ namespace llvm {
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             const SDLoc &dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals) const;
+                            SmallVectorImpl<SDValue> &InVals,
+                            uint32_t *RegMask) const;
     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
                              const SDLoc &dl, SelectionDAG &DAG,
@@ -1138,8 +1158,7 @@ namespace llvm {
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget,
-                           SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
                       SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index ba1aede3c1a0..08b501ff20bf 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -38,7 +38,9 @@ multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
   def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
 }
 
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, bit Commutable = 0,
+                               string Ver = ""> {
+  let isCommutable = Commutable in
   def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
       !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
@@ -63,25 +65,25 @@ multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
         (bitconvert (load_mmx addr:$src))))]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb", 1>;
 defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
 defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
-defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
-defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd", 1>;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq", 1>;
 defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
 defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
 defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
 defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
-defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul", 1>;
 defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
 defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
 defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
 defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
 defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
-defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
-defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub", 1>;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr", 1>;
 defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
-defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", 1>;
 
 
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
@@ -98,6 +100,6 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
 // "3DNowA" instructions
 defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
 defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
-defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
-defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", 0, "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", 0, "a">;
 defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 230d1700b8d2..c38c13bb9757 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -34,13 +34,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
                                                           "v" # NumElts # "i1"));
 
-  // The GPR register class that can hold the write mask.  Use GR8 for fewer
-  // than 8 elements.  Use shift-right and equal to work around the lack of
-  // !lt in tablegen.
-  RegisterClass MRC =
-    !cast<RegisterClass>("GR" #
-                         !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
-
   // Suffix used in the instruction mnemonic.
   string Suffix = suffix;
 
@@ -69,6 +62,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   // The corresponding memory operand, e.g. i512mem for VR512.
   X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
   X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+  // FP scalar memory operand for intrinsics - ssmem/sdmem.
+  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
+                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
   // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
@@ -89,6 +85,12 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
+  ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
+                                          !cast<ComplexPattern>("sse_load_f32"),
+                                    !if (!eq (EltTypeName, "f64"),
+                                          !cast<ComplexPattern>("sse_load_f64"),
+                                    ?));
+
   // The corresponding float type, e.g. v16f32 for v16i32
   // Note: For EltSize < 32, FloatVT is illegal and TableGen
   //       fails to compile, so we choose FloatVT = VT
@@ -207,7 +209,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
-  let AddedComplexity = 20, isCommutable = IsKCommutable in
+  let isCommutable = IsKCommutable in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
@@ -219,7 +221,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
 
   // Zero mask does not add any restrictions to commute operands transformation.
   // So, it is Ok to use IsCommutable instead of IsKCommutable.
-  let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+  let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -250,6 +252,23 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          MaskingConstraint, NoItinerary, IsCommutable,
                          IsKCommutable>;
 
+// Similar to AVX512_maskable_common, but with scalar types.
+multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  SDNode Select = vselect,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [], [], [],
+                         MaskingConstraint, NoItinerary, IsCommutable,
+                         IsKCommutable>;
+
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
 // perserved vector elements come from a new dummy input operand tied to $dst.
@@ -460,7 +479,7 @@ def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
 def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
                [(set VR128X:$dst, (v4i32 immAllZerosV))]>;
 def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
@@ -470,7 +489,7 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in {
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
   def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
                           [(set FR32X:$dst, fp32imm0)]>;
   def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@@ -484,7 +503,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                                        PatFrag vinsert_insert> {
   let ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
-                   (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+                   (ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
@@ -492,7 +511,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                          (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
 
     defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
-                   (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+                   (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
@@ -625,14 +644,14 @@ multiclass vextract_for_size<int Opcode,
     // vextract_extract), we interesting only in patterns without mask,
     // intrinsics pattern match generated bellow.
     defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
-                (ins From.RC:$src1, i32u8imm:$idx),
+                (ins From.RC:$src1, u8imm:$idx),
                 "vextract" # To.EltTypeName # "x" # To.NumElts,
                 "$idx, $src1", "$src1, $idx",
                 [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
                                                          (iPTR imm)))]>,
               AVX512AIi8Base, EVEX;
     def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
-                    (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx),
+                    (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
                     "vextract" # To.EltTypeName # "x" # To.NumElts #
                         "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
                     [(store (To.VT (vextract_extract:$idx
@@ -642,7 +661,7 @@ multiclass vextract_for_size<int Opcode,
     let mayStore = 1, hasSideEffects = 0 in
     def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
                     (ins To.MemOp:$dst, To.KRCWM:$mask,
-                                        From.RC:$src1, i32u8imm:$idx),
+                                        From.RC:$src1, u8imm:$idx),
                      "vextract" # To.EltTypeName # "x" # To.NumElts #
                           "\t{$idx, $src1, $dst {${mask}}|"
                           "$dst {${mask}}, $src1, $idx}",
@@ -846,32 +865,20 @@ def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
 // broadcast with a scalar argument.
 multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
-
-  let isCodeGenOnly = 1 in {
-  def r_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
-               (ins SrcInfo.FRC:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}",
-               [(set DestInfo.RC:$dst, (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)))]>,
-               Requires<[HasAVX512]>, T8PD, EVEX;
-
-  let Constraints = "$src0 = $dst" in
-  def rk_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
-                (ins DestInfo.RC:$src0, DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
-                OpcodeStr#"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
-                [(set DestInfo.RC:$dst,
-                     (vselect DestInfo.KRCWM:$mask,
-                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
-                              DestInfo.RC:$src0))]>,
-              Requires<[HasAVX512]>, T8PD, EVEX, EVEX_K;
-
-  def rkz_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
-                (ins DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
-                OpcodeStr#"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
-                [(set DestInfo.RC:$dst,
-                     (vselect DestInfo.KRCWM:$mask,
-                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
-                              DestInfo.ImmAllZerosV))]>,
-                Requires<[HasAVX512]>, T8PD, EVEX, EVEX_KZ;
-  } // let isCodeGenOnly = 1 in
+  def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#r)
+             (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                                  (X86VBroadcast SrcInfo.FRC:$src),
+                                  DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#rk)
+             DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
+             (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                                  (X86VBroadcast SrcInfo.FRC:$src),
+                                  DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)
+             DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
 }
 
 multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
@@ -892,7 +899,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                           (SrcInfo.VT (scalar_to_vector
                                        (SrcInfo.ScalarLdFrag addr:$src))))),
             (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
-  let AddedComplexity = 20 in
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                           (X86VBroadcast
                            (SrcInfo.VT (scalar_to_vector
@@ -900,7 +906,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                           DestInfo.RC:$src0)),
             (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
              DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
-  let AddedComplexity = 30 in
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                           (X86VBroadcast
                            (SrcInfo.VT (scalar_to_vector
@@ -951,39 +956,42 @@ def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
           (VBROADCASTSDZm addr:$src)>;
 
 multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+                                    SDPatternOperator OpNode,
                                     RegisterClass SrcRC> {
+  let ExeDomain = _.ExeDomain in
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins SrcRC:$src),
                          "vpbroadcast"##_.Suffix, "$src", "$src",
-                         (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX;
+                         (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX;
 }
 
 multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+                                       SDPatternOperator OpNode,
                                        RegisterClass SrcRC, Predicate prd> {
   let Predicates = [prd] in
-    defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+    defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
-    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256;
+    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128;
   }
 }
 
 let isCodeGenOnly = 1 in {
-defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8,
-                                                 HasBWI>;
-defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16,
-                                                 HasBWI>;
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
+                                                 X86VBroadcast, GR8, HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
+                                                 X86VBroadcast, GR16, HasBWI>;
 }
 let isAsmParserOnly = 1 in {
   defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
-                                                       GR32, HasBWI>;
+                                                       null_frag, GR32, HasBWI>;
   defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
-                                                       GR32, HasBWI>;
+                                                       null_frag, GR32, HasBWI>;
 }
-defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
-                                                 HasAVX512>;
-defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
-                                                 HasAVX512>, VEX_W;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
+                                                 X86VBroadcast, GR32, HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
+                                                 X86VBroadcast, GR64, HasAVX512>, VEX_W;
 
 def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
            (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
@@ -1035,7 +1043,18 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                             AVX5128IBase, EVEX;
 }
 
+let Predicates = [HasAVX512] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZm addr:$src)>;
+}
+
 let Predicates = [HasVLX, HasBWI] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZ128m addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZ256m addr:$src)>;
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   // This means we'll encounter truncated i32 loads; match that here.
   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -1075,18 +1094,12 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
 
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
-def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
-          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v8f32 VR256X:$src), 1)>;
 def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
           (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v4f64 VR256X:$src), 1)>;
 def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
           (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v4i64 VR256X:$src), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
-          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v8i32 VR256X:$src), 1)>;
 def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
           (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v16i16 VR256X:$src), 1)>;
@@ -1098,46 +1111,6 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
 def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
-          (VINSERTF64x4Zrr
-           (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
-                                                   VR128X:$src, sub_xmm),
-                                    VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
-                                                   VR128X:$src, sub_xmm),
-                                    VR128X:$src, 1)), sub_ymm), 1)>;
-
-def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
-                                                   VR128X:$src, sub_xmm),
-                                    VR128X:$src, 1)), sub_ymm), 1)>;
 }
 
 let Predicates = [HasVLX] in {
@@ -1209,25 +1182,6 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
 def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
 
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
-          (VINSERTF64x4Zrr
-           (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-
 def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
           (VBROADCASTF64X4rm addr:$src)>;
 def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
@@ -1265,25 +1219,6 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
 def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
           (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v8i32 VR256X:$src), 1)>;
-
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
-          (VINSERTF32x8Zrr
-           (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
-          (VINSERTI32x8Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
 }
 
 multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
@@ -1310,6 +1245,13 @@ defm VBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
 defm VBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
                                           avx512vl_f32_info, avx512vl_f64_info>;
 
+let Predicates = [HasVLX] in {
+def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
+          (VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
+          (VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+}
+
 def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
           (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
 def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
@@ -1604,13 +1546,13 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               imm:$cc)>, EVEX_4V;
+  let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
-                    (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                    (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc),
                     "vcmp${cc}"#_.Suffix,
                     "$src2, $src1", "$src1, $src2",
-                    (OpNode (_.VT _.RC:$src1),
-                        (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                    (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
                         imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
 
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -1629,6 +1571,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                         "vcmp"#_.Suffix,
                         "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+  let mayLoad = 1 in
     defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
                         (outs _.KRC:$dst),
                         (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -1667,8 +1610,10 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
 }
 
 let Predicates = [HasAVX512] in {
+  let ExeDomain = SSEPackedSingle in
   defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
                                    AVX512XSIi8Base;
+  let ExeDomain = SSEPackedDouble in
   defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
                                    AVX512XDIi8Base, VEX_W;
 }
@@ -2087,22 +2032,20 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       [(set _.KRC:$dst,(or _.KRCWM:$mask,
                                       (OpNode (_.VT _.RC:$src1),
                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    let AddedComplexity = 20 in {
-      def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##
-                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set _.KRC:$dst,
-                            (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                    (i32 imm:$src2)))], NoItinerary>;
-      def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##
-                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set _.KRC:$dst,
                           (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                              (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    }
+                                  (i32 imm:$src2)))], NoItinerary>;
+    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##
+                    "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                    [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                        (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                            (i32 imm:$src2))))], NoItinerary>, EVEX_K;
   }
 }
 
@@ -2242,28 +2185,26 @@ let Predicates = [HasBWI] in {
 
 // GR from/to mask register
 def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
-          (COPY_TO_REGCLASS GR16:$src, VK16)>;
+          (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
 def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
-          (COPY_TO_REGCLASS VK16:$src, GR16)>;
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
 
 def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
-          (COPY_TO_REGCLASS GR8:$src, VK8)>;
+          (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
 def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
-          (COPY_TO_REGCLASS VK8:$src, GR8)>;
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>;
 
 def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
           (KMOVWrk VK16:$src)>;
 def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
-          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>;
+          (COPY_TO_REGCLASS VK16:$src, GR32)>;
 
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>;
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>;
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
           (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
 def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>;
+          (COPY_TO_REGCLASS VK8:$src, GR32)>;
 
 def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
           (COPY_TO_REGCLASS GR32:$src, VK32)>;
@@ -2296,20 +2237,20 @@ let Predicates = [HasDQI] in {
 let Predicates = [HasAVX512, NoDQI] in {
   def : Pat<(store VK1:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK2:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK4:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK8:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
+              sub_8bit)))>;
 
   def : Pat<(v8i1 (load addr:$src)),
             (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
@@ -2340,44 +2281,41 @@ let Predicates = [HasBWI] in {
 
 let Predicates = [HasAVX512] in {
   def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
-                                        (i32 1))), VK1)>;
+            (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1)), VK1)>;
 
   def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
+            (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>;
 
   def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
             (COPY_TO_REGCLASS GR32:$src, VK1)>;
 
   def : Pat<(i1 (trunc (i8 GR8:$src))),
        (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                          GR8:$src, sub_8bit), (i32 1))),
-       VK1)>;
+        (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                 GR8:$src, sub_8bit), (i32 1)), VK1)>;
 
   def : Pat<(i1 (trunc (i16 GR16:$src))),
        (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                          GR16:$src, sub_16bit), (i32 1))),
-       VK1)>;
+        (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                 GR16:$src, sub_16bit), (i32 1)), VK1)>;
 
   def : Pat<(i32 (zext VK1:$src)),
-            (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+            (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>;
 
   def : Pat<(i32 (anyext VK1:$src)),
             (COPY_TO_REGCLASS VK1:$src, GR32)>;
 
   def : Pat<(i8 (zext VK1:$src)),
             (EXTRACT_SUBREG
-             (AND32ri8 (KMOVWrk
-                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>;
 
   def : Pat<(i8 (anyext VK1:$src)),
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
 
   def : Pat<(i64 (zext VK1:$src)),
-            (AND64ri8 (SUBREG_TO_REG (i64 0),
-             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+            (SUBREG_TO_REG (i64 0),
+             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>;
 
   def : Pat<(i64 (anyext VK1:$src)),
             (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
@@ -2385,8 +2323,7 @@ let Predicates = [HasAVX512] in {
 
   def : Pat<(i16 (zext VK1:$src)),
             (EXTRACT_SUBREG
-             (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
-              sub_16bit)>;
+             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>;
 
   def : Pat<(i16 (anyext VK1:$src)),
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
@@ -2440,15 +2377,6 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
 
 defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
 
-multiclass avx512_mask_unop_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
-                (i16 GR16:$src)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
-}
-defm : avx512_mask_unop_int<"knot", "KNOT">;
-
 // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
 let Predicates = [HasAVX512, NoDQI] in
 def : Pat<(vnot VK8:$src),
@@ -2497,21 +2425,6 @@ defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,   1>;
 defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
 defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,   1, HasDQI>;
 
-multiclass avx512_mask_binop_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
-                (i16 GR16:$src1), (i16 GR16:$src2)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
-              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-
-defm : avx512_mask_binop_int<"kand",  "KAND">;
-defm : avx512_mask_binop_int<"kandn", "KANDN">;
-defm : avx512_mask_binop_int<"kor",   "KOR">;
-defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
-defm : avx512_mask_binop_int<"kxor",  "KXOR">;
-
 multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
                             Instruction Inst> {
   // With AVX512F, 8-bit mask is promoted to 16-bit mask,
@@ -2613,8 +2526,8 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
   }
 }
 
-defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
-defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>;
 
 // Mask setting all 0s or 1s
 multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
@@ -2625,7 +2538,6 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
 }
 
 multiclass avx512_mask_setop_w<PatFrag Val> {
-  defm B : avx512_mask_setop<VK8,   v8i1, Val>;
   defm W : avx512_mask_setop<VK16, v16i1, Val>;
   defm D : avx512_mask_setop<VK32,  v32i1, Val>;
   defm Q : avx512_mask_setop<VK64, v64i1, Val>;
@@ -2642,9 +2554,11 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
   def : Pat<(v4i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK4)>;
   def : Pat<(v2i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK2)>;
-  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
-  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
-  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  let AddedComplexity = 10 in { // To optimize isel table.
+    def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+    def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+    def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  }
 }
 
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
@@ -2695,12 +2609,12 @@ def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
 
 // Patterns for kmask shift
 multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
-  def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))),
+  def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))),
             (VT (COPY_TO_REGCLASS
                    (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
                                (I8Imm $imm)),
                    RC))>;
-  def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))),
+  def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))),
             (VT (COPY_TO_REGCLASS
                    (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
                                (I8Imm $imm)),
@@ -2738,7 +2652,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                     [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
                     _.ExeDomain>, EVEX;
 
-  let Constraints = "$src0 = $dst" in {
+  let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
   def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
                     (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3160,6 +3074,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src)
                        "vmovq\t{$src, $dst|$dst, $src}",
                        [(set FR64X:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
 def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64X:$src))],
@@ -3272,20 +3190,22 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
                                     (scalar_to_vector _.FRC:$src2))))],
              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
-              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              (ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
               "$dst {${mask}} {z}, $src1, $src2}"),
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
-                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                      (_.VT (OpNode _.RC:$src1,
+                                            (scalar_to_vector _.FRC:$src2))),
                                       _.ImmAllZerosV)))],
               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
-             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
              "$dst {${mask}}, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
-                                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                     (_.VT (OpNode _.RC:$src1,
+                                           (scalar_to_vector _.FRC:$src2))),
                                      (_.VT _.RC:$src0))))],
              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
   let canFoldAsLoad = 1, isReMaterializable = 1 in
@@ -3335,8 +3255,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
                                           (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
                                           (COPY_TO_REGCLASS GR32:$mask, VK1WM),
-                                          (_.VT _.RC:$src0),
-                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                                          (_.VT _.RC:$src0), _.FRC:$src1),
                             _.RC)>;
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
@@ -3346,10 +3265,8 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
                                                        (_.EltVT ZeroFP))))))),
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
                                           (COPY_TO_REGCLASS GR32:$mask, VK1WM),
-                                          (_.VT _.RC:$src0),
-                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                                          (_.VT _.RC:$src0), _.FRC:$src1),
                             _.RC)>;
-
 }
 
 multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3359,14 +3276,31 @@ def : Pat<(masked_store addr:$dst, Mask,
              (_.info512.VT (insert_subvector undef,
                                (_.info256.VT (insert_subvector undef,
                                                  (_.info128.VT _.info128.RC:$src),
-                                                 (i64 0))),
-                               (i64 0)))),
+                                                 (iPTR 0))),
+                               (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                       (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
                       (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 
 }
 
+multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
+                                               AVX512VLVectorVTInfo _,
+                                               dag Mask, RegisterClass MaskRC,
+                                               SubRegIndex subreg> {
+
+def : Pat<(masked_store addr:$dst, Mask,
+             (_.info512.VT (insert_subvector undef,
+                               (_.info256.VT (insert_subvector undef,
+                                                 (_.info128.VT _.info128.RC:$src),
+                                                 (iPTR 0))),
+                               (iPTR 0)))),
+          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
 multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
                                        dag Mask, RegisterClass MaskRC> {
 
@@ -3374,7 +3308,7 @@ def : Pat<(_.info128.VT (extract_subvector
                          (_.info512.VT (masked_load addr:$srcAddr, Mask,
                                         (_.info512.VT (bitconvert
                                                        (v16i32 immAllZerosV))))),
-                           (i64 0))),
+                           (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmkz)
                       (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
                       addr:$srcAddr)>;
@@ -3384,53 +3318,81 @@ def : Pat<(_.info128.VT (extract_subvector
                       (_.info512.VT (insert_subvector undef,
                             (_.info256.VT (insert_subvector undef,
                                   (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (i64 0))),
-                            (i64 0))))),
-                (i64 0))),
+                                  (iPTR 0))),
+                            (iPTR 0))))),
+                (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
                       (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
                       addr:$srcAddr)>;
 
 }
 
+multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
+                                              AVX512VLVectorVTInfo _,
+                                              dag Mask, RegisterClass MaskRC,
+                                              SubRegIndex subreg> {
+
+def : Pat<(_.info128.VT (extract_subvector
+                         (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                                        (_.info512.VT (bitconvert
+                                                       (v16i32 immAllZerosV))))),
+                           (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmkz)
+                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+                (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                      (_.info512.VT (insert_subvector undef,
+                            (_.info256.VT (insert_subvector undef,
+                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+                                  (iPTR 0))),
+                            (iPTR 0))))),
+                (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      addr:$srcAddr)>;
+
+}
+
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
-defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
-                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
-defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
-                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
-defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
-                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
-defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
-                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
 
 def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
           (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
-          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
 let hasSideEffects = 0 in
 defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2),
                            "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
                            XS, EVEX_4V, VEX_LIG;
 
 let hasSideEffects = 0 in
-defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2),
                            "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
                            XD, EVEX_4V, VEX_LIG, VEX_W;
 
@@ -3439,31 +3401,31 @@ let Predicates = [HasAVX512] in {
   // Move scalar to XMM zero-extended, zeroing a VR128X then do a
   // MOVS{S,D} to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
-            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+            (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>;
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
-            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+            (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
-            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+            (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
-            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+            (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>;
   }
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (V_SET0)),
+             (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4i32 (V_SET0)),
+             (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (V_SET0)),
+             (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4i32 (V_SET0)),
+             (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
 
   let AddedComplexity = 20 in {
@@ -3525,11 +3487,11 @@ let Predicates = [HasAVX512] in {
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
                                             FR32X:$src)), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                    (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                                      FR64X:$src)), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                    (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
@@ -3538,18 +3500,18 @@ let Predicates = [HasAVX512] in {
   // Move low f64 and clear high bits.
   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (V_SET0)),
+             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (V_SET0)),
+             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
 
   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
 
   // Extract and store.
@@ -3582,10 +3544,6 @@ let Predicates = [HasAVX512] in {
             (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
   def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
             (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
-  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
-  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
@@ -3635,6 +3593,8 @@ let Predicates = [HasAVX512] in {
   }
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -3669,42 +3629,26 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i64 (X86vzload addr:$src)),
             (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
 }
-
-def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
 let SchedRW = [WriteLoad] in {
   def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                         (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
-                        [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
-                        SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+                        [], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
                         EVEX_CD8<64, CD8VF>;
 
   let Predicates = [HasVLX] in {
     def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
                          (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
-                         [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))],
-                         SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+                         [], SSEPackedInt>, EVEX, T8PD, EVEX_V256,
                          EVEX_CD8<64, CD8VF>;
 
     def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
                         (ins i128mem:$src),
                         "vmovntdqa\t{$src, $dst|$dst, $src}",
-                        [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))],
-                        SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+                        [], SSEPackedInt>, EVEX, T8PD, EVEX_V128,
                         EVEX_CD8<64, CD8VF>;
   }
 }
@@ -4150,8 +4094,7 @@ let Predicates = [HasDQI, NoVLX] in {
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           X86VectorVTInfo _, OpndItins itins,
-                           bit IsCommutable = 0> {
+                           X86VectorVTInfo _, bit IsCommutable = 0> {
   defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
@@ -4159,7 +4102,7 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                      (bitconvert (_.VT _.RC:$src2)))),
                     (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
                                                        _.RC:$src2)))),
-                    itins.rr, IsCommutable>,
+                    IIC_SSE_BIT_P_RR, IsCommutable>,
             AVX512BIBase, EVEX_4V;
 
   defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4169,14 +4112,13 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    (bitconvert (_.LdFrag addr:$src2)))),
                   (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
                                      (bitconvert (_.LdFrag addr:$src2)))))),
-                  itins.rm>,
+                  IIC_SSE_BIT_P_RM>,
             AVX512BIBase, EVEX_4V;
 }
 
 multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            X86VectorVTInfo _, OpndItins itins,
-                            bit IsCommutable = 0> :
-           avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+                            X86VectorVTInfo _, bit IsCommutable = 0> :
+           avx512_logic_rm<opc, OpcodeStr, OpNode, _, IsCommutable> {
   defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                   "${src2}"##_.BroadcastStr##", $src1",
@@ -4189,58 +4131,48 @@ multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                      (bitconvert
                                       (_.VT (X86VBroadcast
                                              (_.ScalarLdFrag addr:$src2)))))))),
-                  itins.rm>,
+                  IIC_SSE_BIT_P_RM>,
              AVX512BIBase, EVEX_4V, EVEX_B;
 }
 
 multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
-                               Predicate prd, bit IsCommutable = 0> {
-  let Predicates = [prd] in
-    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                               AVX512VLVectorVTInfo VTInfo,
+                               bit IsCommutable = 0> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
                              IsCommutable>, EVEX_V512;
 
-  let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
                              IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
                              IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
-                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+                                  IsCommutable>, EVEX_CD8<32, CD8VF>;
 }
 
 multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
-                               itins, prd, IsCommutable>,
-                               VEX_W, EVEX_CD8<64, CD8VF>;
+                                  IsCommutable>,
+                                  VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins, Predicate prd,
-                                 bit IsCommutable = 0> {
-  defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
-                                IsCommutable>;
-
-  defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
-                                IsCommutable>;
+                                 SDNode OpNode, bit IsCommutable = 0> {
+  defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, IsCommutable>;
+  defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, IsCommutable>;
 }
 
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 0>;
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
@@ -4252,16 +4184,16 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 FROUND_CURRENT)),
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2,
+                                          (i32 FROUND_CURRENT))),
                            itins.rr>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
-                         (VecNode (_.VT _.RC:$src1),
-                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
-                           (i32 FROUND_CURRENT)),
+                         (_.VT (VecNode _.RC:$src1,
+                                        _.ScalarIntMemCPat:$src2,
+                                        (i32 FROUND_CURRENT))),
                          itins.rm>;
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -4291,13 +4223,43 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
                           EVEX_B, EVEX_RC;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode VecNode, OpndItins itins, bit IsCommutable> {
-  let ExeDomain = _.ExeDomain in
+                                SDNode OpNode, SDNode VecNode, SDNode SaeNode,
+                                OpndItins itins, bit IsCommutable> {
+  let ExeDomain = _.ExeDomain in {
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2)),
+                           itins.rr>;
+
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (VecNode _.RC:$src1,
+                                        _.ScalarIntMemCPat:$src2)),
+                         itins.rm>;
+
+  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr> {
+    let isCommutable = IsCommutable;
+  }
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+  }
+
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
-                            (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                             (i32 FROUND_NO_EXC))>, EVEX_B;
+  }
 }
 
 multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4316,31 +4278,29 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SDNode VecNode,
+                                  SDNode VecNode, SDNode SaeNode,
                                   SizeItins itins, bit IsCommutable> {
-  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
-                              itins.s, IsCommutable>,
-             avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
-                              itins.s, IsCommutable>,
+  defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
+                              VecNode, SaeNode, itins.s, IsCommutable>,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
-  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
-                              itins.d,                  IsCommutable>,
-             avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
-                              itins.d, IsCommutable>,
+  defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
+                              VecNode, SaeNode, itins.d, IsCommutable>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 }
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>;
-defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
-defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+                                 SSE_ALU_ITINS_S, 0>;
+defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+                                 SSE_ALU_ITINS_S, 0>;
 
 // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
-  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4598,6 +4558,7 @@ let Predicates = [HasVLX,HasDQI] in {
 
 multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -4613,10 +4574,12 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
                                               (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
                    EVEX_4V, EVEX_B;
+  }
 }
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -4627,6 +4590,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (OpNode _.RC:$src1,
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                           (i32 FROUND_CURRENT))>;
+  }
 }
 
 multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
@@ -4899,6 +4863,33 @@ defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
 defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
 defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
 
+// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                 VR128X:$src2)), sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                 VR128X:$src2)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                 imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                 imm:$src2)), sub_xmm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // Variable Bit Shifts
 //===-------------------------------------------------------------------===//
@@ -4932,6 +4923,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
                     EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
 }
+
 multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
@@ -4955,12 +4947,13 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
 }
 
 // Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
-  let Predicates = [HasBWI, NoVLX] in {
+multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
+                                     SDNode OpNode, list<Predicate> p> {
+  let Predicates = p in {
   def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
                                   (_.info256.VT _.info256.RC:$src2))),
             (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME#"WZrr")
+                (!cast<Instruction>(OpcodeStr#"Zrr")
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
              sub_ymm)>;
@@ -4968,13 +4961,12 @@ multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
   def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
                                   (_.info128.VT _.info128.RC:$src2))),
             (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME#"WZrr")
+                (!cast<Instruction>(OpcodeStr#"Zrr")
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
              sub_xmm)>;
   }
 }
-
 multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
                                  SDNode OpNode> {
   let Predicates = [HasBWI] in
@@ -4990,19 +4982,22 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
 }
 
 defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
-              avx512_var_shift_w<0x12, "vpsllvw", shl>,
-              avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+              avx512_var_shift_w<0x12, "vpsllvw", shl>;
 
 defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
-              avx512_var_shift_w<0x11, "vpsravw", sra>,
-              avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+              avx512_var_shift_w<0x11, "vpsravw", sra>;
 
 defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
-              avx512_var_shift_w<0x10, "vpsrlvw", srl>,
-              avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
+              avx512_var_shift_w<0x10, "vpsrlvw", srl>;
+
 defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
 defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
 
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>;
+
 // Special handing for handling VPSRAV intrinsics.
 multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                                          list<Predicate> p> {
@@ -5013,7 +5008,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 20 in {
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
@@ -5023,8 +5017,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
-    }
-    let AddedComplexity = 30 in {
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
@@ -5034,7 +5026,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
-    }
   }
 }
 
@@ -5046,14 +5037,12 @@ multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
                      (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
                _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 20 in
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1,
                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 30 in
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1,
                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
@@ -5251,6 +5240,7 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
   def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.RC:$src1, f64mem:$src2),
                   !strconcat(OpcodeStr,
@@ -5599,7 +5589,7 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
           "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
 
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-          (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
+          (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
 
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5625,13 +5615,13 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                             string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
                             SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
-
+  let ExeDomain = _.ExeDomain in {
   defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
                 // Operands for intrinsic are in 123 order to preserve passthu
                 // semantics.
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
-                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
+                         _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -5641,8 +5631,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 
   defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
                 (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds3 _.RC:$src2,
-                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
                               _.RC:$src1, (i32 FROUND_CURRENT))),
                 (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
                                   (i32 imm:$rc))),
@@ -5653,8 +5642,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 
   defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds1 _.RC:$src1,
-                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
                               _.RC:$src2, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
                          (i32 imm:$rc))),
@@ -5662,6 +5650,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                          _.FRC:$src2))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
                           (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+  }
 }
 
 multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
@@ -5692,6 +5681,7 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
 let Constraints = "$src1 = $dst" in {
 multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -5711,6 +5701,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src1,
              _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
             AVX512FMA3Base, EVEX_B;
+  }
 }
 } // Constraints = "$src1 = $dst"
 
@@ -5878,10 +5869,10 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
                 !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
                 EVEX, VEX_LIG, EVEX_B, EVEX_RC;
-    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
-                      (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))),
+                      (SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
                       (i32 FROUND_CURRENT)))]>,
                 EVEX, VEX_LIG;
   } // Predicates = [HasAVX512]
@@ -5918,20 +5909,20 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
             (VCVTSS2SIZrr VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))),
-            (VCVTSS2SIZrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)),
+            (VCVTSS2SIZrm sse_load_f32:$src)>;
   def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
             (VCVTSS2SI64Zrr VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))),
-            (VCVTSS2SI64Zrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)),
+            (VCVTSS2SI64Zrm sse_load_f32:$src)>;
   def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
             (VCVTSD2SIZrr VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))),
-            (VCVTSD2SIZrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)),
+            (VCVTSD2SIZrm sse_load_f64:$src)>;
   def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
             (VCVTSD2SI64Zrr VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))),
-            (VCVTSD2SI64Zrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)),
+            (VCVTSD2SI64Zrm sse_load_f64:$src)>;
 } // HasAVX512
 
 let Predicates = [HasAVX512] in {
@@ -6018,7 +6009,7 @@ let Predicates = [HasAVX512] in {
                                     EVEX,VEX_LIG , EVEX_B;
     let mayLoad = 1, hasSideEffects = 0 in
       def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
-                  (ins _SrcRC.MemOp:$src),
+                  (ins _SrcRC.IntScalarMemOp:$src),
                   !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                   []>, EVEX, VEX_LIG;
 
@@ -6055,47 +6046,58 @@ defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
             (VCVTTSS2SIZrr_Int VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))),
-            (VCVTTSS2SIZrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvttss2si sse_load_f32:$src)),
+            (VCVTTSS2SIZrm_Int ssmem:$src)>;
   def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
             (VCVTTSS2SI64Zrr_Int VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))),
-            (VCVTTSS2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 sse_load_f32:$src)),
+            (VCVTTSS2SI64Zrm_Int ssmem:$src)>;
   def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
             (VCVTTSD2SIZrr_Int VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))),
-            (VCVTTSD2SIZrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si sse_load_f64:$src)),
+            (VCVTTSD2SIZrm_Int sdmem:$src)>;
   def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
             (VCVTTSD2SI64Zrr_Int VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))),
-            (VCVTTSD2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)),
+            (VCVTTSD2SI64Zrm_Int sdmem:$src)>;
 } // HasAVX512
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
 //===----------------------------------------------------------------------===//
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNode> {
-  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2),
                                        (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
-  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
-                                  (_Src.VT (scalar_to_vector
-                                            (_Src.ScalarLdFrag addr:$src2))),
+                                  (_Src.VT _Src.ScalarIntMemCPat:$src2),
                                   (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+  let isCodeGenOnly = 1, hasSideEffects = 0 in {
+    def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _Src.FRC:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+    let mayLoad = 1 in
+    def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+  }
 }
 
 // Scalar Coversion with SAE - suppress all exceptions
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
-  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
@@ -6107,7 +6109,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
 // Scalar Conversion with rounding control (RC)
 multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
-  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
@@ -6140,39 +6142,36 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
                                           X86fpextRnd,f32x_info, f64x_info >;
 
 def : Pat<(f64 (fpextend FR32X:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+          (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
           Requires<[HasAVX512]>;
 def : Pat<(f64 (fpextend (loadf32 addr:$src))),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX512]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-      (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
       Requires<[HasAVX512, OptForSize]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
-                    (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+          (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
           Requires<[HasAVX512, OptForSpeed]>;
 
 def : Pat<(f32 (fpround FR64X:$src)),
-          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
-                    (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+          (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
            Requires<[HasAVX512]>;
 
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128X:$dst),
                    (v4f32 (scalar_to_vector
                      (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
-          (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+          (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>,
           Requires<[HasAVX512]>;
 
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128X:$dst),
                    (v2f64 (scalar_to_vector
                      (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
-          (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+          (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>,
           Requires<[HasAVX512]>;
 
 //===----------------------------------------------------------------------===//
@@ -6808,7 +6807,7 @@ let Predicates = [HasAVX512] in {
   let Predicates = [HasVLX] in {
     defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
                         EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
-    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 }
@@ -6917,7 +6916,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
-  let AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -6942,6 +6941,7 @@ defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
@@ -6955,6 +6955,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           (OpNode (_.FloatVT
                             (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
                           EVEX, T8PD, EVEX_B;
+  }
 }
 
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -6986,7 +6987,7 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          SDNode OpNode> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -7005,6 +7006,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (OpNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                          (i32 FROUND_CURRENT))>;
+  }
 }
 
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -7024,7 +7026,7 @@ defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
@@ -7041,9 +7043,11 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          (OpNode (_.FloatVT
                                   (X86VBroadcast (_.ScalarLdFrag addr:$src))),
                                  (i32 FROUND_CURRENT))>, EVEX_B;
+  }
 }
 multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode> {
+  let ExeDomain = _.ExeDomain in
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
@@ -7084,6 +7088,7 @@ defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
 
 multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
                               SDNode OpNodeRnd, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in
   defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
                          (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
@@ -7092,6 +7097,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX;
@@ -7106,6 +7112,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                           (OpNode (_.FloatVT
                             (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
                           EVEX, EVEX_B;
+  }
 }
 
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
@@ -7143,7 +7150,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                               string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -7176,6 +7183,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
   }
+  }
 
   def : Pat<(_.EltVT (OpNode _.FRC:$src)),
             (!cast<Instruction>(NAME#SUFF#Zr)
@@ -7480,11 +7488,11 @@ multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasBWI] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v8i16x_info,
-                    v16i8x_info, i64mem, LdFrag, OpNode>,
+                    v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v16i16x_info,
@@ -7499,11 +7507,11 @@ multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
-                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                   v16i8x_info, i32mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
@@ -7518,11 +7526,11 @@ multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v16i8x_info, i16mem, LdFrag, OpNode>,
+                   v16i8x_info, i16mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7537,11 +7545,11 @@ multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
-                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                   v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
@@ -7556,11 +7564,11 @@ multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v8i16x_info, i32mem, LdFrag, OpNode>,
+                   v8i16x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7575,12 +7583,12 @@ multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
 
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v4i32x_info, i64mem, LdFrag, OpNode>,
+                   v4i32x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7594,19 +7602,19 @@ multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
-defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
-defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
-defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
-defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
-defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">;
 
-defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
-defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
-defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
-defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
-defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
-defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">;
 
 // EXTLOAD patterns, implemented using vpmovz
 multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
@@ -7649,69 +7657,69 @@ let Predicates = [HasAVX512] in {
   defm : avx512_ext_lowering<"DQZ",    v8i64_info,   v8i32x_info,  extloadvi32>;
 }
 
-multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
-                                 SDNode ExtOp, PatFrag ExtLoad16> {
+multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
+                                 SDNode InVecOp, PatFrag ExtLoad16> {
   // 128-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
@@ -7790,8 +7798,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
   }
 }
 
-defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
-defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;
 
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
@@ -7832,7 +7840,7 @@ multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mgatherv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz512mem,
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256xmem,
                                        mgatherv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
@@ -7889,7 +7897,7 @@ multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mscatterv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz512mem,
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256xmem,
                                        mscatterv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
@@ -7922,7 +7930,7 @@ defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7934,7 +7942,7 @@ defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7946,7 +7954,7 @@ defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7958,7 +7966,7 @@ defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7982,6 +7990,17 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
 }
 
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_convert_mask_to_vector_lowering<X86VectorVTInfo X86Info,
+                                                            X86VectorVTInfo _> {
+
+  def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))),
+            (X86Info.VT (EXTRACT_SUBREG
+                           (_.VT (!cast<Instruction>(NAME#"Zrr")
+                             (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))),
+                           X86Info.SubRegIdx))>;
+}
+
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
 let Predicates = [prd] in
@@ -7991,20 +8010,17 @@ let Predicates = [prd] in
     defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
     defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
   }
-}
+let Predicates = [prd, NoVLX] in {
+   defm Z256_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info256,VTInfo.info512>;
+   defm Z128_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info128,VTInfo.info512>;
+  }
 
-multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
-  defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info,  OpcodeStr,
-                                       HasBWI>;
-  defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
-                                       HasBWI>, VEX_W;
-  defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
-                                       HasDQI>;
-  defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
-                                       HasDQI>, VEX_W;
 }
 
-defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>;
+defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W;
+defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>;
+defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W;
 
 multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
     def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
@@ -8319,6 +8335,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
   defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -8466,6 +8483,39 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
 defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 
+let Predicates = [HasAVX512] in {
+// Provide fallback in case the load node that is used in the broadcast
+// patterns above is used by additional users, which prevents the pattern
+// selection.
+def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+          (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+          (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+          (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+          (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+          (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+
+def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+          (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+}
+
 multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
   defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
                            AVX512AIi8Base, EVEX_4V;
@@ -8503,6 +8553,7 @@ defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
 
 multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1), OpcodeStr,
                     "$src1", "$src1",
@@ -8513,6 +8564,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src1", "$src1",
                   (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
             EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+  }
 }
 
 multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8577,66 +8629,7 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
                                     HasBWI>;
 }
 
-defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
-
-def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
-                                                      VR128X:$src))>;
-def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>;
-def avx512_v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>;
-def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
-                                                      VR256X:$src))>;
-def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>;
-def avx512_v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>;
-
-let Predicates = [HasBWI, HasVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))),
-            (VPABSBZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))),
-            (VPABSWZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v32i1sextv32i8)),
-            (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))),
-            (VPABSBZ256rr VR256X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v16i1sextv16i16)),
-            (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))),
-            (VPABSWZ256rr VR256X:$src)>;
-}
-let Predicates = [HasAVX512, HasVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))),
-            (VPABSDZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v8i1sextv8i32)),
-            (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))),
-            (VPABSDZ256rr VR256X:$src)>;
-}
-
-let Predicates = [HasAVX512] in {
-def : Pat<(xor
-          (bc_v8i64 (v16i1sextv16i32)),
-          (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
-          (VPABSDZrr VR512:$src)>;
-def : Pat<(xor
-          (bc_v8i64 (v8i1sextv8i64)),
-          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
-          (VPABSQZrr VR512:$src)>;
-}
-let Predicates = [HasBWI] in {
-def : Pat<(xor
-          (bc_v8i64 (v64i1sextv64i8)),
-          (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))),
-          (VPABSBZrr VR512:$src)>;
-def : Pat<(xor
-          (bc_v8i64 (v32i1sextv32i16)),
-          (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))),
-          (VPABSWZrr VR512:$src)>;
-}
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>;
 
 multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
 
@@ -8663,6 +8656,7 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
 
 multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
                    (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
@@ -8671,6 +8665,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
                  (_.VT (OpNode (_.VT (scalar_to_vector
                                        (_.ScalarLdFrag addr:$src)))))>,
                  EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+  }
 }
 
 multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8947,6 +8942,68 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
 defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
                                        HasBWI>, EVEX_4V;
 
+// Transforms to swizzle an immediate to enable better matching when
+// memory operand isn't in the right place.
+def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 1/4 and 3/6.
+  uint8_t NewImm = Imm & 0xa5;
+  if (Imm & 0x02) NewImm |= 0x10;
+  if (Imm & 0x10) NewImm |= 0x02;
+  if (Imm & 0x08) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 2/4 and 3/5.
+  uint8_t NewImm = Imm & 0xc3;
+  if (Imm & 0x04) NewImm |= 0x10;
+  if (Imm & 0x10) NewImm |= 0x04;
+  if (Imm & 0x08) NewImm |= 0x20;
+  if (Imm & 0x20) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 1/2 and 5/6.
+  uint8_t NewImm = Imm & 0x99;
+  if (Imm & 0x02) NewImm |= 0x04;
+  if (Imm & 0x04) NewImm |= 0x02;
+  if (Imm & 0x20) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x20;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by moving operand 1 to the end.
+  uint8_t Imm = N->getZExtValue();
+  // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
+  uint8_t NewImm = Imm & 0x81;
+  if (Imm & 0x02) NewImm |= 0x04;
+  if (Imm & 0x04) NewImm |= 0x10;
+  if (Imm & 0x08) NewImm |= 0x40;
+  if (Imm & 0x10) NewImm |= 0x02;
+  if (Imm & 0x20) NewImm |= 0x08;
+  if (Imm & 0x40) NewImm |= 0x20;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
+  uint8_t Imm = N->getZExtValue();
+  // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
+  uint8_t NewImm = Imm & 0x81;
+  if (Imm & 0x02) NewImm |= 0x10;
+  if (Imm & 0x04) NewImm |= 0x02;
+  if (Imm & 0x08) NewImm |= 0x20;
+  if (Imm & 0x10) NewImm |= 0x04;
+  if (Imm & 0x20) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
 multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           X86VectorVTInfo _>{
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
@@ -8975,6 +9032,141 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (i8 imm:$src4)), 1, 0>, EVEX_B,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
   }// Constraints = "$src1 = $dst"
+
+  // Additional patterns for matching passthru operand in other positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+
+  // Additional patterns for matching loads in other positions.
+  def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (OpNode _.RC:$src1,
+                          (bitconvert (_.LdFrag addr:$src3)),
+                          _.RC:$src2, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching zero masking with loads in other
+  // positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching masked loads with different
+  // operand orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1,
+                    (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+
+  // Additional patterns for matching broadcasts in other positions.
+  def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (OpNode _.RC:$src1,
+                          (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                          _.RC:$src2, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching zero masking with broadcasts in other
+  // positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+             (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+             (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching masked broadcasts with different
+  // operand orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    (i8 imm:$src4)), _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
 }
 
 multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index ba970bc2048e..dcce7b9951f2 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -147,7 +147,7 @@ addOffset(const MachineInstrBuilder &MIB, int Offset) {
 
 static inline const MachineInstrBuilder &
 addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
-  return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0);
+  return MIB.addImm(1).addReg(0).add(Offset).addReg(0);
 }
 
 /// addRegOffset - This function is used to add a memory reference of the form
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index c73c95019f8d..b85abfb9ca7f 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -110,3 +110,9 @@ defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>;  // signed greater or equal
 defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>;  // signed less than or equal
 defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
 
+// SALC is an undocumented instruction. Information for this instruction can be found
+// here http://www.rcollins.org/secrets/opcodes/SALC.html
+// Set AL if carry. 
+let Uses = [EFLAGS], Defs = [AL] in {
+  def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>;
+}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 3c27eb8077d0..e592c2b3c0aa 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -259,20 +259,20 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 20 in
+    isPseudo = 1, AddedComplexity = 10 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
+let AddedComplexity = 10 in {
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
-def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
-  let AddedComplexity = 20;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
 }
 
 let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
-    AddedComplexity = 15 in {
+    AddedComplexity = 10 in {
   // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
   // which only require 3 bytes compared to MOV32ri which requires 5.
   let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@@ -287,7 +287,7 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
   def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
 }
 
-let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5 in {
 // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
 // FIXME: Add itinerary class and Schedule.
 def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
@@ -772,11 +772,11 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
 // the pseudo. The argument feeding EBX is ebx_input.
 //
 // The additional argument, $ebx_save, is a temporary register used to
-// save the value of RBX accross the actual instruction.
+// save the value of RBX across the actual instruction.
 //
 // To make sure the register assigned to $ebx_save does not interfere with
 // the definition of the actual instruction, we use a definition $dst which
-// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
 // the instruction and we are sure we will have a valid register to restore
 // the value of RBX.
 let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
@@ -1743,6 +1743,12 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
 def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
 def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
 
+// sub reg, relocImm
+def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
+          (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
+def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2),
+          (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 2f260c48df47..4ea223e82be9 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -264,6 +264,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                    "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
 }
 
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP, EFLAGS] in {
+  def TCRETURNdicc : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
+
+  // This gets substituted to a conditional jump instruction in MC lowering.
+  def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst, i32imm:$cond),
+                           "",
+                           [], IIC_JMP_REL>;
+}
+
 
 //===----------------------------------------------------------------------===//
 //  Call Instructions...
@@ -325,3 +340,19 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                            "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
   }
 }
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [RSP, EFLAGS] in {
+  def TCRETURNdi64cc : PseudoI<(outs),
+                           (ins i64i32imm_pcrel:$dst, i32imm:$offset,
+                            i32imm:$cond), []>;
+
+  // This gets substituted to a conditional jump instruction in MC lowering.
+  def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
+                           (ins i64i32imm_pcrel:$dst, i32imm:$cond),
+                           "",
+                           [], IIC_JMP_REL>;
+}
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 4b19f801dae1..1941ae57f0f1 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -191,13 +191,15 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpStr, string PackTy, string Suff,
                        SDNode OpNode, RegisterClass RC,
-                       X86MemOperand x86memop> {
-  defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
-                                x86memop, RC>;
-  defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                                x86memop, RC, OpNode>;
-  defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                                x86memop, RC>;
+                       X86MemOperand x86memop> { 
+  let Predicates = [HasFMA, NoAVX512] in {
+    defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+                                  x86memop, RC>;
+    defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                                  x86memop, RC, OpNode>;
+    defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+                                  x86memop, RC>;
+  }
 }
 
 // The FMA 213 form is created for lowering of scalar FMA intrinscis
diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp
index db83497ee69d..00ef65cdb6bd 100644
--- a/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -16,11 +16,14 @@
 #include "X86InstrInfo.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Threading.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 /// This flag is used in the method llvm::call_once() used below to make the
 /// initialization of the map 'OpcodeToGroup' thread safe.
-LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+static llvm::once_flag InitGroupsOnceFlag;
 
 static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
 X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h
index 025cee3b2b90..e3568160da46 100644
--- a/lib/Target/X86/X86InstrFMA3Info.h
+++ b/lib/Target/X86/X86InstrFMA3Info.h
@@ -1,4 +1,4 @@
-//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,9 +18,11 @@
 #include "X86.h"
 #include "llvm/ADT/DenseMap.h"
 #include <cassert>
+#include <cstdint>
 #include <set>
 
 namespace llvm {
+
 /// This class is used to group {132, 213, 231} forms of FMA opcodes together.
 /// Each of the groups has either 3 register opcodes, 3 memory opcodes,
 /// or 6 register and memory opcodes. Also, each group has an attrubutes field
@@ -201,7 +203,7 @@ public:
   static X86InstrFMA3Info *getX86InstrFMA3Info();
 
   /// Constructor. Just creates an object of the class.
-  X86InstrFMA3Info() {}
+  X86InstrFMA3Info() = default;
 
   /// Destructor. Deallocates the memory used for FMA3 Groups.
   ~X86InstrFMA3Info() {
@@ -310,6 +312,7 @@ public:
     return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
   }
 };
-} // namespace llvm
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 10f3839ea8ed..11b1d070ef2f 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -631,6 +631,9 @@ let Defs = [FPSW] in
 def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
 def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
                 "ffree\t$reg", IIC_FFREE>;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg),
+                "ffreep\t$reg", IIC_FFREE>;
+
 // Clear exceptions
 
 let Defs = [FPSW] in
@@ -665,15 +668,16 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
 
 let Predicates = [HasFXSR] in {
   def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                 "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+               "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
   def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                    "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
-                    IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+                 "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+                 IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
   def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, 
+                TB;
   def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                     "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
-                     IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+                  "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+                  IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
 } // Predicates = [FeatureFXSR]
 } // SchedRW
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 610756aa37da..c2fe786732dc 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -199,7 +199,8 @@ class TAPS : TA { Prefix OpPrefix = PS; }
 class TAPD : TA { Prefix OpPrefix = PD; }
 class TAXD : TA { Prefix OpPrefix = XD; }
 class VEX    { Encoding OpEnc = EncVEX; }
-class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_W    { bits<2> VEX_WPrefix = 1; }
+class VEX_WIG  { bits<2> VEX_WPrefix = 2; }
 class VEX_4V : VEX { bit hasVEX_4V = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
@@ -270,7 +271,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
   Encoding OpEnc = EncNormal; // Encoding used by this instruction
   bits<2> OpEncBits = OpEnc.Value;
-  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bits<2> VEX_WPrefix = 0;  // Does this inst set the VEX_W field?
   bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
   bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
@@ -317,7 +318,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{28-27} = ExeDomain.Value;
   let TSFlags{30-29} = OpEncBits;
   let TSFlags{38-31} = Opcode;
-  let TSFlags{39}    = hasVEX_WPrefix;
+  // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
+  let TSFlags{39}    = VEX_WPrefix{0};
   let TSFlags{40}    = hasVEX_4V;
   let TSFlags{41}    = hasVEX_L;
   let TSFlags{42}    = hasEVEX_K;
@@ -453,7 +455,7 @@ class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
          Domain d = GenericDomain>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
-                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
                    !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
                    !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c5689d7c698c..9867ba84bb9b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -27,21 +27,19 @@ def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
 //===----------------------------------------------------------------------===//
 
 def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
-def load_mvmmx : PatFrag<(ops node:$ptr),
-                         (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
 
 //===----------------------------------------------------------------------===//
 // SSE specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<1>, SDTCisVT<3, i8>,
-                                       SDTCisVec<1>]>;
-def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
-                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                       SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, i8>]>;
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fmins   : SDNode<"X86ISD::FMINS",     SDTFPBinOp>;
+def X86fmaxs   : SDNode<"X86ISD::FMAXS",     SDTFPBinOp>;
 
 // Commutative and Associative FMIN and FMAX.
 def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
@@ -200,6 +198,15 @@ def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
 def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
 def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
 
+def X86kshiftl : SDNode<"X86ISD::KSHIFTL",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i8>]>>;
+def X86kshiftr : SDNode<"X86ISD::KSHIFTR",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i8>]>>;
+
 def X86vrotli  : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
 def X86vrotri  : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
 
@@ -230,10 +237,11 @@ def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
                                              SDTCisSameAs<0,2>,
                                              SDTCisSameSizeAs<0,3>,
                                              SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisFP<0>, SDTCisInt<3>,
                                              SDTCisVT<4, i8>]>>;
 def X86vpperm : SDNode<"X86ISD::VPPERM",
                         SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>]>>;
+                                             SDTCisSameAs<0,2>, SDTCisSameAs<0, 3>]>>;
 
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
@@ -300,13 +308,17 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 
 def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                         SDTCisSameSizeAs<0,2>,
-                                        SDTCisSameNumEltsAs<0,2>]>;
+                                        SDTCisSameNumEltsAs<0,2>,
+                                        SDTCisFP<0>, SDTCisInt<2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
-def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                             SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>]>;
 def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
                                                  SDTCisSameAs<0,2>,
                                                  SDTCisInt<3>,
@@ -314,8 +326,10 @@ def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
                                                  SDTCisSameNumEltsAs<0, 3>,
                                                  SDTCisVT<4, i32>,
                                                  SDTCisVT<5, i32>]>;
-def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                              SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                               SDTCisSameAs<0,1>,
+                                               SDTCisVT<2, i32>,
+                                               SDTCisVT<3, i32>]>;
 
 def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
@@ -324,9 +338,9 @@ def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
 
-def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
-                                SDTCisVT<4, i8>]>;
+def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisVec<0>,
+                                       SDTCisSameAs<0,1>, SDTCisSameAs<0,2>,
+                                       SDTCisSameAs<0,3>, SDTCisVT<4, i8>]>;
 
 def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
@@ -334,16 +348,13 @@ def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
 def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
   SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
 
-def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
-                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
-                           SDTCisVT<4, i32>]>;
+                           SDTCisFP<0>, SDTCisVT<4, i32>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
-def X86Abs      : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
 def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -367,17 +378,28 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
+                                   SDTCisVec<1>, SDTCisInt<1>,
                                    SDTCisSameSizeAs<0,1>,
-                                   SDTCisSameAs<1,2>]>;
+                                   SDTCisSameAs<1,2>,
+                                   SDTCisOpSmallerThanOp<0, 1>]>;
 def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
 def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
 
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
-def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
-def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD"   , SDTPack, [SDNPCommutative]>;
+def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW",
+                            SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+                                                 SDTCVecEltisVT<1, i8>,
+                                                 SDTCisSameSizeAs<0,1>,
+                                                 SDTCisSameAs<1,2>]>>;
+def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD",
+                            SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i32>,
+                                                 SDTCVecEltisVT<1, i16>,
+                                                 SDTCisSameSizeAs<0,1>,
+                                                 SDTCisSameAs<1,2>]>,
+                            [SDNPCommutative]>;
 
 def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
 def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
@@ -414,8 +436,8 @@ def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
 def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
 def X86VGetMant    : SDNode<"X86ISD::VGETMANT",  SDTFPUnaryOpImmRound>;
 def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
-                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
-                                            SDTCisVec<1>, SDTCisFP<1>,
+                       SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                            SDTCisFP<1>,
                                             SDTCisSameNumEltsAs<0,1>,
                                             SDTCisVT<2, i32>]>, []>;
 def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
@@ -428,9 +450,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
-                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
-                               SDTCisPtrTy<3>]>, []>;
 def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
                               [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
                                SDTCisPtrTy<2>]>, []>;
@@ -440,24 +459,30 @@ def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
 
 def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
+def X86faddRnds  : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>;
 def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
+def X86fsubRnds  : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>;
 def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
+def X86fmulRnds  : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
-def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
+def X86fdivRnds  : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",  SDTFPBinOpRound>;
+def X86fmaxRnds  : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",  SDTFPBinOpRound>;
+def X86fminRnds  : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>;
 def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
 def X86scalefs   : SDNode<"X86ISD::SCALEFS",        SDTFPBinOpRound>;
-def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
 def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
 def X86fsqrtRnds    : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
 def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
 def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
 
-def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
-def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
-def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
-def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFPTernaryOp>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFPTernaryOp>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFPTernaryOp>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFPTernaryOp>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFPTernaryOp>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFPTernaryOp>;
 
 def X86FmaddRnd     : SDNode<"X86ISD::FMADD_RND",     SDTFmaRound>;
 def X86FnmaddRnd    : SDNode<"X86ISD::FNMADD_RND",    SDTFmaRound>;
@@ -478,8 +503,10 @@ def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound>;
 def X86FmsubRnds3   : SDNode<"X86ISD::FMSUBS3_RND",     SDTFmaRound>;
 def X86FnmsubRnds3  : SDNode<"X86ISD::FNMSUBS3_RND",    SDTFmaRound>;
 
-def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTFma>;
-def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTFma>;
+def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma>;
+def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTIFma>;
 
 def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
 def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 627b6120b048..7b456fd68343 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -414,17 +414,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
     { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
     { X86::VEXTRACTPSZrr,   X86::VEXTRACTPSZmr,    TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
     { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
     { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+    { X86::VMOVPQIto64Zrr,  X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
+    { X86::VMOVSDto64Zrr,   X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
+    { X86::VMOVSS2DIZrr,    X86::VMOVSS2DIZmr,  TB_FOLDED_STORE },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
+    { X86::VPEXTRDZrr,      X86::VPEXTRDZmr,    TB_FOLDED_STORE },
+    { X86::VPEXTRQZrr,      X86::VPEXTRQZmr,    TB_FOLDED_STORE },
     { X86::VPMOVDBZrr,      X86::VPMOVDBZmr,    TB_FOLDED_STORE },
     { X86::VPMOVDWZrr,      X86::VPMOVDWZmr,    TB_FOLDED_STORE },
     { X86::VPMOVQDZrr,      X86::VPMOVQDZmr,    TB_FOLDED_STORE },
@@ -867,11 +872,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // AVX-512 foldable instructions
     { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm,     TB_NO_REVERSE },
     { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
-    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VMOV64toSDZrr,    X86::VMOV64toSDZrm,      0 },
+    { X86::VMOVDI2PDIZrr,    X86::VMOVDI2PDIZrm,      0 },
     { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
     { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
     { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
@@ -883,8 +887,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
     { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
     { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
+    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VPABSBZrr,        X86::VPABSBZrm,          0 },
     { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
     { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
+    { X86::VPABSWZrr,        X86::VPABSWZrm,          0 },
     { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
     { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
     { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
@@ -904,12 +911,21 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZri,       X86::VPSHUFDZmi,         0 },
     { X86::VPSHUFHWZri,      X86::VPSHUFHWZmi,        0 },
     { X86::VPSHUFLWZri,      X86::VPSHUFLWZmi,        0 },
+    { X86::VPSLLDQZ512rr,    X86::VPSLLDQZ512rm,      0 },
+    { X86::VPSLLDZri,        X86::VPSLLDZmi,          0 },
+    { X86::VPSLLQZri,        X86::VPSLLQZmi,          0 },
+    { X86::VPSLLWZri,        X86::VPSLLWZmi,          0 },
+    { X86::VPSRADZri,        X86::VPSRADZmi,          0 },
+    { X86::VPSRAQZri,        X86::VPSRAQZmi,          0 },
+    { X86::VPSRAWZri,        X86::VPSRAWZmi,          0 },
+    { X86::VPSRLDQZ512rr,    X86::VPSRLDQZ512rm,      0 },
+    { X86::VPSRLDZri,        X86::VPSRLDZmi,          0 },
+    { X86::VPSRLQZri,        X86::VPSRLQZmi,          0 },
+    { X86::VPSRLWZri,        X86::VPSRLWZmi,          0 },
 
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256r_s,  X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256r_s,  X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
     { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
     { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
     { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
@@ -920,6 +936,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
     { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
     { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+    { X86::VPABSBZ256rr,         X86::VPABSBZ256rm,         0 },
+    { X86::VPABSDZ256rr,         X86::VPABSDZ256rm,         0 },
+    { X86::VPABSQZ256rr,         X86::VPABSQZ256rm,         0 },
+    { X86::VPABSWZ256rr,         X86::VPABSWZ256rm,         0 },
     { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
     { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
     { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
@@ -939,10 +959,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
     { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
     { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
+    { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
+    { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
+    { X86::VPSLLQZ256ri,         X86::VPSLLQZ256mi,         0 },
+    { X86::VPSLLWZ256ri,         X86::VPSLLWZ256mi,         0 },
+    { X86::VPSRADZ256ri,         X86::VPSRADZ256mi,         0 },
+    { X86::VPSRAQZ256ri,         X86::VPSRAQZ256mi,         0 },
+    { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
+    { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
+    { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
+    { X86::VPSRLQZ256ri,         X86::VPSRLQZ256mi,         0 },
+    { X86::VPSRLWZ256ri,         X86::VPSRLWZ256mi,         0 },
 
     // AVX-512 foldable instructions (128-bit versions)
     { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ128r_s,  X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
     { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
     { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
     { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
@@ -953,6 +983,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
     { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
     { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+    { X86::VPABSBZ128rr,         X86::VPABSBZ128rm,         0 },
+    { X86::VPABSDZ128rr,         X86::VPABSDZ128rm,         0 },
+    { X86::VPABSQZ128rr,         X86::VPABSQZ128rm,         0 },
+    { X86::VPABSWZ128rr,         X86::VPABSWZ128rm,         0 },
     { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
     { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
     { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
@@ -970,6 +1004,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
     { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
     { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
+    { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
+    { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
+    { X86::VPSLLQZ128ri,         X86::VPSLLQZ128mi,         0 },
+    { X86::VPSLLWZ128ri,         X86::VPSLLWZ128mi,         0 },
+    { X86::VPSRADZ128ri,         X86::VPSRADZ128mi,         0 },
+    { X86::VPSRAQZ128ri,         X86::VPSRAQZ128mi,         0 },
+    { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
+    { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
+    { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
+    { X86::VPSRLQZ128ri,         X86::VPSRLQZ128mi,         0 },
+    { X86::VPSRLWZ128ri,         X86::VPSRLWZ128mi,         0 },
 
     // F16C foldable instructions
     { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
@@ -1170,18 +1215,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
     { X86::PMADDUBSWrr,     X86::PMADDUBSWrm,   TB_ALIGN_16 },
     { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
     { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
     { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
-    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
-    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
     { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
     { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
     { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
-    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
-    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
-    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
-    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
     { X86::PMULHRSWrr,      X86::PMULHRSWrm,    TB_ALIGN_16 },
     { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
@@ -1340,8 +1385,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
 
     // AVX 128-bit versions of foldable instructions
-    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
-    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    TB_NO_REVERSE },
     { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
     { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
     { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
@@ -1350,8 +1393,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
     { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
-    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
-    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    TB_NO_REVERSE },
     { X86::VADDPDrr,          X86::VADDPDrm,           0 },
     { X86::VADDPSrr,          X86::VADDPSrm,           0 },
     { X86::VADDSDrr,          X86::VADDSDrm,           0 },
@@ -1458,18 +1499,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
     { X86::VPMADDUBSWrr,      X86::VPMADDUBSWrm,       0 },
     { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
     { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
     { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
-    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
-    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
     { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
     { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
     { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
-    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
-    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
-    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
-    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
     { X86::VPMULHRSWrr,       X86::VPMULHRSWrm,        0 },
     { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
@@ -1626,18 +1667,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
     { X86::VPMADDUBSWYrr,     X86::VPMADDUBSWYrm,      0 },
     { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
     { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
     { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
-    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
-    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
     { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
     { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
     { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
-    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
-    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
-    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
-    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
     { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
     { X86::VPMULHRSWYrr,      X86::VPMULHRSWYrm,       0 },
@@ -1732,7 +1773,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
-    { X86::VPCMOVrrrY,        X86::VPCMOVrmrY,          0 },
+    { X86::VPCMOVYrrr,        X86::VPCMOVYrmr,          0 },
     { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
     { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
     { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
@@ -1742,9 +1783,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
     { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
     { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
-    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PDYrr,     X86::VPERMIL2PDYmr,       0 },
     { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
-    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPERMIL2PSYrr,     X86::VPERMIL2PSYmr,       0 },
     { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
     { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
     { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
@@ -1800,8 +1841,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDNPSZrr,        X86::VANDNPSZrm,          0 },
     { X86::VANDPDZrr,         X86::VANDPDZrm,           0 },
     { X86::VANDPSZrr,         X86::VANDPSZrm,           0 },
-    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
     { X86::VCMPPDZrri,        X86::VCMPPDZrmi,          0 },
     { X86::VCMPPSZrri,        X86::VCMPPSZrmi,          0 },
     { X86::VCMPSDZrr,         X86::VCMPSDZrm,           0 },
@@ -1842,6 +1881,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMINSDZrr_Int,     X86::VMINSDZrm_Int,       TB_NO_REVERSE },
     { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
     { X86::VMINSSZrr_Int,     X86::VMINSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VMOVLHPSZrr,       X86::VMOVHPSZ128rm,       TB_NO_REVERSE },
     { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
     { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
     { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
@@ -1850,6 +1890,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
     { X86::VORPDZrr,          X86::VORPDZrm,            0 },
     { X86::VORPSZrr,          X86::VORPSZrm,            0 },
+    { X86::VPACKSSDWZrr,      X86::VPACKSSDWZrm,        0 },
+    { X86::VPACKSSWBZrr,      X86::VPACKSSWBZrm,        0 },
+    { X86::VPACKUSDWZrr,      X86::VPACKUSDWZrm,        0 },
+    { X86::VPACKUSWBZrr,      X86::VPACKUSWBZrm,        0 },
     { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
     { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
     { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
@@ -1863,6 +1907,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrr,        X86::VPANDNDZrm,          0 },
     { X86::VPANDNQZrr,        X86::VPANDNQZrm,          0 },
     { X86::VPANDQZrr,         X86::VPANDQZrm,           0 },
+    { X86::VPAVGBZrr,         X86::VPAVGBZrm,           0 },
+    { X86::VPAVGWZrr,         X86::VPAVGWZrm,           0 },
     { X86::VPCMPBZrri,        X86::VPCMPBZrmi,          0 },
     { X86::VPCMPDZrri,        X86::VPCMPDZrmi,          0 },
     { X86::VPCMPEQBZrr,       X86::VPCMPEQBZrm,         0 },
@@ -1887,26 +1933,55 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
     { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
     { X86::VPERMWZrr,         X86::VPERMWZrm,           0 },
+    { X86::VPINSRBZrr,        X86::VPINSRBZrm,          0 },
+    { X86::VPINSRDZrr,        X86::VPINSRDZrm,          0 },
+    { X86::VPINSRQZrr,        X86::VPINSRQZrm,          0 },
+    { X86::VPINSRWZrr,        X86::VPINSRWZrm,          0 },
     { X86::VPMADDUBSWZrr,     X86::VPMADDUBSWZrm,       0 },
     { X86::VPMADDWDZrr,       X86::VPMADDWDZrm,         0 },
+    { X86::VPMAXSBZrr,        X86::VPMAXSBZrm,          0 },
     { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
     { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXSWZrr,        X86::VPMAXSWZrm,          0 },
+    { X86::VPMAXUBZrr,        X86::VPMAXUBZrm,          0 },
     { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
     { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMAXUWZrr,        X86::VPMAXUWZrm,          0 },
+    { X86::VPMINSBZrr,        X86::VPMINSBZrm,          0 },
     { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
     { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINSWZrr,        X86::VPMINSWZrm,          0 },
+    { X86::VPMINUBZrr,        X86::VPMINUBZrm,          0 },
     { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
     { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMINUWZrr,        X86::VPMINUWZrm,          0 },
     { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
+    { X86::VPMULLDZrr,        X86::VPMULLDZrm,          0 },
+    { X86::VPMULLQZrr,        X86::VPMULLQZrm,          0 },
+    { X86::VPMULLWZrr,        X86::VPMULLWZrm,          0 },
     { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
     { X86::VPORDZrr,          X86::VPORDZrm,            0 },
     { X86::VPORQZrr,          X86::VPORQZrm,            0 },
+    { X86::VPSADBWZ512rr,     X86::VPSADBWZ512rm,       0 },
     { X86::VPSHUFBZrr,        X86::VPSHUFBZrm,          0 },
+    { X86::VPSLLDZrr,         X86::VPSLLDZrm,           0 },
+    { X86::VPSLLQZrr,         X86::VPSLLQZrm,           0 },
     { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
     { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSLLVWZrr,        X86::VPSLLVWZrm,          0 },
+    { X86::VPSLLWZrr,         X86::VPSLLWZrm,           0 },
+    { X86::VPSRADZrr,         X86::VPSRADZrm,           0 },
+    { X86::VPSRAQZrr,         X86::VPSRAQZrm,           0 },
     { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRAVQZrr,        X86::VPSRAVQZrm,          0 },
+    { X86::VPSRAVWZrr,        X86::VPSRAVWZrm,          0 },
+    { X86::VPSRAWZrr,         X86::VPSRAWZrm,           0 },
+    { X86::VPSRLDZrr,         X86::VPSRLDZrm,           0 },
+    { X86::VPSRLQZrr,         X86::VPSRLQZrm,           0 },
     { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
     { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSRLVWZrr,        X86::VPSRLVWZrm,          0 },
+    { X86::VPSRLWZrr,         X86::VPSRLWZrm,           0 },
     { X86::VPSUBBZrr,         X86::VPSUBBZrm,           0 },
     { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
     { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
@@ -1957,9 +2032,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPDZ256rr,      X86::VANDPDZ256rm,        0 },
     { X86::VANDPSZ128rr,      X86::VANDPSZ128rm,        0 },
     { X86::VANDPSZ256rr,      X86::VANDPSZ256rm,        0 },
-    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
     { X86::VCMPPDZ128rri,     X86::VCMPPDZ128rmi,       0 },
     { X86::VCMPPDZ256rri,     X86::VCMPPDZ256rmi,       0 },
     { X86::VCMPPSZ128rri,     X86::VCMPPSZ128rmi,       0 },
@@ -1996,6 +2068,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
     { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
     { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
+    { X86::VPACKSSDWZ256rr,   X86::VPACKSSDWZ256rm,     0 },
+    { X86::VPACKSSDWZ128rr,   X86::VPACKSSDWZ128rm,     0 },
+    { X86::VPACKSSWBZ256rr,   X86::VPACKSSWBZ256rm,     0 },
+    { X86::VPACKSSWBZ128rr,   X86::VPACKSSWBZ128rm,     0 },
+    { X86::VPACKUSDWZ256rr,   X86::VPACKUSDWZ256rm,     0 },
+    { X86::VPACKUSDWZ128rr,   X86::VPACKUSDWZ128rm,     0 },
+    { X86::VPACKUSWBZ256rr,   X86::VPACKUSWBZ256rm,     0 },
+    { X86::VPACKUSWBZ128rr,   X86::VPACKUSWBZ128rm,     0 },
     { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
     { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
     { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
@@ -2022,6 +2102,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNQZ256rr,     X86::VPANDNQZ256rm,       0 },
     { X86::VPANDQZ128rr,      X86::VPANDQZ128rm,        0 },
     { X86::VPANDQZ256rr,      X86::VPANDQZ256rm,        0 },
+    { X86::VPAVGBZ128rr,      X86::VPAVGBZ128rm,        0 },
+    { X86::VPAVGBZ256rr,      X86::VPAVGBZ256rm,        0 },
+    { X86::VPAVGWZ128rr,      X86::VPAVGWZ128rm,        0 },
+    { X86::VPAVGWZ256rr,      X86::VPAVGWZ256rm,        0 },
     { X86::VPCMPBZ128rri,     X86::VPCMPBZ128rmi,       0 },
     { X86::VPCMPBZ256rri,     X86::VPCMPBZ256rmi,       0 },
     { X86::VPCMPDZ128rri,     X86::VPCMPDZ128rmi,       0 },
@@ -2070,12 +2154,92 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMADDUBSWZ256rr,  X86::VPMADDUBSWZ256rm,    0 },
     { X86::VPMADDWDZ128rr,    X86::VPMADDWDZ128rm,      0 },
     { X86::VPMADDWDZ256rr,    X86::VPMADDWDZ256rm,      0 },
+    { X86::VPMAXSBZ128rr,     X86::VPMAXSBZ128rm,       0 },
+    { X86::VPMAXSBZ256rr,     X86::VPMAXSBZ256rm,       0 },
+    { X86::VPMAXSDZ128rr,     X86::VPMAXSDZ128rm,       0 },
+    { X86::VPMAXSDZ256rr,     X86::VPMAXSDZ256rm,       0 },
+    { X86::VPMAXSQZ128rr,     X86::VPMAXSQZ128rm,       0 },
+    { X86::VPMAXSQZ256rr,     X86::VPMAXSQZ256rm,       0 },
+    { X86::VPMAXSWZ128rr,     X86::VPMAXSWZ128rm,       0 },
+    { X86::VPMAXSWZ256rr,     X86::VPMAXSWZ256rm,       0 },
+    { X86::VPMAXUBZ128rr,     X86::VPMAXUBZ128rm,       0 },
+    { X86::VPMAXUBZ256rr,     X86::VPMAXUBZ256rm,       0 },
+    { X86::VPMAXUDZ128rr,     X86::VPMAXUDZ128rm,       0 },
+    { X86::VPMAXUDZ256rr,     X86::VPMAXUDZ256rm,       0 },
+    { X86::VPMAXUQZ128rr,     X86::VPMAXUQZ128rm,       0 },
+    { X86::VPMAXUQZ256rr,     X86::VPMAXUQZ256rm,       0 },
+    { X86::VPMAXUWZ128rr,     X86::VPMAXUWZ128rm,       0 },
+    { X86::VPMAXUWZ256rr,     X86::VPMAXUWZ256rm,       0 },
+    { X86::VPMINSBZ128rr,     X86::VPMINSBZ128rm,       0 },
+    { X86::VPMINSBZ256rr,     X86::VPMINSBZ256rm,       0 },
+    { X86::VPMINSDZ128rr,     X86::VPMINSDZ128rm,       0 },
+    { X86::VPMINSDZ256rr,     X86::VPMINSDZ256rm,       0 },
+    { X86::VPMINSQZ128rr,     X86::VPMINSQZ128rm,       0 },
+    { X86::VPMINSQZ256rr,     X86::VPMINSQZ256rm,       0 },
+    { X86::VPMINSWZ128rr,     X86::VPMINSWZ128rm,       0 },
+    { X86::VPMINSWZ256rr,     X86::VPMINSWZ256rm,       0 },
+    { X86::VPMINUBZ128rr,     X86::VPMINUBZ128rm,       0 },
+    { X86::VPMINUBZ256rr,     X86::VPMINUBZ256rm,       0 },
+    { X86::VPMINUDZ128rr,     X86::VPMINUDZ128rm,       0 },
+    { X86::VPMINUDZ256rr,     X86::VPMINUDZ256rm,       0 },
+    { X86::VPMINUQZ128rr,     X86::VPMINUQZ128rm,       0 },
+    { X86::VPMINUQZ256rr,     X86::VPMINUQZ256rm,       0 },
+    { X86::VPMINUWZ128rr,     X86::VPMINUWZ128rm,       0 },
+    { X86::VPMINUWZ256rr,     X86::VPMINUWZ256rm,       0 },
+    { X86::VPMULDQZ128rr,     X86::VPMULDQZ128rm,       0 },
+    { X86::VPMULDQZ256rr,     X86::VPMULDQZ256rm,       0 },
+    { X86::VPMULLDZ128rr,     X86::VPMULLDZ128rm,       0 },
+    { X86::VPMULLDZ256rr,     X86::VPMULLDZ256rm,       0 },
+    { X86::VPMULLQZ128rr,     X86::VPMULLQZ128rm,       0 },
+    { X86::VPMULLQZ256rr,     X86::VPMULLQZ256rm,       0 },
+    { X86::VPMULLWZ128rr,     X86::VPMULLWZ128rm,       0 },
+    { X86::VPMULLWZ256rr,     X86::VPMULLWZ256rm,       0 },
+    { X86::VPMULUDQZ128rr,    X86::VPMULUDQZ128rm,      0 },
+    { X86::VPMULUDQZ256rr,    X86::VPMULUDQZ256rm,      0 },
     { X86::VPORDZ128rr,       X86::VPORDZ128rm,         0 },
     { X86::VPORDZ256rr,       X86::VPORDZ256rm,         0 },
     { X86::VPORQZ128rr,       X86::VPORQZ128rm,         0 },
     { X86::VPORQZ256rr,       X86::VPORQZ256rm,         0 },
+    { X86::VPSADBWZ128rr,     X86::VPSADBWZ128rm,       0 },
+    { X86::VPSADBWZ256rr,     X86::VPSADBWZ256rm,       0 },
     { X86::VPSHUFBZ128rr,     X86::VPSHUFBZ128rm,       0 },
     { X86::VPSHUFBZ256rr,     X86::VPSHUFBZ256rm,       0 },
+    { X86::VPSLLDZ128rr,      X86::VPSLLDZ128rm,        0 },
+    { X86::VPSLLDZ256rr,      X86::VPSLLDZ256rm,        0 },
+    { X86::VPSLLQZ128rr,      X86::VPSLLQZ128rm,        0 },
+    { X86::VPSLLQZ256rr,      X86::VPSLLQZ256rm,        0 },
+    { X86::VPSLLVDZ128rr,     X86::VPSLLVDZ128rm,       0 },
+    { X86::VPSLLVDZ256rr,     X86::VPSLLVDZ256rm,       0 },
+    { X86::VPSLLVQZ128rr,     X86::VPSLLVQZ128rm,       0 },
+    { X86::VPSLLVQZ256rr,     X86::VPSLLVQZ256rm,       0 },
+    { X86::VPSLLVWZ128rr,     X86::VPSLLVWZ128rm,       0 },
+    { X86::VPSLLVWZ256rr,     X86::VPSLLVWZ256rm,       0 },
+    { X86::VPSLLWZ128rr,      X86::VPSLLWZ128rm,        0 },
+    { X86::VPSLLWZ256rr,      X86::VPSLLWZ256rm,        0 },
+    { X86::VPSRADZ128rr,      X86::VPSRADZ128rm,        0 },
+    { X86::VPSRADZ256rr,      X86::VPSRADZ256rm,        0 },
+    { X86::VPSRAQZ128rr,      X86::VPSRAQZ128rm,        0 },
+    { X86::VPSRAQZ256rr,      X86::VPSRAQZ256rm,        0 },
+    { X86::VPSRAVDZ128rr,     X86::VPSRAVDZ128rm,       0 },
+    { X86::VPSRAVDZ256rr,     X86::VPSRAVDZ256rm,       0 },
+    { X86::VPSRAVQZ128rr,     X86::VPSRAVQZ128rm,       0 },
+    { X86::VPSRAVQZ256rr,     X86::VPSRAVQZ256rm,       0 },
+    { X86::VPSRAVWZ128rr,     X86::VPSRAVWZ128rm,       0 },
+    { X86::VPSRAVWZ256rr,     X86::VPSRAVWZ256rm,       0 },
+    { X86::VPSRAWZ128rr,      X86::VPSRAWZ128rm,        0 },
+    { X86::VPSRAWZ256rr,      X86::VPSRAWZ256rm,        0 },
+    { X86::VPSRLDZ128rr,      X86::VPSRLDZ128rm,        0 },
+    { X86::VPSRLDZ256rr,      X86::VPSRLDZ256rm,        0 },
+    { X86::VPSRLQZ128rr,      X86::VPSRLQZ128rm,        0 },
+    { X86::VPSRLQZ256rr,      X86::VPSRLQZ256rm,        0 },
+    { X86::VPSRLVDZ128rr,     X86::VPSRLVDZ128rm,       0 },
+    { X86::VPSRLVDZ256rr,     X86::VPSRLVDZ256rm,       0 },
+    { X86::VPSRLVQZ128rr,     X86::VPSRLVQZ128rm,       0 },
+    { X86::VPSRLVQZ256rr,     X86::VPSRLVQZ256rm,       0 },
+    { X86::VPSRLVWZ128rr,     X86::VPSRLVWZ128rm,       0 },
+    { X86::VPSRLVWZ256rr,     X86::VPSRLVWZ256rm,       0 },
+    { X86::VPSRLWZ128rr,      X86::VPSRLWZ128rm,        0 },
+    { X86::VPSRLWZ256rr,      X86::VPSRLWZ256rm,        0 },
     { X86::VPSUBBZ128rr,      X86::VPSUBBZ128rm,        0 },
     { X86::VPSUBBZ256rr,      X86::VPSUBBZ256rm,        0 },
     { X86::VPSUBDZ128rr,      X86::VPSUBDZ128rm,        0 },
@@ -2112,6 +2276,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPXORDZ256rr,      X86::VPXORDZ256rm,        0 },
     { X86::VPXORQZ128rr,      X86::VPXORQZ128rm,        0 },
     { X86::VPXORQZ256rr,      X86::VPXORQZ256rm,        0 },
+    { X86::VSHUFPDZ128rri,    X86::VSHUFPDZ128rmi,      0 },
+    { X86::VSHUFPDZ256rri,    X86::VSHUFPDZ256rmi,      0 },
+    { X86::VSHUFPSZ128rri,    X86::VSHUFPSZ128rmi,      0 },
+    { X86::VSHUFPSZ256rri,    X86::VSHUFPSZ256rmi,      0 },
     { X86::VSUBPDZ128rr,      X86::VSUBPDZ128rm,        0 },
     { X86::VSUBPDZ256rr,      X86::VSUBPDZ256rm,        0 },
     { X86::VSUBPSZ128rr,      X86::VSUBPSZ128rm,        0 },
@@ -2130,6 +2298,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VXORPSZ256rr,      X86::VXORPSZ256rm,        0 },
 
     // AVX-512 masked foldable instructions
+    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
+    { X86::VPABSBZrrkz,       X86::VPABSBZrmkz,         0 },
+    { X86::VPABSDZrrkz,       X86::VPABSDZrmkz,         0 },
+    { X86::VPABSQZrrkz,       X86::VPABSQZrmkz,         0 },
+    { X86::VPABSWZrrkz,       X86::VPABSWZrmkz,         0 },
     { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
     { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
     { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
@@ -2149,8 +2323,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZrikz,      X86::VPSHUFDZmikz,        0 },
     { X86::VPSHUFHWZrikz,     X86::VPSHUFHWZmikz,       0 },
     { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
+    { X86::VPSLLDZrikz,       X86::VPSLLDZmikz,         0 },
+    { X86::VPSLLQZrikz,       X86::VPSLLQZmikz,         0 },
+    { X86::VPSLLWZrikz,       X86::VPSLLWZmikz,         0 },
+    { X86::VPSRADZrikz,       X86::VPSRADZmikz,         0 },
+    { X86::VPSRAQZrikz,       X86::VPSRAQZmikz,         0 },
+    { X86::VPSRAWZrikz,       X86::VPSRAWZmikz,         0 },
+    { X86::VPSRLDZrikz,       X86::VPSRLDZmikz,         0 },
+    { X86::VPSRLQZrikz,       X86::VPSRLQZmikz,         0 },
+    { X86::VPSRLWZrikz,       X86::VPSRLWZmikz,         0 },
 
     // AVX-512VL 256-bit masked foldable instructions
+    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+    { X86::VPABSBZ256rrkz,    X86::VPABSBZ256rmkz,      0 },
+    { X86::VPABSDZ256rrkz,    X86::VPABSDZ256rmkz,      0 },
+    { X86::VPABSQZ256rrkz,    X86::VPABSQZ256rmkz,      0 },
+    { X86::VPABSWZ256rrkz,    X86::VPABSWZ256rmkz,      0 },
     { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
     { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
     { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
@@ -2170,8 +2359,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256rikz,   X86::VPSHUFDZ256mikz,     0 },
     { X86::VPSHUFHWZ256rikz,  X86::VPSHUFHWZ256mikz,    0 },
     { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
+    { X86::VPSLLDZ256rikz,    X86::VPSLLDZ256mikz,      0 },
+    { X86::VPSLLQZ256rikz,    X86::VPSLLQZ256mikz,      0 },
+    { X86::VPSLLWZ256rikz,    X86::VPSLLWZ256mikz,      0 },
+    { X86::VPSRADZ256rikz,    X86::VPSRADZ256mikz,      0 },
+    { X86::VPSRAQZ256rikz,    X86::VPSRAQZ256mikz,      0 },
+    { X86::VPSRAWZ256rikz,    X86::VPSRAWZ256mikz,      0 },
+    { X86::VPSRLDZ256rikz,    X86::VPSRLDZ256mikz,      0 },
+    { X86::VPSRLQZ256rikz,    X86::VPSRLQZ256mikz,      0 },
+    { X86::VPSRLWZ256rikz,    X86::VPSRLWZ256mikz,      0 },
 
     // AVX-512VL 128-bit masked foldable instructions
+    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+    { X86::VPABSBZ128rrkz,    X86::VPABSBZ128rmkz,      0 },
+    { X86::VPABSDZ128rrkz,    X86::VPABSDZ128rmkz,      0 },
+    { X86::VPABSQZ128rrkz,    X86::VPABSQZ128rmkz,      0 },
+    { X86::VPABSWZ128rrkz,    X86::VPABSWZ128rmkz,      0 },
     { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
     { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
     { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
@@ -2189,6 +2392,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128rikz,   X86::VPSHUFDZ128mikz,     0 },
     { X86::VPSHUFHWZ128rikz,  X86::VPSHUFHWZ128mikz,    0 },
     { X86::VPSHUFLWZ128rikz,  X86::VPSHUFLWZ128mikz,    0 },
+    { X86::VPSLLDZ128rikz,    X86::VPSLLDZ128mikz,      0 },
+    { X86::VPSLLQZ128rikz,    X86::VPSLLQZ128mikz,      0 },
+    { X86::VPSLLWZ128rikz,    X86::VPSLLWZ128mikz,      0 },
+    { X86::VPSRADZ128rikz,    X86::VPSRADZ128mikz,      0 },
+    { X86::VPSRAQZ128rikz,    X86::VPSRAQZ128mikz,      0 },
+    { X86::VPSRAWZ128rikz,    X86::VPSRAWZ128mikz,      0 },
+    { X86::VPSRLDZ128rikz,    X86::VPSRLDZ128mikz,      0 },
+    { X86::VPSRLQZ128rikz,    X86::VPSRLQZ128mikz,      0 },
+    { X86::VPSRLWZ128rikz,    X86::VPSRLWZ128mikz,      0 },
 
     // AES foldable instructions
     { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
@@ -2262,23 +2474,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
-    { X86::VPCMOVrrrY,            X86::VPCMOVrrmY,            0 },
+    { X86::VPCMOVYrrr,            X86::VPCMOVYrrm,            0 },
     { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
-    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PDYrr,         X86::VPERMIL2PDYrm,         0 },
     { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
-    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPERMIL2PSYrr,         X86::VPERMIL2PSYrm,         0 },
     { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
 
     // AVX-512 instructions with 3 source operands.
-    { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
-    { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
-    { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
-    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 },
-    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
     { X86::VPERMI2Brr,            X86::VPERMI2Brm,            0 },
     { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
     { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
@@ -2329,6 +2532,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 masked instructions
     { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
     { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
+    { X86::VADDSDZrr_Intkz,       X86::VADDSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VADDSSZrr_Intkz,       X86::VADDSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VALIGNDZrrikz,         X86::VALIGNDZrmikz,         0 },
     { X86::VALIGNQZrrikz,         X86::VALIGNQZrmikz,         0 },
     { X86::VANDNPDZrrkz,          X86::VANDNPDZrmkz,          0 },
@@ -2337,6 +2542,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPSZrrkz,           X86::VANDPSZrmkz,           0 },
     { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
     { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
+    { X86::VDIVSDZrr_Intkz,       X86::VDIVSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr_Intkz,       X86::VDIVSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VINSERTF32x4Zrrkz,     X86::VINSERTF32x4Zrmkz,     0 },
     { X86::VINSERTF32x8Zrrkz,     X86::VINSERTF32x8Zrmkz,     0 },
     { X86::VINSERTF64x2Zrrkz,     X86::VINSERTF64x2Zrmkz,     0 },
@@ -2349,14 +2556,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXCPSZrrkz,          X86::VMAXCPSZrmkz,          0 },
     { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
     { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
+    { X86::VMAXSDZrr_Intkz,       X86::VMAXSDZrm_Intkz,       0 },
+    { X86::VMAXSSZrr_Intkz,       X86::VMAXSSZrm_Intkz,       0 },
     { X86::VMINCPDZrrkz,          X86::VMINCPDZrmkz,          0 },
     { X86::VMINCPSZrrkz,          X86::VMINCPSZrmkz,          0 },
     { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
     { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
+    { X86::VMINSDZrr_Intkz,       X86::VMINSDZrm_Intkz,       0 },
+    { X86::VMINSSZrr_Intkz,       X86::VMINSSZrm_Intkz,       0 },
     { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
     { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
+    { X86::VMULSDZrr_Intkz,       X86::VMULSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VMULSSZrr_Intkz,       X86::VMULSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
     { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
+    { X86::VPACKSSDWZrrkz,        X86::VPACKSSDWZrmkz,        0 },
+    { X86::VPACKSSWBZrrkz,        X86::VPACKSSWBZrmkz,        0 },
+    { X86::VPACKUSDWZrrkz,        X86::VPACKUSDWZrmkz,        0 },
+    { X86::VPACKUSWBZrrkz,        X86::VPACKUSWBZrmkz,        0 },
     { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
     { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
     { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
@@ -2370,6 +2587,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrrkz,          X86::VPANDNDZrmkz,          0 },
     { X86::VPANDNQZrrkz,          X86::VPANDNQZrmkz,          0 },
     { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
+    { X86::VPAVGBZrrkz,           X86::VPAVGBZrmkz,           0 },
+    { X86::VPAVGWZrrkz,           X86::VPAVGWZrmkz,           0 },
     { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
     { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
     { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
@@ -2380,9 +2599,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZrrkz,           X86::VPERMWZrmkz,           0 },
     { X86::VPMADDUBSWZrrkz,       X86::VPMADDUBSWZrmkz,       0 },
     { X86::VPMADDWDZrrkz,         X86::VPMADDWDZrmkz,         0 },
+    { X86::VPMAXSBZrrkz,          X86::VPMAXSBZrmkz,          0 },
+    { X86::VPMAXSDZrrkz,          X86::VPMAXSDZrmkz,          0 },
+    { X86::VPMAXSQZrrkz,          X86::VPMAXSQZrmkz,          0 },
+    { X86::VPMAXSWZrrkz,          X86::VPMAXSWZrmkz,          0 },
+    { X86::VPMAXUBZrrkz,          X86::VPMAXUBZrmkz,          0 },
+    { X86::VPMAXUDZrrkz,          X86::VPMAXUDZrmkz,          0 },
+    { X86::VPMAXUQZrrkz,          X86::VPMAXUQZrmkz,          0 },
+    { X86::VPMAXUWZrrkz,          X86::VPMAXUWZrmkz,          0 },
+    { X86::VPMINSBZrrkz,          X86::VPMINSBZrmkz,          0 },
+    { X86::VPMINSDZrrkz,          X86::VPMINSDZrmkz,          0 },
+    { X86::VPMINSQZrrkz,          X86::VPMINSQZrmkz,          0 },
+    { X86::VPMINSWZrrkz,          X86::VPMINSWZrmkz,          0 },
+    { X86::VPMINUBZrrkz,          X86::VPMINUBZrmkz,          0 },
+    { X86::VPMINUDZrrkz,          X86::VPMINUDZrmkz,          0 },
+    { X86::VPMINUQZrrkz,          X86::VPMINUQZrmkz,          0 },
+    { X86::VPMINUWZrrkz,          X86::VPMINUWZrmkz,          0 },
+    { X86::VPMULLDZrrkz,          X86::VPMULLDZrmkz,          0 },
+    { X86::VPMULLQZrrkz,          X86::VPMULLQZrmkz,          0 },
+    { X86::VPMULLWZrrkz,          X86::VPMULLWZrmkz,          0 },
+    { X86::VPMULDQZrrkz,          X86::VPMULDQZrmkz,          0 },
+    { X86::VPMULUDQZrrkz,         X86::VPMULUDQZrmkz,         0 },
     { X86::VPORDZrrkz,            X86::VPORDZrmkz,            0 },
     { X86::VPORQZrrkz,            X86::VPORQZrmkz,            0 },
     { X86::VPSHUFBZrrkz,          X86::VPSHUFBZrmkz,          0 },
+    { X86::VPSLLDZrrkz,           X86::VPSLLDZrmkz,           0 },
+    { X86::VPSLLQZrrkz,           X86::VPSLLQZrmkz,           0 },
+    { X86::VPSLLVDZrrkz,          X86::VPSLLVDZrmkz,          0 },
+    { X86::VPSLLVQZrrkz,          X86::VPSLLVQZrmkz,          0 },
+    { X86::VPSLLVWZrrkz,          X86::VPSLLVWZrmkz,          0 },
+    { X86::VPSLLWZrrkz,           X86::VPSLLWZrmkz,           0 },
+    { X86::VPSRADZrrkz,           X86::VPSRADZrmkz,           0 },
+    { X86::VPSRAQZrrkz,           X86::VPSRAQZrmkz,           0 },
+    { X86::VPSRAVDZrrkz,          X86::VPSRAVDZrmkz,          0 },
+    { X86::VPSRAVQZrrkz,          X86::VPSRAVQZrmkz,          0 },
+    { X86::VPSRAVWZrrkz,          X86::VPSRAVWZrmkz,          0 },
+    { X86::VPSRAWZrrkz,           X86::VPSRAWZrmkz,           0 },
+    { X86::VPSRLDZrrkz,           X86::VPSRLDZrmkz,           0 },
+    { X86::VPSRLQZrrkz,           X86::VPSRLQZrmkz,           0 },
+    { X86::VPSRLVDZrrkz,          X86::VPSRLVDZrmkz,          0 },
+    { X86::VPSRLVQZrrkz,          X86::VPSRLVQZrmkz,          0 },
+    { X86::VPSRLVWZrrkz,          X86::VPSRLVWZrmkz,          0 },
+    { X86::VPSRLWZrrkz,           X86::VPSRLWZrmkz,           0 },
     { X86::VPSUBBZrrkz,           X86::VPSUBBZrmkz,           0 },
     { X86::VPSUBDZrrkz,           X86::VPSUBDZrmkz,           0 },
     { X86::VPSUBQZrrkz,           X86::VPSUBQZrmkz,           0 },
@@ -2401,8 +2659,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZrrkz,       X86::VPUNPCKLWDZrmkz,       0 },
     { X86::VPXORDZrrkz,           X86::VPXORDZrmkz,           0 },
     { X86::VPXORQZrrkz,           X86::VPXORQZrmkz,           0 },
+    { X86::VSHUFPDZrrikz,         X86::VSHUFPDZrmikz,         0 },
+    { X86::VSHUFPSZrrikz,         X86::VSHUFPSZrmikz,         0 },
     { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
     { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
+    { X86::VSUBSDZrr_Intkz,       X86::VSUBSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr_Intkz,       X86::VSUBSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VUNPCKHPDZrrkz,        X86::VUNPCKHPDZrmkz,        0 },
     { X86::VUNPCKHPSZrrkz,        X86::VUNPCKHPSZrmkz,        0 },
     { X86::VUNPCKLPDZrrkz,        X86::VUNPCKLPDZrmkz,        0 },
@@ -2437,6 +2699,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
     { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
     { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
+    { X86::VPACKSSDWZ256rrkz,     X86::VPACKSSDWZ256rmkz,     0 },
+    { X86::VPACKSSWBZ256rrkz,     X86::VPACKSSWBZ256rmkz,     0 },
+    { X86::VPACKUSDWZ256rrkz,     X86::VPACKUSDWZ256rmkz,     0 },
+    { X86::VPACKUSWBZ256rrkz,     X86::VPACKUSWBZ256rmkz,     0 },
     { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
     { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
     { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
@@ -2450,6 +2716,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ256rrkz,       X86::VPANDNDZ256rmkz,       0 },
     { X86::VPANDNQZ256rrkz,       X86::VPANDNQZ256rmkz,       0 },
     { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
+    { X86::VPAVGBZ256rrkz,        X86::VPAVGBZ256rmkz,        0 },
+    { X86::VPAVGWZ256rrkz,        X86::VPAVGWZ256rmkz,        0 },
     { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
     { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
     { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
@@ -2460,9 +2728,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ256rrkz,        X86::VPERMWZ256rmkz,        0 },
     { X86::VPMADDUBSWZ256rrkz,    X86::VPMADDUBSWZ256rmkz,    0 },
     { X86::VPMADDWDZ256rrkz,      X86::VPMADDWDZ256rmkz,      0 },
+    { X86::VPMAXSBZ256rrkz,       X86::VPMAXSBZ256rmkz,       0 },
+    { X86::VPMAXSDZ256rrkz,       X86::VPMAXSDZ256rmkz,       0 },
+    { X86::VPMAXSQZ256rrkz,       X86::VPMAXSQZ256rmkz,       0 },
+    { X86::VPMAXSWZ256rrkz,       X86::VPMAXSWZ256rmkz,       0 },
+    { X86::VPMAXUBZ256rrkz,       X86::VPMAXUBZ256rmkz,       0 },
+    { X86::VPMAXUDZ256rrkz,       X86::VPMAXUDZ256rmkz,       0 },
+    { X86::VPMAXUQZ256rrkz,       X86::VPMAXUQZ256rmkz,       0 },
+    { X86::VPMAXUWZ256rrkz,       X86::VPMAXUWZ256rmkz,       0 },
+    { X86::VPMINSBZ256rrkz,       X86::VPMINSBZ256rmkz,       0 },
+    { X86::VPMINSDZ256rrkz,       X86::VPMINSDZ256rmkz,       0 },
+    { X86::VPMINSQZ256rrkz,       X86::VPMINSQZ256rmkz,       0 },
+    { X86::VPMINSWZ256rrkz,       X86::VPMINSWZ256rmkz,       0 },
+    { X86::VPMINUBZ256rrkz,       X86::VPMINUBZ256rmkz,       0 },
+    { X86::VPMINUDZ256rrkz,       X86::VPMINUDZ256rmkz,       0 },
+    { X86::VPMINUQZ256rrkz,       X86::VPMINUQZ256rmkz,       0 },
+    { X86::VPMINUWZ256rrkz,       X86::VPMINUWZ256rmkz,       0 },
+    { X86::VPMULDQZ256rrkz,       X86::VPMULDQZ256rmkz,       0 },
+    { X86::VPMULLDZ256rrkz,       X86::VPMULLDZ256rmkz,       0 },
+    { X86::VPMULLQZ256rrkz,       X86::VPMULLQZ256rmkz,       0 },
+    { X86::VPMULLWZ256rrkz,       X86::VPMULLWZ256rmkz,       0 },
+    { X86::VPMULUDQZ256rrkz,      X86::VPMULUDQZ256rmkz,      0 },
     { X86::VPORDZ256rrkz,         X86::VPORDZ256rmkz,         0 },
     { X86::VPORQZ256rrkz,         X86::VPORQZ256rmkz,         0 },
     { X86::VPSHUFBZ256rrkz,       X86::VPSHUFBZ256rmkz,       0 },
+    { X86::VPSLLDZ256rrkz,        X86::VPSLLDZ256rmkz,        0 },
+    { X86::VPSLLQZ256rrkz,        X86::VPSLLQZ256rmkz,        0 },
+    { X86::VPSLLVDZ256rrkz,       X86::VPSLLVDZ256rmkz,       0 },
+    { X86::VPSLLVQZ256rrkz,       X86::VPSLLVQZ256rmkz,       0 },
+    { X86::VPSLLVWZ256rrkz,       X86::VPSLLVWZ256rmkz,       0 },
+    { X86::VPSLLWZ256rrkz,        X86::VPSLLWZ256rmkz,        0 },
+    { X86::VPSRADZ256rrkz,        X86::VPSRADZ256rmkz,        0 },
+    { X86::VPSRAQZ256rrkz,        X86::VPSRAQZ256rmkz,        0 },
+    { X86::VPSRAVDZ256rrkz,       X86::VPSRAVDZ256rmkz,       0 },
+    { X86::VPSRAVQZ256rrkz,       X86::VPSRAVQZ256rmkz,       0 },
+    { X86::VPSRAVWZ256rrkz,       X86::VPSRAVWZ256rmkz,       0 },
+    { X86::VPSRAWZ256rrkz,        X86::VPSRAWZ256rmkz,        0 },
+    { X86::VPSRLDZ256rrkz,        X86::VPSRLDZ256rmkz,        0 },
+    { X86::VPSRLQZ256rrkz,        X86::VPSRLQZ256rmkz,        0 },
+    { X86::VPSRLVDZ256rrkz,       X86::VPSRLVDZ256rmkz,       0 },
+    { X86::VPSRLVQZ256rrkz,       X86::VPSRLVQZ256rmkz,       0 },
+    { X86::VPSRLVWZ256rrkz,       X86::VPSRLVWZ256rmkz,       0 },
+    { X86::VPSRLWZ256rrkz,        X86::VPSRLWZ256rmkz,        0 },
     { X86::VPSUBBZ256rrkz,        X86::VPSUBBZ256rmkz,        0 },
     { X86::VPSUBDZ256rrkz,        X86::VPSUBDZ256rmkz,        0 },
     { X86::VPSUBQZ256rrkz,        X86::VPSUBQZ256rmkz,        0 },
@@ -2481,6 +2788,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ256rrkz,    X86::VPUNPCKLWDZ256rmkz,    0 },
     { X86::VPXORDZ256rrkz,        X86::VPXORDZ256rmkz,        0 },
     { X86::VPXORQZ256rrkz,        X86::VPXORQZ256rmkz,        0 },
+    { X86::VSHUFPDZ256rrikz,      X86::VSHUFPDZ256rmikz,      0 },
+    { X86::VSHUFPSZ256rrikz,      X86::VSHUFPSZ256rmikz,      0 },
     { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
     { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
     { X86::VUNPCKHPDZ256rrkz,     X86::VUNPCKHPDZ256rmkz,     0 },
@@ -2513,6 +2822,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
     { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
     { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
+    { X86::VPACKSSDWZ128rrkz,     X86::VPACKSSDWZ128rmkz,     0 },
+    { X86::VPACKSSWBZ128rrkz,     X86::VPACKSSWBZ128rmkz,     0 },
+    { X86::VPACKUSDWZ128rrkz,     X86::VPACKUSDWZ128rmkz,     0 },
+    { X86::VPACKUSWBZ128rrkz,     X86::VPACKUSWBZ128rmkz,     0 },
     { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
     { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
     { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
@@ -2526,15 +2839,56 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ128rrkz,       X86::VPANDNDZ128rmkz,       0 },
     { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
     { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
+    { X86::VPAVGBZ128rrkz,        X86::VPAVGBZ128rmkz,        0 },
+    { X86::VPAVGWZ128rrkz,        X86::VPAVGWZ128rmkz,        0 },
     { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
     { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
     { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
     { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
     { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
     { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
+    { X86::VPMAXSBZ128rrkz,       X86::VPMAXSBZ128rmkz,       0 },
+    { X86::VPMAXSDZ128rrkz,       X86::VPMAXSDZ128rmkz,       0 },
+    { X86::VPMAXSQZ128rrkz,       X86::VPMAXSQZ128rmkz,       0 },
+    { X86::VPMAXSWZ128rrkz,       X86::VPMAXSWZ128rmkz,       0 },
+    { X86::VPMAXUBZ128rrkz,       X86::VPMAXUBZ128rmkz,       0 },
+    { X86::VPMAXUDZ128rrkz,       X86::VPMAXUDZ128rmkz,       0 },
+    { X86::VPMAXUQZ128rrkz,       X86::VPMAXUQZ128rmkz,       0 },
+    { X86::VPMAXUWZ128rrkz,       X86::VPMAXUWZ128rmkz,       0 },
+    { X86::VPMINSBZ128rrkz,       X86::VPMINSBZ128rmkz,       0 },
+    { X86::VPMINSDZ128rrkz,       X86::VPMINSDZ128rmkz,       0 },
+    { X86::VPMINSQZ128rrkz,       X86::VPMINSQZ128rmkz,       0 },
+    { X86::VPMINSWZ128rrkz,       X86::VPMINSWZ128rmkz,       0 },
+    { X86::VPMINUBZ128rrkz,       X86::VPMINUBZ128rmkz,       0 },
+    { X86::VPMINUDZ128rrkz,       X86::VPMINUDZ128rmkz,       0 },
+    { X86::VPMINUQZ128rrkz,       X86::VPMINUQZ128rmkz,       0 },
+    { X86::VPMINUWZ128rrkz,       X86::VPMINUWZ128rmkz,       0 },
+    { X86::VPMULDQZ128rrkz,       X86::VPMULDQZ128rmkz,       0 },
+    { X86::VPMULLDZ128rrkz,       X86::VPMULLDZ128rmkz,       0 },
+    { X86::VPMULLQZ128rrkz,       X86::VPMULLQZ128rmkz,       0 },
+    { X86::VPMULLWZ128rrkz,       X86::VPMULLWZ128rmkz,       0 },
+    { X86::VPMULUDQZ128rrkz,      X86::VPMULUDQZ128rmkz,      0 },
     { X86::VPORDZ128rrkz,         X86::VPORDZ128rmkz,         0 },
     { X86::VPORQZ128rrkz,         X86::VPORQZ128rmkz,         0 },
     { X86::VPSHUFBZ128rrkz,       X86::VPSHUFBZ128rmkz,       0 },
+    { X86::VPSLLDZ128rrkz,        X86::VPSLLDZ128rmkz,        0 },
+    { X86::VPSLLQZ128rrkz,        X86::VPSLLQZ128rmkz,        0 },
+    { X86::VPSLLVDZ128rrkz,       X86::VPSLLVDZ128rmkz,       0 },
+    { X86::VPSLLVQZ128rrkz,       X86::VPSLLVQZ128rmkz,       0 },
+    { X86::VPSLLVWZ128rrkz,       X86::VPSLLVWZ128rmkz,       0 },
+    { X86::VPSLLWZ128rrkz,        X86::VPSLLWZ128rmkz,        0 },
+    { X86::VPSRADZ128rrkz,        X86::VPSRADZ128rmkz,        0 },
+    { X86::VPSRAQZ128rrkz,        X86::VPSRAQZ128rmkz,        0 },
+    { X86::VPSRAVDZ128rrkz,       X86::VPSRAVDZ128rmkz,       0 },
+    { X86::VPSRAVQZ128rrkz,       X86::VPSRAVQZ128rmkz,       0 },
+    { X86::VPSRAVWZ128rrkz,       X86::VPSRAVWZ128rmkz,       0 },
+    { X86::VPSRAWZ128rrkz,        X86::VPSRAWZ128rmkz,        0 },
+    { X86::VPSRLDZ128rrkz,        X86::VPSRLDZ128rmkz,        0 },
+    { X86::VPSRLQZ128rrkz,        X86::VPSRLQZ128rmkz,        0 },
+    { X86::VPSRLVDZ128rrkz,       X86::VPSRLVDZ128rmkz,       0 },
+    { X86::VPSRLVQZ128rrkz,       X86::VPSRLVQZ128rmkz,       0 },
+    { X86::VPSRLVWZ128rrkz,       X86::VPSRLVWZ128rmkz,       0 },
+    { X86::VPSRLWZ128rrkz,        X86::VPSRLWZ128rmkz,        0 },
     { X86::VPSUBBZ128rrkz,        X86::VPSUBBZ128rmkz,        0 },
     { X86::VPSUBDZ128rrkz,        X86::VPSUBDZ128rmkz,        0 },
     { X86::VPSUBQZ128rrkz,        X86::VPSUBQZ128rmkz,        0 },
@@ -2553,6 +2907,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ128rrkz,    X86::VPUNPCKLWDZ128rmkz,    0 },
     { X86::VPXORDZ128rrkz,        X86::VPXORDZ128rmkz,        0 },
     { X86::VPXORQZ128rrkz,        X86::VPXORQZ128rmkz,        0 },
+    { X86::VSHUFPDZ128rrikz,      X86::VSHUFPDZ128rmikz,      0 },
+    { X86::VSHUFPSZ128rrikz,      X86::VSHUFPSZ128rmikz,      0 },
     { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
     { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
     { X86::VUNPCKHPDZ128rrkz,     X86::VUNPCKHPDZ128rmkz,     0 },
@@ -2563,6 +2919,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VXORPSZ128rrkz,        X86::VXORPSZ128rmkz,        0 },
 
     // AVX-512 masked foldable instructions
+    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
+    { X86::VPABSBZrrk,            X86::VPABSBZrmk,            0 },
+    { X86::VPABSDZrrk,            X86::VPABSDZrmk,            0 },
+    { X86::VPABSQZrrk,            X86::VPABSQZrmk,            0 },
+    { X86::VPABSWZrrk,            X86::VPABSWZrmk,            0 },
     { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
     { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
     { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
@@ -2582,8 +2944,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZrik,           X86::VPSHUFDZmik,           0 },
     { X86::VPSHUFHWZrik,          X86::VPSHUFHWZmik,          0 },
     { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
+    { X86::VPSLLDZrik,            X86::VPSLLDZmik,            0 },
+    { X86::VPSLLQZrik,            X86::VPSLLQZmik,            0 },
+    { X86::VPSLLWZrik,            X86::VPSLLWZmik,            0 },
+    { X86::VPSRADZrik,            X86::VPSRADZmik,            0 },
+    { X86::VPSRAQZrik,            X86::VPSRAQZmik,            0 },
+    { X86::VPSRAWZrik,            X86::VPSRAWZmik,            0 },
+    { X86::VPSRLDZrik,            X86::VPSRLDZmik,            0 },
+    { X86::VPSRLQZrik,            X86::VPSRLQZmik,            0 },
+    { X86::VPSRLWZrik,            X86::VPSRLWZmik,            0 },
 
     // AVX-512VL 256-bit masked foldable instructions
+    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
+    { X86::VPABSBZ256rrk,         X86::VPABSBZ256rmk,         0 },
+    { X86::VPABSDZ256rrk,         X86::VPABSDZ256rmk,         0 },
+    { X86::VPABSQZ256rrk,         X86::VPABSQZ256rmk,         0 },
+    { X86::VPABSWZ256rrk,         X86::VPABSWZ256rmk,         0 },
     { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
     { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
     { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
@@ -2603,8 +2980,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256rik,        X86::VPSHUFDZ256mik,        0 },
     { X86::VPSHUFHWZ256rik,       X86::VPSHUFHWZ256mik,       0 },
     { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
+    { X86::VPSLLDZ256rik,         X86::VPSLLDZ256mik,         0 },
+    { X86::VPSLLQZ256rik,         X86::VPSLLQZ256mik,         0 },
+    { X86::VPSLLWZ256rik,         X86::VPSLLWZ256mik,         0 },
+    { X86::VPSRADZ256rik,         X86::VPSRADZ256mik,         0 },
+    { X86::VPSRAQZ256rik,         X86::VPSRAQZ256mik,         0 },
+    { X86::VPSRAWZ256rik,         X86::VPSRAWZ256mik,         0 },
+    { X86::VPSRLDZ256rik,         X86::VPSRLDZ256mik,         0 },
+    { X86::VPSRLQZ256rik,         X86::VPSRLQZ256mik,         0 },
+    { X86::VPSRLWZ256rik,         X86::VPSRLWZ256mik,         0 },
 
     // AVX-512VL 128-bit masked foldable instructions
+    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
+    { X86::VPABSBZ128rrk,         X86::VPABSBZ128rmk,         0 },
+    { X86::VPABSDZ128rrk,         X86::VPABSDZ128rmk,         0 },
+    { X86::VPABSQZ128rrk,         X86::VPABSQZ128rmk,         0 },
+    { X86::VPABSWZ128rrk,         X86::VPABSWZ128rmk,         0 },
     { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
     { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
     { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
@@ -2622,6 +3013,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128rik,        X86::VPSHUFDZ128mik,        0 },
     { X86::VPSHUFHWZ128rik,       X86::VPSHUFHWZ128mik,       0 },
     { X86::VPSHUFLWZ128rik,       X86::VPSHUFLWZ128mik,       0 },
+    { X86::VPSLLDZ128rik,         X86::VPSLLDZ128mik,         0 },
+    { X86::VPSLLQZ128rik,         X86::VPSLLQZ128mik,         0 },
+    { X86::VPSLLWZ128rik,         X86::VPSLLWZ128mik,         0 },
+    { X86::VPSRADZ128rik,         X86::VPSRADZ128mik,         0 },
+    { X86::VPSRAQZ128rik,         X86::VPSRAQZ128mik,         0 },
+    { X86::VPSRAWZ128rik,         X86::VPSRAWZ128mik,         0 },
+    { X86::VPSRLDZ128rik,         X86::VPSRLDZ128mik,         0 },
+    { X86::VPSRLQZ128rik,         X86::VPSRLQZ128mik,         0 },
+    { X86::VPSRLWZ128rik,         X86::VPSRLWZ128mik,         0 },
   };
 
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
@@ -2651,6 +3051,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 foldable masked instructions
     { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
     { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
+    { X86::VADDSDZrr_Intk,     X86::VADDSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VADDSSZrr_Intk,     X86::VADDSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VALIGNDZrrik,       X86::VALIGNDZrmik,         0 },
     { X86::VALIGNQZrrik,       X86::VALIGNQZrmik,         0 },
     { X86::VANDNPDZrrk,        X86::VANDNPDZrmk,          0 },
@@ -2659,6 +3061,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPSZrrk,         X86::VANDPSZrmk,           0 },
     { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
     { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
+    { X86::VDIVSDZrr_Intk,     X86::VDIVSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr_Intk,     X86::VDIVSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VINSERTF32x4Zrrk,   X86::VINSERTF32x4Zrmk,     0 },
     { X86::VINSERTF32x8Zrrk,   X86::VINSERTF32x8Zrmk,     0 },
     { X86::VINSERTF64x2Zrrk,   X86::VINSERTF64x2Zrmk,     0 },
@@ -2671,14 +3075,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXCPSZrrk,        X86::VMAXCPSZrmk,          0 },
     { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
     { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
+    { X86::VMAXSDZrr_Intk,     X86::VMAXSDZrm_Intk,       0 },
+    { X86::VMAXSSZrr_Intk,     X86::VMAXSSZrm_Intk,       0 },
     { X86::VMINCPDZrrk,        X86::VMINCPDZrmk,          0 },
     { X86::VMINCPSZrrk,        X86::VMINCPSZrmk,          0 },
     { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
     { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
+    { X86::VMINSDZrr_Intk,     X86::VMINSDZrm_Intk,       0 },
+    { X86::VMINSSZrr_Intk,     X86::VMINSSZrm_Intk,       0 },
     { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
     { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
+    { X86::VMULSDZrr_Intk,     X86::VMULSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VMULSSZrr_Intk,     X86::VMULSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
     { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
+    { X86::VPACKSSDWZrrk,      X86::VPACKSSDWZrmk,        0 },
+    { X86::VPACKSSWBZrrk,      X86::VPACKSSWBZrmk,        0 },
+    { X86::VPACKUSDWZrrk,      X86::VPACKUSDWZrmk,        0 },
+    { X86::VPACKUSWBZrrk,      X86::VPACKUSWBZrmk,        0 },
     { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
     { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
     { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
@@ -2692,6 +3106,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrrk,        X86::VPANDNDZrmk,          0 },
     { X86::VPANDNQZrrk,        X86::VPANDNQZrmk,          0 },
     { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
+    { X86::VPAVGBZrrk,         X86::VPAVGBZrmk,           0 },
+    { X86::VPAVGWZrrk,         X86::VPAVGWZrmk,           0 },
     { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
     { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
     { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
@@ -2714,9 +3130,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
     { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
     { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
+    { X86::VPMAXSBZrrk,        X86::VPMAXSBZrmk,          0 },
+    { X86::VPMAXSDZrrk,        X86::VPMAXSDZrmk,          0 },
+    { X86::VPMAXSQZrrk,        X86::VPMAXSQZrmk,          0 },
+    { X86::VPMAXSWZrrk,        X86::VPMAXSWZrmk,          0 },
+    { X86::VPMAXUBZrrk,        X86::VPMAXUBZrmk,          0 },
+    { X86::VPMAXUDZrrk,        X86::VPMAXUDZrmk,          0 },
+    { X86::VPMAXUQZrrk,        X86::VPMAXUQZrmk,          0 },
+    { X86::VPMAXUWZrrk,        X86::VPMAXUWZrmk,          0 },
+    { X86::VPMINSBZrrk,        X86::VPMINSBZrmk,          0 },
+    { X86::VPMINSDZrrk,        X86::VPMINSDZrmk,          0 },
+    { X86::VPMINSQZrrk,        X86::VPMINSQZrmk,          0 },
+    { X86::VPMINSWZrrk,        X86::VPMINSWZrmk,          0 },
+    { X86::VPMINUBZrrk,        X86::VPMINUBZrmk,          0 },
+    { X86::VPMINUDZrrk,        X86::VPMINUDZrmk,          0 },
+    { X86::VPMINUQZrrk,        X86::VPMINUQZrmk,          0 },
+    { X86::VPMINUWZrrk,        X86::VPMINUWZrmk,          0 },
+    { X86::VPMULDQZrrk,        X86::VPMULDQZrmk,          0 },
+    { X86::VPMULLDZrrk,        X86::VPMULLDZrmk,          0 },
+    { X86::VPMULLQZrrk,        X86::VPMULLQZrmk,          0 },
+    { X86::VPMULLWZrrk,        X86::VPMULLWZrmk,          0 },
+    { X86::VPMULUDQZrrk,       X86::VPMULUDQZrmk,         0 },
     { X86::VPORDZrrk,          X86::VPORDZrmk,            0 },
     { X86::VPORQZrrk,          X86::VPORQZrmk,            0 },
     { X86::VPSHUFBZrrk,        X86::VPSHUFBZrmk,          0 },
+    { X86::VPSLLDZrrk,         X86::VPSLLDZrmk,           0 },
+    { X86::VPSLLQZrrk,         X86::VPSLLQZrmk,           0 },
+    { X86::VPSLLVDZrrk,        X86::VPSLLVDZrmk,          0 },
+    { X86::VPSLLVQZrrk,        X86::VPSLLVQZrmk,          0 },
+    { X86::VPSLLVWZrrk,        X86::VPSLLVWZrmk,          0 },
+    { X86::VPSLLWZrrk,         X86::VPSLLWZrmk,           0 },
+    { X86::VPSRADZrrk,         X86::VPSRADZrmk,           0 },
+    { X86::VPSRAQZrrk,         X86::VPSRAQZrmk,           0 },
+    { X86::VPSRAVDZrrk,        X86::VPSRAVDZrmk,          0 },
+    { X86::VPSRAVQZrrk,        X86::VPSRAVQZrmk,          0 },
+    { X86::VPSRAVWZrrk,        X86::VPSRAVWZrmk,          0 },
+    { X86::VPSRAWZrrk,         X86::VPSRAWZrmk,           0 },
+    { X86::VPSRLDZrrk,         X86::VPSRLDZrmk,           0 },
+    { X86::VPSRLQZrrk,         X86::VPSRLQZrmk,           0 },
+    { X86::VPSRLVDZrrk,        X86::VPSRLVDZrmk,          0 },
+    { X86::VPSRLVQZrrk,        X86::VPSRLVQZrmk,          0 },
+    { X86::VPSRLVWZrrk,        X86::VPSRLVWZrmk,          0 },
+    { X86::VPSRLWZrrk,         X86::VPSRLWZrmk,           0 },
     { X86::VPSUBBZrrk,         X86::VPSUBBZrmk,           0 },
     { X86::VPSUBDZrrk,         X86::VPSUBDZrmk,           0 },
     { X86::VPSUBQZrrk,         X86::VPSUBQZrmk,           0 },
@@ -2736,8 +3191,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZrrk,     X86::VPUNPCKLWDZrmk,       0 },
     { X86::VPXORDZrrk,         X86::VPXORDZrmk,           0 },
     { X86::VPXORQZrrk,         X86::VPXORQZrmk,           0 },
+    { X86::VSHUFPDZrrik,       X86::VSHUFPDZrmik,         0 },
+    { X86::VSHUFPSZrrik,       X86::VSHUFPSZrmik,         0 },
     { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
     { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
+    { X86::VSUBSDZrr_Intk,     X86::VSUBSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr_Intk,     X86::VSUBSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VUNPCKHPDZrrk,      X86::VUNPCKHPDZrmk,        0 },
     { X86::VUNPCKHPSZrrk,      X86::VUNPCKHPSZrmk,        0 },
     { X86::VUNPCKLPDZrrk,      X86::VUNPCKLPDZrmk,        0 },
@@ -2772,6 +3231,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
     { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
     { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
+    { X86::VPACKSSDWZ256rrk,   X86::VPACKSSDWZ256rmk,     0 },
+    { X86::VPACKSSWBZ256rrk,   X86::VPACKSSWBZ256rmk,     0 },
+    { X86::VPACKUSDWZ256rrk,   X86::VPACKUSDWZ256rmk,     0 },
+    { X86::VPACKUSWBZ256rrk,   X86::VPACKUSWBZ256rmk,     0 },
     { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
     { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
     { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
@@ -2785,6 +3248,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ256rrk,     X86::VPANDNDZ256rmk,       0 },
     { X86::VPANDNQZ256rrk,     X86::VPANDNQZ256rmk,       0 },
     { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
+    { X86::VPAVGBZ256rrk,      X86::VPAVGBZ256rmk,        0 },
+    { X86::VPAVGWZ256rrk,      X86::VPAVGWZ256rmk,        0 },
     { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
     { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
     { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
@@ -2807,9 +3272,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
     { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
     { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
+    { X86::VPMAXSBZ256rrk,     X86::VPMAXSBZ256rmk,       0 },
+    { X86::VPMAXSDZ256rrk,     X86::VPMAXSDZ256rmk,       0 },
+    { X86::VPMAXSQZ256rrk,     X86::VPMAXSQZ256rmk,       0 },
+    { X86::VPMAXSWZ256rrk,     X86::VPMAXSWZ256rmk,       0 },
+    { X86::VPMAXUBZ256rrk,     X86::VPMAXUBZ256rmk,       0 },
+    { X86::VPMAXUDZ256rrk,     X86::VPMAXUDZ256rmk,       0 },
+    { X86::VPMAXUQZ256rrk,     X86::VPMAXUQZ256rmk,       0 },
+    { X86::VPMAXUWZ256rrk,     X86::VPMAXUWZ256rmk,       0 },
+    { X86::VPMINSBZ256rrk,     X86::VPMINSBZ256rmk,       0 },
+    { X86::VPMINSDZ256rrk,     X86::VPMINSDZ256rmk,       0 },
+    { X86::VPMINSQZ256rrk,     X86::VPMINSQZ256rmk,       0 },
+    { X86::VPMINSWZ256rrk,     X86::VPMINSWZ256rmk,       0 },
+    { X86::VPMINUBZ256rrk,     X86::VPMINUBZ256rmk,       0 },
+    { X86::VPMINUDZ256rrk,     X86::VPMINUDZ256rmk,       0 },
+    { X86::VPMINUQZ256rrk,     X86::VPMINUQZ256rmk,       0 },
+    { X86::VPMINUWZ256rrk,     X86::VPMINUWZ256rmk,       0 },
+    { X86::VPMULDQZ256rrk,     X86::VPMULDQZ256rmk,       0 },
+    { X86::VPMULLDZ256rrk,     X86::VPMULLDZ256rmk,       0 },
+    { X86::VPMULLQZ256rrk,     X86::VPMULLQZ256rmk,       0 },
+    { X86::VPMULLWZ256rrk,     X86::VPMULLWZ256rmk,       0 },
+    { X86::VPMULUDQZ256rrk,    X86::VPMULUDQZ256rmk,      0 },
     { X86::VPORDZ256rrk,       X86::VPORDZ256rmk,         0 },
     { X86::VPORQZ256rrk,       X86::VPORQZ256rmk,         0 },
     { X86::VPSHUFBZ256rrk,     X86::VPSHUFBZ256rmk,       0 },
+    { X86::VPSLLDZ256rrk,      X86::VPSLLDZ256rmk,        0 },
+    { X86::VPSLLQZ256rrk,      X86::VPSLLQZ256rmk,        0 },
+    { X86::VPSLLVDZ256rrk,     X86::VPSLLVDZ256rmk,       0 },
+    { X86::VPSLLVQZ256rrk,     X86::VPSLLVQZ256rmk,       0 },
+    { X86::VPSLLVWZ256rrk,     X86::VPSLLVWZ256rmk,       0 },
+    { X86::VPSLLWZ256rrk,      X86::VPSLLWZ256rmk,        0 },
+    { X86::VPSRADZ256rrk,      X86::VPSRADZ256rmk,        0 },
+    { X86::VPSRAQZ256rrk,      X86::VPSRAQZ256rmk,        0 },
+    { X86::VPSRAVDZ256rrk,     X86::VPSRAVDZ256rmk,       0 },
+    { X86::VPSRAVQZ256rrk,     X86::VPSRAVQZ256rmk,       0 },
+    { X86::VPSRAVWZ256rrk,     X86::VPSRAVWZ256rmk,       0 },
+    { X86::VPSRAWZ256rrk,      X86::VPSRAWZ256rmk,        0 },
+    { X86::VPSRLDZ256rrk,      X86::VPSRLDZ256rmk,        0 },
+    { X86::VPSRLQZ256rrk,      X86::VPSRLQZ256rmk,        0 },
+    { X86::VPSRLVDZ256rrk,     X86::VPSRLVDZ256rmk,       0 },
+    { X86::VPSRLVQZ256rrk,     X86::VPSRLVQZ256rmk,       0 },
+    { X86::VPSRLVWZ256rrk,     X86::VPSRLVWZ256rmk,       0 },
+    { X86::VPSRLWZ256rrk,      X86::VPSRLWZ256rmk,        0 },
     { X86::VPSUBBZ256rrk,      X86::VPSUBBZ256rmk,        0 },
     { X86::VPSUBDZ256rrk,      X86::VPSUBDZ256rmk,        0 },
     { X86::VPSUBQZ256rrk,      X86::VPSUBQZ256rmk,        0 },
@@ -2830,6 +3334,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ256rrk,  X86::VPUNPCKLWDZ256rmk,    0 },
     { X86::VPXORDZ256rrk,      X86::VPXORDZ256rmk,        0 },
     { X86::VPXORQZ256rrk,      X86::VPXORQZ256rmk,        0 },
+    { X86::VSHUFPDZ256rrik,    X86::VSHUFPDZ256rmik,      0 },
+    { X86::VSHUFPSZ256rrik,    X86::VSHUFPSZ256rmik,      0 },
     { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
     { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
     { X86::VUNPCKHPDZ256rrk,   X86::VUNPCKHPDZ256rmk,     0 },
@@ -2862,6 +3368,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
     { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
     { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
+    { X86::VPACKSSDWZ128rrk,   X86::VPACKSSDWZ128rmk,     0 },
+    { X86::VPACKSSWBZ128rrk,   X86::VPACKSSWBZ128rmk,     0 },
+    { X86::VPACKUSDWZ128rrk,   X86::VPACKUSDWZ128rmk,     0 },
+    { X86::VPACKUSWBZ128rrk,   X86::VPACKUSWBZ128rmk,     0 },
     { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
     { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
     { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },
@@ -2875,6 +3385,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ128rrk,     X86::VPANDNDZ128rmk,       0 },
     { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
     { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
+    { X86::VPAVGBZ128rrk,      X86::VPAVGBZ128rmk,        0 },
+    { X86::VPAVGWZ128rrk,      X86::VPAVGWZ128rmk,        0 },
     { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
     { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
     { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
@@ -2893,9 +3405,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
     { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
     { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
+    { X86::VPMAXSBZ128rrk,     X86::VPMAXSBZ128rmk,       0 },
+    { X86::VPMAXSDZ128rrk,     X86::VPMAXSDZ128rmk,       0 },
+    { X86::VPMAXSQZ128rrk,     X86::VPMAXSQZ128rmk,       0 },
+    { X86::VPMAXSWZ128rrk,     X86::VPMAXSWZ128rmk,       0 },
+    { X86::VPMAXUBZ128rrk,     X86::VPMAXUBZ128rmk,       0 },
+    { X86::VPMAXUDZ128rrk,     X86::VPMAXUDZ128rmk,       0 },
+    { X86::VPMAXUQZ128rrk,     X86::VPMAXUQZ128rmk,       0 },
+    { X86::VPMAXUWZ128rrk,     X86::VPMAXUWZ128rmk,       0 },
+    { X86::VPMINSBZ128rrk,     X86::VPMINSBZ128rmk,       0 },
+    { X86::VPMINSDZ128rrk,     X86::VPMINSDZ128rmk,       0 },
+    { X86::VPMINSQZ128rrk,     X86::VPMINSQZ128rmk,       0 },
+    { X86::VPMINSWZ128rrk,     X86::VPMINSWZ128rmk,       0 },
+    { X86::VPMINUBZ128rrk,     X86::VPMINUBZ128rmk,       0 },
+    { X86::VPMINUDZ128rrk,     X86::VPMINUDZ128rmk,       0 },
+    { X86::VPMINUQZ128rrk,     X86::VPMINUQZ128rmk,       0 },
+    { X86::VPMINUWZ128rrk,     X86::VPMINUWZ128rmk,       0 },
+    { X86::VPMULDQZ128rrk,     X86::VPMULDQZ128rmk,       0 },
+    { X86::VPMULLDZ128rrk,     X86::VPMULLDZ128rmk,       0 },
+    { X86::VPMULLQZ128rrk,     X86::VPMULLQZ128rmk,       0 },
+    { X86::VPMULLWZ128rrk,     X86::VPMULLWZ128rmk,       0 },
+    { X86::VPMULUDQZ128rrk,    X86::VPMULUDQZ128rmk,      0 },
     { X86::VPORDZ128rrk,       X86::VPORDZ128rmk,         0 },
     { X86::VPORQZ128rrk,       X86::VPORQZ128rmk,         0 },
     { X86::VPSHUFBZ128rrk,     X86::VPSHUFBZ128rmk,       0 },
+    { X86::VPSLLDZ128rrk,      X86::VPSLLDZ128rmk,        0 },
+    { X86::VPSLLQZ128rrk,      X86::VPSLLQZ128rmk,        0 },
+    { X86::VPSLLVDZ128rrk,     X86::VPSLLVDZ128rmk,       0 },
+    { X86::VPSLLVQZ128rrk,     X86::VPSLLVQZ128rmk,       0 },
+    { X86::VPSLLVWZ128rrk,     X86::VPSLLVWZ128rmk,       0 },
+    { X86::VPSLLWZ128rrk,      X86::VPSLLWZ128rmk,        0 },
+    { X86::VPSRADZ128rrk,      X86::VPSRADZ128rmk,        0 },
+    { X86::VPSRAQZ128rrk,      X86::VPSRAQZ128rmk,        0 },
+    { X86::VPSRAVDZ128rrk,     X86::VPSRAVDZ128rmk,       0 },
+    { X86::VPSRAVQZ128rrk,     X86::VPSRAVQZ128rmk,       0 },
+    { X86::VPSRAVWZ128rrk,     X86::VPSRAVWZ128rmk,       0 },
+    { X86::VPSRAWZ128rrk,      X86::VPSRAWZ128rmk,        0 },
+    { X86::VPSRLDZ128rrk,      X86::VPSRLDZ128rmk,        0 },
+    { X86::VPSRLQZ128rrk,      X86::VPSRLQZ128rmk,        0 },
+    { X86::VPSRLVDZ128rrk,     X86::VPSRLVDZ128rmk,       0 },
+    { X86::VPSRLVQZ128rrk,     X86::VPSRLVQZ128rmk,       0 },
+    { X86::VPSRLVWZ128rrk,     X86::VPSRLVWZ128rmk,       0 },
+    { X86::VPSRLWZ128rrk,      X86::VPSRLWZ128rmk,        0 },
     { X86::VPSUBBZ128rrk,      X86::VPSUBBZ128rmk,        0 },
     { X86::VPSUBDZ128rrk,      X86::VPSUBDZ128rmk,        0 },
     { X86::VPSUBQZ128rrk,      X86::VPSUBQZ128rmk,        0 },
@@ -2916,6 +3467,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ128rrk,  X86::VPUNPCKLWDZ128rmk,    0 },
     { X86::VPXORDZ128rrk,      X86::VPXORDZ128rmk,        0 },
     { X86::VPXORQZ128rrk,      X86::VPXORQZ128rmk,        0 },
+    { X86::VSHUFPDZ128rrik,    X86::VSHUFPDZ128rmik,      0 },
+    { X86::VSHUFPSZ128rrik,    X86::VSHUFPSZ128rmik,      0 },
     { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
     { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
     { X86::VUNPCKHPDZ128rrk,   X86::VUNPCKHPDZ128rmk,     0 },
@@ -3063,18 +3616,13 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
 
-  if (MI.getOpcode() == getCallFrameSetupOpcode() ||
-      MI.getOpcode() == getCallFrameDestroyOpcode()) {
+  if (isFrameInstr(MI)) {
     unsigned StackAlign = TFI->getStackAlignment();
-    int SPAdj =
-        (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;
-
-    SPAdj -= MI.getOperand(1).getImm();
-
-    if (MI.getOpcode() == getCallFrameSetupOpcode())
-      return SPAdj;
-    else
-      return -SPAdj;
+    int SPAdj = alignTo(getFrameSize(MI), StackAlign);
+    SPAdj -= getFrameAdjustment(MI);
+    if (!isFrameSetup(MI))
+      SPAdj = -SPAdj;
+    return SPAdj;
   }
 
   // To know whether a call adjusts the stack, we need information
@@ -3569,7 +4117,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
     const DebugLoc &DL = Orig.getDebugLoc();
     BuildMI(MBB, I, DL, get(X86::MOV32ri))
-        .addOperand(Orig.getOperand(0))
+        .add(Orig.getOperand(0))
         .addImm(Value);
   } else {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
@@ -3654,10 +4202,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
-    MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
-                                 get(TargetOpcode::COPY))
-        .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
-        .addOperand(Src);
+    MachineInstr *Copy =
+        BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+            .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+            .add(Src);
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
@@ -3823,10 +4371,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
-                .addOperand(Dest)
+                .add(Dest)
                 .addReg(0)
                 .addImm(1ULL << ShAmt)
-                .addOperand(Src)
+                .add(Src)
                 .addImm(0)
                 .addReg(0);
     break;
@@ -3848,14 +4396,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     MachineInstrBuilder MIB =
         BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .addOperand(Dest)
+            .add(Dest)
             .addReg(0)
             .addImm(1ULL << ShAmt)
             .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
             .addImm(0)
             .addReg(0);
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
     NewMI = MIB;
 
     break;
@@ -3869,10 +4417,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                .addOperand(Dest)
+                .add(Dest)
                 .addReg(0)
                 .addImm(1ULL << ShAmt)
-                .addOperand(Src)
+                .add(Src)
                 .addImm(0)
                 .addReg(0);
     break;
@@ -3891,11 +4439,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     MachineInstrBuilder MIB =
         BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .addOperand(Dest)
+            .add(Dest)
             .addReg(SrcReg,
                     getKillRegState(isKill) | getUndefRegState(isUndef));
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, 1);
     break;
@@ -3905,10 +4453,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      1);
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), 1);
     break;
   case X86::DEC64r:
   case X86::DEC32r: {
@@ -3924,11 +4470,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .addOperand(Dest)
+                                  .add(Dest)
                                   .addReg(SrcReg, getUndefRegState(isUndef) |
                                                       getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, -1);
 
@@ -3939,10 +4485,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      -1);
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), -1);
     break;
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
@@ -3970,12 +4514,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
       return nullptr;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
     if (ImplicitOp2.getReg() != 0)
-      MIB.addOperand(ImplicitOp2);
+      MIB.add(ImplicitOp2);
 
     NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
 
@@ -3995,9 +4538,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Src2 = MI.getOperand(2).getReg();
     bool isKill2 = MI.getOperand(2).isKill();
-    NewMI = addRegReg(
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
-        Src.getReg(), Src.isKill(), Src2, isKill2);
+    NewMI = addRegReg(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest),
+                      Src.getReg(), Src.isKill(), Src2, isKill2);
 
     // Preserve undefness of the operands.
     bool isUndef = MI.getOperand(1).isUndef();
@@ -4014,10 +4556,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::ADD64ri32_DB:
   case X86::ADD64ri8_DB:
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      MI.getOperand(2));
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
+        MI.getOperand(2));
     break;
   case X86::ADD32ri:
   case X86::ADD32ri8:
@@ -4034,11 +4575,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .addOperand(Dest)
+                                  .add(Dest)
                                   .addReg(SrcReg, getUndefRegState(isUndef) |
                                                       getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, MI.getOperand(2));
     break;
@@ -4051,12 +4592,136 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      MI.getOperand(2));
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src),
+        MI.getOperand(2));
+    break;
+
+  case X86::VMOVDQU8Z128rmk:
+  case X86::VMOVDQU8Z256rmk:
+  case X86::VMOVDQU8Zrmk:
+  case X86::VMOVDQU16Z128rmk:
+  case X86::VMOVDQU16Z256rmk:
+  case X86::VMOVDQU16Zrmk:
+  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
+  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
+  case X86::VMOVDQU32Zrmk:    case X86::VMOVDQA32Zrmk:
+  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
+  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
+  case X86::VMOVDQU64Zrmk:    case X86::VMOVDQA64Zrmk:
+  case X86::VMOVUPDZ128rmk:   case X86::VMOVAPDZ128rmk:
+  case X86::VMOVUPDZ256rmk:   case X86::VMOVAPDZ256rmk:
+  case X86::VMOVUPDZrmk:      case X86::VMOVAPDZrmk:
+  case X86::VMOVUPSZ128rmk:   case X86::VMOVAPSZ128rmk:
+  case X86::VMOVUPSZ256rmk:   case X86::VMOVAPSZ256rmk:
+  case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk: {
+    unsigned Opc;
+    switch (MIOpc) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::VMOVDQU8Z128rmk:  Opc = X86::VPBLENDMBZ128rmk; break;
+    case X86::VMOVDQU8Z256rmk:  Opc = X86::VPBLENDMBZ256rmk; break;
+    case X86::VMOVDQU8Zrmk:     Opc = X86::VPBLENDMBZrmk;    break;
+    case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+    case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+    case X86::VMOVDQU16Zrmk:    Opc = X86::VPBLENDMWZrmk;    break;
+    case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQU32Zrmk:    Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQU64Zrmk:    Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVUPDZ128rmk:   Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVUPDZ256rmk:   Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVUPDZrmk:      Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVUPSZ128rmk:   Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVUPSZ256rmk:   Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVUPSZrmk:      Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQA32Zrmk:    Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQA64Zrmk:    Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVAPDZ128rmk:   Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVAPDZ256rmk:   Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVAPDZrmk:      Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVAPSZ128rmk:   Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVAPSZ256rmk:   Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVAPSZrmk:      Opc = X86::VBLENDMPSZrmk;    break;
+    }
+
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .add(MI.getOperand(2))
+              .add(Src)
+              .add(MI.getOperand(3))
+              .add(MI.getOperand(4))
+              .add(MI.getOperand(5))
+              .add(MI.getOperand(6))
+              .add(MI.getOperand(7));
     break;
   }
+  case X86::VMOVDQU8Z128rrk:
+  case X86::VMOVDQU8Z256rrk:
+  case X86::VMOVDQU8Zrrk:
+  case X86::VMOVDQU16Z128rrk:
+  case X86::VMOVDQU16Z256rrk:
+  case X86::VMOVDQU16Zrrk:
+  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
+  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
+  case X86::VMOVDQU32Zrrk:    case X86::VMOVDQA32Zrrk:
+  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
+  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
+  case X86::VMOVDQU64Zrrk:    case X86::VMOVDQA64Zrrk:
+  case X86::VMOVUPDZ128rrk:   case X86::VMOVAPDZ128rrk:
+  case X86::VMOVUPDZ256rrk:   case X86::VMOVAPDZ256rrk:
+  case X86::VMOVUPDZrrk:      case X86::VMOVAPDZrrk:
+  case X86::VMOVUPSZ128rrk:   case X86::VMOVAPSZ128rrk:
+  case X86::VMOVUPSZ256rrk:   case X86::VMOVAPSZ256rrk:
+  case X86::VMOVUPSZrrk:      case X86::VMOVAPSZrrk: {
+    unsigned Opc;
+    switch (MIOpc) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::VMOVDQU8Z128rrk:  Opc = X86::VPBLENDMBZ128rrk; break;
+    case X86::VMOVDQU8Z256rrk:  Opc = X86::VPBLENDMBZ256rrk; break;
+    case X86::VMOVDQU8Zrrk:     Opc = X86::VPBLENDMBZrrk;    break;
+    case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
+    case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
+    case X86::VMOVDQU16Zrrk:    Opc = X86::VPBLENDMWZrrk;    break;
+    case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+    case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+    case X86::VMOVDQU32Zrrk:    Opc = X86::VPBLENDMDZrrk;    break;
+    case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+    case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+    case X86::VMOVDQU64Zrrk:    Opc = X86::VPBLENDMQZrrk;    break;
+    case X86::VMOVUPDZ128rrk:   Opc = X86::VBLENDMPDZ128rrk; break;
+    case X86::VMOVUPDZ256rrk:   Opc = X86::VBLENDMPDZ256rrk; break;
+    case X86::VMOVUPDZrrk:      Opc = X86::VBLENDMPDZrrk;    break;
+    case X86::VMOVUPSZ128rrk:   Opc = X86::VBLENDMPSZ128rrk; break;
+    case X86::VMOVUPSZ256rrk:   Opc = X86::VBLENDMPSZ256rrk; break;
+    case X86::VMOVUPSZrrk:      Opc = X86::VBLENDMPSZrrk;    break;
+    case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+    case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+    case X86::VMOVDQA32Zrrk:    Opc = X86::VPBLENDMDZrrk;    break;
+    case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+    case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+    case X86::VMOVDQA64Zrrk:    Opc = X86::VPBLENDMQZrrk;    break;
+    case X86::VMOVAPDZ128rrk:   Opc = X86::VBLENDMPDZ128rrk; break;
+    case X86::VMOVAPDZ256rrk:   Opc = X86::VBLENDMPDZ256rrk; break;
+    case X86::VMOVAPDZrrk:      Opc = X86::VBLENDMPDZrrk;    break;
+    case X86::VMOVAPSZ128rrk:   Opc = X86::VBLENDMPSZ128rrk; break;
+    case X86::VMOVAPSZ256rrk:   Opc = X86::VBLENDMPSZ256rrk; break;
+    case X86::VMOVAPSZrrk:      Opc = X86::VBLENDMPSZrrk;    break;
+    }
+
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .add(MI.getOperand(2))
+              .add(Src)
+              .add(MI.getOperand(3));
+    break;
+  }
+  }
 
   if (!NewMI) return nullptr;
 
@@ -4337,6 +5002,18 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
+  case X86::PFSUBrr:
+  case X86::PFSUBRrr: {
+    // PFSUB  x, y: x = x - y
+    // PFSUBR x, y: x = y - x
+    unsigned Opc =
+        (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+    break;
+  }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
   case X86::PBLENDWrri:
@@ -4606,18 +5283,30 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
   case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
   case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
-  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
-  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
-  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
-  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
-  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
-  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrik:
+  case X86::VPTERNLOGDZ128rrik:
+  case X86::VPTERNLOGDZ256rrik:
+  case X86::VPTERNLOGQZrrik:
+  case X86::VPTERNLOGQZ128rrik:
+  case X86::VPTERNLOGQZ256rrik:
   case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
   case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
   case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
   case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
   case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
-  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZ128rmbi:
+  case X86::VPTERNLOGDZ256rmbi:
+  case X86::VPTERNLOGDZrmbi:
+  case X86::VPTERNLOGQZ128rmbi:
+  case X86::VPTERNLOGQZ256rmbi:
+  case X86::VPTERNLOGQZrmbi:
+  case X86::VPTERNLOGDZ128rmbikz:
+  case X86::VPTERNLOGDZ256rmbikz:
+  case X86::VPTERNLOGDZrmbikz:
+  case X86::VPTERNLOGQZ128rmbikz:
+  case X86::VPTERNLOGQZ256rmbikz:
+  case X86::VPTERNLOGQZrmbikz: {
     auto &WorkingMI = cloneIfNew(MI);
     if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
       return nullptr;
@@ -4798,18 +5487,30 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
   case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
   case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
-  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
-  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
-  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
-  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
-  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
-  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrik:
+  case X86::VPTERNLOGDZ128rrik:
+  case X86::VPTERNLOGDZ256rrik:
+  case X86::VPTERNLOGQZrrik:
+  case X86::VPTERNLOGQZ128rrik:
+  case X86::VPTERNLOGQZ256rrik:
   case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
   case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
   case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
   case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
   case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
   case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZ128rmbi:
+  case X86::VPTERNLOGDZ256rmbi:
+  case X86::VPTERNLOGDZrmbi:
+  case X86::VPTERNLOGQZ128rmbi:
+  case X86::VPTERNLOGQZ256rmbi:
+  case X86::VPTERNLOGQZrmbi:
+  case X86::VPTERNLOGDZ128rmbikz:
+  case X86::VPTERNLOGDZ256rmbikz:
+  case X86::VPTERNLOGDZrmbikz:
+  case X86::VPTERNLOGQZ128rmbikz:
+  case X86::VPTERNLOGQZ256rmbikz:
+  case X86::VPTERNLOGQZrmbikz:
     return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   default:
     const X86InstrFMA3Group *FMA3Group =
@@ -5108,6 +5809,95 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
   return !isPredicated(MI);
 }
 
+bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool X86InstrInfo::canMakeTailCallConditional(
+    SmallVectorImpl<MachineOperand> &BranchCond,
+    const MachineInstr &TailCall) const {
+  if (TailCall.getOpcode() != X86::TCRETURNdi &&
+      TailCall.getOpcode() != X86::TCRETURNdi64) {
+    // Only direct calls can be done with a conditional branch.
+    return false;
+  }
+
+  const MachineFunction *MF = TailCall.getParent()->getParent();
+  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
+    // Conditional tail calls confuse the Win64 unwinder.
+    return false;
+  }
+
+  assert(BranchCond.size() == 1);
+  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
+    // Can't make a conditional tail call with this condition.
+    return false;
+  }
+
+  const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  if (X86FI->getTCReturnAddrDelta() != 0 ||
+      TailCall.getOperand(1).getImm() != 0) {
+    // A conditional tail call cannot do any stack adjustment.
+    return false;
+  }
+
+  return true;
+}
+
+void X86InstrInfo::replaceBranchWithTailCall(
+    MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
+    const MachineInstr &TailCall) const {
+  assert(canMakeTailCallConditional(BranchCond, TailCall));
+
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isBranch())
+      assert(0 && "Can't find the branch to replace!");
+
+    X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
+    assert(BranchCond.size() == 1);
+    if (CC != BranchCond[0].getImm())
+      continue;
+
+    break;
+  }
+
+  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
+                                                         : X86::TCRETURNdi64cc;
+
+  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
+  MIB->addOperand(TailCall.getOperand(0)); // Destination.
+  MIB.addImm(0); // Stack offset (not used).
+  MIB->addOperand(BranchCond[0]); // Condition.
+  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
+
+  // Add implicit uses and defs of all live regs potentially clobbered by the
+  // call. This way they still appear live across the call.
+  LivePhysRegs LiveRegs(&getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
+  LiveRegs.stepForward(*MIB, Clobbers);
+  for (const auto &C : Clobbers) {
+    MIB.addReg(C.first, RegState::Implicit);
+    MIB.addReg(C.first, RegState::Implicit | RegState::Define);
+  }
+
+  I->eraseFromParent();
+}
+
 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
 // fallthrough MBB cannot be identified.
@@ -5514,8 +6304,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
 
   // SrcReg(MaskReg) -> DestReg(GR64)
   // SrcReg(MaskReg) -> DestReg(GR32)
-  // SrcReg(MaskReg) -> DestReg(GR16)
-  // SrcReg(MaskReg) -> DestReg(GR8)
 
   // All KMASK RegClasses hold the same k registers, can be tested against anyone.
   if (X86::VK16RegClass.contains(SrcReg)) {
@@ -5525,20 +6313,10 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
     }
     if (X86::GR32RegClass.contains(DestReg))
       return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
-    if (X86::GR16RegClass.contains(DestReg)) {
-      DestReg = getX86SubSuperRegister(DestReg, 32);
-      return X86::KMOVWrk;
-    }
-    if (X86::GR8RegClass.contains(DestReg)) {
-      DestReg = getX86SubSuperRegister(DestReg, 32);
-      return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
-    }
   }
 
   // SrcReg(GR64) -> DestReg(MaskReg)
   // SrcReg(GR32) -> DestReg(MaskReg)
-  // SrcReg(GR16) -> DestReg(MaskReg)
-  // SrcReg(GR8)  -> DestReg(MaskReg)
 
   // All KMASK RegClasses hold the same k registers, can be tested against anyone.
   if (X86::VK16RegClass.contains(DestReg)) {
@@ -5548,14 +6326,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
     }
     if (X86::GR32RegClass.contains(SrcReg))
       return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
-    if (X86::GR16RegClass.contains(SrcReg)) {
-      SrcReg = getX86SubSuperRegister(SrcReg, 32);
-      return X86::KMOVWkr;
-    }
-    if (X86::GR8RegClass.contains(SrcReg)) {
-      SrcReg = getX86SubSuperRegister(SrcReg, 32);
-      return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
-    }
   }
 
 
@@ -5965,7 +6735,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
+    MIB.add(Addr[i]);
   MIB.addReg(SrcReg, getKillRegState(isKill));
   (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
@@ -6000,7 +6770,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
+    MIB.add(Addr[i]);
   (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
 }
@@ -6017,12 +6787,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP8ri:
-    if (!MI.getOperand(1).isImm())
-      return false;
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI.getOperand(1).getImm();
+    if (MI.getOperand(1).isImm()) {
+      CmpMask = ~0;
+      CmpValue = MI.getOperand(1).getImm();
+    } else {
+      CmpMask = CmpValue = 0;
+    }
     return true;
   // A SUB can be used to perform comparison.
   case X86::SUB64rm:
@@ -6031,7 +6803,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB8rm:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::SUB64rr:
@@ -6040,7 +6812,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB8rr:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::SUB64ri32:
@@ -6050,12 +6822,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB8ri:
-    if (!MI.getOperand(2).isImm())
-      return false;
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI.getOperand(2).getImm();
+    if (MI.getOperand(2).isImm()) {
+      CmpMask = ~0;
+      CmpValue = MI.getOperand(2).getImm();
+    } else {
+      CmpMask = CmpValue = 0;
+    }
     return true;
   case X86::CMP64rr:
   case X86::CMP32rr:
@@ -6063,7 +6837,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::CMP8rr:
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = MI.getOperand(1).getReg();
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::TEST8rr:
@@ -6089,8 +6863,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
 inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
-                                        MachineInstr &OI) {
+                                        unsigned SrcReg2, int ImmMask,
+                                        int ImmValue, MachineInstr &OI) {
   if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
        (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
        (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
@@ -6101,7 +6875,8 @@ inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
         OI.getOperand(2).getReg() == SrcReg)))
     return true;
 
-  if (((FlagI.getOpcode() == X86::CMP64ri32 &&
+  if (ImmMask != 0 &&
+      ((FlagI.getOpcode() == X86::CMP64ri32 &&
         OI.getOpcode() == X86::SUB64ri32) ||
        (FlagI.getOpcode() == X86::CMP64ri8 &&
         OI.getOpcode() == X86::SUB64ri8) ||
@@ -6288,7 +7063,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
 
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
-  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
   if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
     return false;
 
@@ -6338,8 +7113,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (; RI != RE; ++RI) {
     MachineInstr &Instr = *RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
-    if (!IsCmpZero &&
-        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+    if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
+                                           CmpValue, Instr)) {
       Sub = &Instr;
       break;
     }
@@ -6764,14 +7539,33 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
   case X86::AVX512_128_SET0:
-    return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
-  case X86::AVX512_256_SET0:
-    return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
+  case X86::AVX512_FsFLD0SS:
+  case X86::AVX512_FsFLD0SD: {
+    bool HasVLX = Subtarget.hasVLX();
+    unsigned SrcReg = MIB->getOperand(0).getReg();
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
+      return Expand2AddrUndef(MIB,
+                              get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
+    // Extended register without VLX. Use a larger XOR.
+    SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
+    MIB->getOperand(0).setReg(SrcReg);
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  }
+  case X86::AVX512_256_SET0: {
+    bool HasVLX = Subtarget.hasVLX();
+    unsigned SrcReg = MIB->getOperand(0).getReg();
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
+      return Expand2AddrUndef(MIB,
+                              get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr));
+    // Extended register without VLX. Use a larger XOR.
+    SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
+    MIB->getOperand(0).setReg(SrcReg);
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  }
   case X86::AVX512_512_SET0:
     return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
-  case X86::AVX512_FsFLD0SS:
-  case X86::AVX512_FsFLD0SD:
-    return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
@@ -6838,11 +7632,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   // registers, since it is not usable as a write mask.
   // FIXME: A more advanced approach would be to choose the best input mask
   // register based on context.
-  case X86::KSET0B:
   case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
   case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
   case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
-  case X86::KSET1B:
   case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
   case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
   case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
@@ -6860,7 +7652,7 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
   if (NumAddrOps < 4) {
     // FrameIndex only - add an immediate offset (whether its zero or not).
     for (unsigned i = 0; i != NumAddrOps; ++i)
-      MIB.addOperand(MOs[i]);
+      MIB.add(MOs[i]);
     addOffset(MIB, PtrOffset);
   } else {
     // General Memory Addressing - we need to add any offset to an existing
@@ -6871,7 +7663,7 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
       if (i == 3 && PtrOffset != 0) {
         MIB.addDisp(MO, PtrOffset);
       } else {
-        MIB.addOperand(MO);
+        MIB.add(MO);
       }
     }
   }
@@ -6893,11 +7685,11 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
   unsigned NumOps = MI.getDesc().getNumOperands() - 2;
   for (unsigned i = 0; i != NumOps; ++i) {
     MachineOperand &MO = MI.getOperand(i + 2);
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
   for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
 
   MachineBasicBlock *MBB = InsertPt->getParent();
@@ -6922,7 +7714,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
       assert(MO.isReg() && "Expected to fold into reg operand!");
       addOperands(MIB, MOs, PtrOffset);
     } else {
-      MIB.addOperand(MO);
+      MIB.add(MO);
     }
   }
 
@@ -7226,7 +8018,7 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExeDepsFix pass how many idle
+/// Inform the ExecutionDepsFix pass how many idle
 /// instructions we would like before a partial register update.
 unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
@@ -7344,11 +8136,15 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   case X86::VCVTUSI642SDZrrb_Int:
   case X86::VCVTUSI642SDZrm_Int:
   case X86::VCVTSD2SSZrr:
-  case X86::VCVTSD2SSZrrb:
+  case X86::VCVTSD2SSZrr_Int:
+  case X86::VCVTSD2SSZrrb_Int:
   case X86::VCVTSD2SSZrm:
+  case X86::VCVTSD2SSZrm_Int:
   case X86::VCVTSS2SDZrr:
-  case X86::VCVTSS2SDZrrb:
+  case X86::VCVTSS2SDZrr_Int:
+  case X86::VCVTSS2SDZrrb_Int:
   case X86::VCVTSS2SDZrm:
+  case X86::VCVTSS2SDZrm_Int:
   case X86::VRNDSCALESDr:
   case X86::VRNDSCALESDrb:
   case X86::VRNDSCALESDm:
@@ -7375,8 +8171,8 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExeDepsFix pass how many idle instructions we would like before
-/// certain undef register reads.
+/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// before certain undef register reads.
 ///
 /// This catches the VCVTSI2SD family of instructions:
 ///
@@ -7522,6 +8318,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
     case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
     case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
+    case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+    case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
+    case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
+    case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
+    case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
+    case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
     case X86::VFMADDSS4rr_Int:   case X86::VFNMADDSS4rr_Int:
     case X86::VFMSUBSS4rr_Int:   case X86::VFNMSUBSS4rr_Int:
     case X86::VFMADD132SSr_Int:  case X86::VFNMADD132SSr_Int:
@@ -7536,6 +8338,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
     case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
     case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
+    case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk:
+    case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk:
+    case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk:
+    case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk:
+    case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk:
+    case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk:
+    case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz:
+    case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz:
+    case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz:
+    case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
+    case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
+    case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
       return false;
     default:
       return true;
@@ -7555,6 +8369,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
     case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
     case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
+    case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+    case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
+    case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
+    case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
+    case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
+    case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
     case X86::VFMADDSD4rr_Int:   case X86::VFNMADDSD4rr_Int:
     case X86::VFMSUBSD4rr_Int:   case X86::VFNMSUBSD4rr_Int:
     case X86::VFMADD132SDr_Int:  case X86::VFNMADD132SDr_Int:
@@ -7569,6 +8389,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
     case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
     case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
+    case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk:
+    case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk:
+    case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk:
+    case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk:
+    case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk:
+    case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk:
+    case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz:
+    case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz:
+    case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz:
+    case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
+    case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
+    case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
       return false;
     default:
       return true;
@@ -7800,11 +8632,11 @@ bool X86InstrInfo::unfoldMemoryOperand(
   if (FoldedStore)
     MIB.addReg(Reg, RegState::Define);
   for (MachineOperand &BeforeOp : BeforeOps)
-    MIB.addOperand(BeforeOp);
+    MIB.add(BeforeOp);
   if (FoldedLoad)
     MIB.addReg(Reg);
   for (MachineOperand &AfterOp : AfterOps)
-    MIB.addOperand(AfterOp);
+    MIB.add(AfterOp);
   for (MachineOperand &ImpOp : ImpOps) {
     MIB.addReg(ImpOp.getReg(),
                getDefRegState(ImpOp.isDef()) |
@@ -8143,28 +8975,29 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
     break;
   }
 
-  // Check if chain operands and base addresses match.
-  if (Load1->getOperand(0) != Load2->getOperand(0) ||
-      Load1->getOperand(5) != Load2->getOperand(5))
+  // Lambda to check if both the loads have the same value for an operand index.
+  auto HasSameOp = [&](int I) {
+    return Load1->getOperand(I) == Load2->getOperand(I);
+  };
+
+  // All operands except the displacement should match.
+  if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
+      !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
     return false;
-  // Segment operands should match as well.
-  if (Load1->getOperand(4) != Load2->getOperand(4))
+
+  // Chain Operand must be the same.
+  if (!HasSameOp(5))
     return false;
-  // Scale should be 1, Index should be Reg0.
-  if (Load1->getOperand(1) == Load2->getOperand(1) &&
-      Load1->getOperand(2) == Load2->getOperand(2)) {
-    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
-      return false;
 
-    // Now let's examine the displacements.
-    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
-        isa<ConstantSDNode>(Load2->getOperand(3))) {
-      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
-      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
-      return true;
-    }
-  }
-  return false;
+  // Now let's examine if the displacements are constants.
+  auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
+  auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
+  if (!Disp1 || !Disp2)
+    return false;
+
+  Offset1 = Disp1->getSExtValue();
+  Offset2 = Disp2->getSExtValue();
+  return true;
 }
 
 bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
@@ -8215,165 +9048,6 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
-bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
-                                          const MachineInstr &Second) const {
-  // Check if this processor supports macro-fusion. Since this is a minor
-  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
-  // proxy for SandyBridge+.
-  if (!Subtarget.hasAVX())
-    return false;
-
-  enum {
-    FuseTest,
-    FuseCmp,
-    FuseInc
-  } FuseKind;
-
-  switch (Second.getOpcode()) {
-  default:
-    return false;
-  case X86::JE_1:
-  case X86::JNE_1:
-  case X86::JL_1:
-  case X86::JLE_1:
-  case X86::JG_1:
-  case X86::JGE_1:
-    FuseKind = FuseInc;
-    break;
-  case X86::JB_1:
-  case X86::JBE_1:
-  case X86::JA_1:
-  case X86::JAE_1:
-    FuseKind = FuseCmp;
-    break;
-  case X86::JS_1:
-  case X86::JNS_1:
-  case X86::JP_1:
-  case X86::JNP_1:
-  case X86::JO_1:
-  case X86::JNO_1:
-    FuseKind = FuseTest;
-    break;
-  }
-  switch (First.getOpcode()) {
-  default:
-    return false;
-  case X86::TEST8rr:
-  case X86::TEST16rr:
-  case X86::TEST32rr:
-  case X86::TEST64rr:
-  case X86::TEST8ri:
-  case X86::TEST16ri:
-  case X86::TEST32ri:
-  case X86::TEST32i32:
-  case X86::TEST64i32:
-  case X86::TEST64ri32:
-  case X86::TEST8rm:
-  case X86::TEST16rm:
-  case X86::TEST32rm:
-  case X86::TEST64rm:
-  case X86::TEST8ri_NOREX:
-  case X86::AND16i16:
-  case X86::AND16ri:
-  case X86::AND16ri8:
-  case X86::AND16rm:
-  case X86::AND16rr:
-  case X86::AND32i32:
-  case X86::AND32ri:
-  case X86::AND32ri8:
-  case X86::AND32rm:
-  case X86::AND32rr:
-  case X86::AND64i32:
-  case X86::AND64ri32:
-  case X86::AND64ri8:
-  case X86::AND64rm:
-  case X86::AND64rr:
-  case X86::AND8i8:
-  case X86::AND8ri:
-  case X86::AND8rm:
-  case X86::AND8rr:
-    return true;
-  case X86::CMP16i16:
-  case X86::CMP16ri:
-  case X86::CMP16ri8:
-  case X86::CMP16rm:
-  case X86::CMP16rr:
-  case X86::CMP32i32:
-  case X86::CMP32ri:
-  case X86::CMP32ri8:
-  case X86::CMP32rm:
-  case X86::CMP32rr:
-  case X86::CMP64i32:
-  case X86::CMP64ri32:
-  case X86::CMP64ri8:
-  case X86::CMP64rm:
-  case X86::CMP64rr:
-  case X86::CMP8i8:
-  case X86::CMP8ri:
-  case X86::CMP8rm:
-  case X86::CMP8rr:
-  case X86::ADD16i16:
-  case X86::ADD16ri:
-  case X86::ADD16ri8:
-  case X86::ADD16ri8_DB:
-  case X86::ADD16ri_DB:
-  case X86::ADD16rm:
-  case X86::ADD16rr:
-  case X86::ADD16rr_DB:
-  case X86::ADD32i32:
-  case X86::ADD32ri:
-  case X86::ADD32ri8:
-  case X86::ADD32ri8_DB:
-  case X86::ADD32ri_DB:
-  case X86::ADD32rm:
-  case X86::ADD32rr:
-  case X86::ADD32rr_DB:
-  case X86::ADD64i32:
-  case X86::ADD64ri32:
-  case X86::ADD64ri32_DB:
-  case X86::ADD64ri8:
-  case X86::ADD64ri8_DB:
-  case X86::ADD64rm:
-  case X86::ADD64rr:
-  case X86::ADD64rr_DB:
-  case X86::ADD8i8:
-  case X86::ADD8mi:
-  case X86::ADD8mr:
-  case X86::ADD8ri:
-  case X86::ADD8rm:
-  case X86::ADD8rr:
-  case X86::SUB16i16:
-  case X86::SUB16ri:
-  case X86::SUB16ri8:
-  case X86::SUB16rm:
-  case X86::SUB16rr:
-  case X86::SUB32i32:
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB32rm:
-  case X86::SUB32rr:
-  case X86::SUB64i32:
-  case X86::SUB64ri32:
-  case X86::SUB64ri8:
-  case X86::SUB64rm:
-  case X86::SUB64rr:
-  case X86::SUB8i8:
-  case X86::SUB8ri:
-  case X86::SUB8rm:
-  case X86::SUB8rr:
-    return FuseKind == FuseCmp || FuseKind == FuseInc;
-  case X86::INC16r:
-  case X86::INC32r:
-  case X86::INC64r:
-  case X86::INC8r:
-  case X86::DEC16r:
-  case X86::DEC32r:
-  case X86::DEC64r:
-  case X86::DEC8r:
-    return FuseKind == FuseInc;
-  }
-}
-
 bool X86InstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -8424,6 +9098,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
   { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
   { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr },
+  { X86::MOVSDmr,    X86::MOVSDmr,   X86::MOVPQI2QImr },
   { X86::MOVSSmr,    X86::MOVSSmr,   X86::MOVPDI2DImr },
   { X86::MOVSDrm,    X86::MOVSDrm,   X86::MOVQI2PQIrm },
   { X86::MOVSSrm,    X86::MOVSSrm,   X86::MOVDI2PDIrm },
@@ -8443,6 +9118,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
   { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
   { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr },
+  { X86::VMOVSDmr,   X86::VMOVSDmr,   X86::VMOVPQI2QImr },
   { X86::VMOVSSmr,   X86::VMOVSSmr,   X86::VMOVPDI2DImr },
   { X86::VMOVSDrm,   X86::VMOVSDrm,   X86::VMOVQI2PQIrm },
   { X86::VMOVSSrm,   X86::VMOVSSrm,   X86::VMOVDI2PDIrm },
@@ -8465,7 +9141,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   // AVX512 support
   { X86::VMOVLPSZ128mr,  X86::VMOVLPDZ128mr,  X86::VMOVPQI2QIZmr  },
   { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
-  { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+  { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr },
   { X86::VMOVNTPSZmr,    X86::VMOVNTPDZmr,    X86::VMOVNTDQZmr    },
   { X86::VMOVSDZmr,      X86::VMOVSDZmr,      X86::VMOVPQI2QIZmr  },
   { X86::VMOVSSZmr,      X86::VMOVSSZmr,      X86::VMOVPDI2DIZmr  },
@@ -8493,10 +9169,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
   { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
   { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
-  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
-  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
-  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
-  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
   { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
   { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
   { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
@@ -8508,6 +9180,14 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VBROADCASTF128,  X86::VBROADCASTF128,  X86::VBROADCASTI128 },
 };
 
+static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+};
+
 static const uint16_t ReplaceableInstrsAVX512[][4] = {
   // Two integer columns for 64-bit and 32-bit elements.
   //PackedSingle        PackedDouble        PackedInt             PackedInt
@@ -8769,16 +9449,25 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
       validDomains = 0xe;
     } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
       validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+    } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
+      // Insert/extract instructions should only effect domain if AVX2
+      // is enabled.
+      if (!Subtarget.hasAVX2())
+        return std::make_pair(0, 0);
+      validDomains = 0xe;
     } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
       validDomains = 0xe;
-    } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
-      validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
-    } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+    } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain,
+                                                  ReplaceableInstrsAVX512DQ)) {
+      validDomains = 0xe;
+    } else if (Subtarget.hasDQI()) {
+      if (const uint16_t *table = lookupAVX512(opcode, domain,
                                              ReplaceableInstrsAVX512DQMasked)) {
-      if (domain == 1 || (domain == 3 && table[3] == opcode))
-        validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
-      else
-        validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+        if (domain == 1 || (domain == 3 && table[3] == opcode))
+          validDomains = 0xa;
+        else
+          validDomains = 0xc;
+      }
     }
   }
   return std::make_pair(domain, validDomains);
@@ -8794,6 +9483,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
            "256-bit vector operations only available in AVX2");
     table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
   }
+  if (!table) { // try the other table
+    assert(Subtarget.hasAVX2() &&
+           "256-bit insert/extract only available in AVX2");
+    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
+  }
   if (!table) { // try the AVX512 table
     assert(Subtarget.hasAVX512() && "Requires AVX-512");
     table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
@@ -9457,28 +10151,6 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   return makeArrayRef(TargetFlags);
 }
 
-bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
-  switch (Inst.getOpcode()) {
-    case X86::TCRETURNdi:
-    case X86::TCRETURNmi:
-    case X86::TCRETURNri:
-    case X86::TCRETURNdi64:
-    case X86::TCRETURNmi64:
-    case X86::TCRETURNri64:
-    case X86::TAILJMPd:
-    case X86::TAILJMPm:
-    case X86::TAILJMPr:
-    case X86::TAILJMPd64:
-    case X86::TAILJMPm64:
-    case X86::TAILJMPr64:
-    case X86::TAILJMPm64_REX:
-    case X86::TAILJMPr64_REX:
-      return true;
-    default:
-      return false;
-  }
-}
-
 namespace {
   /// Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
@@ -9665,3 +10337,124 @@ namespace {
 char LDTLSCleanup::ID = 0;
 FunctionPass*
 llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+
+unsigned X86InstrInfo::getOutliningBenefit(size_t SequenceSize,
+                                           size_t Occurrences,
+                                           bool CanBeTailCall) const {
+  unsigned NotOutlinedSize = SequenceSize * Occurrences;
+  unsigned OutlinedSize;
+
+  // Is it a tail call?
+  if (CanBeTailCall) {
+    // If yes, we don't have to include a return instruction-- it's already in
+    // our sequence. So we have one occurrence of the sequence + #Occurrences
+    // calls.
+    OutlinedSize = SequenceSize + Occurrences;
+  } else {
+    // If not, add one for the return instruction.
+    OutlinedSize = (SequenceSize + 1) + Occurrences;
+  }
+
+  // Return the number of instructions saved by outlining this sequence.
+  return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+  return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+X86GenInstrInfo::MachineOutlinerInstrType
+X86InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugValue() || MI.isIndirectDebugValue())
+    return MachineOutlinerInstrType::Invisible;
+
+  // Is this a tail call? If yes, we can outline as a tail call.
+  if (isTailCall(MI))
+    return MachineOutlinerInstrType::Legal;
+
+  // Is this the terminator of a basic block?
+  if (MI.isTerminator() || MI.isReturn()) {
+
+    // Does its parent have any successors in its MachineFunction?
+    if (MI.getParent()->succ_empty())
+        return MachineOutlinerInstrType::Legal;
+
+    // It does, so we can't tail call it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  // Don't outline anything that modifies or reads from the stack pointer.
+  //
+  // FIXME: There are instructions which are being manually built without
+  // explicit uses/defs so we also have to check the MCInstrDesc. We should be
+  // able to remove the extra checks once those are fixed up. For example,
+  // sometimes we might get something like %RAX<def> = POP64r 1. This won't be
+  // caught by modifiesRegister or readsRegister even though the instruction
+  // really ought to be formed so that modifiesRegister/readsRegister would
+  // catch it.
+  if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
+      MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) 
+    return MachineOutlinerInstrType::Illegal;
+
+  // Outlined calls change the instruction pointer, so don't read from it.
+  if (MI.readsRegister(X86::RIP, &RI) ||
+      MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Positions can't safely be outlined.
+  if (MI.isPosition())
+    return MachineOutlinerInstrType::Illegal;
+
+  // Make sure none of the operands of this instruction do anything tricky.
+  for (const MachineOperand &MOP : MI.operands())
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return MachineOutlinerInstrType::Illegal;
+
+  return MachineOutlinerInstrType::Legal;
+}
+
+void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                          MachineFunction &MF,
+                                          bool IsTailCall) const {
+
+  // If we're a tail call, we already have a return, so don't do anything.
+  if (IsTailCall)
+    return;
+
+  // We're a normal call, so our sequence doesn't have a return instruction.
+  // Add it in.
+  MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ));
+  MBB.insert(MBB.end(), retq);
+}
+
+void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                          MachineFunction &MF,
+                                          bool IsTailCall) const {
+  return;
+}
+
+MachineBasicBlock::iterator
+X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &It,
+                                 MachineFunction &MF,
+                                 bool IsTailCall) const {
+  // Is it a tail call?
+  if (IsTailCall) {
+    // Yes, just insert a JMP.
+    It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(X86::JMP_1))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+  } else {
+    // No, insert a call.
+    It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+  }
+
+  return It;
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index acfdef4da7a3..2fee48570ce1 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -182,6 +182,20 @@ public:
   ///
   const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
+  /// Returns the stack pointer adjustment that happens inside the frame
+  /// setup..destroy sequence (e.g. by pushes, or inside the callee).
+  int64_t getFrameAdjustment(const MachineInstr &I) const {
+    assert(isFrameInstr(I));
+    return I.getOperand(1).getImm();
+  }
+
+  /// Sets the stack pointer adjustment made inside the frame made up by this
+  /// instruction.
+  void setFrameAdjustment(MachineInstr &I, int64_t V) const {
+    assert(isFrameInstr(I));
+    I.getOperand(1).setImm(V);
+  }
+
   /// getSPAdjust - This returns the stack pointer adjustment made by
   /// this instruction. For x86, we need to handle more complex call
   /// sequences involving PUSHes.
@@ -316,6 +330,13 @@ public:
 
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+  bool isUnconditionalTailCall(const MachineInstr &MI) const override;
+  bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
+                                  const MachineInstr &TailCall) const override;
+  void replaceBranchWithTailCall(MachineBasicBlock &MBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 const MachineInstr &TailCall) const override;
+
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
@@ -436,9 +457,6 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
-
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   bool
@@ -539,8 +557,28 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
-  bool isTailCall(const MachineInstr &Inst) const override;
+  unsigned getOutliningBenefit(size_t SequenceSize,
+                               size_t Occurrences,
+                               bool CanBeTailCall) const override;
+
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+
+  llvm::X86GenInstrInfo::MachineOutlinerInstrType
+  getOutliningType(MachineInstr &MI) const override;
+
+  void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool IsTailCall) const override;
+
+  void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool isTailCall) const override;
 
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It,
+                     MachineFunction &MF,
+                     bool IsTailCall) const override;
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 38036715a25a..e31d2769047b 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -318,6 +318,7 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
   def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
   def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
   def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
+  def X86Mem256_RC512Operand  : AsmOperandClass { let Name = "Mem256_RC512"; }
   def X86Mem512_RC512Operand  : AsmOperandClass { let Name = "Mem512_RC512"; }
 }
 
@@ -374,9 +375,10 @@ def vy256mem : X86VMemOperand<VR256,  "printi256mem", X86Mem256_RC256Operand>;
 def vx64xmem  : X86VMemOperand<VR128X, "printi64mem",  X86Mem64_RC128XOperand>;
 def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
 def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
-def vy128xmem : X86VMemOperand<VR256,  "printi128mem", X86Mem128_RC256XOperand>;
+def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
 def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
 def vy512mem  : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz256xmem : X86VMemOperand<VR512,  "printi256mem", X86Mem256_RC512Operand>;
 def vz512mem  : X86VMemOperand<VR512,  "printi512mem", X86Mem512_RC512Operand>;
 
 // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
@@ -831,7 +833,6 @@ def HasXSAVEC    : Predicate<"Subtarget->hasXSAVEC()">;
 def HasXSAVES    : Predicate<"Subtarget->hasXSAVES()">;
 def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
 def HasFMA       : Predicate<"Subtarget->hasFMA()">;
-def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
 def HasTBM       : Predicate<"Subtarget->hasTBM()">;
@@ -848,8 +849,6 @@ def HasVBMI      : Predicate<"Subtarget->hasVBMI()">,
 def HasIFMA      : Predicate<"Subtarget->hasIFMA()">,
                      AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
-def HasHLE       : Predicate<"Subtarget->hasHLE()">;
-def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
@@ -857,9 +856,11 @@ def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
+def HasCLZERO    : Predicate<"Subtarget->hasCLZERO()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
+def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
@@ -895,6 +896,7 @@ def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
@@ -931,6 +933,15 @@ def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
 def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
 
+// FIXME: Ideally we would just replace the above i*immSExt* matchers with
+// relocImm-based matchers, but then FastISel would be unable to use them.
+def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
+  return isSExtRelocImm<8>(N);
+}]>;
+def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
+  return isSExtRelocImm<32>(N);
+}]>;
+
 // If we have multiple users of an immediate, it's much smaller to reuse
 // the register, rather than encode the immediate in every instruction.
 // This has the risk of increasing register pressure from stretched live
@@ -971,6 +982,13 @@ def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 
+def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
 // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
 // unsigned field.
 def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
@@ -1106,13 +1124,15 @@ def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
 def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
                 IIC_POP_REG>, OpSize16;
-def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
-                IIC_POP_MEM>, OpSize16;
 def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in {
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize16;
 def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
                 IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
-} // mayLoad, SchedRW
+} // mayStore, mayLoad, WriteRMW
 
 let mayStore = 1, SchedRW = [WriteStore] in {
 def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
@@ -1194,9 +1214,10 @@ def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
                  IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
 def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in
 def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
                 IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
-} // mayLoad, SchedRW
 let mayStore = 1, SchedRW = [WriteStore] in {
 def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
                  IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
@@ -1965,7 +1986,12 @@ def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
                      Requires<[In64BitMode]>;
 
 // Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>, 
+                     Requires<[Not16BitMode]>;
+
+// Data instruction prefix
+def DATA32_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data32", []>, 
+                     Requires<[In16BitMode]>;
 
 // Repeat string operation instruction prefixes
 // These uses the DF flag in the EFLAGS register to inc or dec ECX
@@ -2079,6 +2105,7 @@ def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
                  "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
                  Requires<[Not64BitMode]>;
+let mayStore = 1 in
 def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                  "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
                  Requires<[Not64BitMode]>;
@@ -2448,8 +2475,19 @@ def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
 //===----------------------------------------------------------------------===//
 // CLZERO Instruction
 //
-let Uses = [EAX] in
-def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+let SchedRW = [WriteSystem] in {
+  let Uses = [EAX] in
+  def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>,
+                TB, Requires<[HasCLZERO]>;
+
+  let usesCustomInserter = 1 in {
+  def CLZERO : PseudoI<(outs), (ins i32mem:$src1),
+                       [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>;
+  }
+} // SchedRW
+
+def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate TBM instructions.
@@ -2522,10 +2560,10 @@ let Predicates = [HasTBM] in {
 // Memory Instructions
 //
 
+let Predicates = [HasCLFLUSHOPT] in
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                    "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
-def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
 
 
 //===----------------------------------------------------------------------===//
@@ -2977,7 +3015,7 @@ def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32me
 def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 0bb106823983..dc3800ce381b 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -294,6 +294,7 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         [(set VR64:$dst, (load_mmx addr:$src))],
                         IIC_MMX_MOVQ_RM>;
 } // SchedRW
+
 let SchedRW = [WriteStore] in
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -378,7 +379,6 @@ defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
 defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
                                    MMX_PHADDSUBW>;
 
-
 // -- Subtraction
 defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
                                    MMX_INTALU_ITINS>;
@@ -479,13 +479,6 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
                                     int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
-
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
                                     int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
                                     MMX_SHIFT_ITINS>;
@@ -496,13 +489,6 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
                                     int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
-
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
                                     int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
                                     MMX_SHIFT_ITINS>;
@@ -510,11 +496,6 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                     int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRADrm VR64:$src1, addr:$src2)>;
-
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
                                      MMX_INTALU_ITINS>;
@@ -576,9 +557,6 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                                                    imm:$src2))],
                           IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
 
-
-
-
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
@@ -639,7 +617,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                           [(set GR32orGR64:$dst,
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
-
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -670,6 +647,16 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))),
+          (MMX_CVTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
+          (MMX_CVTTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+          (MMX_CVTPD2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+          (MMX_CVTTPD2PIirr VR128:$src)>;
 }
-
-
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index 309f601d1fce..104ba2a174db 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+let mayLoad = 1 in {
   def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
               OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, Not64BitMode]>;
@@ -21,16 +22,19 @@ multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
               OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
+}
 
 defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
 
 multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+let mayLoad = 1 in {
   def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
+}
   def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
@@ -45,16 +49,18 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
 def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
+let mayLoad = 1 in {
 def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
 def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
-
+}
 def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
+let mayStore = 1 in {
 def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
@@ -65,6 +71,8 @@ def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
 def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
                     "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
+}
+let mayLoad = 1 in
 def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
                     "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1812d01711d1..e1bf28cbf612 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -259,8 +259,8 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator Int, RegisterClass RC,
-                               string asm, Operand memopr,
+                               SDPatternOperator OpNode, RegisterClass RC,
+                               ValueType VT, string asm, Operand memopr,
                                ComplexPattern mem_cpat, Domain d,
                                OpndItins itins, bit Is2Addr = 1> {
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
@@ -268,14 +268,14 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
        Sched<[itins.Sched]>;
   let mayLoad = 1 in
   def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 }
@@ -446,9 +446,9 @@ def : Pat<(v4f64  (bitconvert (v8f32  VR256:$src))), (v4f64  VR256:$src)>;
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
-                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
-                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -461,12 +461,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
+    isPseudo = 1, SchedRW = [WriteZero] in {
 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-let Predicates = [NoVLX] in
+let Predicates = [NoAVX512] in
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 
 
@@ -475,7 +475,7 @@ def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                  [(set VR256:$dst, (v8i32 immAllZerosV))]>;
 }
@@ -491,7 +491,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move FP Scalar Instructions
 //
@@ -527,12 +526,12 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   // AVX
   defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
-                              VEX_4V, VEX_LIG;
+                              VEX_4V, VEX_LIG, VEX_WIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
-                     VEX, VEX_LIG, Sched<[WriteStore]>;
+                     VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
@@ -552,7 +551,7 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG;
   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
@@ -644,10 +643,6 @@ let Predicates = [UseAVX] in {
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
@@ -738,10 +733,6 @@ let Predicates = [UseSSE2] in {
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
   // is during lowering, where it's not possible to recognize the fold because
@@ -786,29 +777,29 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in
 let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX;
+                              PS, VEX, VEX_WIG;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX;
+                              PD, VEX, VEX_WIG;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX;
+                              PS, VEX, VEX_WIG;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX;
+                              PD, VEX, VEX_WIG;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX, VEX_L;
+                              PS, VEX, VEX_L, VEX_WIG;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX, VEX_L;
+                              PD, VEX, VEX_L, VEX_WIG;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX, VEX_L;
+                              PS, VEX, VEX_L, VEX_WIG;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX, VEX_L;
+                              PD, VEX, VEX_L, VEX_WIG;
 }
 
 let Predicates = [UseSSE1] in {
@@ -832,35 +823,35 @@ let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
 } // SchedRW
 
 // For disassembler
@@ -869,35 +860,35 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movaps\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVA_P_RR>, VEX;
+                          IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movapd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVA_P_RR>, VEX;
+                           IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movups\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movupd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
 }
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -955,24 +946,10 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
                          IIC_SSE_MOVU_P_RR>;
 }
 
-// Use vmovaps/vmovups for AVX integer load/store.
 let Predicates = [HasAVX, NoVLX] in {
-  // 128-bit load/store
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (VMOVUPSrm addr:$src)>;
-
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
+  // 256-bit load/store need to use floating point load/store in case we don't
+  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
+  // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
@@ -981,10 +958,18 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v4i64 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v8i32 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 
   // Special patterns for storing subvector extracts of lower 128-bits
   // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
@@ -994,18 +979,6 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(alignedstore (v4f32 (extract_subvector
                                   (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v2i64 (extract_subvector
-                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v4i32 (extract_subvector
-                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v8i16 (extract_subvector
-                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v16i8 (extract_subvector
-                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 
   def : Pat<(store (v2f64 (extract_subvector
                            (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
@@ -1013,40 +986,6 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(store (v4f32 (extract_subvector
                            (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v2i64 (extract_subvector
-                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v4i32 (extract_subvector
-                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v8i16 (extract_subvector
-                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v16i8 (extract_subvector
-                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
-  // 128-bit load/store
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
-  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -1107,7 +1046,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
   let Predicates = [UseAVX] in
     defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                                    itin>, VEX_4V;
+                                    itin>, VEX_4V, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in
     defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
@@ -1126,12 +1065,12 @@ def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
                                  (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX;
+                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX;
+                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 }// UseAVX
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
@@ -1238,12 +1177,12 @@ def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 } // UseAVX
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
@@ -1343,14 +1282,14 @@ let AddedComplexity = 20, Predicates = [UseAVX] in {
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -1725,11 +1664,11 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               PS, VEX, Requires<[HasAVX, NoVLX]>;
+                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
+                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
@@ -1777,20 +1716,21 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 // Convert scalar double to scalar single
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
-                       (ins FR64:$src1, FR64:$src2),
+                       (ins FR32:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                       IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2F]>;
+                      Sched<[WriteCvtF2F]>, VEX_WIG;
 let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
-                       (ins FR64:$src1, f64mem:$src2),
+                       (ins FR32:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [], IIC_SSE_CVT_Scalar_RM>,
                       XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
-def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+def : Pat<(f32 (fpround FR64:$src)), 
+            (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
           Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1810,15 +1750,15 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
-                       Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
+                       Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
-                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG,
+                       Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
@@ -1842,30 +1782,30 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
 // SSE2 instructions with XS prefix
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
-                    (ins FR32:$src1, FR32:$src2),
+                    (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RR>,
                     XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
-                    Sched<[WriteCvtF2F]>;
+                    Sched<[WriteCvtF2F]>, VEX_WIG;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
-                    (ins FR32:$src1, f32mem:$src2),
+                    (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RM>,
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
 def : Pat<(f64 (fpextend FR32:$src)),
-    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fpextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
 def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
     Requires<[UseAVX, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
     Requires<[UseAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
@@ -1895,15 +1835,15 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
-                    Sched<[WriteCvtF2F]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1999,22 +1939,22 @@ def : Pat<(v4f32 (X86Movss
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
-                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
@@ -2035,7 +1975,7 @@ def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
-                       VEX, Sched<[WriteCvtF2I]>;
+                       VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
@@ -2044,7 +1984,7 @@ def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
-                      Sched<[WriteCvtF2ILd]>;
+                      Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2053,12 +1993,12 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                       VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
@@ -2083,23 +2023,23 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
-                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
-                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
-                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
                           IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
-                          Sched<[WriteCvtF2ILd]>;
+                          Sched<[WriteCvtF2ILd]>, VEX_WIG;
 }
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2118,7 +2058,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
-                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
@@ -2132,7 +2072,7 @@ def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
-                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2142,12 +2082,12 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
-                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
-                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 }
 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
@@ -2193,19 +2133,19 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
-                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
-                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -2225,30 +2165,30 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                        VEX, Sched<[WriteCvtI2FLd]>;
+                          (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>,
+                        VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
-                        VEX, Sched<[WriteCvtI2F]>;
+                        VEX, Sched<[WriteCvtI2F]>, VEX_WIG;
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2F]>;
+                         VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG;
 }
 
 let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
+                         (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))],
                        IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2276,7 +2216,7 @@ let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
-                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
@@ -2285,7 +2225,7 @@ let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
-                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2294,11 +2234,11 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (fpround VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
-                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 }
 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
@@ -2368,21 +2308,25 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
+let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
-                 XD, VEX_4V, VEX_LIG;
+                 XD, VEX_4V, VEX_LIG, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
                   i8immZExt3>, XS;
+  let ExeDomain = SSEPackedDouble in
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
@@ -2398,6 +2342,7 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
                                                VR128:$src, immLeaf:$cc))],
                                                itins.rr>,
            Sched<[itins.Sched]>;
+let mayLoad = 1 in
   def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, memop:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
@@ -2408,18 +2353,22 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
 
 let isCodeGenOnly = 1 in {
   // Aliases to match intrinsics which expect XMM operand(s).
+  let ExeDomain = SSEPackedSingle in
   defm Int_VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                        SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
                        XS, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
   defm Int_VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                        SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
                        XD, VEX_4V;
   let Constraints = "$src1 = $dst" in {
+    let ExeDomain = SSEPackedSingle in
     defm Int_CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
                          SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+    let ExeDomain = SSEPackedDouble in
     defm Int_CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
                          SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
@@ -2437,6 +2386,7 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
                      IIC_SSE_COMIS_RR>,
           Sched<[WriteFAdd]>;
+let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
@@ -2454,6 +2404,7 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
                      IIC_SSE_COMIS_RR>,
           Sched<[WriteFAdd]>;
+let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
@@ -2464,26 +2415,26 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss">, PS, VEX, VEX_LIG;
+                                  "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd">, PD, VEX, VEX_LIG;
+                                  "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                    "comiss">, PS, VEX, VEX_LIG;
+                                    "comiss">, PS, VEX, VEX_LIG, VEX_WIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                    "comisd">, PD, VEX, VEX_LIG;
+                                    "comisd">, PD, VEX, VEX_LIG, VEX_WIG;
   }
 
   let isCodeGenOnly = 1 in {
     defm Int_VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                              sse_load_f32, "ucomiss">, PS, VEX;
+                              sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG;
     defm Int_VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                              sse_load_f64, "ucomisd">, PD, VEX;
+                              sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG;
 
     defm Int_VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                              sse_load_f32, "comiss">, PS, VEX;
+                              sse_load_f32, "comiss">, PS, VEX, VEX_WIG;
     defm Int_VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                              sse_load_f64, "comisd">, PD, VEX;
+                              sse_load_f64, "comisd">, PD, VEX, VEX_WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss">, PS;
@@ -2512,18 +2463,19 @@ let Defs = [EFLAGS] in {
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, Intrinsic Int, string asm,
+                            Operand CC,  ValueType VT, string asm,
                             string asm_alt, Domain d, ImmLeaf immLeaf,
                             PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))],
              itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+             [(set RC:$dst,
+               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))],
              itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
@@ -2540,67 +2492,33 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
-defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
-defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
-defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
-defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
-  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
-  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
 }
 
-let Predicates = [HasAVX] in {
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
-          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
-          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-
-def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
-          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
-          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
-          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
-          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [UseSSE1] in {
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
-          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [UseSSE2] in {
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
-          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Shuffle Instructions
 //===----------------------------------------------------------------------===//
@@ -2624,16 +2542,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
 let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2715,29 +2633,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2789,13 +2707,13 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                        SSEPackedSingle>, PS, VEX;
+                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                        SSEPackedDouble>, PD, VEX;
+                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
-                                         SSEPackedSingle>, PS, VEX, VEX_L;
+                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
-                                         SSEPackedDouble>, PD, VEX, VEX_L;
+                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
@@ -2839,7 +2757,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          OpndItins itins, bit IsCommutable = 0, Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2848,7 +2766,7 @@ let Constraints = "$src1 = $dst" in
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, loadv4i64, i256mem, itins,
-                               IsCommutable, 0>, VEX_4V, VEX_L;
+                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 // These are ordered here for pattern ordering requirements with the fp versions
@@ -2876,7 +2794,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
                                   (bc_v4i64 (v8f32 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2884,14 +2802,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (loadv4i64 addr:$src2)))], 0>,
-                                  PD, VEX_4V, VEX_L;
+                                  PD, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
                                  (bc_v2i64 (v4f32 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
@@ -2899,7 +2817,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (loadv2i64 addr:$src2)))], 0>,
-                                                 PD, VEX_4V;
+                                                 PD, VEX_4V, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -3065,17 +2983,17 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -3092,10 +3010,10 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   SizeItins itins> {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                          OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
-                         XS, VEX_4V, VEX_LIG;
+                         XS, VEX_4V, VEX_LIG, VEX_WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                          OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
-                         XD, VEX_4V, VEX_LIG;
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -3108,21 +3026,20 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                      SDPatternOperator IntSS,
-                                      SDPatternOperator IntSD,
+                                      SDPatternOperator OpNode,
                                       SizeItins itins> {
-  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
-  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
-    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
                    SSEPackedSingle, itins.s>, XS;
-    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
                    SSEPackedDouble, itins.d>, XD;
   }
@@ -3131,29 +3048,23 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 // Binary Arithmetic instructions
 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
            basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
-                                      SSE_ALU_ITINS_S>;
+           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>;
 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
            basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
-                                      SSE_MUL_ITINS_S>;
+           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>;
 let isCommutable = 0 in {
   defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
-                                        SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>;
   defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
              basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
-                                        SSE_DIV_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>;
   defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
-                                        int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>;
   defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
-                                        int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>;
 }
 
 let isCodeGenOnly = 1 in {
@@ -3400,7 +3311,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
             Sched<[itins.Sched.Folded, ReadAfterLd]>,
             Requires<[target, OptForSize]>;
 
-  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
@@ -3444,7 +3355,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  let isCodeGenOnly = 1 in {
+  let isCodeGenOnly = 1, ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
                 (ins VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3465,7 +3376,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   // which has a clobber before the rcp, vs.
   // vrcpss mem, %xmm0, %xmm0
   // TODO: In theory, we could fold the load, and avoid the stall caused by
-  // the partial register store, either in ExeDepFix or with smarter RA.
+  // the partial register store, either in ExecutionDepsFix or with smarter RA.
   let Predicates = [UseAVX] in {
    def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
@@ -3495,22 +3406,22 @@ let Predicates = prds in {
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>;
+                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
 }
 
   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3531,22 +3442,22 @@ let Predicates = [HasAVX] in {
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>;
+                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
 }
 
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3567,7 +3478,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
                       f32mem,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
-                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3579,7 +3490,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          f64mem,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
                          OpNode, SSEPackedDouble, itins, "SD">,
-                         XD, VEX_4V, VEX_LIG;
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 // Square root.
@@ -3647,41 +3558,41 @@ def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f32 VR128:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX;
+                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f128mem:$dst, VR128:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v2f64 VR128:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX;
+                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
 
 let ExeDomain = SSEPackedInt in
 def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
+                         (ins i128mem:$dst, VR128:$src),
                          "movntdq\t{$src, $dst|$dst, $src}",
                          [(alignednontemporalstore (v2i64 VR128:$src),
                                                    addr:$dst)],
-                                                   IIC_SSE_MOVNT>, VEX;
+                                                   IIC_SSE_MOVNT>, VEX, VEX_WIG;
 
 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v8f32 VR256:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f64 VR256:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 let ExeDomain = SSEPackedInt in
 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
-                    (ins f256mem:$dst, VR256:$src),
+                    (ins i256mem:$dst, VR256:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4i64 VR256:$src),
                                               addr:$dst)],
-                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+                                              IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 }
 
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
@@ -3797,20 +3708,18 @@ def : Pat<(X86MFence), (MFENCE)>;
 //===----------------------------------------------------------------------===//
 
 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+               IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG;
 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+               IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG;
 
-let Predicates = [UseSSE1] in {
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+              IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
-}
+              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+              IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3821,16 +3730,16 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
 let hasSideEffects = 0, SchedRW = [WriteMove] in {
 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX;
+                    VEX, VEX_WIG;
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX, VEX_L;
+                    VEX, VEX_L, VEX_WIG;
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX;
+                    VEX, VEX_WIG;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX, VEX_L;
+                    VEX, VEX_L, VEX_WIG;
 }
 
 // For Disassembler
@@ -3839,54 +3748,58 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>,
-                        VEX;
+                        VEX, VEX_WIG;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
-                        VEX;
+                        VEX, VEX_WIG;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
     hasSideEffects = 0, SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX;
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv2i64 addr:$src))],
+                   IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX, VEX_L;
-let Predicates = [HasAVX] in {
-  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX;
-  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX, VEX_L;
-}
+                   VEX, VEX_L, VEX_WIG;
+let Predicates = [HasAVX,NoVLX] in
+def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (loadv2i64 addr:$src))],
+                  IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG;
+def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                  XS, VEX, VEX_L, VEX_WIG;
 }
 
 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i128mem:$dst, VR128:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX;
+                     "movdqa\t{$src, $dst|$dst, $src}",
+                     [(alignedstore (v2i64 VR128:$src), addr:$dst)],
+                     IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i256mem:$dst, VR256:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX, VEX_L;
-let Predicates = [HasAVX] in {
+                     VEX, VEX_L, VEX_WIG;
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX;
+                  "vmovdqu\t{$src, $dst|$dst, $src}",
+                  [(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_WIG;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX, VEX_L;
-}
+                  XS, VEX, VEX_L, VEX_WIG;
 }
 
 let SchedRW = [WriteMove] in {
@@ -3949,6 +3862,50 @@ def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
                 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
+let Predicates = [HasAVX, NoVLX] in {
+  // Additional patterns for other integer sizes.
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
 //===---------------------------------------------------------------------===//
@@ -4037,12 +3994,12 @@ defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
                                VR256, loadv4i64, i256mem, SSE_PMADD,
-                               0>, VEX_4V, VEX_L;
+                               0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                              memopv2i64, i128mem, SSE_PMADD>;
@@ -4050,11 +4007,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                             VEX_4V;
+                             VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
                              loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
-                             VEX_4V, VEX_L;
+                             VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
                             memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
@@ -4062,11 +4019,11 @@ defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
 let Predicates = [HasAVX, NoVLX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
                               loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                              VEX_4V;
+                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, loadv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
+                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
@@ -4113,11 +4070,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, DstVT128, SrcVT,
-                              loadv2i64, 0>, VEX_4V;
+                              loadv2i64, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, DstVT256, SrcVT,
-                                loadv2i64, 0>, VEX_4V, VEX_L;
+                                loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                            VR128, DstVT128, SrcVT, memopv2i64>;
@@ -4138,10 +4095,10 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
                            SDNode OpNode> {
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                             VR128, v16i8, 0>, VEX_4V;
+                             VR128, v16i8, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                               VR256, v32i8, 0>, VEX_4V, VEX_L;
+                               VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
 }
@@ -4202,7 +4159,7 @@ let Predicates = [HasAVX, prd] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
@@ -4210,7 +4167,7 @@ let Predicates = [HasAVX, prd] in {
                      [(set VR128:$dst,
                        (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
-                  Sched<[WriteShuffleLd]>;
+                  Sched<[WriteShuffleLd]>, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, prd] in {
@@ -4220,7 +4177,7 @@ let Predicates = [HasAVX2, prd] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
@@ -4228,7 +4185,7 @@ let Predicates = [HasAVX2, prd] in {
                       [(set VR256:$dst,
                         (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
-                   Sched<[WriteShuffleLd]>;
+                   Sched<[WriteShuffleLd]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -4257,20 +4214,6 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
                              NoVLX_Or_NoBWI>, XD;
 
-let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
-            (VPSHUFDmi addr:$src1, imm:$imm)>;
-  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-            (VPSHUFDri VR128:$src1, imm:$imm)>;
-}
-
-let Predicates = [UseSSE2] in {
-  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
-            (PSHUFDmi addr:$src1, imm:$imm)>;
-  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-            (PSHUFDri VR128:$src1, imm:$imm)>;
-}
-
 //===---------------------------------------------------------------------===//
 // Packed Integer Pack Instructions (SSE & AVX)
 //===---------------------------------------------------------------------===//
@@ -4364,24 +4307,24 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
                              loadv2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
                                VEX_4V, VEX_L;
 }
@@ -4443,44 +4386,44 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4565,14 +4508,14 @@ def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
-           IIC_SSE_MOVMSK>, VEX;
+           IIC_SSE_MOVMSK>, VEX, VEX_WIG;
 
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
-           VEX, VEX_L;
+           VEX, VEX_L, VEX_WIG;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
@@ -4593,13 +4536,13 @@ def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
-           IIC_SSE_MASKMOV>, VEX;
+           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
-           IIC_SSE_MASKMOV>, VEX;
+           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
 
 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -4725,19 +4668,6 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
-
-def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
@@ -4758,12 +4688,12 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 } //SchedRW
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
                           (ins i64mem:$dst, VR128:$src),
                           "movq\t{$src, $dst|$dst, $src}",
                           [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
@@ -4837,6 +4767,8 @@ let Predicates = [UseAVX] in {
   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   // These instructions also write zeros in the high part of a 256-bit register.
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -4866,6 +4798,8 @@ let Predicates = [UseSSE2] in {
               (MOV64toPQIrr GR64:$src)>;
   }
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (MOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -4903,7 +4837,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                    VEX, Requires<[UseAVX]>;
+                    VEX, Requires<[UseAVX]>, VEX_WIG;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4920,7 +4854,7 @@ def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
-                                    IIC_SSE_MOVDQ>, VEX;
+                                    IIC_SSE_MOVDQ>, VEX, VEX_WIG;
 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
@@ -4932,7 +4866,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteVecLogic] in {
 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG;
 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
 }
@@ -4978,7 +4912,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[UseAVX]>;
+                      XS, VEX, Requires<[UseAVX]>, VEX_WIG;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -5016,13 +4950,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -5090,8 +5024,8 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
@@ -5108,16 +5042,6 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVDDUPYrr VR256:$src)>;
 }
 
-let Predicates = [HasAVX] in {
-  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64
-                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-}
-
 let Predicates = [HasAVX, NoVLX] in
 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
           (VMOVDDUPrm addr:$src)>;
@@ -5128,13 +5052,6 @@ def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
 let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
-            (MOVDDUPrm addr:$src)>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
-            (MOVDDUPrm addr:$src)>;
-  def : Pat<(X86Movddup (bc_v2f64
-                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (MOVDDUPrm addr:$src)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5145,11 +5062,11 @@ let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
                    [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
-                   VEX, VEX_L;
+                   VEX, VEX_L, VEX_WIG;
 }
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
@@ -5183,15 +5100,15 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
@@ -5278,23 +5195,23 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, loadv4f32, 0>, VEX_4V;
+                            X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, loadv4f32, 0>, VEX_4V;
+                            X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                            X86fhadd, loadv2f64, 0>, VEX_4V;
+                            X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                            X86fhsub, loadv2f64, 0>, VEX_4V;
+                            X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 
@@ -5352,84 +5269,24 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   Sched<[WriteVecALULd]>;
 }
 
-// Helper fragments to match sext vXi1 to vXiY.
-def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
-                                               VR128:$src))>;
-def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
-def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
-def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
-                                               VR256:$src))>;
-def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
-def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
-
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
-  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
-}
-let Predicates = [HasAVX, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
-}
-
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(xor
-            (bc_v2i64 (v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
-            (VPABSBrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
-            (VPABSWrr VR128:$src)>;
+  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
-            (VPABSDrr VR128:$src)>;
-}
-
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
-  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
-}
-let Predicates = [HasAVX2, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
+  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG;
 }
-
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  def : Pat<(xor
-            (bc_v4i64 (v32i1sextv32i8)),
-            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
-            (VPABSBYrr VR256:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (v16i1sextv16i16)),
-            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
-            (VPABSWYrr VR256:$src)>;
+  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
-  def : Pat<(xor
-            (bc_v4i64 (v8i1sextv8i32)),
-            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
-            (VPABSDYrr VR256:$src)>;
+  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG;
 }
 
-defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
-defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
-defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
-
-let Predicates = [UseSSSE3] in {
-  def : Pat<(xor
-            (bc_v2i64 (v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
-            (PABSBrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
-            (PABSWrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
-            (PABSDrr VR128:$src)>;
-}
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -5527,45 +5384,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PSHUFB, 0>, VEX_4V;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
                                   v16i8, VR128, loadv2i64, i128mem,
-                                  SSE_PMADD, 0>, VEX_4V;
+                                  SSE_PMADD, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V;
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBD, 0>, VEX_4V;
+                                  SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 }
 
@@ -5573,42 +5430,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
                                    v32i8, VR256, loadv4i64, i256mem,
-                                   SSE_PMADD, 0>, VEX_4V, VEX_L;
+                                   SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
                                         int_x86_avx2_phsub_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
 }
 }
 
@@ -5686,9 +5543,9 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2] in
-  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGNR : ssse3_palignr<"palignr">;
 
@@ -5779,10 +5636,10 @@ multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
   defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
   let Predicates = [HasAVX, prd] in
     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
-                                     VR128, VR128, AVXItins>, VEX;
+                                     VR128, VR128, AVXItins>, VEX, VEX_WIG;
   let Predicates = [HasAVX2, prd] in
     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
-                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
+                                     VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG;
 }
 
 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
@@ -6010,12 +5867,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   }
 }
 
-defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
-defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>;
 
 let Predicates = [UseSSE41] in {
-  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
-  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6103,20 +5960,20 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR64:$dst,
                   (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
-                  Sched<[WriteShuffle]>, REX_W;
+                  Sched<[WriteShuffle]>;
   let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
-                          addr:$dst)]>, REX_W;
+                          addr:$dst)]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
 
-defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
@@ -6140,7 +5997,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
   defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
 }
 
@@ -6268,7 +6125,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG;
   let Constraints = "$src1 = $dst" in
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
@@ -6461,14 +6318,14 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
                                  loadv4f32, loadv2f64,
                                  int_x86_sse41_round_ps,
-                                 int_x86_sse41_round_pd>, VEX;
+                                 int_x86_sse41_round_pd>, VEX, VEX_WIG;
   defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
                                  loadv8f32, loadv4f64,
                                  int_x86_avx_round_ps_256,
-                                 int_x86_avx_round_pd_256>, VEX, VEX_L;
+                                 int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG;
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround",
                                  int_x86_sse41_round_ss,
-                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
@@ -6606,20 +6463,20 @@ let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX;
+                Sched<[WriteVecLogic]>, VEX, VEX_WIG;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX, VEX_L;
+                Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG;
 }
 
 let Defs = [EFLAGS] in {
@@ -6722,7 +6579,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
                                          int_x86_sse41_phminposuw, loadv2i64,
-                                         WriteVecIMul>, VEX;
+                                         WriteVecIMul>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
                                          int_x86_sse41_phminposuw, memopv2i64,
                                          WriteVecIMul>;
@@ -6778,65 +6635,65 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
                                    VR128, loadv2i64, i128mem,
-                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6864,18 +6721,18 @@ let Constraints = "$src1 = $dst" in {
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
                                  loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
-                                 VEX_4V;
+                                 VEX_4V, VEX_WIG;
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                 VEX_4V;
+                                 VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6945,52 +6802,52 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                         VR128, loadv2i64, i128mem, 0,
-                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in {
   defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
                                   VR128, loadv4f32, f128mem, 0,
-                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
                                    VR256, loadv8f32, f256mem, 0,
-                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
   defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
                                   VR128, loadv2f64, f128mem, 0,
-                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
                                    VR256, loadv4f64, f256mem, 0,
-                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
                                   VR128, loadv2i64, i128mem, 0,
-                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, loadv4f32, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V;
+                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, loadv2f64, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V;
+                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                     VR256, loadv8f32, i256mem, 0,
-                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, loadv4i64, i256mem, 0,
-                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
                                    VR256, loadv4i64, i256mem, 0,
-                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7020,6 +6877,19 @@ let Constraints = "$src1 = $dst" in {
                                   SSE_DPPD_ITINS>;
 }
 
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX] in {
+def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
+          (VBLENDPDYrri VR256:$src1,
+                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0x3)>;
+def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
@@ -7165,14 +7035,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{$src2, $dst|$dst, $src2}"),
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
                     itins.rr>, Sched<[itins.Sched]>;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{$src2, $dst|$dst, $src2}"),
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (mem_frag addr:$src2)), XMM0))],
@@ -7193,18 +7063,18 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   DEFAULT_ITINS_VARBLENDSCHED>;
 
 // Aliases with the implicit xmm0 argument
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
@@ -7228,17 +7098,14 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
 let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX, NoVLX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "vmovntdqa\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
-                       VEX;
+                       "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+                       VEX, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                         "vmovntdqa\t{$src, $dst|$dst, $src}",
-                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         VEX, VEX_L;
+                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+                         VEX, VEX_L, VEX_WIG;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "movntdqa\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+                       "movntdqa\t{$src, $dst|$dst, $src}", []>;
 } // SchedRW
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -7295,11 +7162,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, 0>, VEX_4V;
+                                 loadv2i64, i128mem, 0>, VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -7323,7 +7190,7 @@ multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
-                         Requires<[HasAVX]>;
+                         Requires<[HasAVX]>, VEX_WIG;
   defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
                          Requires<[UseSSE42]>;
 }
@@ -7397,7 +7264,7 @@ multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
-                      Requires<[HasAVX]>;
+                      Requires<[HasAVX]>, VEX_WIG;
   defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
                       Requires<[UseSSE42]>;
 }
@@ -7515,14 +7382,18 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                       bit UsesXMM0 = 0> {
   def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             !if(UsesXMM0,
+                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
 
   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             !if(UsesXMM0,
+                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
                     (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
@@ -7557,10 +7428,10 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
 }
 
 // Aliases with explicit %xmm0
-def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
-def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
 
 //===----------------------------------------------------------------------===//
 // AES-NI Instructions
@@ -7588,13 +7459,13 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7615,12 +7486,12 @@ let Predicates = [HasAVX, HasAES] in {
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst,
         (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
-      VEX;
+      VEX, VEX_WIG;
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
-      Sched<[WriteAESIMCLd]>, VEX;
+      Sched<[WriteAESIMCLd]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1),
@@ -7640,13 +7511,13 @@ let Predicates = [HasAVX, HasAES] in {
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
-      Sched<[WriteAESKeyGen]>, VEX;
+      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
-      Sched<[WriteAESKeyGenLd]>, VEX;
+      Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, u8imm:$src2),
@@ -7672,14 +7543,14 @@ def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst,
              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
-           Sched<[WriteCLMul]>;
+           Sched<[WriteCLMul]>, VEX_WIG;
 
 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
                               (loadv2i64 addr:$src2), imm:$src3))]>,
-           Sched<[WriteCLMulLd, ReadAfterLd]>;
+           Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -7879,6 +7750,15 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
+
+// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit
+// all ones value.
+let Predicates = [HasAVX1Only] in
+def : Pat<(v8i32 immAllOnesV),
+          (VINSERTF128rr
+           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm),
+           (V_SETALLONES), 1)>;
+
 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
                             PatFrag memop_frag> {
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
@@ -8029,41 +7909,6 @@ let ExeDomain = SSEPackedDouble in {
                                loadv4i64, v4f64, v4i64>, VEX_L;
 }
 
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
-          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
-def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
-          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
-def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
-          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
-def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
-          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
-
-def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
-                               (i8 imm:$imm))),
-          (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
-          (VPERMILPDYmi addr:$src1, imm:$imm)>;
-
-def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
-          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
-          (VPERMILPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
-          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
-          (VPERMILPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
-          (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
-          (VPERMILPDmi addr:$src1, imm:$imm)>;
-}
-
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
@@ -8118,15 +7963,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 //
+// Note, these instruction do not affect the YMM16-YMM31.
 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8235,6 +8081,46 @@ defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
 defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
                                 VR256, loadv4i64, i256mem>, VEX_L;
 
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX2] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
 //===----------------------------------------------------------------------===//
 // VPBROADCAST - Load from memory and broadcast to all elements of the
 //               destination operand
@@ -8282,6 +8168,11 @@ defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
                                     v2i64, v4i64, NoVLX>;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQrm addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQYrm addr:$src)>;
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   // This means we'll encounter truncated i32 loads; match that here.
   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -8296,7 +8187,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
             (VPBROADCASTWYrm addr:$src)>;
 }
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX] in {
   // Provide aliases for broadcast from the same register class that
   // automatically does the extract.
   def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
@@ -8343,18 +8234,13 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+            (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
-            (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
-  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
-            (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-
-  // The patterns for VPBROADCASTD are not needed because they would match
-  // the exact same thing as VBROADCASTSS patterns.
-
+            (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
-        (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-  // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+            (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
 }
 
 // AVX1 broadcast patterns
@@ -8377,15 +8263,15 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
-            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+            (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm),
+              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>;
 
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
             (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
@@ -8399,7 +8285,7 @@ let Predicates = [HasAVX1Only] in {
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
 
   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
-              (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+            (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8407,7 +8293,8 @@ let Predicates = [HasAVX1Only] in {
 //
 
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                     ValueType OpVT, X86FoldableSchedWrite Sched> {
+                     ValueType OpVT, X86FoldableSchedWrite Sched,
+                     X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, VR256:$src2),
@@ -8417,7 +8304,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
                      Sched<[Sched]>, VEX_4V, VEX_L;
     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
-                     (ins VR256:$src1, i256mem:$src2),
+                     (ins VR256:$src1, memOp:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
@@ -8427,12 +8314,15 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256,
+                        i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256,
+                        f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                         ValueType OpVT, X86FoldableSchedWrite Sched> {
+                         ValueType OpVT, X86FoldableSchedWrite Sched,
+                         X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
     def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
                        (ins VR256:$src1, u8imm:$src2),
@@ -8442,7 +8332,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
                        Sched<[Sched]>, VEX, VEX_L;
     def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
-                       (ins i256mem:$src1, u8imm:$src2),
+                       (ins memOp:$src1, u8imm:$src2),
                        !strconcat(OpcodeStr,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
@@ -8453,10 +8343,10 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
-                            WriteShuffle256>, VEX_W;
+                            WriteShuffle256, i256mem>, VEX_W;
 let ExeDomain = SSEPackedDouble in
 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
-                             WriteFShuffle256>, VEX_W;
+                             WriteFShuffle256, f256mem>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index e2be73532157..0efb383e1c8d 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -340,75 +340,71 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
 
 let hasSideEffects = 0 in {
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL, EFLAGS] in {
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [CL, EFLAGS]
+
+let Uses = [EFLAGS] in {
 def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                "rcl{b}\t$dst", [], IIC_SR>;
 def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
                   "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
-let Uses = [CL] in
-def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
-
 def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
                   "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
-let Uses = [CL] in
-def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
-
-
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcl{q}\t$dst", [], IIC_SR>;
 def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
                    "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [EFLAGS]
 
+let Uses = [CL, EFLAGS] in {
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [CL, EFLAGS]
 
+let Uses = [EFLAGS] in {
 def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                "rcr{b}\t$dst", [], IIC_SR>;
 def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
                   "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
-let Uses = [CL] in
-def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
-
 def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
                   "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
-let Uses = [CL] in
-def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
-
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcr{q}\t$dst", [], IIC_SR>;
 def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [EFLAGS]
 
 } // Constraints = "$src = $dst"
 
-let SchedRW = [WriteShiftLd, WriteRMW] in {
+let SchedRW = [WriteShiftLd, WriteRMW], mayStore = 1 in {
+let Uses = [EFLAGS] in {
 def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
                "rcl{b}\t$dst", [], IIC_SR>;
 def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
@@ -442,8 +438,9 @@ def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
                  "rcr{q}\t$dst", [], IIC_SR>;
 def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+} // Uses = [EFLAGS]
 
-let Uses = [CL] in {
+let Uses = [CL, EFLAGS] in {
 def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
                 "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
@@ -461,7 +458,7 @@ def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
                  "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
 def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
                   "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-}
+} // Uses = [CL, EFLAGS]
 } // SchedRW
 } // hasSideEffects = 0
 
@@ -665,19 +662,19 @@ def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
 // Rotate by 1
 def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
                  "ror{b}\t$dst",
-               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)],
                IIC_SR>;
 def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t$dst",
-              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)],
               IIC_SR>, OpSize16;
 def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t$dst",
-              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)],
               IIC_SR>, OpSize32;
 def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                  "ror{q}\t$dst",
-               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)],
                IIC_SR>;
 } // SchedRW
 
@@ -849,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
 
 } // Defs = [EFLAGS]
 
+// Sandy Bridge and newer Intel processors support faster rotates using
+// SHLD to avoid a partial flag update on the normal rotate instructions.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
+  def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+            (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+  def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+            (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+}
+
 def ROT32L2R_imm8  : SDNodeXForm<imm, [{
   // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
   return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 9265d64b3230..2e5350ce979e 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -173,27 +173,28 @@ def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
-
+let mayStore = 1 in {
 def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
 def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
 def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
-
+}
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
 def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
 def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
-
+let mayLoad = 1 in {
 def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
 def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
 def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+}
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -202,6 +203,7 @@ def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
 let SchedRW = [WriteSystem] in {
 def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
 
+let mayLoad = 1 in
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
                 OpSize16;
@@ -210,6 +212,7 @@ def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                 OpSize16;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+let mayLoad = 1 in
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
                 OpSize32;
@@ -217,23 +220,27 @@ def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
                 OpSize32;
 // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+let mayLoad = 1 in
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 
+let mayLoad = 1 in
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
                 OpSize16;
 def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                 "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
                 OpSize16;
+let mayLoad = 1 in
 def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
                 OpSize32;
 def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
                 OpSize32;
+let mayLoad = 1 in
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
 def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
@@ -248,11 +255,13 @@ def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
                "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
                 "str{q}\t$dst", [], IIC_STR>, TB;
+let mayStore = 1 in
 def STRm   : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
                "str{w}\t$dst", [], IIC_STR>, TB;
 
 def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
+let mayLoad = 1 in
 def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
 
@@ -377,12 +386,14 @@ def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
 
 def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
               "verr\t$seg", [], IIC_VERR>, TB;
-def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
-              "verr\t$seg", [], IIC_VERR>, TB;
 def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
               "verw\t$seg", [], IIC_VERW_MEM>, TB;
+let mayLoad = 1 in {
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
 def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
               "verw\t$seg", [], IIC_VERW_REG>, TB;
+}
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -403,6 +414,7 @@ def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
               "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+let mayStore = 1 in
 def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
@@ -412,6 +424,7 @@ def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+let mayStore = 1 in
 def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
@@ -429,6 +442,7 @@ def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
               "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
                 "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+let mayLoad = 1 in
 def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
                 "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
 } // SchedRW
@@ -459,6 +473,7 @@ def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
 
 def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
                 "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+let mayLoad = 1 in
 def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
                 "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
 
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 7267d752653e..38ac8be94483 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -25,9 +25,9 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
 
 let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
 def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
-                         "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+                         "xbegin\t$dst", []>, OpSize16;
 def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
-                         "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+                         "xbegin\t$dst", []>, OpSize32;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
@@ -35,7 +35,7 @@ def XEND : I<0x01, MRM_D5, (outs), (ins),
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
@@ -44,7 +44,7 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
 // HLE prefixes
 
 let isAsmParserOnly = 1 in {
-def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
-def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>;
 }
 
diff --git a/lib/Target/X86/X86InstrTablesInfo.h b/lib/Target/X86/X86InstrTablesInfo.h
deleted file mode 100755
index 415a891bfd97..000000000000
--- a/lib/Target/X86/X86InstrTablesInfo.h
+++ /dev/null
@@ -1,1162 +0,0 @@
-//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains related X86 Instruction Information Tables.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
-#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
-
-using namespace llvm;
-
-struct X86EvexToVexCompressTableEntry {
-  uint16_t EvexOpcode;
-  uint16_t VexOpcode;
-};
-
-
-
-// X86 EVEX encoded instructions that have a VEX 128 encoding
-// (table format: <EVEX opcode, VEX-128 opcode>).
-static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = {
-  // EVEX scalar with corresponding VEX.
-  { X86::Int_VCOMISDZrm         ,  X86::Int_VCOMISDrm            },
-  { X86::Int_VCOMISDZrr         ,  X86::Int_VCOMISDrr            },
-  { X86::Int_VCOMISSZrm         ,  X86::Int_VCOMISSrm            },
-  { X86::Int_VCOMISSZrr         ,  X86::Int_VCOMISSrr            },
-  { X86::Int_VUCOMISDZrm        ,  X86::Int_VUCOMISDrm           },
-  { X86::Int_VUCOMISDZrr        ,  X86::Int_VUCOMISDrr           },
-  { X86::Int_VUCOMISSZrm        ,  X86::Int_VUCOMISSrm           },
-  { X86::Int_VUCOMISSZrr        ,  X86::Int_VUCOMISSrr           },
-  { X86::VADDSDZrm              ,  X86::VADDSDrm                 },
-  { X86::VADDSDZrm_Int          ,  X86::VADDSDrm_Int             },
-  { X86::VADDSDZrr              ,  X86::VADDSDrr                 },
-  { X86::VADDSDZrr_Int          ,  X86::VADDSDrr_Int             },
-  { X86::VADDSSZrm              ,  X86::VADDSSrm                 },
-  { X86::VADDSSZrm_Int          ,  X86::VADDSSrm_Int             },
-  { X86::VADDSSZrr              ,  X86::VADDSSrr                 },
-  { X86::VADDSSZrr_Int          ,  X86::VADDSSrr_Int             },
-  { X86::VCOMISDZrm             ,  X86::VCOMISDrm                },
-  { X86::VCOMISDZrr             ,  X86::VCOMISDrr                },
-  { X86::VCOMISSZrm             ,  X86::VCOMISSrm                },
-  { X86::VCOMISSZrr             ,  X86::VCOMISSrr                },
-  { X86::VCVTSD2SI64Zrm         ,  X86::VCVTSD2SI64rm            },
-  { X86::VCVTSD2SI64Zrr         ,  X86::VCVTSD2SI64rr            },
-  { X86::VCVTSD2SIZrm           ,  X86::VCVTSD2SIrm              },
-  { X86::VCVTSD2SIZrr           ,  X86::VCVTSD2SIrr              },
-  { X86::VCVTSD2SSZrm           ,  X86::VCVTSD2SSrm              },
-  { X86::VCVTSD2SSZrr           ,  X86::VCVTSD2SSrr              },
-  { X86::VCVTSI2SDZrm           ,  X86::VCVTSI2SDrm              },
-  { X86::VCVTSI2SDZrm_Int       ,  X86::Int_VCVTSI2SDrm          },
-  { X86::VCVTSI2SDZrr           ,  X86::VCVTSI2SDrr              },
-  { X86::VCVTSI2SDZrr_Int       ,  X86::Int_VCVTSI2SDrr          },
-  { X86::VCVTSI2SSZrm           ,  X86::VCVTSI2SSrm              },
-  { X86::VCVTSI2SSZrm_Int       ,  X86::Int_VCVTSI2SSrm          },
-  { X86::VCVTSI2SSZrr           ,  X86::VCVTSI2SSrr              },
-  { X86::VCVTSI2SSZrr_Int       ,  X86::Int_VCVTSI2SSrr          },
-  { X86::VCVTSS2SDZrm           ,  X86::VCVTSS2SDrm              },
-  { X86::VCVTSS2SDZrr           ,  X86::VCVTSS2SDrr              },
-  { X86::VCVTSS2SI64Zrm         ,  X86::VCVTSS2SI64rm            },
-  { X86::VCVTSS2SI64Zrr         ,  X86::VCVTSS2SI64rr            },
-  { X86::VCVTSS2SIZrm           ,  X86::VCVTSS2SIrm              },
-  { X86::VCVTSS2SIZrr           ,  X86::VCVTSS2SIrr              },
-  { X86::VCVTTSD2SI64Zrm        ,  X86::VCVTTSD2SI64rm           },
-  { X86::VCVTTSD2SI64Zrm_Int    ,  X86::Int_VCVTTSD2SI64rm       },
-  { X86::VCVTTSD2SI64Zrr        ,  X86::VCVTTSD2SI64rr           },
-  { X86::VCVTTSD2SI64Zrr_Int    ,  X86::Int_VCVTTSD2SI64rr       },
-  { X86::VCVTTSD2SIZrm          ,  X86::VCVTTSD2SIrm             },
-  { X86::VCVTTSD2SIZrm_Int      ,  X86::Int_VCVTTSD2SIrm         },
-  { X86::VCVTTSD2SIZrr          ,  X86::VCVTTSD2SIrr             },
-  { X86::VCVTTSD2SIZrr_Int      ,  X86::Int_VCVTTSD2SIrr         },
-  { X86::VCVTTSS2SI64Zrm        ,  X86::VCVTTSS2SI64rm           },
-  { X86::VCVTTSS2SI64Zrm_Int    ,  X86::Int_VCVTTSS2SI64rm       },
-  { X86::VCVTTSS2SI64Zrr        ,  X86::VCVTTSS2SI64rr           },
-  { X86::VCVTTSS2SI64Zrr_Int    ,  X86::Int_VCVTTSS2SI64rr       },
-  { X86::VCVTTSS2SIZrm          ,  X86::VCVTTSS2SIrm             },
-  { X86::VCVTTSS2SIZrm_Int      ,  X86::Int_VCVTTSS2SIrm         },
-  { X86::VCVTTSS2SIZrr          ,  X86::VCVTTSS2SIrr             },
-  { X86::VCVTTSS2SIZrr_Int      ,  X86::Int_VCVTTSS2SIrr         },
-  { X86::VDIVSDZrm              ,  X86::VDIVSDrm                 },
-  { X86::VDIVSDZrm_Int          ,  X86::VDIVSDrm_Int             },
-  { X86::VDIVSDZrr              ,  X86::VDIVSDrr                 },
-  { X86::VDIVSDZrr_Int          ,  X86::VDIVSDrr_Int             },
-  { X86::VDIVSSZrm              ,  X86::VDIVSSrm                 },
-  { X86::VDIVSSZrm_Int          ,  X86::VDIVSSrm_Int             },
-  { X86::VDIVSSZrr              ,  X86::VDIVSSrr                 },
-  { X86::VDIVSSZrr_Int          ,  X86::VDIVSSrr_Int             },
-  { X86::VFMADD132SDZm          ,  X86::VFMADD132SDm             },
-  { X86::VFMADD132SDZm_Int      ,  X86::VFMADD132SDm_Int         },
-  { X86::VFMADD132SDZr          ,  X86::VFMADD132SDr             },
-  { X86::VFMADD132SDZr_Int      ,  X86::VFMADD132SDr_Int         },
-  { X86::VFMADD132SSZm          ,  X86::VFMADD132SSm             },
-  { X86::VFMADD132SSZm_Int      ,  X86::VFMADD132SSm_Int         },
-  { X86::VFMADD132SSZr          ,  X86::VFMADD132SSr             },
-  { X86::VFMADD132SSZr_Int      ,  X86::VFMADD132SSr_Int         },
-  { X86::VFMADD213SDZm          ,  X86::VFMADD213SDm             },
-  { X86::VFMADD213SDZm_Int      ,  X86::VFMADD213SDm_Int         },
-  { X86::VFMADD213SDZr          ,  X86::VFMADD213SDr             },
-  { X86::VFMADD213SDZr_Int      ,  X86::VFMADD213SDr_Int         },
-  { X86::VFMADD213SSZm          ,  X86::VFMADD213SSm             },
-  { X86::VFMADD213SSZm_Int      ,  X86::VFMADD213SSm_Int         },
-  { X86::VFMADD213SSZr          ,  X86::VFMADD213SSr             },
-  { X86::VFMADD213SSZr_Int      ,  X86::VFMADD213SSr_Int         },
-  { X86::VFMADD231SDZm          ,  X86::VFMADD231SDm             },
-  { X86::VFMADD231SDZm_Int      ,  X86::VFMADD231SDm_Int         },
-  { X86::VFMADD231SDZr          ,  X86::VFMADD231SDr             },
-  { X86::VFMADD231SDZr_Int      ,  X86::VFMADD231SDr_Int         },
-  { X86::VFMADD231SSZm          ,  X86::VFMADD231SSm             },
-  { X86::VFMADD231SSZm_Int      ,  X86::VFMADD231SSm_Int         },
-  { X86::VFMADD231SSZr          ,  X86::VFMADD231SSr             },
-  { X86::VFMADD231SSZr_Int      ,  X86::VFMADD231SSr_Int         },
-  { X86::VFMSUB132SDZm          ,  X86::VFMSUB132SDm             },
-  { X86::VFMSUB132SDZm_Int      ,  X86::VFMSUB132SDm_Int         },
-  { X86::VFMSUB132SDZr          ,  X86::VFMSUB132SDr             },
-  { X86::VFMSUB132SDZr_Int      ,  X86::VFMSUB132SDr_Int         },
-  { X86::VFMSUB132SSZm          ,  X86::VFMSUB132SSm             },
-  { X86::VFMSUB132SSZm_Int      ,  X86::VFMSUB132SSm_Int         },
-  { X86::VFMSUB132SSZr          ,  X86::VFMSUB132SSr             },
-  { X86::VFMSUB132SSZr_Int      ,  X86::VFMSUB132SSr_Int         },
-  { X86::VFMSUB213SDZm          ,  X86::VFMSUB213SDm             },
-  { X86::VFMSUB213SDZm_Int      ,  X86::VFMSUB213SDm_Int         },
-  { X86::VFMSUB213SDZr          ,  X86::VFMSUB213SDr             },
-  { X86::VFMSUB213SDZr_Int      ,  X86::VFMSUB213SDr_Int         },
-  { X86::VFMSUB213SSZm          ,  X86::VFMSUB213SSm             },
-  { X86::VFMSUB213SSZm_Int      ,  X86::VFMSUB213SSm_Int         },
-  { X86::VFMSUB213SSZr          ,  X86::VFMSUB213SSr             },
-  { X86::VFMSUB213SSZr_Int      ,  X86::VFMSUB213SSr_Int         },
-  { X86::VFMSUB231SDZm          ,  X86::VFMSUB231SDm             },
-  { X86::VFMSUB231SDZm_Int      ,  X86::VFMSUB231SDm_Int         },
-  { X86::VFMSUB231SDZr          ,  X86::VFMSUB231SDr             },
-  { X86::VFMSUB231SDZr_Int      ,  X86::VFMSUB231SDr_Int         },
-  { X86::VFMSUB231SSZm          ,  X86::VFMSUB231SSm             },
-  { X86::VFMSUB231SSZm_Int      ,  X86::VFMSUB231SSm_Int         },
-  { X86::VFMSUB231SSZr          ,  X86::VFMSUB231SSr             },
-  { X86::VFMSUB231SSZr_Int      ,  X86::VFMSUB231SSr_Int         },
-  { X86::VFNMADD132SDZm         ,  X86::VFNMADD132SDm            },
-  { X86::VFNMADD132SDZm_Int     ,  X86::VFNMADD132SDm_Int        },
-  { X86::VFNMADD132SDZr         ,  X86::VFNMADD132SDr            },
-  { X86::VFNMADD132SDZr_Int     ,  X86::VFNMADD132SDr_Int        },
-  { X86::VFNMADD132SSZm         ,  X86::VFNMADD132SSm            },
-  { X86::VFNMADD132SSZm_Int     ,  X86::VFNMADD132SSm_Int        },
-  { X86::VFNMADD132SSZr         ,  X86::VFNMADD132SSr            },
-  { X86::VFNMADD132SSZr_Int     ,  X86::VFNMADD132SSr_Int        },
-  { X86::VFNMADD213SDZm         ,  X86::VFNMADD213SDm            },
-  { X86::VFNMADD213SDZm_Int     ,  X86::VFNMADD213SDm_Int        },
-  { X86::VFNMADD213SDZr         ,  X86::VFNMADD213SDr            },
-  { X86::VFNMADD213SDZr_Int     ,  X86::VFNMADD213SDr_Int        },
-  { X86::VFNMADD213SSZm         ,  X86::VFNMADD213SSm            },
-  { X86::VFNMADD213SSZm_Int     ,  X86::VFNMADD213SSm_Int        },
-  { X86::VFNMADD213SSZr         ,  X86::VFNMADD213SSr            },
-  { X86::VFNMADD213SSZr_Int     ,  X86::VFNMADD213SSr_Int        },
-  { X86::VFNMADD231SDZm         ,  X86::VFNMADD231SDm            },
-  { X86::VFNMADD231SDZm_Int     ,  X86::VFNMADD231SDm_Int        },
-  { X86::VFNMADD231SDZr         ,  X86::VFNMADD231SDr            },
-  { X86::VFNMADD231SDZr_Int     ,  X86::VFNMADD231SDr_Int        },
-  { X86::VFNMADD231SSZm         ,  X86::VFNMADD231SSm            },
-  { X86::VFNMADD231SSZm_Int     ,  X86::VFNMADD231SSm_Int        },
-  { X86::VFNMADD231SSZr         ,  X86::VFNMADD231SSr            },
-  { X86::VFNMADD231SSZr_Int     ,  X86::VFNMADD231SSr_Int        },
-  { X86::VFNMSUB132SDZm         ,  X86::VFNMSUB132SDm            },
-  { X86::VFNMSUB132SDZm_Int     ,  X86::VFNMSUB132SDm_Int        },
-  { X86::VFNMSUB132SDZr         ,  X86::VFNMSUB132SDr            },
-  { X86::VFNMSUB132SDZr_Int     ,  X86::VFNMSUB132SDr_Int        },
-  { X86::VFNMSUB132SSZm         ,  X86::VFNMSUB132SSm            },
-  { X86::VFNMSUB132SSZm_Int     ,  X86::VFNMSUB132SSm_Int        },
-  { X86::VFNMSUB132SSZr         ,  X86::VFNMSUB132SSr            },
-  { X86::VFNMSUB132SSZr_Int     ,  X86::VFNMSUB132SSr_Int        },
-  { X86::VFNMSUB213SDZm         ,  X86::VFNMSUB213SDm            },
-  { X86::VFNMSUB213SDZm_Int     ,  X86::VFNMSUB213SDm_Int        },
-  { X86::VFNMSUB213SDZr         ,  X86::VFNMSUB213SDr            },
-  { X86::VFNMSUB213SDZr_Int     ,  X86::VFNMSUB213SDr_Int        },
-  { X86::VFNMSUB213SSZm         ,  X86::VFNMSUB213SSm            },
-  { X86::VFNMSUB213SSZm_Int     ,  X86::VFNMSUB213SSm_Int        },
-  { X86::VFNMSUB213SSZr         ,  X86::VFNMSUB213SSr            },
-  { X86::VFNMSUB213SSZr_Int     ,  X86::VFNMSUB213SSr_Int        },
-  { X86::VFNMSUB231SDZm         ,  X86::VFNMSUB231SDm            },
-  { X86::VFNMSUB231SDZm_Int     ,  X86::VFNMSUB231SDm_Int        },
-  { X86::VFNMSUB231SDZr         ,  X86::VFNMSUB231SDr            },
-  { X86::VFNMSUB231SDZr_Int     ,  X86::VFNMSUB231SDr_Int        },
-  { X86::VFNMSUB231SSZm         ,  X86::VFNMSUB231SSm            },
-  { X86::VFNMSUB231SSZm_Int     ,  X86::VFNMSUB231SSm_Int        },
-  { X86::VFNMSUB231SSZr         ,  X86::VFNMSUB231SSr            },
-  { X86::VFNMSUB231SSZr_Int     ,  X86::VFNMSUB231SSr_Int        },
-  { X86::VMAXCSDZrm             ,  X86::VMAXCSDrm                },
-  { X86::VMAXCSDZrr             ,  X86::VMAXCSDrr                },
-  { X86::VMAXCSSZrm             ,  X86::VMAXCSSrm                },
-  { X86::VMAXCSSZrr             ,  X86::VMAXCSSrr                },
-  { X86::VMAXSDZrm              ,  X86::VMAXSDrm                 },
-  { X86::VMAXSDZrm_Int          ,  X86::VMAXSDrm_Int             },
-  { X86::VMAXSDZrr              ,  X86::VMAXSDrr                 },
-  { X86::VMAXSDZrr_Int          ,  X86::VMAXSDrr_Int             },
-  { X86::VMAXSSZrm              ,  X86::VMAXSSrm                 },
-  { X86::VMAXSSZrm_Int          ,  X86::VMAXSSrm_Int             },
-  { X86::VMAXSSZrr              ,  X86::VMAXSSrr                 },
-  { X86::VMAXSSZrr_Int          ,  X86::VMAXSSrr_Int             },
-  { X86::VMINCSDZrm             ,  X86::VMINCSDrm                },
-  { X86::VMINCSDZrr             ,  X86::VMINCSDrr                },
-  { X86::VMINCSSZrm             ,  X86::VMINCSSrm                },
-  { X86::VMINCSSZrr             ,  X86::VMINCSSrr                },
-  { X86::VMINSDZrm              ,  X86::VMINSDrm                 },
-  { X86::VMINSDZrm_Int          ,  X86::VMINSDrm_Int             },
-  { X86::VMINSDZrr              ,  X86::VMINSDrr                 },
-  { X86::VMINSDZrr_Int          ,  X86::VMINSDrr_Int             },
-  { X86::VMINSSZrm              ,  X86::VMINSSrm                 },
-  { X86::VMINSSZrm_Int          ,  X86::VMINSSrm_Int             },
-  { X86::VMINSSZrr              ,  X86::VMINSSrr                 },
-  { X86::VMINSSZrr_Int          ,  X86::VMINSSrr_Int             },
-  { X86::VMOV64toSDZrr          ,  X86::VMOV64toSDrr             },
-  { X86::VMOVDI2SSZrm           ,  X86::VMOVDI2SSrm              },
-  { X86::VMOVDI2SSZrr           ,  X86::VMOVDI2SSrr              },
-  { X86::VMOVSDZmr              ,  X86::VMOVSDmr                 },
-  { X86::VMOVSDZrm              ,  X86::VMOVSDrm                 },
-  { X86::VMOVSDZrr              ,  X86::VMOVSDrr                 },
-  { X86::VMOVSSZmr              ,  X86::VMOVSSmr                 },
-  { X86::VMOVSSZrm              ,  X86::VMOVSSrm                 },
-  { X86::VMOVSSZrr              ,  X86::VMOVSSrr                 },
-  { X86::VMOVSSZrr_REV          ,  X86::VMOVSSrr_REV             },
-  { X86::VMULSDZrm              ,  X86::VMULSDrm                 },
-  { X86::VMULSDZrm_Int          ,  X86::VMULSDrm_Int             },
-  { X86::VMULSDZrr              ,  X86::VMULSDrr                 },
-  { X86::VMULSDZrr_Int          ,  X86::VMULSDrr_Int             },
-  { X86::VMULSSZrm              ,  X86::VMULSSrm                 },
-  { X86::VMULSSZrm_Int          ,  X86::VMULSSrm_Int             },
-  { X86::VMULSSZrr              ,  X86::VMULSSrr                 },
-  { X86::VMULSSZrr_Int          ,  X86::VMULSSrr_Int             },
-  { X86::VSQRTSDZm              ,  X86::VSQRTSDm                 },
-  { X86::VSQRTSDZm_Int          ,  X86::VSQRTSDm_Int             },
-  { X86::VSQRTSDZr              ,  X86::VSQRTSDr                 },
-  { X86::VSQRTSDZr_Int          ,  X86::VSQRTSDr_Int             },
-  { X86::VSQRTSSZm              ,  X86::VSQRTSSm                 },
-  { X86::VSQRTSSZm_Int          ,  X86::VSQRTSSm_Int             },
-  { X86::VSQRTSSZr              ,  X86::VSQRTSSr                 },
-  { X86::VSQRTSSZr_Int          ,  X86::VSQRTSSr_Int             },
-  { X86::VSUBSDZrm              ,  X86::VSUBSDrm                 },
-  { X86::VSUBSDZrm_Int          ,  X86::VSUBSDrm_Int             },
-  { X86::VSUBSDZrr              ,  X86::VSUBSDrr                 },
-  { X86::VSUBSDZrr_Int          ,  X86::VSUBSDrr_Int             },
-  { X86::VSUBSSZrm              ,  X86::VSUBSSrm                 },
-  { X86::VSUBSSZrm_Int          ,  X86::VSUBSSrm_Int             },
-  { X86::VSUBSSZrr              ,  X86::VSUBSSrr                 },
-  { X86::VSUBSSZrr_Int          ,  X86::VSUBSSrr_Int             },
-  { X86::VUCOMISDZrm            ,  X86::VUCOMISDrm               },
-  { X86::VUCOMISDZrr            ,  X86::VUCOMISDrr               },
-  { X86::VUCOMISSZrm            ,  X86::VUCOMISSrm               },
-  { X86::VUCOMISSZrr            ,  X86::VUCOMISSrr               },
-
-  { X86::VMOV64toPQIZrr         ,   X86::VMOV64toPQIrr           },
-  { X86::VMOV64toSDZrr          ,   X86::VMOV64toSDrr            },
-  { X86::VMOVDI2PDIZrm          ,   X86::VMOVDI2PDIrm            },
-  { X86::VMOVDI2PDIZrr          ,   X86::VMOVDI2PDIrr            },
-  { X86::VMOVLHPSZrr            ,   X86::VMOVLHPSrr              },
-  { X86::VMOVHLPSZrr            ,   X86::VMOVHLPSrr              },
-  { X86::VMOVPDI2DIZmr          ,   X86::VMOVPDI2DImr            },
-  { X86::VMOVPDI2DIZrr          ,   X86::VMOVPDI2DIrr            },
-  { X86::VMOVPQI2QIZmr          ,   X86::VMOVPQI2QImr            },
-  { X86::VMOVPQIto64Zrr         ,   X86::VMOVPQIto64rr           },
-  { X86::VMOVQI2PQIZrm          ,   X86::VMOVQI2PQIrm            },
-  { X86::VMOVZPQILo2PQIZrr      ,   X86::VMOVZPQILo2PQIrr        },
-
-  { X86::VPEXTRBZmr             ,   X86::VPEXTRBmr               },
-  { X86::VPEXTRBZrr             ,   X86::VPEXTRBrr               },
-  { X86::VPEXTRDZmr             ,   X86::VPEXTRDmr               },
-  { X86::VPEXTRDZrr             ,   X86::VPEXTRDrr               },
-  { X86::VPEXTRQZmr             ,   X86::VPEXTRQmr               },
-  { X86::VPEXTRQZrr             ,   X86::VPEXTRQrr               },
-  { X86::VPEXTRWZmr             ,   X86::VPEXTRWmr               },
-  { X86::VPEXTRWZrr             ,   X86::VPEXTRWri               },
-
-  { X86::VPINSRBZrm             ,   X86::VPINSRBrm               },
-  { X86::VPINSRBZrr             ,   X86::VPINSRBrr               },
-  { X86::VPINSRDZrm             ,   X86::VPINSRDrm               },
-  { X86::VPINSRDZrr             ,   X86::VPINSRDrr               },
-  { X86::VPINSRQZrm             ,   X86::VPINSRQrm               },
-  { X86::VPINSRQZrr             ,   X86::VPINSRQrr               },
-  { X86::VPINSRWZrm             ,   X86::VPINSRWrmi              },
-  { X86::VPINSRWZrr             ,   X86::VPINSRWrri              },
-
-  // EVEX 128 with corresponding VEX.
-  { X86::VADDPDZ128rm           ,    X86::VADDPDrm               },
-  { X86::VADDPDZ128rr           ,    X86::VADDPDrr               },
-  { X86::VADDPSZ128rm           ,    X86::VADDPSrm               },
-  { X86::VADDPSZ128rr           ,    X86::VADDPSrr               },
-  { X86::VANDNPDZ128rm          ,    X86::VANDNPDrm              },
-  { X86::VANDNPDZ128rr          ,    X86::VANDNPDrr              },
-  { X86::VANDNPSZ128rm          ,    X86::VANDNPSrm              },
-  { X86::VANDNPSZ128rr          ,    X86::VANDNPSrr              },
-  { X86::VANDPDZ128rm           ,    X86::VANDPDrm               },
-  { X86::VANDPDZ128rr           ,    X86::VANDPDrr               },
-  { X86::VANDPSZ128rm           ,    X86::VANDPSrm               },
-  { X86::VANDPSZ128rr           ,    X86::VANDPSrr               },
-  { X86::VBROADCASTSSZ128m      ,    X86::VBROADCASTSSrm         },
-  { X86::VBROADCASTSSZ128r      ,    X86::VBROADCASTSSrr         },
-  { X86::VBROADCASTSSZ128r_s    ,    X86::VBROADCASTSSrr         },
-  { X86::VCVTDQ2PDZ128rm        ,    X86::VCVTDQ2PDrm            },
-  { X86::VCVTDQ2PDZ128rr        ,    X86::VCVTDQ2PDrr            },
-  { X86::VCVTDQ2PSZ128rm        ,    X86::VCVTDQ2PSrm            },
-  { X86::VCVTDQ2PSZ128rr        ,    X86::VCVTDQ2PSrr            },
-  { X86::VCVTPD2DQZ128rm        ,    X86::VCVTPD2DQrm            },
-  { X86::VCVTPD2DQZ128rr        ,    X86::VCVTPD2DQrr            },
-  { X86::VCVTPD2PSZ128rm        ,    X86::VCVTPD2PSrm            },
-  { X86::VCVTPD2PSZ128rr        ,    X86::VCVTPD2PSrr            },
-  { X86::VCVTPH2PSZ128rm        ,    X86::VCVTPH2PSrm            },
-  { X86::VCVTPH2PSZ128rr        ,    X86::VCVTPH2PSrr            },
-  { X86::VCVTPS2DQZ128rm        ,    X86::VCVTPS2DQrm            },
-  { X86::VCVTPS2DQZ128rr        ,    X86::VCVTPS2DQrr            },
-  { X86::VCVTPS2PDZ128rm        ,    X86::VCVTPS2PDrm            },
-  { X86::VCVTPS2PDZ128rr        ,    X86::VCVTPS2PDrr            },
-  { X86::VCVTPS2PHZ128mr        ,    X86::VCVTPS2PHmr            },
-  { X86::VCVTPS2PHZ128rr        ,    X86::VCVTPS2PHrr            },
-  { X86::VCVTTPD2DQZ128rm       ,    X86::VCVTTPD2DQrm           },
-  { X86::VCVTTPD2DQZ128rr       ,    X86::VCVTTPD2DQrr           },
-  { X86::VCVTTPS2DQZ128rm       ,    X86::VCVTTPS2DQrm           },
-  { X86::VCVTTPS2DQZ128rr       ,    X86::VCVTTPS2DQrr           },
-  { X86::VDIVPDZ128rm           ,    X86::VDIVPDrm               },
-  { X86::VDIVPDZ128rr           ,    X86::VDIVPDrr               },
-  { X86::VDIVPSZ128rm           ,    X86::VDIVPSrm               },
-  { X86::VDIVPSZ128rr           ,    X86::VDIVPSrr               },
-  { X86::VFMADD132PDZ128m       ,    X86::VFMADD132PDm           },
-  { X86::VFMADD132PDZ128r       ,    X86::VFMADD132PDr           },
-  { X86::VFMADD132PSZ128m       ,    X86::VFMADD132PSm           },
-  { X86::VFMADD132PSZ128r       ,    X86::VFMADD132PSr           },
-  { X86::VFMADD213PDZ128m       ,    X86::VFMADD213PDm           },
-  { X86::VFMADD213PDZ128r       ,    X86::VFMADD213PDr           },
-  { X86::VFMADD213PSZ128m       ,    X86::VFMADD213PSm           },
-  { X86::VFMADD213PSZ128r       ,    X86::VFMADD213PSr           },
-  { X86::VFMADD231PDZ128m       ,    X86::VFMADD231PDm           },
-  { X86::VFMADD231PDZ128r       ,    X86::VFMADD231PDr           },
-  { X86::VFMADD231PSZ128m       ,    X86::VFMADD231PSm           },
-  { X86::VFMADD231PSZ128r       ,    X86::VFMADD231PSr           },
-  { X86::VFMADDSUB132PDZ128m    ,    X86::VFMADDSUB132PDm        },
-  { X86::VFMADDSUB132PDZ128r    ,    X86::VFMADDSUB132PDr        },
-  { X86::VFMADDSUB132PSZ128m    ,    X86::VFMADDSUB132PSm        },
-  { X86::VFMADDSUB132PSZ128r    ,    X86::VFMADDSUB132PSr        },
-  { X86::VFMADDSUB213PDZ128m    ,    X86::VFMADDSUB213PDm        },
-  { X86::VFMADDSUB213PDZ128r    ,    X86::VFMADDSUB213PDr        },
-  { X86::VFMADDSUB213PSZ128m    ,    X86::VFMADDSUB213PSm        },
-  { X86::VFMADDSUB213PSZ128r    ,    X86::VFMADDSUB213PSr        },
-  { X86::VFMADDSUB231PDZ128m    ,    X86::VFMADDSUB231PDm        },
-  { X86::VFMADDSUB231PDZ128r    ,    X86::VFMADDSUB231PDr        },
-  { X86::VFMADDSUB231PSZ128m    ,    X86::VFMADDSUB231PSm        },
-  { X86::VFMADDSUB231PSZ128r    ,    X86::VFMADDSUB231PSr        },
-  { X86::VFMSUB132PDZ128m       ,    X86::VFMSUB132PDm           },
-  { X86::VFMSUB132PDZ128r       ,    X86::VFMSUB132PDr           },
-  { X86::VFMSUB132PSZ128m       ,    X86::VFMSUB132PSm           },
-  { X86::VFMSUB132PSZ128r       ,    X86::VFMSUB132PSr           },
-  { X86::VFMSUB213PDZ128m       ,    X86::VFMSUB213PDm           },
-  { X86::VFMSUB213PDZ128r       ,    X86::VFMSUB213PDr           },
-  { X86::VFMSUB213PSZ128m       ,    X86::VFMSUB213PSm           },
-  { X86::VFMSUB213PSZ128r       ,    X86::VFMSUB213PSr           },
-  { X86::VFMSUB231PDZ128m       ,    X86::VFMSUB231PDm           },
-  { X86::VFMSUB231PDZ128r       ,    X86::VFMSUB231PDr           },
-  { X86::VFMSUB231PSZ128m       ,    X86::VFMSUB231PSm           },
-  { X86::VFMSUB231PSZ128r       ,    X86::VFMSUB231PSr           },
-  { X86::VFMSUBADD132PDZ128m    ,    X86::VFMSUBADD132PDm        },
-  { X86::VFMSUBADD132PDZ128r    ,    X86::VFMSUBADD132PDr        },
-  { X86::VFMSUBADD132PSZ128m    ,    X86::VFMSUBADD132PSm        },
-  { X86::VFMSUBADD132PSZ128r    ,    X86::VFMSUBADD132PSr        },
-  { X86::VFMSUBADD213PDZ128m    ,    X86::VFMSUBADD213PDm        },
-  { X86::VFMSUBADD213PDZ128r    ,    X86::VFMSUBADD213PDr        },
-  { X86::VFMSUBADD213PSZ128m    ,    X86::VFMSUBADD213PSm        },
-  { X86::VFMSUBADD213PSZ128r    ,    X86::VFMSUBADD213PSr        },
-  { X86::VFMSUBADD231PDZ128m    ,    X86::VFMSUBADD231PDm        },
-  { X86::VFMSUBADD231PDZ128r    ,    X86::VFMSUBADD231PDr        },
-  { X86::VFMSUBADD231PSZ128m    ,    X86::VFMSUBADD231PSm        },
-  { X86::VFMSUBADD231PSZ128r    ,    X86::VFMSUBADD231PSr        },
-  { X86::VFNMADD132PDZ128m      ,    X86::VFNMADD132PDm          },
-  { X86::VFNMADD132PDZ128r      ,    X86::VFNMADD132PDr          },
-  { X86::VFNMADD132PSZ128m      ,    X86::VFNMADD132PSm          },
-  { X86::VFNMADD132PSZ128r      ,    X86::VFNMADD132PSr          },
-  { X86::VFNMADD213PDZ128m      ,    X86::VFNMADD213PDm          },
-  { X86::VFNMADD213PDZ128r      ,    X86::VFNMADD213PDr          },
-  { X86::VFNMADD213PSZ128m      ,    X86::VFNMADD213PSm          },
-  { X86::VFNMADD213PSZ128r      ,    X86::VFNMADD213PSr          },
-  { X86::VFNMADD231PDZ128m      ,    X86::VFNMADD231PDm          },
-  { X86::VFNMADD231PDZ128r      ,    X86::VFNMADD231PDr          },
-  { X86::VFNMADD231PSZ128m      ,    X86::VFNMADD231PSm          },
-  { X86::VFNMADD231PSZ128r      ,    X86::VFNMADD231PSr          },
-  { X86::VFNMSUB132PDZ128m      ,    X86::VFNMSUB132PDm          },
-  { X86::VFNMSUB132PDZ128r      ,    X86::VFNMSUB132PDr          },
-  { X86::VFNMSUB132PSZ128m      ,    X86::VFNMSUB132PSm          },
-  { X86::VFNMSUB132PSZ128r      ,    X86::VFNMSUB132PSr          },
-  { X86::VFNMSUB213PDZ128m      ,    X86::VFNMSUB213PDm          },
-  { X86::VFNMSUB213PDZ128r      ,    X86::VFNMSUB213PDr          },
-  { X86::VFNMSUB213PSZ128m      ,    X86::VFNMSUB213PSm          },
-  { X86::VFNMSUB213PSZ128r      ,    X86::VFNMSUB213PSr          },
-  { X86::VFNMSUB231PDZ128m      ,    X86::VFNMSUB231PDm          },
-  { X86::VFNMSUB231PDZ128r      ,    X86::VFNMSUB231PDr          },
-  { X86::VFNMSUB231PSZ128m      ,    X86::VFNMSUB231PSm          },
-  { X86::VFNMSUB231PSZ128r      ,    X86::VFNMSUB231PSr          },
-  { X86::VMAXCPDZ128rm          ,    X86::VMAXCPDrm              },
-  { X86::VMAXCPDZ128rr          ,    X86::VMAXCPDrr              },
-  { X86::VMAXCPSZ128rm          ,    X86::VMAXCPSrm              },
-  { X86::VMAXCPSZ128rr          ,    X86::VMAXCPSrr              },
-  { X86::VMAXPDZ128rm           ,    X86::VMAXPDrm               },
-  { X86::VMAXPDZ128rr           ,    X86::VMAXPDrr               },
-  { X86::VMAXPSZ128rm           ,    X86::VMAXPSrm               },
-  { X86::VMAXPSZ128rr           ,    X86::VMAXPSrr               },
-  { X86::VMINCPDZ128rm          ,    X86::VMINCPDrm              },
-  { X86::VMINCPDZ128rr          ,    X86::VMINCPDrr              },
-  { X86::VMINCPSZ128rm          ,    X86::VMINCPSrm              },
-  { X86::VMINCPSZ128rr          ,    X86::VMINCPSrr              },
-  { X86::VMINPDZ128rm           ,    X86::VMINPDrm               },
-  { X86::VMINPDZ128rr           ,    X86::VMINPDrr               },
-  { X86::VMINPSZ128rm           ,    X86::VMINPSrm               },
-  { X86::VMINPSZ128rr           ,    X86::VMINPSrr               },
-  { X86::VMOVAPDZ128mr          ,    X86::VMOVAPDmr              },
-  { X86::VMOVAPDZ128rm          ,    X86::VMOVAPDrm              },
-  { X86::VMOVAPDZ128rr          ,    X86::VMOVAPDrr              },
-  { X86::VMOVAPDZ128rr_REV      ,    X86::VMOVAPDrr_REV          },
-  { X86::VMOVAPSZ128mr          ,    X86::VMOVAPSmr              },
-  { X86::VMOVAPSZ128rm          ,    X86::VMOVAPSrm              },
-  { X86::VMOVAPSZ128rr          ,    X86::VMOVAPSrr              },
-  { X86::VMOVAPSZ128rr_REV      ,    X86::VMOVAPSrr_REV          },
-  { X86::VMOVDDUPZ128rm         ,    X86::VMOVDDUPrm             },
-  { X86::VMOVDDUPZ128rr         ,    X86::VMOVDDUPrr             },
-  { X86::VMOVDQA32Z128mr        ,    X86::VMOVDQAmr              },
-  { X86::VMOVDQA32Z128rm        ,    X86::VMOVDQArm              },
-  { X86::VMOVDQA32Z128rr        ,    X86::VMOVDQArr              },
-  { X86::VMOVDQA32Z128rr_REV    ,    X86::VMOVDQArr_REV          },
-  { X86::VMOVDQA64Z128mr        ,    X86::VMOVDQAmr              },
-  { X86::VMOVDQA64Z128rm        ,    X86::VMOVDQArm              },
-  { X86::VMOVDQA64Z128rr        ,    X86::VMOVDQArr              },
-  { X86::VMOVDQA64Z128rr_REV    ,    X86::VMOVDQArr_REV          },
-  { X86::VMOVDQU16Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU16Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU16Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU16Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU32Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU32Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU32Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU32Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU64Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU64Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU64Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU64Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU8Z128mr         ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU8Z128rm         ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU8Z128rr         ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU8Z128rr_REV     ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVHPDZ128mr          ,    X86::VMOVHPDmr              },
-  { X86::VMOVHPDZ128rm          ,    X86::VMOVHPDrm              },
-  { X86::VMOVHPSZ128mr          ,    X86::VMOVHPSmr              },
-  { X86::VMOVHPSZ128rm          ,    X86::VMOVHPSrm              },
-  { X86::VMOVLPDZ128mr          ,    X86::VMOVLPDmr              },
-  { X86::VMOVLPDZ128rm          ,    X86::VMOVLPDrm              },
-  { X86::VMOVLPSZ128mr          ,    X86::VMOVLPSmr              },
-  { X86::VMOVLPSZ128rm          ,    X86::VMOVLPSrm              },
-  { X86::VMOVNTDQAZ128rm        ,    X86::VMOVNTDQArm            },
-  { X86::VMOVNTDQZ128mr         ,    X86::VMOVNTDQmr             },
-  { X86::VMOVNTPDZ128mr         ,    X86::VMOVNTPDmr             },
-  { X86::VMOVNTPSZ128mr         ,    X86::VMOVNTPSmr             },
-  { X86::VMOVSHDUPZ128rm        ,    X86::VMOVSHDUPrm            },
-  { X86::VMOVSHDUPZ128rr        ,    X86::VMOVSHDUPrr            },
-  { X86::VMOVSLDUPZ128rm        ,    X86::VMOVSLDUPrm            },
-  { X86::VMOVSLDUPZ128rr        ,    X86::VMOVSLDUPrr            },
-  { X86::VMOVUPDZ128mr          ,    X86::VMOVUPDmr              },
-  { X86::VMOVUPDZ128rm          ,    X86::VMOVUPDrm              },
-  { X86::VMOVUPDZ128rr          ,    X86::VMOVUPDrr              },
-  { X86::VMOVUPDZ128rr_REV      ,    X86::VMOVUPDrr_REV          },
-  { X86::VMOVUPSZ128mr          ,    X86::VMOVUPSmr              },
-  { X86::VMOVUPSZ128rm          ,    X86::VMOVUPSrm              },
-  { X86::VMOVUPSZ128rr          ,    X86::VMOVUPSrr              },
-  { X86::VMOVUPSZ128rr_REV      ,    X86::VMOVUPSrr_REV          },
-  { X86::VMULPDZ128rm           ,    X86::VMULPDrm               },
-  { X86::VMULPDZ128rr           ,    X86::VMULPDrr               },
-  { X86::VMULPSZ128rm           ,    X86::VMULPSrm               },
-  { X86::VMULPSZ128rr           ,    X86::VMULPSrr               },
-  { X86::VORPDZ128rm            ,    X86::VORPDrm                },
-  { X86::VORPDZ128rr            ,    X86::VORPDrr                },
-  { X86::VORPSZ128rm            ,    X86::VORPSrm                },
-  { X86::VORPSZ128rr            ,    X86::VORPSrr                },
-  { X86::VPABSBZ128rm           ,    X86::VPABSBrm               },
-  { X86::VPABSBZ128rr           ,    X86::VPABSBrr               },
-  { X86::VPABSDZ128rm           ,    X86::VPABSDrm               },
-  { X86::VPABSDZ128rr           ,    X86::VPABSDrr               },
-  { X86::VPABSWZ128rm           ,    X86::VPABSWrm               },
-  { X86::VPABSWZ128rr           ,    X86::VPABSWrr               },
-  { X86::VPACKSSDWZ128rm        ,    X86::VPACKSSDWrm            },
-  { X86::VPACKSSDWZ128rr        ,    X86::VPACKSSDWrr            },
-  { X86::VPACKSSWBZ128rm        ,    X86::VPACKSSWBrm            },
-  { X86::VPACKSSWBZ128rr        ,    X86::VPACKSSWBrr            },
-  { X86::VPACKUSDWZ128rm        ,    X86::VPACKUSDWrm            },
-  { X86::VPACKUSDWZ128rr        ,    X86::VPACKUSDWrr            },
-  { X86::VPACKUSWBZ128rm        ,    X86::VPACKUSWBrm            },
-  { X86::VPACKUSWBZ128rr        ,    X86::VPACKUSWBrr            },
-  { X86::VPADDBZ128rm           ,    X86::VPADDBrm               },
-  { X86::VPADDBZ128rr           ,    X86::VPADDBrr               },
-  { X86::VPADDDZ128rm           ,    X86::VPADDDrm               },
-  { X86::VPADDDZ128rr           ,    X86::VPADDDrr               },
-  { X86::VPADDQZ128rm           ,    X86::VPADDQrm               },
-  { X86::VPADDQZ128rr           ,    X86::VPADDQrr               },
-  { X86::VPADDSBZ128rm          ,    X86::VPADDSBrm              },
-  { X86::VPADDSBZ128rr          ,    X86::VPADDSBrr              },
-  { X86::VPADDSWZ128rm          ,    X86::VPADDSWrm              },
-  { X86::VPADDSWZ128rr          ,    X86::VPADDSWrr              },
-  { X86::VPADDUSBZ128rm         ,    X86::VPADDUSBrm             },
-  { X86::VPADDUSBZ128rr         ,    X86::VPADDUSBrr             },
-  { X86::VPADDUSWZ128rm         ,    X86::VPADDUSWrm             },
-  { X86::VPADDUSWZ128rr         ,    X86::VPADDUSWrr             },
-  { X86::VPADDWZ128rm           ,    X86::VPADDWrm               },
-  { X86::VPADDWZ128rr           ,    X86::VPADDWrr               },
-  { X86::VPALIGNRZ128rmi        ,    X86::VPALIGNRrmi            },
-  { X86::VPALIGNRZ128rri        ,    X86::VPALIGNRrri            },
-  { X86::VPANDDZ128rm           ,    X86::VPANDrm                },
-  { X86::VPANDDZ128rr           ,    X86::VPANDrr                },
-  { X86::VPANDQZ128rm           ,    X86::VPANDrm                },
-  { X86::VPANDQZ128rr           ,    X86::VPANDrr                },
-  { X86::VPAVGBZ128rm           ,    X86::VPAVGBrm               },
-  { X86::VPAVGBZ128rr           ,    X86::VPAVGBrr               },
-  { X86::VPAVGWZ128rm           ,    X86::VPAVGWrm               },
-  { X86::VPAVGWZ128rr           ,    X86::VPAVGWrr               },
-  { X86::VPBROADCASTBZ128m      ,    X86::VPBROADCASTBrm         },
-  { X86::VPBROADCASTBZ128r      ,    X86::VPBROADCASTBrr         },
-  { X86::VPBROADCASTDZ128m      ,    X86::VPBROADCASTDrm         },
-  { X86::VPBROADCASTDZ128r      ,    X86::VPBROADCASTDrr         },
-  { X86::VPBROADCASTQZ128m      ,    X86::VPBROADCASTQrm         },
-  { X86::VPBROADCASTQZ128r      ,    X86::VPBROADCASTQrr         },
-  { X86::VPBROADCASTWZ128m      ,    X86::VPBROADCASTWrm         },
-  { X86::VPBROADCASTWZ128r      ,    X86::VPBROADCASTWrr         },
-  { X86::VPERMILPDZ128mi        ,    X86::VPERMILPDmi            },
-  { X86::VPERMILPDZ128ri        ,    X86::VPERMILPDri            },
-  { X86::VPERMILPDZ128rm        ,    X86::VPERMILPDrm            },
-  { X86::VPERMILPDZ128rr        ,    X86::VPERMILPDrr            },
-  { X86::VPERMILPSZ128mi        ,    X86::VPERMILPSmi            },
-  { X86::VPERMILPSZ128ri        ,    X86::VPERMILPSri            },
-  { X86::VPERMILPSZ128rm        ,    X86::VPERMILPSrm            },
-  { X86::VPERMILPSZ128rr        ,    X86::VPERMILPSrr            },
-  { X86::VPMADDUBSWZ128rm       ,    X86::VPMADDUBSWrm           },
-  { X86::VPMADDUBSWZ128rr       ,    X86::VPMADDUBSWrr           },
-  { X86::VPMADDWDZ128rm         ,    X86::VPMADDWDrm             },
-  { X86::VPMADDWDZ128rr         ,    X86::VPMADDWDrr             },
-  { X86::VPMAXSBZ128rm          ,    X86::VPMAXSBrm              },
-  { X86::VPMAXSBZ128rr          ,    X86::VPMAXSBrr              },
-  { X86::VPMAXSDZ128rm          ,    X86::VPMAXSDrm              },
-  { X86::VPMAXSDZ128rr          ,    X86::VPMAXSDrr              },
-  { X86::VPMAXSWZ128rm          ,    X86::VPMAXSWrm              },
-  { X86::VPMAXSWZ128rr          ,    X86::VPMAXSWrr              },
-  { X86::VPMAXUBZ128rm          ,    X86::VPMAXUBrm              },
-  { X86::VPMAXUBZ128rr          ,    X86::VPMAXUBrr              },
-  { X86::VPMAXUDZ128rm          ,    X86::VPMAXUDrm              },
-  { X86::VPMAXUDZ128rr          ,    X86::VPMAXUDrr              },
-  { X86::VPMAXUWZ128rm          ,    X86::VPMAXUWrm              },
-  { X86::VPMAXUWZ128rr          ,    X86::VPMAXUWrr              },
-  { X86::VPMINSBZ128rm          ,    X86::VPMINSBrm              },
-  { X86::VPMINSBZ128rr          ,    X86::VPMINSBrr              },
-  { X86::VPMINSDZ128rm          ,    X86::VPMINSDrm              },
-  { X86::VPMINSDZ128rr          ,    X86::VPMINSDrr              },
-  { X86::VPMINSWZ128rm          ,    X86::VPMINSWrm              },
-  { X86::VPMINSWZ128rr          ,    X86::VPMINSWrr              },
-  { X86::VPMINUBZ128rm          ,    X86::VPMINUBrm              },
-  { X86::VPMINUBZ128rr          ,    X86::VPMINUBrr              },
-  { X86::VPMINUDZ128rm          ,    X86::VPMINUDrm              },
-  { X86::VPMINUDZ128rr          ,    X86::VPMINUDrr              },
-  { X86::VPMINUWZ128rm          ,    X86::VPMINUWrm              },
-  { X86::VPMINUWZ128rr          ,    X86::VPMINUWrr              },
-  { X86::VPMOVSXBDZ128rm        ,    X86::VPMOVSXBDrm            },
-  { X86::VPMOVSXBDZ128rr        ,    X86::VPMOVSXBDrr            },
-  { X86::VPMOVSXBQZ128rm        ,    X86::VPMOVSXBQrm            },
-  { X86::VPMOVSXBQZ128rr        ,    X86::VPMOVSXBQrr            },
-  { X86::VPMOVSXBWZ128rm        ,    X86::VPMOVSXBWrm            },
-  { X86::VPMOVSXBWZ128rr        ,    X86::VPMOVSXBWrr            },
-  { X86::VPMOVSXDQZ128rm        ,    X86::VPMOVSXDQrm            },
-  { X86::VPMOVSXDQZ128rr        ,    X86::VPMOVSXDQrr            },
-  { X86::VPMOVSXWDZ128rm        ,    X86::VPMOVSXWDrm            },
-  { X86::VPMOVSXWDZ128rr        ,    X86::VPMOVSXWDrr            },
-  { X86::VPMOVSXWQZ128rm        ,    X86::VPMOVSXWQrm            },
-  { X86::VPMOVSXWQZ128rr        ,    X86::VPMOVSXWQrr            },
-  { X86::VPMOVZXBDZ128rm        ,    X86::VPMOVZXBDrm            },
-  { X86::VPMOVZXBDZ128rr        ,    X86::VPMOVZXBDrr            },
-  { X86::VPMOVZXBQZ128rm        ,    X86::VPMOVZXBQrm            },
-  { X86::VPMOVZXBQZ128rr        ,    X86::VPMOVZXBQrr            },
-  { X86::VPMOVZXBWZ128rm        ,    X86::VPMOVZXBWrm            },
-  { X86::VPMOVZXBWZ128rr        ,    X86::VPMOVZXBWrr            },
-  { X86::VPMOVZXDQZ128rm        ,    X86::VPMOVZXDQrm            },
-  { X86::VPMOVZXDQZ128rr        ,    X86::VPMOVZXDQrr            },
-  { X86::VPMOVZXWDZ128rm        ,    X86::VPMOVZXWDrm            },
-  { X86::VPMOVZXWDZ128rr        ,    X86::VPMOVZXWDrr            },
-  { X86::VPMOVZXWQZ128rm        ,    X86::VPMOVZXWQrm            },
-  { X86::VPMOVZXWQZ128rr        ,    X86::VPMOVZXWQrr            },
-  { X86::VPMULDQZ128rm          ,    X86::VPMULDQrm              },
-  { X86::VPMULDQZ128rr          ,    X86::VPMULDQrr              },
-  { X86::VPMULHRSWZ128rm        ,    X86::VPMULHRSWrm            },
-  { X86::VPMULHRSWZ128rr        ,    X86::VPMULHRSWrr            },
-  { X86::VPMULHUWZ128rm         ,    X86::VPMULHUWrm             },
-  { X86::VPMULHUWZ128rr         ,    X86::VPMULHUWrr             },
-  { X86::VPMULHWZ128rm          ,    X86::VPMULHWrm              },
-  { X86::VPMULHWZ128rr          ,    X86::VPMULHWrr              },
-  { X86::VPMULLDZ128rm          ,    X86::VPMULLDrm              },
-  { X86::VPMULLDZ128rr          ,    X86::VPMULLDrr              },
-  { X86::VPMULLWZ128rm          ,    X86::VPMULLWrm              },
-  { X86::VPMULLWZ128rr          ,    X86::VPMULLWrr              },
-  { X86::VPMULUDQZ128rm         ,    X86::VPMULUDQrm             },
-  { X86::VPMULUDQZ128rr         ,    X86::VPMULUDQrr             },
-  { X86::VPORDZ128rm            ,    X86::VPORrm                 },
-  { X86::VPORDZ128rr            ,    X86::VPORrr                 },
-  { X86::VPORQZ128rm            ,    X86::VPORrm                 },
-  { X86::VPORQZ128rr            ,    X86::VPORrr                 },
-  { X86::VPSADBWZ128rm          ,    X86::VPSADBWrm              },
-  { X86::VPSADBWZ128rr          ,    X86::VPSADBWrr              },
-  { X86::VPSHUFBZ128rm          ,    X86::VPSHUFBrm              },
-  { X86::VPSHUFBZ128rr          ,    X86::VPSHUFBrr              },
-  { X86::VPSHUFDZ128mi          ,    X86::VPSHUFDmi              },
-  { X86::VPSHUFDZ128ri          ,    X86::VPSHUFDri              },
-  { X86::VPSHUFHWZ128mi         ,    X86::VPSHUFHWmi             },
-  { X86::VPSHUFHWZ128ri         ,    X86::VPSHUFHWri             },
-  { X86::VPSHUFLWZ128mi         ,    X86::VPSHUFLWmi             },
-  { X86::VPSHUFLWZ128ri         ,    X86::VPSHUFLWri             },
-  { X86::VPSLLDQZ128rr          ,    X86::VPSLLDQri              },
-  { X86::VPSLLDZ128ri           ,    X86::VPSLLDri               },
-  { X86::VPSLLDZ128rm           ,    X86::VPSLLDrm               },
-  { X86::VPSLLDZ128rr           ,    X86::VPSLLDrr               },
-  { X86::VPSLLQZ128ri           ,    X86::VPSLLQri               },
-  { X86::VPSLLQZ128rm           ,    X86::VPSLLQrm               },
-  { X86::VPSLLQZ128rr           ,    X86::VPSLLQrr               },
-  { X86::VPSLLVDZ128rm          ,    X86::VPSLLVDrm              },
-  { X86::VPSLLVDZ128rr          ,    X86::VPSLLVDrr              },
-  { X86::VPSLLVQZ128rm          ,    X86::VPSLLVQrm              },
-  { X86::VPSLLVQZ128rr          ,    X86::VPSLLVQrr              },
-  { X86::VPSLLWZ128ri           ,    X86::VPSLLWri               },
-  { X86::VPSLLWZ128rm           ,    X86::VPSLLWrm               },
-  { X86::VPSLLWZ128rr           ,    X86::VPSLLWrr               },
-  { X86::VPSRADZ128ri           ,    X86::VPSRADri               },
-  { X86::VPSRADZ128rm           ,    X86::VPSRADrm               },
-  { X86::VPSRADZ128rr           ,    X86::VPSRADrr               },
-  { X86::VPSRAVDZ128rm          ,    X86::VPSRAVDrm              },
-  { X86::VPSRAVDZ128rr          ,    X86::VPSRAVDrr              },
-  { X86::VPSRAWZ128ri           ,    X86::VPSRAWri               },
-  { X86::VPSRAWZ128rm           ,    X86::VPSRAWrm               },
-  { X86::VPSRAWZ128rr           ,    X86::VPSRAWrr               },
-  { X86::VPSRLDQZ128rr          ,    X86::VPSRLDQri              },
-  { X86::VPSRLDZ128ri           ,    X86::VPSRLDri               },
-  { X86::VPSRLDZ128rm           ,    X86::VPSRLDrm               },
-  { X86::VPSRLDZ128rr           ,    X86::VPSRLDrr               },
-  { X86::VPSRLQZ128ri           ,    X86::VPSRLQri               },
-  { X86::VPSRLQZ128rm           ,    X86::VPSRLQrm               },
-  { X86::VPSRLQZ128rr           ,    X86::VPSRLQrr               },
-  { X86::VPSRLVDZ128rm          ,    X86::VPSRLVDrm              },
-  { X86::VPSRLVDZ128rr          ,    X86::VPSRLVDrr              },
-  { X86::VPSRLVQZ128rm          ,    X86::VPSRLVQrm              },
-  { X86::VPSRLVQZ128rr          ,    X86::VPSRLVQrr              },
-  { X86::VPSRLWZ128ri           ,    X86::VPSRLWri               },
-  { X86::VPSRLWZ128rm           ,    X86::VPSRLWrm               },
-  { X86::VPSRLWZ128rr           ,    X86::VPSRLWrr               },
-  { X86::VPSUBBZ128rm           ,    X86::VPSUBBrm               },
-  { X86::VPSUBBZ128rr           ,    X86::VPSUBBrr               },
-  { X86::VPSUBDZ128rm           ,    X86::VPSUBDrm               },
-  { X86::VPSUBDZ128rr           ,    X86::VPSUBDrr               },
-  { X86::VPSUBQZ128rm           ,    X86::VPSUBQrm               },
-  { X86::VPSUBQZ128rr           ,    X86::VPSUBQrr               },
-  { X86::VPSUBSBZ128rm          ,    X86::VPSUBSBrm              },
-  { X86::VPSUBSBZ128rr          ,    X86::VPSUBSBrr              },
-  { X86::VPSUBSWZ128rm          ,    X86::VPSUBSWrm              },
-  { X86::VPSUBSWZ128rr          ,    X86::VPSUBSWrr              },
-  { X86::VPSUBUSBZ128rm         ,    X86::VPSUBUSBrm             },
-  { X86::VPSUBUSBZ128rr         ,    X86::VPSUBUSBrr             },
-  { X86::VPSUBUSWZ128rm         ,    X86::VPSUBUSWrm             },
-  { X86::VPSUBUSWZ128rr         ,    X86::VPSUBUSWrr             },
-  { X86::VPSUBWZ128rm           ,    X86::VPSUBWrm               },
-  { X86::VPSUBWZ128rr           ,    X86::VPSUBWrr               },
-  { X86::VPUNPCKHBWZ128rm       ,    X86::VPUNPCKHBWrm           },
-  { X86::VPUNPCKHBWZ128rr       ,    X86::VPUNPCKHBWrr           },
-  { X86::VPUNPCKHDQZ128rm       ,    X86::VPUNPCKHDQrm           },
-  { X86::VPUNPCKHDQZ128rr       ,    X86::VPUNPCKHDQrr           },
-  { X86::VPUNPCKHQDQZ128rm      ,    X86::VPUNPCKHQDQrm          },
-  { X86::VPUNPCKHQDQZ128rr      ,    X86::VPUNPCKHQDQrr          },
-  { X86::VPUNPCKHWDZ128rm       ,    X86::VPUNPCKHWDrm           },
-  { X86::VPUNPCKHWDZ128rr       ,    X86::VPUNPCKHWDrr           },
-  { X86::VPUNPCKLBWZ128rm       ,    X86::VPUNPCKLBWrm           },
-  { X86::VPUNPCKLBWZ128rr       ,    X86::VPUNPCKLBWrr           },
-  { X86::VPUNPCKLDQZ128rm       ,    X86::VPUNPCKLDQrm           },
-  { X86::VPUNPCKLDQZ128rr       ,    X86::VPUNPCKLDQrr           },
-  { X86::VPUNPCKLQDQZ128rm      ,    X86::VPUNPCKLQDQrm          },
-  { X86::VPUNPCKLQDQZ128rr      ,    X86::VPUNPCKLQDQrr          },
-  { X86::VPUNPCKLWDZ128rm       ,    X86::VPUNPCKLWDrm           },
-  { X86::VPUNPCKLWDZ128rr       ,    X86::VPUNPCKLWDrr           },
-  { X86::VPXORDZ128rm           ,    X86::VPXORrm                },
-  { X86::VPXORDZ128rr           ,    X86::VPXORrr                },
-  { X86::VPXORQZ128rm           ,    X86::VPXORrm                },
-  { X86::VPXORQZ128rr           ,    X86::VPXORrr                },
-  { X86::VSHUFPDZ128rmi         ,    X86::VSHUFPDrmi             },
-  { X86::VSHUFPDZ128rri         ,    X86::VSHUFPDrri             },
-  { X86::VSHUFPSZ128rmi         ,    X86::VSHUFPSrmi             },
-  { X86::VSHUFPSZ128rri         ,    X86::VSHUFPSrri             },
-  { X86::VSQRTPDZ128m           ,    X86::VSQRTPDm               },
-  { X86::VSQRTPDZ128r           ,    X86::VSQRTPDr               },
-  { X86::VSQRTPSZ128m           ,    X86::VSQRTPSm               },
-  { X86::VSQRTPSZ128r           ,    X86::VSQRTPSr               },
-  { X86::VSUBPDZ128rm           ,    X86::VSUBPDrm               },
-  { X86::VSUBPDZ128rr           ,    X86::VSUBPDrr               },
-  { X86::VSUBPSZ128rm           ,    X86::VSUBPSrm               },
-  { X86::VSUBPSZ128rr           ,    X86::VSUBPSrr               },
-  { X86::VUNPCKHPDZ128rm        ,    X86::VUNPCKHPDrm            },
-  { X86::VUNPCKHPDZ128rr        ,    X86::VUNPCKHPDrr            },
-  { X86::VUNPCKHPSZ128rm        ,    X86::VUNPCKHPSrm            },
-  { X86::VUNPCKHPSZ128rr        ,    X86::VUNPCKHPSrr            },
-  { X86::VUNPCKLPDZ128rm        ,    X86::VUNPCKLPDrm            },
-  { X86::VUNPCKLPDZ128rr        ,    X86::VUNPCKLPDrr            },
-  { X86::VUNPCKLPSZ128rm        ,    X86::VUNPCKLPSrm            },
-  { X86::VUNPCKLPSZ128rr        ,    X86::VUNPCKLPSrr            },
-  { X86::VXORPDZ128rm           ,    X86::VXORPDrm               },
-  { X86::VXORPDZ128rr           ,    X86::VXORPDrr               },
-  { X86::VXORPSZ128rm           ,    X86::VXORPSrm               },
-  { X86::VXORPSZ128rr           ,    X86::VXORPSrr               },
-};
-
-
-// X86 EVEX encoded instructions that have a VEX 256 encoding
-// (table format: <EVEX opcode, VEX-256 opcode>).
- static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = {
-  { X86::VADDPDZ256rm           ,     X86::VADDPDYrm             },
-  { X86::VADDPDZ256rr           ,     X86::VADDPDYrr             },
-  { X86::VADDPSZ256rm           ,     X86::VADDPSYrm             },
-  { X86::VADDPSZ256rr           ,     X86::VADDPSYrr             },
-  { X86::VANDNPDZ256rm          ,     X86::VANDNPDYrm            },
-  { X86::VANDNPDZ256rr          ,     X86::VANDNPDYrr            },
-  { X86::VANDNPSZ256rm          ,     X86::VANDNPSYrm            },
-  { X86::VANDNPSZ256rr          ,     X86::VANDNPSYrr            },
-  { X86::VANDPDZ256rm           ,     X86::VANDPDYrm             },
-  { X86::VANDPDZ256rr           ,     X86::VANDPDYrr             },
-  { X86::VANDPSZ256rm           ,     X86::VANDPSYrm             },
-  { X86::VANDPSZ256rr           ,     X86::VANDPSYrr             },
-  { X86::VBROADCASTSDZ256m      ,     X86::VBROADCASTSDYrm       },
-  { X86::VBROADCASTSDZ256r      ,     X86::VBROADCASTSDYrr       },
-  { X86::VBROADCASTSDZ256r_s    ,     X86::VBROADCASTSDYrr       },
-  { X86::VBROADCASTSSZ256m      ,     X86::VBROADCASTSSYrm       },
-  { X86::VBROADCASTSSZ256r      ,     X86::VBROADCASTSSYrr       },
-  { X86::VBROADCASTSSZ256r_s    ,     X86::VBROADCASTSSYrr       },
-  { X86::VCVTDQ2PDZ256rm        ,     X86::VCVTDQ2PDYrm          },
-  { X86::VCVTDQ2PDZ256rr        ,     X86::VCVTDQ2PDYrr          },
-  { X86::VCVTDQ2PSZ256rm        ,     X86::VCVTDQ2PSYrm          },
-  { X86::VCVTDQ2PSZ256rr        ,     X86::VCVTDQ2PSYrr          },
-  { X86::VCVTPD2DQZ256rm        ,     X86::VCVTPD2DQYrm          },
-  { X86::VCVTPD2DQZ256rr        ,     X86::VCVTPD2DQYrr          },
-  { X86::VCVTPD2PSZ256rm        ,     X86::VCVTPD2PSYrm          },
-  { X86::VCVTPD2PSZ256rr        ,     X86::VCVTPD2PSYrr          },
-  { X86::VCVTPH2PSZ256rm        ,     X86::VCVTPH2PSYrm          },
-  { X86::VCVTPH2PSZ256rr        ,     X86::VCVTPH2PSYrr          },
-  { X86::VCVTPS2DQZ256rm        ,     X86::VCVTPS2DQYrm          },
-  { X86::VCVTPS2DQZ256rr        ,     X86::VCVTPS2DQYrr          },
-  { X86::VCVTPS2PDZ256rm        ,     X86::VCVTPS2PDYrm          },
-  { X86::VCVTPS2PDZ256rr        ,     X86::VCVTPS2PDYrr          },
-  { X86::VCVTPS2PHZ256mr        ,     X86::VCVTPS2PHYmr          },
-  { X86::VCVTPS2PHZ256rr        ,     X86::VCVTPS2PHYrr          },
-  { X86::VCVTTPD2DQZ256rm       ,     X86::VCVTTPD2DQYrm         },
-  { X86::VCVTTPD2DQZ256rr       ,     X86::VCVTTPD2DQYrr         },
-  { X86::VCVTTPS2DQZ256rm       ,     X86::VCVTTPS2DQYrm         },
-  { X86::VCVTTPS2DQZ256rr       ,     X86::VCVTTPS2DQYrr         },
-  { X86::VDIVPDZ256rm           ,     X86::VDIVPDYrm             },
-  { X86::VDIVPDZ256rr           ,     X86::VDIVPDYrr             },
-  { X86::VDIVPSZ256rm           ,     X86::VDIVPSYrm             },
-  { X86::VDIVPSZ256rr           ,     X86::VDIVPSYrr             },
-  { X86::VEXTRACTF32x4Z256mr    ,    X86::VEXTRACTF128mr         },
-  { X86::VEXTRACTF64x2Z256mr    ,    X86::VEXTRACTF128mr         },
-  { X86::VEXTRACTF32x4Z256rr    ,    X86::VEXTRACTF128rr         },
-  { X86::VEXTRACTF64x2Z256rr    ,    X86::VEXTRACTF128rr         },
-  { X86::VEXTRACTI32x4Z256mr    ,    X86::VEXTRACTI128mr         },
-  { X86::VEXTRACTI64x2Z256mr    ,    X86::VEXTRACTI128mr         },
-  { X86::VEXTRACTI32x4Z256rr    ,    X86::VEXTRACTI128rr         },
-  { X86::VEXTRACTI64x2Z256rr    ,    X86::VEXTRACTI128rr         },
-  { X86::VFMADD132PDZ256m       ,     X86::VFMADD132PDYm         },
-  { X86::VFMADD132PDZ256r       ,     X86::VFMADD132PDYr         },
-  { X86::VFMADD132PSZ256m       ,     X86::VFMADD132PSYm         },
-  { X86::VFMADD132PSZ256r       ,     X86::VFMADD132PSYr         },
-  { X86::VFMADD213PDZ256m       ,     X86::VFMADD213PDYm         },
-  { X86::VFMADD213PDZ256r       ,     X86::VFMADD213PDYr         },
-  { X86::VFMADD213PSZ256m       ,     X86::VFMADD213PSYm         },
-  { X86::VFMADD213PSZ256r       ,     X86::VFMADD213PSYr         },
-  { X86::VFMADD231PDZ256m       ,     X86::VFMADD231PDYm         },
-  { X86::VFMADD231PDZ256r       ,     X86::VFMADD231PDYr         },
-  { X86::VFMADD231PSZ256m       ,     X86::VFMADD231PSYm         },
-  { X86::VFMADD231PSZ256r       ,     X86::VFMADD231PSYr         },
-  { X86::VFMADDSUB132PDZ256m    ,     X86::VFMADDSUB132PDYm      },
-  { X86::VFMADDSUB132PDZ256r    ,     X86::VFMADDSUB132PDYr      },
-  { X86::VFMADDSUB132PSZ256m    ,     X86::VFMADDSUB132PSYm      },
-  { X86::VFMADDSUB132PSZ256r    ,     X86::VFMADDSUB132PSYr      },
-  { X86::VFMADDSUB213PDZ256m    ,     X86::VFMADDSUB213PDYm      },
-  { X86::VFMADDSUB213PDZ256r    ,     X86::VFMADDSUB213PDYr      },
-  { X86::VFMADDSUB213PSZ256m    ,     X86::VFMADDSUB213PSYm      },
-  { X86::VFMADDSUB213PSZ256r    ,     X86::VFMADDSUB213PSYr      },
-  { X86::VFMADDSUB231PDZ256m    ,     X86::VFMADDSUB231PDYm      },
-  { X86::VFMADDSUB231PDZ256r    ,     X86::VFMADDSUB231PDYr      },
-  { X86::VFMADDSUB231PSZ256m    ,     X86::VFMADDSUB231PSYm      },
-  { X86::VFMADDSUB231PSZ256r    ,     X86::VFMADDSUB231PSYr      },
-  { X86::VFMSUB132PDZ256m       ,     X86::VFMSUB132PDYm         },
-  { X86::VFMSUB132PDZ256r       ,     X86::VFMSUB132PDYr         },
-  { X86::VFMSUB132PSZ256m       ,     X86::VFMSUB132PSYm         },
-  { X86::VFMSUB132PSZ256r       ,     X86::VFMSUB132PSYr         },
-  { X86::VFMSUB213PDZ256m       ,     X86::VFMSUB213PDYm         },
-  { X86::VFMSUB213PDZ256r       ,     X86::VFMSUB213PDYr         },
-  { X86::VFMSUB213PSZ256m       ,     X86::VFMSUB213PSYm         },
-  { X86::VFMSUB213PSZ256r       ,     X86::VFMSUB213PSYr         },
-  { X86::VFMSUB231PDZ256m       ,     X86::VFMSUB231PDYm         },
-  { X86::VFMSUB231PDZ256r       ,     X86::VFMSUB231PDYr         },
-  { X86::VFMSUB231PSZ256m       ,     X86::VFMSUB231PSYm         },
-  { X86::VFMSUB231PSZ256r       ,     X86::VFMSUB231PSYr         },
-  { X86::VFMSUBADD132PDZ256m    ,     X86::VFMSUBADD132PDYm      },
-  { X86::VFMSUBADD132PDZ256r    ,     X86::VFMSUBADD132PDYr      },
-  { X86::VFMSUBADD132PSZ256m    ,     X86::VFMSUBADD132PSYm      },
-  { X86::VFMSUBADD132PSZ256r    ,     X86::VFMSUBADD132PSYr      },
-  { X86::VFMSUBADD213PDZ256m    ,     X86::VFMSUBADD213PDYm      },
-  { X86::VFMSUBADD213PDZ256r    ,     X86::VFMSUBADD213PDYr      },
-  { X86::VFMSUBADD213PSZ256m    ,     X86::VFMSUBADD213PSYm      },
-  { X86::VFMSUBADD213PSZ256r    ,     X86::VFMSUBADD213PSYr      },
-  { X86::VFMSUBADD231PDZ256m    ,     X86::VFMSUBADD231PDYm      },
-  { X86::VFMSUBADD231PDZ256r    ,     X86::VFMSUBADD231PDYr      },
-  { X86::VFMSUBADD231PSZ256m    ,     X86::VFMSUBADD231PSYm      },
-  { X86::VFMSUBADD231PSZ256r    ,     X86::VFMSUBADD231PSYr      },
-  { X86::VFNMADD132PDZ256m      ,     X86::VFNMADD132PDYm        },
-  { X86::VFNMADD132PDZ256r      ,     X86::VFNMADD132PDYr        },
-  { X86::VFNMADD132PSZ256m      ,     X86::VFNMADD132PSYm        },
-  { X86::VFNMADD132PSZ256r      ,     X86::VFNMADD132PSYr        },
-  { X86::VFNMADD213PDZ256m      ,     X86::VFNMADD213PDYm        },
-  { X86::VFNMADD213PDZ256r      ,     X86::VFNMADD213PDYr        },
-  { X86::VFNMADD213PSZ256m      ,     X86::VFNMADD213PSYm        },
-  { X86::VFNMADD213PSZ256r      ,     X86::VFNMADD213PSYr        },
-  { X86::VFNMADD231PDZ256m      ,     X86::VFNMADD231PDYm        },
-  { X86::VFNMADD231PDZ256r      ,     X86::VFNMADD231PDYr        },
-  { X86::VFNMADD231PSZ256m      ,     X86::VFNMADD231PSYm        },
-  { X86::VFNMADD231PSZ256r      ,     X86::VFNMADD231PSYr        },
-  { X86::VFNMSUB132PDZ256m      ,     X86::VFNMSUB132PDYm        },
-  { X86::VFNMSUB132PDZ256r      ,     X86::VFNMSUB132PDYr        },
-  { X86::VFNMSUB132PSZ256m      ,     X86::VFNMSUB132PSYm        },
-  { X86::VFNMSUB132PSZ256r      ,     X86::VFNMSUB132PSYr        },
-  { X86::VFNMSUB213PDZ256m      ,     X86::VFNMSUB213PDYm        },
-  { X86::VFNMSUB213PDZ256r      ,     X86::VFNMSUB213PDYr        },
-  { X86::VFNMSUB213PSZ256m      ,     X86::VFNMSUB213PSYm        },
-  { X86::VFNMSUB213PSZ256r      ,     X86::VFNMSUB213PSYr        },
-  { X86::VFNMSUB231PDZ256m      ,     X86::VFNMSUB231PDYm        },
-  { X86::VFNMSUB231PDZ256r      ,     X86::VFNMSUB231PDYr        },
-  { X86::VFNMSUB231PSZ256m      ,     X86::VFNMSUB231PSYm        },
-  { X86::VFNMSUB231PSZ256r      ,     X86::VFNMSUB231PSYr        },
-  { X86::VINSERTF32x4Z256rm     ,    X86::VINSERTF128rm          },
-  { X86::VINSERTF64x2Z256rm     ,    X86::VINSERTF128rm          },
-  { X86::VINSERTF32x4Z256rr     ,    X86::VINSERTF128rr          },
-  { X86::VINSERTF64x2Z256rr     ,    X86::VINSERTF128rr          },
-  { X86::VINSERTI32x4Z256rm     ,    X86::VINSERTI128rm          },
-  { X86::VINSERTI64x2Z256rm     ,    X86::VINSERTI128rm          },
-  { X86::VINSERTI32x4Z256rr     ,    X86::VINSERTI128rr          },
-  { X86::VINSERTI64x2Z256rr     ,    X86::VINSERTI128rr          },
-  { X86::VMAXCPDZ256rm          ,     X86::VMAXCPDYrm            },
-  { X86::VMAXCPDZ256rr          ,     X86::VMAXCPDYrr            },
-  { X86::VMAXCPSZ256rm          ,     X86::VMAXCPSYrm            },
-  { X86::VMAXCPSZ256rr          ,     X86::VMAXCPSYrr            },
-  { X86::VMAXPDZ256rm           ,     X86::VMAXPDYrm             },
-  { X86::VMAXPDZ256rr           ,     X86::VMAXPDYrr             },
-  { X86::VMAXPSZ256rm           ,     X86::VMAXPSYrm             },
-  { X86::VMAXPSZ256rr           ,     X86::VMAXPSYrr             },
-  { X86::VMINCPDZ256rm          ,     X86::VMINCPDYrm            },
-  { X86::VMINCPDZ256rr          ,     X86::VMINCPDYrr            },
-  { X86::VMINCPSZ256rm          ,     X86::VMINCPSYrm            },
-  { X86::VMINCPSZ256rr          ,     X86::VMINCPSYrr            },
-  { X86::VMINPDZ256rm           ,     X86::VMINPDYrm             },
-  { X86::VMINPDZ256rr           ,     X86::VMINPDYrr             },
-  { X86::VMINPSZ256rm           ,     X86::VMINPSYrm             },
-  { X86::VMINPSZ256rr           ,     X86::VMINPSYrr             },
-  { X86::VMOVAPDZ256mr          ,     X86::VMOVAPDYmr            },
-  { X86::VMOVAPDZ256rm          ,     X86::VMOVAPDYrm            },
-  { X86::VMOVAPDZ256rr          ,     X86::VMOVAPDYrr            },
-  { X86::VMOVAPDZ256rr_REV      ,     X86::VMOVAPDYrr_REV        },
-  { X86::VMOVAPSZ256mr          ,     X86::VMOVAPSYmr            },
-  { X86::VMOVAPSZ256rm          ,     X86::VMOVAPSYrm            },
-  { X86::VMOVAPSZ256rr          ,     X86::VMOVAPSYrr            },
-  { X86::VMOVAPSZ256rr_REV      ,     X86::VMOVAPSYrr_REV        },
-  { X86::VMOVDDUPZ256rm         ,     X86::VMOVDDUPYrm           },
-  { X86::VMOVDDUPZ256rr         ,     X86::VMOVDDUPYrr           },
-  { X86::VMOVDQA32Z256mr        ,     X86::VMOVDQAYmr            },
-  { X86::VMOVDQA32Z256rm        ,     X86::VMOVDQAYrm            },
-  { X86::VMOVDQA32Z256rr        ,     X86::VMOVDQAYrr            },
-  { X86::VMOVDQA32Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
-  { X86::VMOVDQA64Z256mr        ,     X86::VMOVDQAYmr            },
-  { X86::VMOVDQA64Z256rm        ,     X86::VMOVDQAYrm            },
-  { X86::VMOVDQA64Z256rr        ,     X86::VMOVDQAYrr            },
-  { X86::VMOVDQA64Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
-  { X86::VMOVDQU16Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU16Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU16Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU16Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU32Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU32Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU32Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU32Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU64Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU64Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU64Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU64Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU8Z256mr         ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU8Z256rm         ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU8Z256rr         ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU8Z256rr_REV     ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVNTDQAZ256rm        ,     X86::VMOVNTDQAYrm          },
-  { X86::VMOVNTDQZ256mr         ,     X86::VMOVNTDQYmr           },
-  { X86::VMOVNTPDZ256mr         ,     X86::VMOVNTPDYmr           },
-  { X86::VMOVNTPSZ256mr         ,     X86::VMOVNTPSYmr           },
-  { X86::VMOVSHDUPZ256rm        ,     X86::VMOVSHDUPYrm          },
-  { X86::VMOVSHDUPZ256rr        ,     X86::VMOVSHDUPYrr          },
-  { X86::VMOVSLDUPZ256rm        ,     X86::VMOVSLDUPYrm          },
-  { X86::VMOVSLDUPZ256rr        ,     X86::VMOVSLDUPYrr          },
-  { X86::VMOVUPDZ256mr          ,     X86::VMOVUPDYmr            },
-  { X86::VMOVUPDZ256rm          ,     X86::VMOVUPDYrm            },
-  { X86::VMOVUPDZ256rr          ,     X86::VMOVUPDYrr            },
-  { X86::VMOVUPDZ256rr_REV      ,     X86::VMOVUPDYrr_REV        },
-  { X86::VMOVUPSZ256mr          ,     X86::VMOVUPSYmr            },
-  { X86::VMOVUPSZ256rm          ,     X86::VMOVUPSYrm            },
-  { X86::VMOVUPSZ256rr          ,     X86::VMOVUPSYrr            },
-  { X86::VMOVUPSZ256rr_REV      ,     X86::VMOVUPSYrr_REV        },
-  { X86::VMULPDZ256rm           ,     X86::VMULPDYrm             },
-  { X86::VMULPDZ256rr           ,     X86::VMULPDYrr             },
-  { X86::VMULPSZ256rm           ,     X86::VMULPSYrm             },
-  { X86::VMULPSZ256rr           ,     X86::VMULPSYrr             },
-  { X86::VORPDZ256rm            ,     X86::VORPDYrm              },
-  { X86::VORPDZ256rr            ,     X86::VORPDYrr              },
-  { X86::VORPSZ256rm            ,     X86::VORPSYrm              },
-  { X86::VORPSZ256rr            ,     X86::VORPSYrr              },
-  { X86::VPABSBZ256rm           ,     X86::VPABSBYrm             },
-  { X86::VPABSBZ256rr           ,     X86::VPABSBYrr             },
-  { X86::VPABSDZ256rm           ,     X86::VPABSDYrm             },
-  { X86::VPABSDZ256rr           ,     X86::VPABSDYrr             },
-  { X86::VPABSWZ256rm           ,     X86::VPABSWYrm             },
-  { X86::VPABSWZ256rr           ,     X86::VPABSWYrr             },
-  { X86::VPACKSSDWZ256rm        ,     X86::VPACKSSDWYrm          },
-  { X86::VPACKSSDWZ256rr        ,     X86::VPACKSSDWYrr          },
-  { X86::VPACKSSWBZ256rm        ,     X86::VPACKSSWBYrm          },
-  { X86::VPACKSSWBZ256rr        ,     X86::VPACKSSWBYrr          },
-  { X86::VPACKUSDWZ256rm        ,     X86::VPACKUSDWYrm          },
-  { X86::VPACKUSDWZ256rr        ,     X86::VPACKUSDWYrr          },
-  { X86::VPACKUSWBZ256rm        ,     X86::VPACKUSWBYrm          },
-  { X86::VPACKUSWBZ256rr        ,     X86::VPACKUSWBYrr          },
-  { X86::VPADDBZ256rm           ,     X86::VPADDBYrm             },
-  { X86::VPADDBZ256rr           ,     X86::VPADDBYrr             },
-  { X86::VPADDDZ256rm           ,     X86::VPADDDYrm             },
-  { X86::VPADDDZ256rr           ,     X86::VPADDDYrr             },
-  { X86::VPADDQZ256rm           ,     X86::VPADDQYrm             },
-  { X86::VPADDQZ256rr           ,     X86::VPADDQYrr             },
-  { X86::VPADDSBZ256rm          ,     X86::VPADDSBYrm            },
-  { X86::VPADDSBZ256rr          ,     X86::VPADDSBYrr            },
-  { X86::VPADDSWZ256rm          ,     X86::VPADDSWYrm            },
-  { X86::VPADDSWZ256rr          ,     X86::VPADDSWYrr            },
-  { X86::VPADDUSBZ256rm         ,     X86::VPADDUSBYrm           },
-  { X86::VPADDUSBZ256rr         ,     X86::VPADDUSBYrr           },
-  { X86::VPADDUSWZ256rm         ,     X86::VPADDUSWYrm           },
-  { X86::VPADDUSWZ256rr         ,     X86::VPADDUSWYrr           },
-  { X86::VPADDWZ256rm           ,     X86::VPADDWYrm             },
-  { X86::VPADDWZ256rr           ,     X86::VPADDWYrr             },
-  { X86::VPALIGNRZ256rmi        ,     X86::VPALIGNRYrmi          },
-  { X86::VPALIGNRZ256rri        ,     X86::VPALIGNRYrri          },
-  { X86::VPANDDZ256rm           ,     X86::VPANDYrm              },
-  { X86::VPANDDZ256rr           ,     X86::VPANDYrr              },
-  { X86::VPANDQZ256rm           ,     X86::VPANDYrm              },
-  { X86::VPANDQZ256rr           ,     X86::VPANDYrr              },
-  { X86::VPAVGBZ256rm           ,     X86::VPAVGBYrm             },
-  { X86::VPAVGBZ256rr           ,     X86::VPAVGBYrr             },
-  { X86::VPAVGWZ256rm           ,     X86::VPAVGWYrm             },
-  { X86::VPAVGWZ256rr           ,     X86::VPAVGWYrr             },
-  { X86::VPBROADCASTBZ256m      ,     X86::VPBROADCASTBYrm       },
-  { X86::VPBROADCASTBZ256r      ,     X86::VPBROADCASTBYrr       },
-  { X86::VPBROADCASTDZ256m      ,     X86::VPBROADCASTDYrm       },
-  { X86::VPBROADCASTDZ256r      ,     X86::VPBROADCASTDYrr       },
-  { X86::VPBROADCASTQZ256m      ,     X86::VPBROADCASTQYrm       },
-  { X86::VPBROADCASTQZ256r      ,     X86::VPBROADCASTQYrr       },
-  { X86::VPBROADCASTWZ256m      ,     X86::VPBROADCASTWYrm       },
-  { X86::VPBROADCASTWZ256r      ,     X86::VPBROADCASTWYrr       },
-  { X86::VPERMDZ256rm           ,     X86::VPERMDYrm             },
-  { X86::VPERMDZ256rr           ,     X86::VPERMDYrr             },
-  { X86::VPERMILPDZ256mi        ,     X86::VPERMILPDYmi          },
-  { X86::VPERMILPDZ256ri        ,     X86::VPERMILPDYri          },
-  { X86::VPERMILPDZ256rm        ,     X86::VPERMILPDYrm          },
-  { X86::VPERMILPDZ256rr        ,     X86::VPERMILPDYrr          },
-  { X86::VPERMILPSZ256mi        ,     X86::VPERMILPSYmi          },
-  { X86::VPERMILPSZ256ri        ,     X86::VPERMILPSYri          },
-  { X86::VPERMILPSZ256rm        ,     X86::VPERMILPSYrm          },
-  { X86::VPERMILPSZ256rr        ,     X86::VPERMILPSYrr          },
-  { X86::VPERMPDZ256mi          ,     X86::VPERMPDYmi            },
-  { X86::VPERMPDZ256ri          ,     X86::VPERMPDYri            },
-  { X86::VPERMPSZ256rm          ,     X86::VPERMPSYrm            },
-  { X86::VPERMPSZ256rr          ,     X86::VPERMPSYrr            },
-  { X86::VPERMQZ256mi           ,     X86::VPERMQYmi             },
-  { X86::VPERMQZ256ri           ,     X86::VPERMQYri             },
-  { X86::VPMADDUBSWZ256rm       ,     X86::VPMADDUBSWYrm         },
-  { X86::VPMADDUBSWZ256rr       ,     X86::VPMADDUBSWYrr         },
-  { X86::VPMADDWDZ256rm         ,     X86::VPMADDWDYrm           },
-  { X86::VPMADDWDZ256rr         ,     X86::VPMADDWDYrr           },
-  { X86::VPMAXSBZ256rm          ,     X86::VPMAXSBYrm            },
-  { X86::VPMAXSBZ256rr          ,     X86::VPMAXSBYrr            },
-  { X86::VPMAXSDZ256rm          ,     X86::VPMAXSDYrm            },
-  { X86::VPMAXSDZ256rr          ,     X86::VPMAXSDYrr            },
-  { X86::VPMAXSWZ256rm          ,     X86::VPMAXSWYrm            },
-  { X86::VPMAXSWZ256rr          ,     X86::VPMAXSWYrr            },
-  { X86::VPMAXUBZ256rm          ,     X86::VPMAXUBYrm            },
-  { X86::VPMAXUBZ256rr          ,     X86::VPMAXUBYrr            },
-  { X86::VPMAXUDZ256rm          ,     X86::VPMAXUDYrm            },
-  { X86::VPMAXUDZ256rr          ,     X86::VPMAXUDYrr            },
-  { X86::VPMAXUWZ256rm          ,     X86::VPMAXUWYrm            },
-  { X86::VPMAXUWZ256rr          ,     X86::VPMAXUWYrr            },
-  { X86::VPMINSBZ256rm          ,     X86::VPMINSBYrm            },
-  { X86::VPMINSBZ256rr          ,     X86::VPMINSBYrr            },
-  { X86::VPMINSDZ256rm          ,     X86::VPMINSDYrm            },
-  { X86::VPMINSDZ256rr          ,     X86::VPMINSDYrr            },
-  { X86::VPMINSWZ256rm          ,     X86::VPMINSWYrm            },
-  { X86::VPMINSWZ256rr          ,     X86::VPMINSWYrr            },
-  { X86::VPMINUBZ256rm          ,     X86::VPMINUBYrm            },
-  { X86::VPMINUBZ256rr          ,     X86::VPMINUBYrr            },
-  { X86::VPMINUDZ256rm          ,     X86::VPMINUDYrm            },
-  { X86::VPMINUDZ256rr          ,     X86::VPMINUDYrr            },
-  { X86::VPMINUWZ256rm          ,     X86::VPMINUWYrm            },
-  { X86::VPMINUWZ256rr          ,     X86::VPMINUWYrr            },
-  { X86::VPMOVSXBDZ256rm        ,     X86::VPMOVSXBDYrm          },
-  { X86::VPMOVSXBDZ256rr        ,     X86::VPMOVSXBDYrr          },
-  { X86::VPMOVSXBQZ256rm        ,     X86::VPMOVSXBQYrm          },
-  { X86::VPMOVSXBQZ256rr        ,     X86::VPMOVSXBQYrr          },
-  { X86::VPMOVSXBWZ256rm        ,     X86::VPMOVSXBWYrm          },
-  { X86::VPMOVSXBWZ256rr        ,     X86::VPMOVSXBWYrr          },
-  { X86::VPMOVSXDQZ256rm        ,     X86::VPMOVSXDQYrm          },
-  { X86::VPMOVSXDQZ256rr        ,     X86::VPMOVSXDQYrr          },
-  { X86::VPMOVSXWDZ256rm        ,     X86::VPMOVSXWDYrm          },
-  { X86::VPMOVSXWDZ256rr        ,     X86::VPMOVSXWDYrr          },
-  { X86::VPMOVSXWQZ256rm        ,     X86::VPMOVSXWQYrm          },
-  { X86::VPMOVSXWQZ256rr        ,     X86::VPMOVSXWQYrr          },
-  { X86::VPMOVZXBDZ256rm        ,     X86::VPMOVZXBDYrm          },
-  { X86::VPMOVZXBDZ256rr        ,     X86::VPMOVZXBDYrr          },
-  { X86::VPMOVZXBQZ256rm        ,     X86::VPMOVZXBQYrm          },
-  { X86::VPMOVZXBQZ256rr        ,     X86::VPMOVZXBQYrr          },
-  { X86::VPMOVZXBWZ256rm        ,     X86::VPMOVZXBWYrm          },
-  { X86::VPMOVZXBWZ256rr        ,     X86::VPMOVZXBWYrr          },
-  { X86::VPMOVZXDQZ256rm        ,     X86::VPMOVZXDQYrm          },
-  { X86::VPMOVZXDQZ256rr        ,     X86::VPMOVZXDQYrr          },
-  { X86::VPMOVZXWDZ256rm        ,     X86::VPMOVZXWDYrm          },
-  { X86::VPMOVZXWDZ256rr        ,     X86::VPMOVZXWDYrr          },
-  { X86::VPMOVZXWQZ256rm        ,     X86::VPMOVZXWQYrm          },
-  { X86::VPMOVZXWQZ256rr        ,     X86::VPMOVZXWQYrr          },
-  { X86::VPMULDQZ256rm          ,     X86::VPMULDQYrm            },
-  { X86::VPMULDQZ256rr          ,     X86::VPMULDQYrr            },
-  { X86::VPMULHRSWZ256rm        ,     X86::VPMULHRSWYrm          },
-  { X86::VPMULHRSWZ256rr        ,     X86::VPMULHRSWYrr          },
-  { X86::VPMULHUWZ256rm         ,     X86::VPMULHUWYrm           },
-  { X86::VPMULHUWZ256rr         ,     X86::VPMULHUWYrr           },
-  { X86::VPMULHWZ256rm          ,     X86::VPMULHWYrm            },
-  { X86::VPMULHWZ256rr          ,     X86::VPMULHWYrr            },
-  { X86::VPMULLDZ256rm          ,     X86::VPMULLDYrm            },
-  { X86::VPMULLDZ256rr          ,     X86::VPMULLDYrr            },
-  { X86::VPMULLWZ256rm          ,     X86::VPMULLWYrm            },
-  { X86::VPMULLWZ256rr          ,     X86::VPMULLWYrr            },
-  { X86::VPMULUDQZ256rm         ,     X86::VPMULUDQYrm           },
-  { X86::VPMULUDQZ256rr         ,     X86::VPMULUDQYrr           },
-  { X86::VPORDZ256rm            ,     X86::VPORYrm               },
-  { X86::VPORDZ256rr            ,     X86::VPORYrr               },
-  { X86::VPORQZ256rm            ,     X86::VPORYrm               },
-  { X86::VPORQZ256rr            ,     X86::VPORYrr               },
-  { X86::VPSADBWZ256rm          ,     X86::VPSADBWYrm            },
-  { X86::VPSADBWZ256rr          ,     X86::VPSADBWYrr            },
-  { X86::VPSHUFBZ256rm          ,     X86::VPSHUFBYrm            },
-  { X86::VPSHUFBZ256rr          ,     X86::VPSHUFBYrr            },
-  { X86::VPSHUFDZ256mi          ,     X86::VPSHUFDYmi            },
-  { X86::VPSHUFDZ256ri          ,     X86::VPSHUFDYri            },
-  { X86::VPSHUFHWZ256mi         ,     X86::VPSHUFHWYmi           },
-  { X86::VPSHUFHWZ256ri         ,     X86::VPSHUFHWYri           },
-  { X86::VPSHUFLWZ256mi         ,     X86::VPSHUFLWYmi           },
-  { X86::VPSHUFLWZ256ri         ,     X86::VPSHUFLWYri           },
-  { X86::VPSLLDQZ256rr          ,     X86::VPSLLDQYri            },
-  { X86::VPSLLDZ256ri           ,     X86::VPSLLDYri             },
-  { X86::VPSLLDZ256rm           ,     X86::VPSLLDYrm             },
-  { X86::VPSLLDZ256rr           ,     X86::VPSLLDYrr             },
-  { X86::VPSLLQZ256ri           ,     X86::VPSLLQYri             },
-  { X86::VPSLLQZ256rm           ,     X86::VPSLLQYrm             },
-  { X86::VPSLLQZ256rr           ,     X86::VPSLLQYrr             },
-  { X86::VPSLLVDZ256rm          ,     X86::VPSLLVDYrm            },
-  { X86::VPSLLVDZ256rr          ,     X86::VPSLLVDYrr            },
-  { X86::VPSLLVQZ256rm          ,     X86::VPSLLVQYrm            },
-  { X86::VPSLLVQZ256rr          ,     X86::VPSLLVQYrr            },
-  { X86::VPSLLWZ256ri           ,     X86::VPSLLWYri             },
-  { X86::VPSLLWZ256rm           ,     X86::VPSLLWYrm             },
-  { X86::VPSLLWZ256rr           ,     X86::VPSLLWYrr             },
-  { X86::VPSRADZ256ri           ,     X86::VPSRADYri             },
-  { X86::VPSRADZ256rm           ,     X86::VPSRADYrm             },
-  { X86::VPSRADZ256rr           ,     X86::VPSRADYrr             },
-  { X86::VPSRAVDZ256rm          ,     X86::VPSRAVDYrm            },
-  { X86::VPSRAVDZ256rr          ,     X86::VPSRAVDYrr            },
-  { X86::VPSRAWZ256ri           ,     X86::VPSRAWYri             },
-  { X86::VPSRAWZ256rm           ,     X86::VPSRAWYrm             },
-  { X86::VPSRAWZ256rr           ,     X86::VPSRAWYrr             },
-  { X86::VPSRLDQZ256rr          ,     X86::VPSRLDQYri            },
-  { X86::VPSRLDZ256ri           ,     X86::VPSRLDYri             },
-  { X86::VPSRLDZ256rm           ,     X86::VPSRLDYrm             },
-  { X86::VPSRLDZ256rr           ,     X86::VPSRLDYrr             },
-  { X86::VPSRLQZ256ri           ,     X86::VPSRLQYri             },
-  { X86::VPSRLQZ256rm           ,     X86::VPSRLQYrm             },
-  { X86::VPSRLQZ256rr           ,     X86::VPSRLQYrr             },
-  { X86::VPSRLVDZ256rm          ,     X86::VPSRLVDYrm            },
-  { X86::VPSRLVDZ256rr          ,     X86::VPSRLVDYrr            },
-  { X86::VPSRLVQZ256rm          ,     X86::VPSRLVQYrm            },
-  { X86::VPSRLVQZ256rr          ,     X86::VPSRLVQYrr            },
-  { X86::VPSRLWZ256ri           ,     X86::VPSRLWYri             },
-  { X86::VPSRLWZ256rm           ,     X86::VPSRLWYrm             },
-  { X86::VPSRLWZ256rr           ,     X86::VPSRLWYrr             },
-  { X86::VPSUBBZ256rm           ,     X86::VPSUBBYrm             },
-  { X86::VPSUBBZ256rr           ,     X86::VPSUBBYrr             },
-  { X86::VPSUBDZ256rm           ,     X86::VPSUBDYrm             },
-  { X86::VPSUBDZ256rr           ,     X86::VPSUBDYrr             },
-  { X86::VPSUBQZ256rm           ,     X86::VPSUBQYrm             },
-  { X86::VPSUBQZ256rr           ,     X86::VPSUBQYrr             },
-  { X86::VPSUBSBZ256rm          ,     X86::VPSUBSBYrm            },
-  { X86::VPSUBSBZ256rr          ,     X86::VPSUBSBYrr            },
-  { X86::VPSUBSWZ256rm          ,     X86::VPSUBSWYrm            },
-  { X86::VPSUBSWZ256rr          ,     X86::VPSUBSWYrr            },
-  { X86::VPSUBUSBZ256rm         ,     X86::VPSUBUSBYrm           },
-  { X86::VPSUBUSBZ256rr         ,     X86::VPSUBUSBYrr           },
-  { X86::VPSUBUSWZ256rm         ,     X86::VPSUBUSWYrm           },
-  { X86::VPSUBUSWZ256rr         ,     X86::VPSUBUSWYrr           },
-  { X86::VPSUBWZ256rm           ,     X86::VPSUBWYrm             },
-  { X86::VPSUBWZ256rr           ,     X86::VPSUBWYrr             },
-  { X86::VPUNPCKHBWZ256rm       ,     X86::VPUNPCKHBWYrm         },
-  { X86::VPUNPCKHBWZ256rr       ,     X86::VPUNPCKHBWYrr         },
-  { X86::VPUNPCKHDQZ256rm       ,     X86::VPUNPCKHDQYrm         },
-  { X86::VPUNPCKHDQZ256rr       ,     X86::VPUNPCKHDQYrr         },
-  { X86::VPUNPCKHQDQZ256rm      ,     X86::VPUNPCKHQDQYrm        },
-  { X86::VPUNPCKHQDQZ256rr      ,     X86::VPUNPCKHQDQYrr        },
-  { X86::VPUNPCKHWDZ256rm       ,     X86::VPUNPCKHWDYrm         },
-  { X86::VPUNPCKHWDZ256rr       ,     X86::VPUNPCKHWDYrr         },
-  { X86::VPUNPCKLBWZ256rm       ,     X86::VPUNPCKLBWYrm         },
-  { X86::VPUNPCKLBWZ256rr       ,     X86::VPUNPCKLBWYrr         },
-  { X86::VPUNPCKLDQZ256rm       ,     X86::VPUNPCKLDQYrm         },
-  { X86::VPUNPCKLDQZ256rr       ,     X86::VPUNPCKLDQYrr         },
-  { X86::VPUNPCKLQDQZ256rm      ,     X86::VPUNPCKLQDQYrm        },
-  { X86::VPUNPCKLQDQZ256rr      ,     X86::VPUNPCKLQDQYrr        },
-  { X86::VPUNPCKLWDZ256rm       ,     X86::VPUNPCKLWDYrm         },
-  { X86::VPUNPCKLWDZ256rr       ,     X86::VPUNPCKLWDYrr         },
-  { X86::VPXORDZ256rm           ,     X86::VPXORYrm              },
-  { X86::VPXORDZ256rr           ,     X86::VPXORYrr              },
-  { X86::VPXORQZ256rm           ,     X86::VPXORYrm              },
-  { X86::VPXORQZ256rr           ,     X86::VPXORYrr              },
-  { X86::VSHUFPDZ256rmi         ,     X86::VSHUFPDYrmi           },
-  { X86::VSHUFPDZ256rri         ,     X86::VSHUFPDYrri           },
-  { X86::VSHUFPSZ256rmi         ,     X86::VSHUFPSYrmi           },
-  { X86::VSHUFPSZ256rri         ,     X86::VSHUFPSYrri           },
-  { X86::VSQRTPDZ256m           ,     X86::VSQRTPDYm             },
-  { X86::VSQRTPDZ256r           ,     X86::VSQRTPDYr             },
-  { X86::VSQRTPSZ256m           ,     X86::VSQRTPSYm             },
-  { X86::VSQRTPSZ256r           ,     X86::VSQRTPSYr             },
-  { X86::VSUBPDZ256rm           ,     X86::VSUBPDYrm             },
-  { X86::VSUBPDZ256rr           ,     X86::VSUBPDYrr             },
-  { X86::VSUBPSZ256rm           ,     X86::VSUBPSYrm             },
-  { X86::VSUBPSZ256rr           ,     X86::VSUBPSYrr             },
-  { X86::VUNPCKHPDZ256rm        ,     X86::VUNPCKHPDYrm          },
-  { X86::VUNPCKHPDZ256rr        ,     X86::VUNPCKHPDYrr          },
-  { X86::VUNPCKHPSZ256rm        ,     X86::VUNPCKHPSYrm          },
-  { X86::VUNPCKHPSZ256rr        ,     X86::VUNPCKHPSYrr          },
-  { X86::VUNPCKLPDZ256rm        ,     X86::VUNPCKLPDYrm          },
-  { X86::VUNPCKLPDZ256rr        ,     X86::VUNPCKLPDYrr          },
-  { X86::VUNPCKLPSZ256rm        ,     X86::VUNPCKLPSYrm          },
-  { X86::VUNPCKLPSZ256rr        ,     X86::VUNPCKLPSYrr          },
-  { X86::VXORPDZ256rm           ,     X86::VXORPDYrm             },
-  { X86::VXORPDZ256rr           ,     X86::VXORPDYrr             },
-  { X86::VXORPSZ256rm           ,     X86::VXORPSYrm             },
-  { X86::VXORPSZ256rr           ,     X86::VXORPSYrr             },
-};
-
-#endif
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 2ea27a934b47..315a69e6a2a2 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -43,22 +43,26 @@ def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
   "vmptrld\t$vmcs", []>, PS;
 def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
   "vmptrst\t$vmcs", []>, TB;
-def VMREAD64rm : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
   "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
-def VMREAD32rm : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
   "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
-def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+let mayStore = 1 in {
+def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+}
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
   "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
-def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
   "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+let mayLoad = 1 in {
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+}
 // 0F 01 C4
 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
 def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 2b296e1e5b85..53224431c0e9 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -183,6 +183,27 @@ let ExeDomain = SSEPackedInt in {
   defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
 }
 
+// IFMA patterns - for cases where we can safely ignore the overflow bits from
+// the multiply or easily match with existing intrinsics.
+let Predicates = [HasXOP] in {
+  def : Pat<(v8i16 (add (mul (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
+                        (v8i16 VR128:$src3))),
+            (VPMACSWWrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+                        (v4i32 VR128:$src3))),
+            (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)),
+                                   (X86PShufd (v4i32 VR128:$src2), (i8 -11))),
+                        (v2i64 VR128:$src3))),
+            (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+                        (v2i64 VR128:$src3))),
+            (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
+                        (v4i32 VR128:$src3))),
+            (VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+}
+
 // Instruction where second source can be memory, third must be imm8
 multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
   let isCommutable = 1 in
@@ -269,159 +290,87 @@ let ExeDomain = SSEPackedInt in {
 }
 
 // Instruction where either second or third source can be memory
-multiclass xop4op_int<bits<8> opc, string OpcodeStr,
-                      Intrinsic Int128, Intrinsic Int256> {
-  // 128-bit Instruction
-  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, ValueType VT> {
+  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
-            XOP_4V;
-  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V;
+  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst,
-              (Int128 VR128:$src1, VR128:$src2,
-               (bitconvert (loadv2i64 addr:$src3))))]>,
+            [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
+                                   (X86andnp (load addr:$src3), RC:$src2))))]>,
             XOP_4V, VEX_W;
-  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
-            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
+            (ins RC:$src1, x86memop:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst,
-              (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
-               VR128:$src3))]>,
+            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+                                   (X86andnp RC:$src3, (load addr:$src2)))))]>,
             XOP_4V;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             []>, XOP_4V, VEX_W;
-
-  // 256-bit Instruction
-  def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2, VR256:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
-             XOP_4V, VEX_L;
-  def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, VR256:$src2,
-               (bitconvert (loadv4i64 addr:$src3))))]>,
-             XOP_4V, VEX_W, VEX_L;
-  def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
-             (ins VR256:$src1, f256mem:$src2, VR256:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
-                VR256:$src3))]>,
-             XOP_4V, VEX_L;
-  // For disassembler
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
-            (ins VR256:$src1, VR256:$src2, VR256:$src3),
-            !strconcat(OpcodeStr,
-            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, VEX_W, VEX_L;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPCMOV : xop4op_int<0xA2, "vpcmov",
-                           int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>;
+  defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L;
 }
 
-let Predicates = [HasXOP] in {
-  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
-                       (X86andnp VR128:$src3, VR128:$src2))),
-            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
-
-  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
-                       (X86andnp VR256:$src3, VR256:$src2))),
-            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-}
-
-multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                  ValueType vt128, ValueType vt256,
-                  ValueType id128, ValueType id256,
-                  PatFrag ld_128, PatFrag ld_256> {
-  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
+                        X86MemOperand intmemop, X86MemOperand fpmemop,
+                        ValueType VT, PatFrag FPLdFrag,
+                        PatFrag IntLdFrag> {
+  def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
-  def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
+        [(set RC:$dst,
+           (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>;
+  def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                          (id128 (bitconvert (loadv2i64 addr:$src3))),
-                          (i8 imm:$src4))))]>,
-        VEX_W;
-  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+        [(set RC:$dst,
+          (VT (X86vpermil2 RC:$src1, RC:$src2,
+                           (bitconvert (IntLdFrag addr:$src3)),
+                           (i8 imm:$src4))))]>, VEX_W;
+  def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1),
-                          (vt128 (bitconvert (ld_128 addr:$src2))),
-                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
+        [(set RC:$dst,
+          (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
+                           RC:$src3, (i8 imm:$src4))))]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+  def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         []>, VEX_W;
-
-  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
-                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
-  def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
-                          (id256 (bitconvert (loadv4i64 addr:$src3))),
-                          (i8 imm:$src4))))]>, VEX_W, VEX_L;
-  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1),
-                          (vt256 (bitconvert (ld_256 addr:$src2))),
-                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
-  // For disassembler
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        []>, VEX_W, VEX_L;
 }
 
-let ExeDomain = SSEPackedDouble in
-  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
-                           v2i64, v4i64, loadv2f64, loadv4f64>;
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem,
+                                 v2f64, loadv2f64, loadv2i64>;
+  defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem,
+                                  v4f64, loadv4f64, loadv4i64>, VEX_L;
+}
 
-let ExeDomain = SSEPackedSingle in
-  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
-                           v4i32, v8i32, loadv4f32, loadv8f32>;
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
+                                 v4f32, loadv4f32, loadv2i64>;
+  defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
+                                  v8f32, loadv8f32, loadv4i64>, VEX_L;
+}
 
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
new file mode 100644
index 000000000000..6cc5e8b63597
--- /dev/null
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -0,0 +1,516 @@
+//===- X86InstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterBankInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "X86-isel"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace {
+
+class X86InstructionSelector : public InstructionSelector {
+public:
+  X86InstructionSelector(const X86Subtarget &STI,
+                         const X86RegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  // TODO: remove after selectImpl support pattern with a predicate.
+  unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getAddOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getSubOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
+                          uint64_t Alignment) const;
+
+  bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI,
+                      MachineFunction &MF) const;
+  bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
+  bool selectFrameIndex(MachineInstr &I, MachineRegisterInfo &MRI,
+                        MachineFunction &MF) const;
+  bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI,
+                      MachineFunction &MF) const;
+
+  const X86Subtarget &STI;
+  const X86InstrInfo &TII;
+  const X86RegisterInfo &TRI;
+  const X86RegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI,
+                                               const X86RegisterBankInfo &RBI)
+    : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI)
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+// FIXME: This should be target-independent, inferred from the types declared
+// for each class in the bank.
+static const TargetRegisterClass *
+getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
+  if (RB.getID() == X86::GPRRegBankID) {
+    if (Ty.getSizeInBits() == 32)
+      return &X86::GR32RegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &X86::GR64RegClass;
+  }
+  if (RB.getID() == X86::VECRRegBankID) {
+    if (Ty.getSizeInBits() == 32)
+      return &X86::FR32XRegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &X86::FR64XRegClass;
+    if (Ty.getSizeInBits() == 128)
+      return &X86::VR128XRegClass;
+    if (Ty.getSizeInBits() == 256)
+      return &X86::VR256XRegClass;
+    if (Ty.getSizeInBits() == 512)
+      return &X86::VR512RegClass;
+  }
+
+  llvm_unreachable("Unknown RegBank!");
+}
+
+// Set X86 Opcode and constrain DestReg.
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                       const RegisterBankInfo &RBI) {
+
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    assert(I.isCopy() && "Generic operators do not allow physical registers");
+    return true;
+  }
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  (void)DstSize;
+  unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+  (void)SrcSize;
+  assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+         "No phys reg on generic operators");
+  assert((DstSize == SrcSize ||
+          // Copies are a mean to setup initial types, the number of
+          // bits may not exactly match.
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
+         "Copy with different width?!");
+
+  const TargetRegisterClass *RC = nullptr;
+
+  switch (RegBank.getID()) {
+  case X86::GPRRegBankID:
+    assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
+    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    break;
+  case X86::VECRRegBankID:
+    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    break;
+  default:
+    llvm_unreachable("Unknown RegBank!");
+  }
+
+  // No need to constrain SrcReg. It will get constrained when
+  // we hit another of its use or its defs.
+  // Copies do not have constraints.
+  const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
+  if (!OldRC || !RC->hasSubClassEq(OldRC)) {
+    if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+      DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                   << " operand\n");
+      return false;
+    }
+  }
+  I.setDesc(TII.get(X86::COPY));
+  return true;
+}
+
+bool X86InstructionSelector::select(MachineInstr &I) const {
+  assert(I.getParent() && "Instruction should be in a basic block!");
+  assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Opcode = I.getOpcode();
+  if (!isPreISelGenericOpcode(Opcode)) {
+    // Certain non-generic instructions also need some special handling.
+
+    if (I.isCopy())
+      return selectCopy(I, TII, MRI, TRI, RBI);
+
+    // TODO: handle more cases - LOAD_STACK_GUARD, PHI
+    return true;
+  }
+
+  assert(I.getNumOperands() == I.getNumExplicitOperands() &&
+         "Generic instruction has unexpected implicit operands\n");
+
+  // TODO: This should be implemented by tblgen, pattern with predicate not
+  // supported yet.
+  if (selectBinaryOp(I, MRI, MF))
+    return true;
+  if (selectLoadStoreOp(I, MRI, MF))
+    return true;
+  if (selectFrameIndex(I, MRI, MF))
+    return true;
+  if (selectConstant(I, MRI, MF))
+    return true;
+
+  return selectImpl(I);
+}
+
+unsigned X86InstructionSelector::getFAddOp(LLT &Ty,
+                                           const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_FADD;
+
+  if (Ty == LLT::scalar(32)) {
+    if (STI.hasAVX512()) {
+      return X86::VADDSSZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VADDSSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::ADDSSrr;
+    }
+  } else if (Ty == LLT::scalar(64)) {
+    if (STI.hasAVX512()) {
+      return X86::VADDSDZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VADDSDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::ADDSDrr;
+    }
+  } else if (Ty == LLT::vector(4, 32)) {
+    if ((STI.hasAVX512()) && (STI.hasVLX())) {
+      return X86::VADDPSZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VADDPSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::ADDPSrr;
+    }
+  }
+
+  return TargetOpcode::G_FADD;
+}
+
+unsigned X86InstructionSelector::getFSubOp(LLT &Ty,
+                                           const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_FSUB;
+
+  if (Ty == LLT::scalar(32)) {
+    if (STI.hasAVX512()) {
+      return X86::VSUBSSZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VSUBSSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::SUBSSrr;
+    }
+  } else if (Ty == LLT::scalar(64)) {
+    if (STI.hasAVX512()) {
+      return X86::VSUBSDZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VSUBSDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::SUBSDrr;
+    }
+  } else if (Ty == LLT::vector(4, 32)) {
+    if ((STI.hasAVX512()) && (STI.hasVLX())) {
+      return X86::VSUBPSZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VSUBPSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::SUBPSrr;
+    }
+  }
+
+  return TargetOpcode::G_FSUB;
+}
+
+unsigned X86InstructionSelector::getAddOp(LLT &Ty,
+                                          const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_ADD;
+
+  if (Ty == LLT::vector(4, 32)) {
+    if (STI.hasAVX512() && STI.hasVLX()) {
+      return X86::VPADDDZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VPADDDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::PADDDrr;
+    }
+  }
+
+  return TargetOpcode::G_ADD;
+}
+
+unsigned X86InstructionSelector::getSubOp(LLT &Ty,
+                                          const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_SUB;
+
+  if (Ty == LLT::vector(4, 32)) {
+    if (STI.hasAVX512() && STI.hasVLX()) {
+      return X86::VPSUBDZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VPSUBDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::PSUBDrr;
+    }
+  }
+
+  return TargetOpcode::G_SUB;
+}
+
+bool X86InstructionSelector::selectBinaryOp(MachineInstr &I,
+                                            MachineRegisterInfo &MRI,
+                                            MachineFunction &MF) const {
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+  unsigned NewOpc = I.getOpcode();
+
+  switch (NewOpc) {
+  case TargetOpcode::G_FADD:
+    NewOpc = getFAddOp(Ty, RB);
+    break;
+  case TargetOpcode::G_FSUB:
+    NewOpc = getFSubOp(Ty, RB);
+    break;
+  case TargetOpcode::G_ADD:
+    NewOpc = getAddOp(Ty, RB);
+    break;
+  case TargetOpcode::G_SUB:
+    NewOpc = getSubOp(Ty, RB);
+    break;
+  default:
+    break;
+  }
+
+  if (NewOpc == I.getOpcode())
+    return false;
+
+  I.setDesc(TII.get(NewOpc));
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
+                                                unsigned Opc,
+                                                uint64_t Alignment) const {
+  bool Isload = (Opc == TargetOpcode::G_LOAD);
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (Ty == LLT::scalar(8)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV8rm : X86::MOV8mr;
+  } else if (Ty == LLT::scalar(16)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV16rm : X86::MOV16mr;
+  } else if (Ty == LLT::scalar(32)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::VECRRegBankID == RB.getID())
+      return Isload ? (HasAVX512 ? X86::VMOVSSZrm
+                                 : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm)
+                    : (HasAVX512 ? X86::VMOVSSZmr
+                                 : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+  } else if (Ty == LLT::scalar(64)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::VECRRegBankID == RB.getID())
+      return Isload ? (HasAVX512 ? X86::VMOVSDZrm
+                                 : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm)
+                    : (HasAVX512 ? X86::VMOVSDZmr
+                                 : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
+    if (Alignment >= 16)
+      return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
+                              : HasAVX512
+                                    ? X86::VMOVAPSZ128rm_NOVLX
+                                    : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
+                    : (HasVLX ? X86::VMOVAPSZ128mr
+                              : HasAVX512
+                                    ? X86::VMOVAPSZ128mr_NOVLX
+                                    : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+    else
+      return Isload ? (HasVLX ? X86::VMOVUPSZ128rm
+                              : HasAVX512
+                                    ? X86::VMOVUPSZ128rm_NOVLX
+                                    : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
+                    : (HasVLX ? X86::VMOVUPSZ128mr
+                              : HasAVX512
+                                    ? X86::VMOVUPSZ128mr_NOVLX
+                                    : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
+  return Opc;
+}
+
+bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+
+  unsigned Opc = I.getOpcode();
+
+  if (Opc != TargetOpcode::G_STORE && Opc != TargetOpcode::G_LOAD)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+  auto &MemOp = **I.memoperands_begin();
+  unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
+  if (NewOpc == Opc)
+    return false;
+
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+  if (Opc == TargetOpcode::G_LOAD)
+    addOffset(MIB, 0);
+  else {
+    // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+    I.RemoveOperand(0);
+    addOffset(MIB, 0).addUse(DefReg);
+  }
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectFrameIndex(MachineInstr &I,
+                                              MachineRegisterInfo &MRI,
+                                              MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_FRAME_INDEX)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+
+  // Use LEA to calculate frame index.
+  unsigned NewOpc;
+  if (Ty == LLT::pointer(0, 64))
+    NewOpc = X86::LEA64r;
+  else if (Ty == LLT::pointer(0, 32))
+    NewOpc = STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r;
+  else
+    llvm_unreachable("Can't select G_FRAME_INDEX, unsupported type.");
+
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+  addOffset(MIB, 0);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectConstant(MachineInstr &I,
+                                            MachineRegisterInfo &MRI,
+                                            MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_CONSTANT)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+
+  assert(Ty.isScalar() && "invalid element type.");
+
+  uint64_t Val = 0;
+  if (I.getOperand(1).isCImm()) {
+    Val = I.getOperand(1).getCImm()->getZExtValue();
+    I.getOperand(1).ChangeToImmediate(Val);
+  } else if (I.getOperand(1).isImm()) {
+    Val = I.getOperand(1).getImm();
+  } else
+    llvm_unreachable("Unsupported operand type.");
+
+  unsigned NewOpc;
+  switch (Ty.getSizeInBits()) {
+  case 8:
+    NewOpc = X86::MOV8ri;
+    break;
+  case 16:
+    NewOpc = X86::MOV16ri;
+    break;
+  case 32:
+    NewOpc = X86::MOV32ri;
+    break;
+  case 64: {
+    // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used
+    if (isInt<32>(Val))
+      NewOpc = X86::MOV64ri32;
+    else
+      NewOpc = X86::MOV64ri;
+    break;
+  }
+  default:
+    llvm_unreachable("Can't select G_CONSTANT, unsupported type.");
+  }
+
+  I.setDesc(TII.get(NewOpc));
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+InstructionSelector *
+llvm::createX86InstructionSelector(X86Subtarget &Subtarget,
+                                   X86RegisterBankInfo &RBI) {
+  return new X86InstructionSelector(Subtarget, RBI);
+}
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index d9edf4676faf..806d6cc888f0 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -19,6 +19,7 @@
 
 using namespace llvm;
 
+namespace {
 /// \brief This class holds necessary information to represent an interleaved
 /// access group and supports utilities to lower the group into
 /// X86-specific instructions/intrinsics.
@@ -27,7 +28,6 @@ using namespace llvm;
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
-
 class X86InterleavedAccessGroup {
   /// \brief Reference to the wide-load instruction of an interleaved access
   /// group.
@@ -95,6 +95,7 @@ public:
   /// instructions/intrinsics.
   bool lowerIntoOptimizedSequence();
 };
+} // end anonymous namespace
 
 bool X86InterleavedAccessGroup::isSupported() const {
   VectorType *ShuffleVecTy = Shuffles[0]->getType();
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 63a02af02faa..2a40399ba571 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
   TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
-  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
 };
 
 struct IntrinsicData {
@@ -67,6 +67,23 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
   X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
 
+  X86_INTRINSIC_DATA(avx2_gather_d_d,      GATHER_AVX2, X86::VPGATHERDDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_d_256,  GATHER_AVX2, X86::VPGATHERDDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_pd,     GATHER_AVX2, X86::VGATHERDPDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_ps,     GATHER_AVX2, X86::VGATHERDPSrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_q,      GATHER_AVX2, X86::VPGATHERDQrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_q_256,  GATHER_AVX2, X86::VPGATHERDQYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_d,      GATHER_AVX2, X86::VPGATHERQDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_d_256,  GATHER_AVX2, X86::VPGATHERQDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_pd,     GATHER_AVX2, X86::VGATHERQPDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_ps,     GATHER_AVX2, X86::VGATHERQPSrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_q,      GATHER_AVX2, X86::VPGATHERQQrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_q_256,  GATHER_AVX2, X86::VPGATHERQQYrm, 0),
+
   X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
@@ -325,6 +342,8 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
  * the alphabetical order.
  */
 static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx_cmp_pd_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(avx_cmp_ps_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
@@ -351,9 +370,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
@@ -421,18 +440,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -455,18 +462,20 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
-
+  X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FADD_RND, 0),
+                     X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FADD_RND, 0),
+                     X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
                      X86ISD::VBROADCAST, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
@@ -720,9 +729,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
                      X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FDIV_RND, 0),
+                     X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FDIV_RND, 0),
+                     X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -795,74 +804,42 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VGETMANTS, 0),
   X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
                      X86ISD::VGETMANTS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMAX_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMAX_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMAXS, X86ISD::FMAXS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMAXS, X86ISD::FMAXS_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMIN_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMIN_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMINS, X86ISD::FMINS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMINS, X86ISD::FMINS_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMUL_RND, 0),
+                     X86ISD::FMULS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMUL_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+                     X86ISD::FMULS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
@@ -1191,9 +1168,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
                      X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSUB_RND, 0),
+                     X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSUB_RND, 0),
+                     X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
@@ -1486,6 +1463,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VPMADD52L, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
                      X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
@@ -1613,6 +1594,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
@@ -1620,7 +1602,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_max_ss,        INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse_min_ss,        INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
@@ -1631,6 +1615,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
   X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_cmp_pd,       INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
@@ -1643,7 +1628,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_cvttpd2dq,    INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(sse2_cvttps2dq,    INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
   X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse2_max_sd,       INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_min_sd,       INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
@@ -1696,9 +1683,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
   X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, ISD::ABS, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
new file mode 100644
index 000000000000..c2dc762fec5e
--- /dev/null
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -0,0 +1,142 @@
+//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86LegalizerInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+using namespace TargetOpcode;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
+                                   const X86TargetMachine &TM)
+    : Subtarget(STI), TM(TM) {
+
+  setLegalizerInfo32bit();
+  setLegalizerInfo64bit();
+  setLegalizerInfoSSE1();
+  setLegalizerInfoSSE2();
+
+  computeTables();
+}
+
+void X86LegalizerInfo::setLegalizerInfo32bit() {
+
+  if (Subtarget.is64Bit())
+    return;
+
+  const LLT p0 = LLT::pointer(0, 32);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {s8, s16, s32})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, p0})
+      setAction({MemOp, Ty}, Legal);
+
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+
+  // Constants
+  for (auto Ty : {s8, s16, s32, p0})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+  setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
+}
+
+void X86LegalizerInfo::setLegalizerInfo64bit() {
+
+  if (!Subtarget.is64Bit())
+    return;
+
+  const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {s8, s16, s32, s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, s64, p0})
+      setAction({MemOp, Ty}, Legal);
+
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+
+  // Constants
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE1() {
+  if (!Subtarget.hasSSE1())
+    return;
+
+  const LLT s32 = LLT::scalar(32);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s32, v4s32})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v4s32, v2s64})
+      setAction({MemOp, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE2() {
+  if (!Subtarget.hasSSE2())
+    return;
+
+  const LLT s64 = LLT::scalar(64);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s64, v2s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v4s32})
+      setAction({BinOp, Ty}, Legal);
+}
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
new file mode 100644
index 000000000000..3f00898b4232
--- /dev/null
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -0,0 +1,43 @@
+//===- X86LegalizerInfo.h ------------------------------------------*- C++
+//-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class X86Subtarget;
+class X86TargetMachine;
+
+/// This class provides the information for the target register banks.
+class X86LegalizerInfo : public LegalizerInfo {
+private:
+  /// Keep a reference to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget &Subtarget;
+  const X86TargetMachine &TM;
+
+public:
+  X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
+
+private:
+  void setLegalizerInfo32bit();
+  void setLegalizerInfo64bit();
+  void setLegalizerInfoSSE1();
+  void setLegalizerInfoSSE2();
+};
+} // namespace llvm
+#endif
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index feeb2fd5993c..550e3543a71e 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -102,7 +102,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
 }
 
 void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
-  OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+  OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo);
   SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
 }
 
@@ -215,6 +215,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
   case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
   case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_ABS8:      RefKind = MCSymbolRefExpr::VK_X86_ABS8; break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
     Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -357,7 +358,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
                                     const MachineOperand &MO) const {
   switch (MO.getType()) {
   default:
-    MI->dump();
+    MI->print(errs());
     llvm_unreachable("unknown operand type");
   case MachineOperand::MO_Register:
     // Ignore all implicit register operands.
@@ -498,11 +499,16 @@ ReSimplify:
     break;
   }
 
-  // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction.
+  // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
   { unsigned Opcode;
   case X86::TAILJMPr:   Opcode = X86::JMP32r; goto SetTailJmpOpcode;
   case X86::TAILJMPd:
   case X86::TAILJMPd64: Opcode = X86::JMP_1;  goto SetTailJmpOpcode;
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPd64_CC:
+    Opcode = X86::GetCondBranchFromCond(
+        static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
+    goto SetTailJmpOpcode;
 
   SetTailJmpOpcode:
     MCOperand Saved = OutMI.getOperand(0);
@@ -888,30 +894,47 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
   SM.recordStatepoint(MI);
 }
 
-void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
-                                       X86MCInstLower &MCIL) {
-  // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
+void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
+                                     X86MCInstLower &MCIL) {
+  // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+  //                  <opcode>, <operands>
 
-  unsigned LoadDefRegister = MI.getOperand(0).getReg();
-  MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
-  unsigned LoadOpcode = MI.getOperand(2).getImm();
-  unsigned LoadOperandsBeginIdx = 3;
+  unsigned DefRegister = FaultingMI.getOperand(0).getReg();
+  FaultMaps::FaultKind FK =
+      static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+  MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+  unsigned Opcode = FaultingMI.getOperand(3).getImm();
+  unsigned OperandsBeginIdx = 4;
 
-  FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+  assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+  FM.recordFaultingOp(FK, HandlerLabel);
 
-  MCInst LoadMI;
-  LoadMI.setOpcode(LoadOpcode);
+  MCInst MI;
+  MI.setOpcode(Opcode);
 
-  if (LoadDefRegister != X86::NoRegister)
-    LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+  if (DefRegister != X86::NoRegister)
+    MI.addOperand(MCOperand::createReg(DefRegister));
 
-  for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
-            E = MI.operands_end();
+  for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+            E = FaultingMI.operands_end();
        I != E; ++I)
-    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
-      LoadMI.addOperand(MaybeOperand.getValue());
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
+      MI.addOperand(MaybeOperand.getValue());
+
+  OutStreamer->EmitInstruction(MI, getSubtargetInfo());
+}
 
-  OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
+                                     X86MCInstLower &MCIL) {
+  bool Is64Bits = Subtarget->is64Bit();
+  MCContext &Ctx = OutStreamer->getContext();
+  MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
+  const MCSymbolRefExpr *Op =
+      MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx);
+
+  EmitAndCountInstruction(
+      MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
+          .addExpr(Op));
 }
 
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
@@ -1276,9 +1299,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::TAILJMPr:
   case X86::TAILJMPm:
   case X86::TAILJMPd:
+  case X86::TAILJMPd_CC:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
   case X86::TAILJMPd64:
+  case X86::TAILJMPd64_CC:
   case X86::TAILJMPr64_REX:
   case X86::TAILJMPm64_REX:
     // Lower these as normal, but add some comments.
@@ -1367,8 +1392,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::STATEPOINT:
     return LowerSTATEPOINT(*MI, MCInstLowering);
 
-  case TargetOpcode::FAULTING_LOAD_OP:
-    return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::FENTRY_CALL:
+    return LowerFENTRY_CALL(*MI, MCInstLowering);
 
   case TargetOpcode::PATCHABLE_OP:
     return LowerPATCHABLE_OP(*MI, MCInstLowering);
@@ -1501,7 +1529,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
@@ -1572,15 +1601,16 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, ElSize, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
 
   case X86::VPERMIL2PDrm:
   case X86::VPERMIL2PSrm:
-  case X86::VPERMIL2PDrmY:
-  case X86::VPERMIL2PSrmY: {
+  case X86::VPERMIL2PDYrm:
+  case X86::VPERMIL2PSYrm: {
     if (!OutStreamer->isVerboseAsm())
       break;
     assert(MI->getNumOperands() >= 8 &&
@@ -1593,8 +1623,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned ElSize;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
-    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
-    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
+    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break;
+    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
     }
 
     const MachineOperand &MaskOp = MI->getOperand(6);
@@ -1602,7 +1632,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
@@ -1618,7 +1649,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
@@ -1678,7 +1710,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
             CS << "?";
         }
         CS << "]";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
       } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
         CS << "<";
         for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
@@ -1710,7 +1742,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           }
         }
         CS << ">";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
       }
     }
     break;
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index c9e636f1eb00..3fcb642424ad 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -9,6 +9,7 @@
 
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
@@ -20,11 +21,8 @@ void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
       MF->getSubtarget().getRegisterInfo());
     unsigned SlotSize = RegInfo->getSlotSize();
-    for (const MCPhysReg *CSR =
-      RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
-      unsigned Reg = *CSR;
-       ++CSR)
-    {
+    for (const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+         unsigned Reg = *CSR; ++CSR) {
       if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
         RestoreBasePointerOffset -= SlotSize;
     }
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
new file mode 100644
index 000000000000..dd21e2b7c4a1
--- /dev/null
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -0,0 +1,271 @@
+//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// \file This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the DAG scheduling mutation to
+// pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace {
+
+/// \brief Verify that the instruction pair, First and Second,
+/// should be scheduled back to back.  If either instruction is unspecified,
+/// then verify that the other instruction may be part of a pair at all.
+static bool shouldScheduleAdjacent(const X86Subtarget &ST,
+                                   const MachineInstr *First,
+                                   const MachineInstr *Second) {
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!ST.hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  assert((First || Second) && "At least one instr must be specified");
+  unsigned FirstOpcode = First
+                         ? First->getOpcode()
+                         : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = Second
+                          ? Second->getOpcode()
+                          : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+
+  switch (SecondOpcode) {
+  default:
+    return false;
+  case X86::JE_1:
+  case X86::JNE_1:
+  case X86::JL_1:
+  case X86::JLE_1:
+  case X86::JG_1:
+  case X86::JGE_1:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_1:
+  case X86::JBE_1:
+  case X86::JA_1:
+  case X86::JAE_1:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_1:
+  case X86::JNS_1:
+  case X86::JP_1:
+  case X86::JNP_1:
+  case X86::JO_1:
+  case X86::JNO_1:
+    FuseKind = FuseTest;
+    break;
+  }
+
+  switch (FirstOpcode) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  case X86::INSTRUCTION_LIST_END:
+    return true;
+  }
+}
+
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class X86MacroFusion : public ScheduleDAGMutation {
+public:
+  X86MacroFusion() {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+  const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>();
+
+  // For now, assume targets can only fuse with the branch.
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
+  if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch))
+    return;
+
+  for (SDep &PredDep : ExitSU.Preds) {
+    if (PredDep.isWeak())
+      continue;
+    SUnit &SU = *PredDep.getSUnit();
+    MachineInstr &Pred = *SU.getInstr();
+    if (!shouldScheduleAdjacent(ST, &Pred, Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    // Adjust latency of data deps between the nodes.
+    for (SDep &PredDep : ExitSU.Preds)
+      if (PredDep.getSUnit() == &SU)
+        PredDep.setLatency(0);
+    for (SDep &SuccDep : SU.Succs)
+      if (SuccDep.getSUnit() == &ExitSU)
+        SuccDep.setLatency(0);
+
+    ++NumFused;
+    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
+          SU.print(dbgs(), DAG);
+          dbgs() << " - ExitSU"
+                 << " / " << DAG->TII->getName(Pred.getOpcode()) << " - "
+                 << DAG->TII->getName(Branch->getOpcode()) << '\n';);
+
+    break;
+  }
+}
+
+} // end namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation () {
+  return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
new file mode 100644
index 000000000000..e630f802e8e6
--- /dev/null
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -0,0 +1,30 @@
+//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// \file This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 definition of the DAG scheduling mutation to pair
+// instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+//===----------------------------------------------------------------------===//
+// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createX86MacroFusionDAGMutation());
+/// to X86PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation();
+
+} // end namespace llvm
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index e1447006cd18..debb192732e5 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -389,9 +389,6 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
   assert(isLEA(First) && isLEA(Last) &&
          "The function works only with LEA instructions");
 
-  // Get new address displacement.
-  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
-
   // Make sure that LEA def registers belong to the same class. There may be
   // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
   // be used as their operands, so we must be sure that replacing one LEA
@@ -400,10 +397,13 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
       MRI->getRegClass(Last.getOperand(0).getReg()))
     return false;
 
+  // Get new address displacement.
+  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
+
   // Loop over all uses of the Last LEA to check that its def register is
   // used only as address base for memory accesses. If so, it can be
   // replaced, otherwise - no.
-  for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+  for (auto &MO : MRI->use_nodbg_operands(Last.getOperand(0).getReg())) {
     MachineInstr &MI = *MO.getParent();
 
     // Get the number of the first memory operand.
@@ -563,8 +563,9 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         // Loop over all uses of the Last LEA and update their operands. Note
         // that the correctness of this has already been checked in the
         // isReplaceable function.
-        for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
-                  UE = MRI->use_end();
+        unsigned LastVReg = Last.getOperand(0).getReg();
+        for (auto UI = MRI->use_nodbg_begin(LastVReg),
+                  UE = MRI->use_nodbg_end();
              UI != UE;) {
           MachineOperand &MO = *UI++;
           MachineInstr &MI = *MO.getParent();
@@ -586,6 +587,9 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
             Op.setOffset(Op.getOffset() + AddrDispShift);
         }
 
+        // Mark debug values referring to Last LEA as undefined.
+        MRI->markUsesInDebugValueAsUndef(LastVReg);
+
         // Since we can possibly extend register lifetime, clear kill flags.
         MRI->clearKillFlags(First.getOperand(0).getReg());
 
@@ -594,7 +598,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
 
         // By this moment, all of the Last LEA's uses must be replaced. So we
         // can freely remove it.
-        assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+        assert(MRI->use_empty(LastVReg) &&
                "The LEA's def register must have no uses");
         Last.eraseFromParent();
 
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
new file mode 100644
index 000000000000..d395c826e6bf
--- /dev/null
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -0,0 +1,243 @@
+//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterBankInfo.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "X86GenRegisterBank.inc"
+
+using namespace llvm;
+// This file will be TableGen'ed at some point.
+#define GET_TARGET_REGBANK_INFO_IMPL
+#include "X86GenRegisterBankInfo.def"
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
+    : X86GenRegisterBankInfo() {
+
+  // validate RegBank initialization.
+  const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID);
+  (void)RBGPR;
+  assert(&X86::GPRRegBank == &RBGPR && "Incorrect RegBanks inizalization.");
+
+  // The GPR register bank is fully defined by all the registers in
+  // GR64 + its subclasses.
+  assert(RBGPR.covers(*TRI.getRegClass(X86::GR64RegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+}
+
+const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+
+  if (X86::GR8RegClass.hasSubClassEq(&RC) ||
+      X86::GR16RegClass.hasSubClassEq(&RC) ||
+      X86::GR32RegClass.hasSubClassEq(&RC) ||
+      X86::GR64RegClass.hasSubClassEq(&RC))
+    return getRegBank(X86::GPRRegBankID);
+
+  if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
+      X86::FR64XRegClass.hasSubClassEq(&RC) ||
+      X86::VR128XRegClass.hasSubClassEq(&RC) ||
+      X86::VR256XRegClass.hasSubClassEq(&RC) ||
+      X86::VR512RegClass.hasSubClassEq(&RC))
+    return getRegBank(X86::VECRRegBankID);
+
+  llvm_unreachable("Unsupported register kind yet.");
+}
+
+X86GenRegisterBankInfo::PartialMappingIdx
+X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
+  if ((Ty.isScalar() && !isFP) || Ty.isPointer()) {
+    switch (Ty.getSizeInBits()) {
+    case 8:
+      return PMI_GPR8;
+    case 16:
+      return PMI_GPR16;
+    case 32:
+      return PMI_GPR32;
+    case 64:
+      return PMI_GPR64;
+      break;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  } else if (Ty.isScalar()) {
+    switch (Ty.getSizeInBits()) {
+    case 32:
+      return PMI_FP32;
+    case 64:
+      return PMI_FP64;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  } else {
+    switch (Ty.getSizeInBits()) {
+    case 128:
+      return PMI_VEC128;
+    case 256:
+      return PMI_VEC256;
+    case 512:
+      return PMI_VEC512;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  }
+
+  return PMI_None;
+}
+
+void X86RegisterBankInfo::getInstrPartialMappingIdxs(
+    const MachineInstr &MI, const MachineRegisterInfo &MRI, const bool isFP,
+    SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx) {
+
+  unsigned NumOperands = MI.getNumOperands();
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    auto &MO = MI.getOperand(Idx);
+    if (!MO.isReg())
+      OpRegBankIdx[Idx] = PMI_None;
+    else
+      OpRegBankIdx[Idx] = getPartialMappingIdx(MRI.getType(MO.getReg()), isFP);
+  }
+}
+
+bool X86RegisterBankInfo::getInstrValueMapping(
+    const MachineInstr &MI,
+    const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+    SmallVectorImpl<const ValueMapping *> &OpdsMapping) {
+
+  unsigned NumOperands = MI.getNumOperands();
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    if (!MI.getOperand(Idx).isReg())
+      continue;
+
+    auto Mapping = getValueMapping(OpRegBankIdx[Idx], 1);
+    if (!Mapping->isValid())
+      return false;
+
+    OpdsMapping[Idx] = Mapping;
+  }
+  return true;
+}
+
+RegisterBankInfo::InstructionMapping
+X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, bool isFP) {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned NumOperands = MI.getNumOperands();
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+  if (NumOperands != 3 || (Ty != MRI.getType(MI.getOperand(1).getReg())) ||
+      (Ty != MRI.getType(MI.getOperand(2).getReg())))
+    llvm_unreachable("Unsupported operand mapping yet.");
+
+  auto Mapping = getValueMapping(getPartialMappingIdx(Ty, isFP), 3);
+  return InstructionMapping{DefaultMappingID, 1, Mapping, NumOperands};
+}
+
+RegisterBankInfo::InstructionMapping
+X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto Opc = MI.getOpcode();
+
+  // Try the default logic for non-generic instructions that are either copies
+  // or already have some operands assigned to banks.
+  if (!isPreISelGenericOpcode(Opc)) {
+    InstructionMapping Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  }
+
+  switch (Opc) {
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+    return getSameOperandsMapping(MI, false);
+    break;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+    return getSameOperandsMapping(MI, true);
+    break;
+  default:
+    break;
+  }
+
+  unsigned NumOperands = MI.getNumOperands();
+
+  // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
+  SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+  getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
+
+  // Finally construct the computed mapping.
+  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+  if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+    return InstructionMapping();
+
+  return InstructionMapping{DefaultMappingID, /* Cost */ 1,
+                            getOperandsMapping(OpdsMapping), NumOperands};
+}
+
+void X86RegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  return applyDefaultMapping(OpdMapper);
+}
+
+RegisterBankInfo::InstructionMappings
+X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE: {
+    // we going to try to map 32/64 bit to PMI_FP32/PMI_FP64
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    unsigned NumOperands = MI.getNumOperands();
+
+    // Track the bank of each register, use FP mapping (all scalars in VEC)
+    SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+    getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
+
+    // Finally construct the computed mapping.
+    SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+    if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+      break;
+
+    RegisterBankInfo::InstructionMapping Mapping = InstructionMapping{
+        /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands};
+    InstructionMappings AltMappings;
+    AltMappings.emplace_back(std::move(Mapping));
+    return AltMappings;
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h
new file mode 100644
index 000000000000..a1e01a9ab949
--- /dev/null
+++ b/lib/Target/X86/X86RegisterBankInfo.h
@@ -0,0 +1,81 @@
+//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "X86GenRegisterBank.inc"
+
+namespace llvm {
+
+class LLT;
+
+class X86GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "X86GenRegisterBank.inc"
+#define GET_TARGET_REGBANK_INFO_CLASS
+#include "X86GenRegisterBankInfo.def"
+
+  static RegisterBankInfo::PartialMapping PartMappings[];
+  static RegisterBankInfo::ValueMapping ValMappings[];
+
+  static PartialMappingIdx getPartialMappingIdx(const LLT &Ty, bool isFP);
+  static const RegisterBankInfo::ValueMapping *
+  getValueMapping(PartialMappingIdx Idx, unsigned NumOperands);
+};
+
+class TargetRegisterInfo;
+
+/// This class provides the information for the target register banks.
+class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
+private:
+  /// Get an instruction mapping.
+  /// \return An InstructionMappings with a statically allocated
+  /// OperandsMapping.
+  static InstructionMapping getSameOperandsMapping(const MachineInstr &MI,
+                                                   bool isFP);
+
+  /// Track the bank of each instruction operand(register)
+  static void
+  getInstrPartialMappingIdxs(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI, const bool isFP,
+                             SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx);
+
+  /// Construct the instruction ValueMapping from PartialMappingIdxs
+  /// \return true if mapping succeeded.
+  static bool
+  getInstrValueMapping(const MachineInstr &MI,
+                       const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+                       SmallVectorImpl<const ValueMapping *> &OpdsMapping);
+
+public:
+  X86RegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+
+} // namespace llvm
+#endif
diff --git a/lib/Target/X86/X86RegisterBanks.td b/lib/Target/X86/X86RegisterBanks.td
new file mode 100644
index 000000000000..6d17cd53a0c1
--- /dev/null
+++ b/lib/Target/X86/X86RegisterBanks.td
@@ -0,0 +1,17 @@
+//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: RAX, RCX,...
+def GPRRegBank : RegisterBank<"GPR", [GR64]>;
+
+/// Floating Point/Vector Registers
+def VECRRegBank : RegisterBank<"VECR", [VR512]>;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 65f438f94b04..9bab9a4cf3ba 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -80,7 +80,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // ExeDepsFixer and PostRAScheduler require liveness.
+  // ExecutionDepsFixer and PostRAScheduler require liveness.
   return true;
 }
 
@@ -337,7 +337,9 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
         return CSR_64_AllRegs_AVX512_SaveList;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_SaveList;
-      return CSR_64_AllRegs_SaveList;
+      if (HasSSE)
+        return CSR_64_AllRegs_SaveList;
+      return CSR_64_AllRegs_NoSSE_SaveList;
     } else {
       if (HasAVX512)
         return CSR_32_AllRegs_AVX512_SaveList;
@@ -447,7 +449,9 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
         return CSR_64_AllRegs_AVX512_RegMask;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_RegMask;
-      return CSR_64_AllRegs_RegMask;
+      if (HasSSE)
+        return CSR_64_AllRegs_RegMask;
+      return CSR_64_AllRegs_NoSSE_RegMask;
     } else {
       if (HasAVX512)
         return CSR_32_AllRegs_AVX512_RegMask;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 372a15aff15a..c177ba1d52f7 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -189,22 +189,22 @@ def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
 def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
 def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
 
-def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
-def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
-def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
-def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
-def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
-def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
-def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
-def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
-def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
-def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
-def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
-def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
-def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
-def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
-def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
-def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>;
 
 } // CostPerUse
 
@@ -437,8 +437,10 @@ def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
 def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
                                           (add LOW32_ADDR_ACCESS, RBP)>;
 
-// A class to support the 'A' assembler constraint: EAX then EDX.
+// A class to support the 'A' assembler constraint: [ER]AX then [ER]DX.
+def GR16_AD : RegisterClass<"X86", [i16], 16, (add AX, DX)>;
 def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+def GR64_AD : RegisterClass<"X86", [i64], 64, (add RAX, RDX)>;
 
 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 35257f89100c..7f7efd7cad3f 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -366,6 +366,7 @@ def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
 def IIC_SSE_MWAITX : InstrItinClass;
 def IIC_SSE_MONITORX : InstrItinClass;
+def IIC_SSE_CLZERO : InstrItinClass;
 
 def IIC_SSE_PREFETCH : InstrItinClass;
 def IIC_SSE_PAUSE : InstrItinClass;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index f031a281e5dd..9da8a18965ea 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -85,10 +85,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
       Args.push_back(Entry);
 
       TargetLowering::CallLoweringInfo CLI(DAG);
-      CLI.setDebugLoc(dl).setChain(Chain)
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
-        .setDiscardResult();
+      CLI.setDebugLoc(dl)
+          .setChain(Chain)
+          .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                        DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                        std::move(Args))
+          .setDiscardResult();
 
       std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 11115524c810..2cebb76022ef 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -14,7 +14,7 @@
 
 #include "X86ShuffleDecodeConstantPool.h"
 #include "Utils/X86ShuffleDecode.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 
@@ -25,7 +25,7 @@
 namespace llvm {
 
 static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
-                                SmallBitVector &UndefElts,
+                                APInt &UndefElts,
                                 SmallVectorImpl<uint64_t> &RawMask) {
   // It is not an error for shuffle masks to not be a vector of
   // MaskEltSizeInBits because the constant pool uniques constants by their
@@ -49,6 +49,33 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
   unsigned NumCstElts = CstTy->getVectorNumElements();
 
+  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+         "Unaligned shuffle mask size");
+
+  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+  UndefElts = APInt(NumMaskElts, 0);
+  RawMask.resize(NumMaskElts, 0);
+
+  // Fast path - if the constants match the mask size then copy direct.
+  if (MaskEltSizeInBits == CstEltSizeInBits) {
+    assert(NumCstElts == NumMaskElts && "Unaligned shuffle mask size");
+    for (unsigned i = 0; i != NumMaskElts; ++i) {
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+        return false;
+
+      if (isa<UndefValue>(COp)) {
+        UndefElts.setBit(i);
+        RawMask[i] = 0;
+        continue;
+      }
+
+      auto *Elt = cast<ConstantInt>(COp);
+      RawMask[i] = Elt->getValue().getZExtValue();
+    }
+    return true;
+  }
+
   // Extract all the undef/constant element data and pack into single bitsets.
   APInt UndefBits(CstSizeInBits, 0);
   APInt MaskBits(CstSizeInBits, 0);
@@ -57,39 +84,30 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return false;
 
+    unsigned BitOffset = i * CstEltSizeInBits;
+
     if (isa<UndefValue>(COp)) {
-      APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits);
-      UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+      UndefBits.setBits(BitOffset, BitOffset + CstEltSizeInBits);
       continue;
     }
 
-    APInt EltBits = cast<ConstantInt>(COp)->getValue();
-    EltBits = EltBits.zextOrTrunc(CstSizeInBits);
-    MaskBits |= EltBits.shl(i * CstEltSizeInBits);
+    MaskBits.insertBits(cast<ConstantInt>(COp)->getValue(), BitOffset);
   }
 
   // Now extract the undef/constant bit data into the raw shuffle masks.
-  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
-         "Unaligned shuffle mask size");
-
-  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
-  UndefElts = SmallBitVector(NumMaskElts, false);
-  RawMask.resize(NumMaskElts, 0);
-
   for (unsigned i = 0; i != NumMaskElts; ++i) {
-    APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits);
-    EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits);
+    unsigned BitOffset = i * MaskEltSizeInBits;
+    APInt EltUndef = UndefBits.extractBits(MaskEltSizeInBits, BitOffset);
 
     // Only treat the element as UNDEF if all bits are UNDEF, otherwise
     // treat it as zero.
     if (EltUndef.isAllOnesValue()) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       RawMask[i] = 0;
       continue;
     }
 
-    APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits);
-    EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits);
+    APInt EltBits = MaskBits.extractBits(MaskEltSizeInBits, BitOffset);
     RawMask[i] = EltBits.getZExtValue();
   }
 
@@ -104,8 +122,8 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 32> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
@@ -145,8 +163,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 16> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
@@ -180,7 +198,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
+  APInt UndefElts;
   SmallVector<uint64_t, 8> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
@@ -231,8 +249,8 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 32> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 16> RawMask;
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
@@ -286,8 +304,8 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
          "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
@@ -314,8 +332,8 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
          "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 586bb7bd7b1a..92a68759195c 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,19 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Subtarget.h"
-#include "X86InstrInfo.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <string>
 
 #if defined(_MSC_VER)
 #include <intrin.h>
@@ -93,8 +97,17 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     return X86II::MO_NO_FLAG;
 
   // Absolute symbols can be referenced directly.
-  if (GV && GV->isAbsoluteSymbolRef())
-    return X86II::MO_NO_FLAG;
+  if (GV) {
+    if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) {
+      // See if we can use the 8-bit immediate form. Note that some instructions
+      // will sign extend the immediate operand, so to be conservative we only
+      // accept the range [0,128).
+      if (CR->getUnsignedMax().ult(128))
+        return X86II::MO_ABS8;
+      else
+        return X86II::MO_NO_FLAG;
+    }
+  }
 
   if (TM.shouldAssumeDSOLocal(M, GV))
     return classifyLocalReference(GV);
@@ -195,7 +208,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+sahf";
   }
 
-
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
@@ -263,7 +275,6 @@ void X86Subtarget::initializeEnvironment() {
   HasVBMI = false;
   HasIFMA = false;
   HasRTM = false;
-  HasHLE = false;
   HasERI = false;
   HasCDI = false;
   HasPFI = false;
@@ -277,6 +288,7 @@ void X86Subtarget::initializeEnvironment() {
   HasRDSEED = false;
   HasLAHFSAHF = false;
   HasMWAITX = false;
+  HasCLZERO = false;
   HasMPX = false;
   IsBTMemSlow = false;
   IsPMULLDSlow = false;
@@ -286,10 +298,11 @@ void X86Subtarget::initializeEnvironment() {
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
-  HasFastPartialYMMWrite = false;
+  HasFastPartialYMMorZMMWrite = false;
   HasFastScalarFSQRT = false;
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
+  HasFastSHLDRotate = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
@@ -321,7 +334,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (!isPositionIndependent())
@@ -359,4 +372,3 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
 bool X86Subtarget::enableEarlyIfConversion() const {
   return hasCMov() && X86EarlyIfConv;
 }
-
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index d80dc4a9b5e8..d0d88d326949 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -18,33 +18,36 @@
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
 #include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <string>
+#include <memory>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "X86GenSubtargetInfo.inc"
 
 namespace llvm {
+
 class GlobalValue;
-class StringRef;
-class TargetMachine;
 
 /// The X86 backend supports a number of different styles of PIC.
 ///
 namespace PICStyles {
+
 enum Style {
   StubPIC,          // Used on i386-darwin in pic mode.
   GOT,              // Used on 32 bit elf on when in pic mode.
   RIPRel,           // Used on X86-64 when in pic mode.
   None              // Set when not in pic mode.
 };
-}
 
-class X86Subtarget final : public X86GenSubtargetInfo {
+} // end namespace PICStyles
 
+class X86Subtarget final : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
     NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -96,10 +99,13 @@ protected:
 
   /// Target has XSAVE instructions
   bool HasXSAVE;
+
   /// Target has XSAVEOPT instructions
   bool HasXSAVEOPT;
+
   /// Target has XSAVEC instructions
   bool HasXSAVEC;
+
   /// Target has XSAVES instructions
   bool HasXSAVES;
 
@@ -148,9 +154,6 @@ protected:
   /// Processor has RTM instructions.
   bool HasRTM;
 
-  /// Processor has HLE.
-  bool HasHLE;
-
   /// Processor has ADX instructions.
   bool HasADX;
 
@@ -169,6 +172,9 @@ protected:
   /// Processor has MONITORX/MWAITX instructions.
   bool HasMWAITX;
 
+  /// Processor has Cache Line Zero instruction
+  bool HasCLZERO;
+
   /// Processor has Prefetch with intent to Write instruction
   bool HasPFPREFETCHWT1;
 
@@ -201,8 +207,8 @@ protected:
   bool UseLeaForSP;
 
   /// True if there is no performance penalty to writing only the lower parts
-  /// of a YMM register without clearing the upper part.
-  bool HasFastPartialYMMWrite;
+  /// of a YMM or ZMM register without clearing the upper part.
+  bool HasFastPartialYMMorZMMWrite;
 
   /// True if hardware SQRTSS instruction is at least as fast (latency) as
   /// RSQRTSS followed by a Newton-Raphson iteration.
@@ -223,6 +229,9 @@ protected:
   /// True if LZCNT instruction is fast.
   bool HasFastLZCNT;
 
+  /// True if SHLD based rotate is fast.
+  bool HasFastSHLDRotate;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -265,24 +274,12 @@ protected:
   /// Processor supports MPX - Memory Protection Extensions
   bool HasMPX;
 
-  /// Processor supports Invalidate Process-Context Identifier
-  bool HasInvPCId;
-
-  /// Processor has VM Functions
-  bool HasVMFUNC;
-
-  /// Processor has Supervisor Mode Access Protection
-  bool HasSMAP;
-
   /// Processor has Software Guard Extensions
   bool HasSGX;
 
   /// Processor supports Flush Cache Line instruction
   bool HasCLFLUSHOPT;
 
-  /// Processor has Persistent Commit feature
-  bool HasPCOMMIT;
-
   /// Processor supports Cache Line Write Back instruction
   bool HasCLWB;
 
@@ -307,8 +304,8 @@ protected:
   /// This is used to avoid ifndefs spreading around while GISel is
   /// an optional library.
   std::unique_ptr<GISelAccessor> GISel;
-private:
 
+private:
   /// Override the stack alignment.
   unsigned StackAlignOverride;
 
@@ -341,13 +338,17 @@ public:
   const X86TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
+
   const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
   const X86FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
+
   const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
   const X86RegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
@@ -370,12 +371,14 @@ public:
   const InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
+
 private:
   /// Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
   X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
   void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
 public:
   /// Is this x86_64? (disregarding specific ABI / programming model)
   bool is64Bit() const {
@@ -432,9 +435,9 @@ public:
   bool hasPCLMUL() const { return HasPCLMUL; }
   // Prefer FMA4 to FMA - its better for commutation/memory folding and
   // has equal or better performance on all supported targets.
-  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
   bool hasFMA4() const { return HasFMA4; }
-  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
@@ -447,13 +450,13 @@ public:
   bool hasVBMI() const { return HasVBMI; }
   bool hasIFMA() const { return HasIFMA; }
   bool hasRTM() const { return HasRTM; }
-  bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool hasMWAITX() const { return HasMWAITX; }
+  bool hasCLZERO() const { return HasCLZERO; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isPMULLDSlow() const { return IsPMULLDSlow; }
@@ -462,10 +465,13 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
-  bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastPartialYMMorZMMWrite() const {
+    return HasFastPartialYMMorZMMWrite;
+  }
   bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
+  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
@@ -481,8 +487,9 @@ public:
   bool hasVLX() const { return HasVLX; }
   bool hasPKU() const { return HasPKU; }
   bool hasMPX() const { return HasMPX; }
+  bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
 
-  virtual bool isXRaySupported() const override { return is64Bit(); }
+  bool isXRaySupported() const override { return is64Bit(); }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
@@ -513,6 +520,7 @@ public:
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
   bool isTargetWindowsMSVC() const {
     return TargetTriple.isWindowsMSVCEnvironment();
@@ -616,6 +624,9 @@ public:
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
+  // TODO: Update the regression tests and return true.
+  bool supportPrintSchedInfo() const override { return false; }
+
   bool enableEarlyIfConversion() const override;
 
   /// Return the instruction itineraries based on the subtarget selection.
@@ -628,6 +639,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index aa5cfc64e9eb..03a1958121ab 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -11,22 +11,47 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetMachine.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86.h"
 #include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "X86RegisterBankInfo.h"
+#endif
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "X86TargetTransformInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
@@ -34,8 +59,11 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::init(true), cl::Hidden);
 
 namespace llvm {
+
 void initializeWinEHStatePassPass(PassRegistry &);
-}
+void initializeX86ExecutionDepsFixPass(PassRegistry &);
+
+} // end namespace llvm
 
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
@@ -47,27 +75,28 @@ extern "C" void LLVMInitializeX86Target() {
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
+  initializeX86ExecutionDepsFixPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::x86_64)
-      return make_unique<X86_64MachoTargetObjectFile>();
-    return make_unique<TargetLoweringObjectFileMachO>();
+      return llvm::make_unique<X86_64MachoTargetObjectFile>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
   }
 
   if (TT.isOSFreeBSD())
-    return make_unique<X86FreeBSDTargetObjectFile>();
+    return llvm::make_unique<X86FreeBSDTargetObjectFile>();
   if (TT.isOSLinux() || TT.isOSNaCl())
-    return make_unique<X86LinuxNaClTargetObjectFile>();
+    return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
   if (TT.isOSFuchsia())
-    return make_unique<X86FuchsiaTargetObjectFile>();
+    return llvm::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
-    return make_unique<X86ELFTargetObjectFile>();
+    return llvm::make_unique<X86ELFTargetObjectFile>();
   if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
-    return make_unique<X86WindowsTargetObjectFile>();
+    return llvm::make_unique<X86WindowsTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
-    return make_unique<TargetLoweringObjectFileCOFF>();
+    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
   llvm_unreachable("unknown subtarget type");
 }
 
@@ -177,31 +206,37 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-X86TargetMachine::~X86TargetMachine() {}
+X86TargetMachine::~X86TargetMachine() = default;
 
 #ifdef LLVM_BUILD_GLOBAL_ISEL
 namespace {
+
 struct X86GISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CL;
-  X86GISelActualAccessor(CallLowering* CL): CL(CL) {}
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
   const CallLowering *getCallLowering() const override {
-    return CL.get();
+    return CallLoweringInfo.get();
   }
+
   const InstructionSelector *getInstructionSelector() const override {
-    //TODO: Implement
-    return nullptr;
+    return InstSelector.get();
   }
+
   const LegalizerInfo *getLegalizerInfo() const override {
-    //TODO: Implement
-    return nullptr;
+    return Legalizer.get();
   }
+
   const RegisterBankInfo *getRegBankInfo() const override {
-    //TODO: Implement
-    return nullptr;
+    return RegBankInfo.get();
   }
 };
-} // End anonymous namespace.
+
+} // end anonymous namespace
 #endif
+
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -244,8 +279,14 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 #ifndef LLVM_BUILD_GLOBAL_ISEL
     GISelAccessor *GISel = new GISelAccessor();
 #else
-    X86GISelActualAccessor *GISel = new X86GISelActualAccessor(
-        new X86CallLowering(*I->getTargetLowering()));
+    X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
+
+    GISel->CallLoweringInfo.reset(new X86CallLowering(*I->getTargetLowering()));
+    GISel->Legalizer.reset(new X86LegalizerInfo(*I, *this));
+
+    auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo());
+    GISel->RegBankInfo.reset(RBI);
+    GISel->InstSelector.reset(createX86InstructionSelector(*I, *RBI));
 #endif
     I->setGISelAccessor(*GISel);
   }
@@ -270,12 +311,12 @@ TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
   });
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// X86 Code Generator Pass Configuration Options.
 class X86PassConfig : public TargetPassConfig {
 public:
@@ -289,7 +330,7 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    DAG->addMutation(createX86MacroFusionDAGMutation());
     return DAG;
   }
 
@@ -301,14 +342,28 @@ public:
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
 #endif
-bool addILPOpts() override;
+  bool addILPOpts() override;
   bool addPreISel() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreEmitPass() override;
   void addPreSched2() override;
 };
-} // namespace
+
+class X86ExecutionDepsFix : public ExecutionDepsFix {
+public:
+  static char ID;
+  X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {}
+  StringRef getPassName() const override {
+    return "X86 Execution Dependency Fix";
+  }
+};
+char X86ExecutionDepsFix::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
+                "X86 Execution Dependency Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(this, PM);
@@ -343,17 +398,17 @@ bool X86PassConfig::addIRTranslator() {
 }
 
 bool X86PassConfig::addLegalizeMachineIR() {
-  //TODO: Implement
+  addPass(new Legalizer());
   return false;
 }
 
 bool X86PassConfig::addRegBankSelect() {
-  //TODO: Implement
+  addPass(new RegBankSelect());
   return false;
 }
 
 bool X86PassConfig::addGlobalInstructionSelect() {
-  //TODO: Implement
+  addPass(new InstructionSelect());
   return false;
 }
 #endif
@@ -391,7 +446,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
 
 void X86PassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass));
+    addPass(new X86ExecutionDepsFix());
 
   if (UseVZeroUpper)
     addPass(createX86IssueVZeroUpperPass());
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index d756d07926dd..cf933f52604e 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -13,14 +13,20 @@
 
 #ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
 #define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
-#include "X86InstrInfo.h"
+
 #include "X86Subtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
 class StringRef;
+class X86Subtarget;
+class X86RegisterBankInfo;
 
 class X86TargetMachine final : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -32,17 +38,19 @@ public:
                    Optional<Reloc::Model> RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
   ~X86TargetMachine() override;
+
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 5715d826862e..b742fb472372 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
   return 8;
 }
 
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
     if (ST->hasAVX512())
       return 512;
@@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   return 32;
 }
 
+unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
+  return getRegisterBitWidth(true);
+}
+
 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // If the loop will not be vectorized, don't interleave the loop.
   // Let regular unroll to unroll the loop, which saves the overflow
@@ -114,7 +118,7 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }
 
 int X86TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty,  
+    unsigned Opcode, Type *Ty,
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo,
@@ -207,6 +211,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX512UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v2i64,   1 },
+    { ISD::SRA,  MVT::v4i64,   1 },
+    { ISD::SRA,  MVT::v8i64,   1 },
+
     { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
   };
@@ -319,6 +327,14 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512BWCostTable[] = {
+    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
+    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
+
+    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
+    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
+
     { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
@@ -347,8 +363,12 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,     MVT::v16i32,     1 },
     { ISD::SRL,     MVT::v16i32,     1 },
     { ISD::SRA,     MVT::v16i32,     1 },
+
     { ISD::SHL,     MVT::v8i64,      1 },
     { ISD::SRL,     MVT::v8i64,      1 },
+
+    { ISD::SRA,     MVT::v2i64,      1 },
+    { ISD::SRA,     MVT::v4i64,      1 },
     { ISD::SRA,     MVT::v8i64,      1 },
 
     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
@@ -595,7 +615,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
     { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
@@ -804,7 +823,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
 
     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-    { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+    { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
+
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }  // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
   };
 
   if (ST->hasAVX2())
@@ -861,7 +887,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
   };
 
   if (ST->hasSSSE3())
@@ -886,7 +915,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
     { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
   };
 
   if (ST->hasSSE2())
@@ -906,7 +938,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -1272,7 +1305,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1338,11 +1372,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Type *> Tys, FastMathFlags FMF) {
+                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                      unsigned ScalarizationCostPassed) {
   // Costs should match the codegen from:
   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
@@ -1418,8 +1453,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
   };
   static const CostTblEntry SSE42CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   18 }, // Nehalem from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSSE3CostTbl[] = {
     { ISD::BITREVERSE, MVT::v2i64,   5 },
@@ -1443,6 +1478,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::CTTZ,       MVT::v16i8,   9 }
   };
   static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v2i64,  29 },
+    { ISD::BITREVERSE, MVT::v4i32,  27 },
+    { ISD::BITREVERSE, MVT::v8i16,  27 },
+    { ISD::BITREVERSE, MVT::v16i8,  20 },
     { ISD::BSWAP,      MVT::v2i64,   7 },
     { ISD::BSWAP,      MVT::v4i32,   7 },
     { ISD::BSWAP,      MVT::v8i16,   7 },
@@ -1462,8 +1501,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSE1CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   28 }, // Pentium III from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
+  };
+  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+    { ISD::BITREVERSE, MVT::i64,    14 }
+  };
+  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+    { ISD::BITREVERSE, MVT::i32,    14 },
+    { ISD::BITREVERSE, MVT::i16,    14 },
+    { ISD::BITREVERSE, MVT::i8,     11 }
   };
 
   unsigned ISD = ISD::DELETED_NODE;
@@ -1523,12 +1570,19 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+  if (ST->is64Bit())
+    if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+    return LT.first * Entry->Cost;
+
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Value *> Args, FastMathFlags FMF) {
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+                     ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -1562,22 +1616,8 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
-int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-  assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  int Cost = 0;
-
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    if (Insert)
-      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    if (Extract)
-      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-
-  return Cost;
-}
-
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
@@ -2132,7 +2172,7 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
   // As a temporary solution, disable on Atom.
-  return !(ST->isAtom() || ST->isSLM());
+  return !(ST->isAtom());
 }
 
 // Get estimation for interleaved load/store operations and strided load.
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index ecaaf951cff7..9bef9e80c395 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -33,8 +33,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   const X86Subtarget *ST;
   const X86TargetLowering *TLI;
 
-  int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const X86Subtarget *getST() const { return ST; }
   const X86TargetLowering *getTLI() const { return TLI; }
 
@@ -53,7 +51,8 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
@@ -63,11 +62,13 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
@@ -76,9 +77,11 @@ public:
                                 const SCEV *Ptr);
 
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF);
+                            ArrayRef<Type *> Tys, FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX);
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF);
+                            ArrayRef<Value *> Args, FastMathFlags FMF,
+                            unsigned VF = 1);
 
   int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
 
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 9766b84be652..d17dfac6a997 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -56,11 +56,11 @@ namespace {
 
     // Core algorithm state:
     // BlockState - Each block is either:
-    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //   - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
     //                   vzeroupper instructions in this block.
     //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
-    //                  block that will ensure that YMM is clean on exit.
-    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  block that will ensure that YMM/ZMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
     //                  subsequent vzeroupper in the block clears it.
     //
     // AddedToDirtySuccessors - This flag is raised when a block is added to the
@@ -97,6 +97,7 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
   return new VZeroUpperInserter();
 }
 
+#ifndef NDEBUG
 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
   switch (ST) {
     case PASS_THROUGH: return "Pass-through";
@@ -105,52 +106,56 @@ const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
   }
   llvm_unreachable("Invalid block exit state.");
 }
+#endif
 
-static bool isYmmReg(unsigned Reg) {
-  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
+/// Thus, there is no need to check for Y/ZMM16 and above.
+static bool isYmmOrZmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
+         (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
 }
 
-static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
        E = MRI.livein_end(); I != E; ++I)
-    if (isYmmReg(I->first))
+    if (isYmmOrZmmReg(I->first))
       return true;
 
   return false;
 }
 
-static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
   for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
+  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
   return true;
 }
 
-static bool hasYmmReg(MachineInstr &MI) {
+static bool hasYmmOrZmmReg(MachineInstr &MI) {
   for (const MachineOperand &MO : MI.operands()) {
-    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
       return true;
     if (!MO.isReg())
       continue;
     if (MO.isDebug())
       continue;
-    if (isYmmReg(MO.getReg()))
+    if (isYmmOrZmmReg(MO.getReg()))
       return true;
   }
   return false;
 }
 
-/// Check if any YMM register will be clobbered by this instruction.
-static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+/// Check if given call instruction has a RegMask operand.
+static bool callHasRegMask(MachineInstr &MI) {
   assert(MI.isCall() && "Can only be called on call instructions.");
   for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isRegMask())
-      continue;
-    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
-      if (MO.clobbersPhysReg(reg))
-        return true;
-    }
+    if (MO.isRegMask())
+      return true;
   }
   return false;
 }
@@ -175,17 +180,20 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
 /// Loop over all of the instructions in the basic block, inserting vzeroupper
 /// instructions before function calls.
 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
-
   // Start by assuming that the block is PASS_THROUGH which implies no unguarded
   // calls.
   BlockExitState CurState = PASS_THROUGH;
   BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
 
   for (MachineInstr &MI : MBB) {
+    bool IsCall = MI.isCall();
+    bool IsReturn = MI.isReturn();
+    bool IsControlFlow = IsCall || IsReturn;
+
     // No need for vzeroupper before iret in interrupt handler function,
-    // epilogue will restore YMM registers if needed.
-    bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
-    bool IsControlFlow = MI.isCall() || MI.isReturn();
+    // epilogue will restore YMM/ZMM registers if needed.
+    if (IsX86INTR && IsReturn)
+      continue;
 
     // An existing VZERO* instruction resets the state.
     if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
@@ -194,30 +202,30 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     }
 
     // Shortcut: don't need to check regular instructions in dirty state.
-    if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+    if (!IsControlFlow && CurState == EXITS_DIRTY)
       continue;
 
-    if (hasYmmReg(MI)) {
-      // We found a ymm-using instruction; this could be an AVX instruction,
-      // or it could be control flow.
+    if (hasYmmOrZmmReg(MI)) {
+      // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
+      // instruction, or it could be control flow.
       CurState = EXITS_DIRTY;
       continue;
     }
 
     // Check for control-flow out of the current function (which might
     // indirectly execute SSE instructions).
-    if (!IsControlFlow || IsReturnFromX86INTR)
+    if (!IsControlFlow)
       continue;
 
-    // If the call won't clobber any YMM register, skip it as well. It usually
-    // happens on helper function calls (such as '_chkstk', '_ftol2') where
-    // standard calling convention is not used (RegMask is not used to mark
-    // register clobbered and register usage (def/imp-def/use) is well-defined
-    // and explicitly specified.
-    if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+    // If the call has no RegMask, skip it as well. It usually happens on
+    // helper function calls (such as '_chkstk', '_ftol2') where standard
+    // calling convention is not used (RegMask is not used to mark register
+    // clobbered and register usage (def/imp-def/use) is well-defined and
+    // explicitly specified.
+    if (IsCall && !callHasRegMask(MI))
       continue;
 
-    // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+    // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
     // registers. In addition, the processor changes back to Clean state, after
     // which execution of SSE instructions or AVX instructions has no transition
     // penalty. Add the VZEROUPPER instruction before any function call/return
@@ -226,7 +234,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     // predecessor block.
     if (CurState == EXITS_DIRTY) {
       // After the inserted VZEROUPPER the state becomes clean again, but
-      // other YMM may appear before other subsequent calls or even before
+      // other YMM/ZMM may appear before other subsequent calls or even before
       // the end of the BB.
       insertVZeroUpper(MI, MBB);
       CurState = EXITS_CLEAN;
@@ -257,30 +265,32 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+  if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
     return false;
   TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   EverMadeChange = false;
   IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
 
-  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
-
-  // Fast check: if the function doesn't use any ymm registers, we don't need
-  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
-  // cheap in the common case of no ymm use.
-  bool YMMUsed = FnHasLiveInYmm;
-  if (!YMMUsed) {
-    const TargetRegisterClass *RC = &X86::VR256RegClass;
-    for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
-         i++) {
-      if (!MRI.reg_nodbg_empty(*i)) {
-        YMMUsed = true;
-        break;
+  bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
+
+  // Fast check: if the function doesn't use any ymm/zmm registers, we don't
+  // need to insert any VZEROUPPER instructions.  This is constant-time, so it
+  // is cheap in the common case of no ymm/zmm use.
+  bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
+  const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
+  for (auto *RC : RCs) {
+    if (!YmmOrZmmUsed) {
+      for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+           i++) {
+        if (!MRI.reg_nodbg_empty(*i)) {
+          YmmOrZmmUsed = true;
+          break;
+        }
       }
     }
   }
-  if (!YMMUsed) {
+  if (!YmmOrZmmUsed) {
     return false;
   }
 
@@ -294,9 +304,9 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     processBasicBlock(MBB);
 
-  // If any YMM regs are live-in to this function, add the entry block to the
-  // DirtySuccessors list
-  if (FnHasLiveInYmm)
+  // If any YMM/ZMM regs are live-in to this function, add the entry block to
+  // the DirtySuccessors list
+  if (FnHasLiveInYmmOrZmm)
     addDirtySuccessor(MF.front());
 
   // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 500c84d2a418..b03c1852281d 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -12,13 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreInstPrinter.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index dc513f7b225b..8a7efe2e39c6 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -15,6 +15,8 @@
 
 #ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
 #define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -32,12 +34,14 @@ public:
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
+
 private:
   void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
   void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index c5859b7786f7..5fc58d831319 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -11,15 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "XCoreMCTargetDesc.h"
 #include "InstPrinter/XCoreInstPrinter.h"
-#include "XCoreMCAsmInfo.h"
+#include "MCTargetDesc/XCoreMCAsmInfo.h"
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
 #include "XCoreTargetStreamer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -79,20 +83,25 @@ static MCInstPrinter *createXCoreMCInstPrinter(const Triple &T,
 }
 
 XCoreTargetStreamer::XCoreTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
-XCoreTargetStreamer::~XCoreTargetStreamer() {}
+
+XCoreTargetStreamer::~XCoreTargetStreamer() = default;
 
 namespace {
 
 class XCoreTargetAsmStreamer : public XCoreTargetStreamer {
   formatted_raw_ostream &OS;
+
 public:
   XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
   void emitCCTopData(StringRef Name) override;
   void emitCCTopFunction(StringRef Name) override;
   void emitCCBottomData(StringRef Name) override;
   void emitCCBottomFunction(StringRef Name) override;
 };
 
+} // end anonymous namespace
+
 XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,
                                                formatted_raw_ostream &OS)
     : XCoreTargetStreamer(S), OS(OS) {}
@@ -112,7 +121,6 @@ void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) {
 void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
   OS << "\t.cc_bottom " << Name << ".function\n";
 }
-}
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index ac0f3fefbae7..1dc384fadf69 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -14,13 +14,13 @@
 #ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 #define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 
-#include "llvm/Support/DataTypes.h"
-
 namespace llvm {
+
 class Target;
+
 Target &getTheXCoreTarget();
 
-} // End llvm namespace
+} // end namespace llvm
 
 // Defines symbolic names for XCore registers.  This defines a mapping from
 // register name to register number.
@@ -36,4 +36,4 @@ Target &getTheXCoreTarget();
 #define GET_SUBTARGETINFO_ENUM
 #include "XCoreGenSubtargetInfo.inc"
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index e0e2e0319964..a752357400b3 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -238,7 +238,7 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
     report_fatal_error("emitPrologue unsupported alignment: "
                        + Twine(MFI.getMaxAlignment()));
 
-  const AttributeSet &PAL = MF.getFunction()->getAttributes();
+  const AttributeList &PAL = MF.getFunction()->getAttributes();
   if (PAL.hasAttrSomewhere(Attribute::Nest))
     BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
     // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 9244d594460f..45437815fa37 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -483,7 +483,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
       CallingConv::C, IntPtrTy,
       DAG.getExternalSymbol("__misaligned_load",
                             getPointerTy(DAG.getDataLayout())),
@@ -1824,6 +1824,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                         APInt &KnownZero,
                                                         APInt &KnownOne,
+                                                        const APInt &DemandedElts,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 41813bbb8156..188f4f1fa06b 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -202,6 +202,7 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
index cdcc52fdc32d..cf469ec3cf1a 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -14,50 +14,39 @@
 #ifndef LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
 
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include <cassert>
+#include <utility>
 #include <vector>
 
 namespace llvm {
 
-// Forward declarations
-class Function;
-
 /// XCoreFunctionInfo - This class is derived from MachineFunction private
 /// XCore target-specific information for each MachineFunction.
 class XCoreFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-  bool LRSpillSlotSet;
+  bool LRSpillSlotSet = false;
   int LRSpillSlot;
-  bool FPSpillSlotSet;
+  bool FPSpillSlotSet = false;
   int FPSpillSlot;
-  bool EHSpillSlotSet;
+  bool EHSpillSlotSet = false;
   int EHSpillSlot[2];
   unsigned ReturnStackOffset;
-  bool ReturnStackOffsetSet;
-  int VarArgsFrameIndex;
-  mutable int CachedEStackSize;
+  bool ReturnStackOffsetSet = false;
+  int VarArgsFrameIndex = 0;
+  mutable int CachedEStackSize = -1;
   std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>>
   SpillLabels;
 
+  virtual void anchor();
+
 public:
-  XCoreFunctionInfo() :
-    LRSpillSlotSet(false),
-    FPSpillSlotSet(false),
-    EHSpillSlotSet(false),
-    ReturnStackOffsetSet(false),
-    VarArgsFrameIndex(0),
-    CachedEStackSize(-1) {}
+  XCoreFunctionInfo() = default;
   
-  explicit XCoreFunctionInfo(MachineFunction &MF) :
-    LRSpillSlotSet(false),
-    FPSpillSlotSet(false),
-    EHSpillSlotSet(false),
-    ReturnStackOffsetSet(false),
-    VarArgsFrameIndex(0),
-    CachedEStackSize(-1) {}
+  explicit XCoreFunctionInfo(MachineFunction &MF) {}
   
-  ~XCoreFunctionInfo() {}
+  ~XCoreFunctionInfo() override = default;
   
   void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
@@ -101,6 +90,7 @@ public:
     return SpillLabels;
   }
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index c03b0afceba3..646309e02de8 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -35,11 +35,11 @@ SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Chain)
-        .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                   Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__memcpy_4",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args))
+        .setLibCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                      Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "__memcpy_4", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args))
         .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index bf3138f2164a..e28e05c7f6a8 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -10,15 +10,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "XCore.h"
 #include "XCoreTargetMachine.h"
 #include "XCoreTargetObjectFile.h"
 #include "XCoreTargetTransformInfo.h"
-#include "XCore.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
+
 using namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
@@ -38,14 +42,15 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(
           T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
           TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM, OL),
-      TLOF(make_unique<XCoreTargetObjectFile>()),
+      TLOF(llvm::make_unique<XCoreTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
-XCoreTargetMachine::~XCoreTargetMachine() {}
+XCoreTargetMachine::~XCoreTargetMachine() = default;
 
 namespace {
+
 /// XCore Code Generator Pass Configuration Options.
 class XCorePassConfig : public TargetPassConfig {
 public:
@@ -61,7 +66,8 @@ public:
   bool addInstSelector() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new XCorePassConfig(this, PM);
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 4bd25bc8776c..2b53f01a996d 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -15,13 +15,19 @@
 #define LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
 
 #include "XCoreSubtarget.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
 class XCoreTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   XCoreSubtarget Subtarget;
+
 public:
   XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -38,6 +44,7 @@ public:
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
@@ -45,4 +52,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp
index 99974d8da64c..c6ac3f614ff7 100644
--- a/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/lib/Transforms/Coroutines/CoroElide.cpp
@@ -92,7 +92,7 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
 
 // Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
 static Type *getFrameType(Function *Resume) {
-  auto *ArgType = Resume->getArgumentList().front().getType();
+  auto *ArgType = Resume->arg_begin()->getType();
   return cast<PointerType>(ArgType)->getElementType();
 }
 
@@ -127,7 +127,8 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   // is spilled into the coroutine frame and recreate the alignment information
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
-  auto *Frame = new AllocaInst(FrameTy, "", InsertPt);
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
   auto *FrameVoidPtr =
       new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
 
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index bb28558a29e2..19e6789dfa74 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -133,6 +133,7 @@ struct SuspendCrossingInfo {
 };
 } // end anonymous namespace
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label,
                                                 BitVector const &BV) const {
   dbgs() << Label << ":";
@@ -151,6 +152,7 @@ LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
   }
   dbgs() << "\n";
 }
+#endif
 
 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
     : Mapping(F) {
@@ -420,15 +422,31 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
           report_fatal_error("Coroutines cannot handle non static allocas yet");
       } else {
         // Otherwise, create a store instruction storing the value into the
-        // coroutine frame. For, argument, we will place the store instruction
-        // right after the coroutine frame pointer instruction, i.e. bitcase of
-        // coro.begin from i8* to %f.frame*. For all other values, the spill is
-        // placed immediately after the definition.
-        Builder.SetInsertPoint(
-            isa<Argument>(CurrentValue)
-                ? FramePtr->getNextNode()
-                : dyn_cast<Instruction>(E.def())->getNextNode());
+        // coroutine frame.
+
+        Instruction *InsertPt = nullptr;
+        if (isa<Argument>(CurrentValue)) {
+          // For arguments, we will place the store instruction right after
+          // the coroutine frame pointer instruction, i.e. bitcast of
+          // coro.begin from i8* to %f.frame*.
+          InsertPt = FramePtr->getNextNode();
+        } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
+          // If we are spilling the result of the invoke instruction, split the
+          // normal edge and insert the spill in the new block.
+          auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
+          InsertPt = NewBB->getTerminator();
+        } else if (dyn_cast<PHINode>(CurrentValue)) {
+          // Skip the PHINodes and EH pads instructions.
+          InsertPt =
+              &*cast<Instruction>(E.def())->getParent()->getFirstInsertionPt();
+        } else {
+          // For all other values, the spill is placed immediately after
+          // the definition.
+          assert(!isa<TerminatorInst>(E.def()) && "unexpected terminator");
+          InsertPt = cast<Instruction>(E.def())->getNextNode();
+        }
 
+        Builder.SetInsertPoint(InsertPt);
         auto *G = Builder.CreateConstInBoundsGEP2_32(
             FrameTy, FramePtr, 0, Index,
             CurrentValue->getName() + Twine(".spill.addr"));
@@ -484,7 +502,7 @@ static void rewritePHIs(BasicBlock &BB) {
   // loop:
   //    %n.val = phi i32[%n, %entry], [%inc, %loop]
   //
-  // It will create:
+  // It will create:  
   //
   // loop.from.entry:
   //    %n.loop.pre = phi i32 [%n, %entry]
@@ -687,13 +705,12 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
           Spills.emplace_back(&I, U);
 
   // Rewrite materializable instructions to be materialized at the use point.
-  std::sort(Spills.begin(), Spills.end());
   DEBUG(dump("Materializations", Spills));
   rewriteMaterializableInstructions(Builder, Spills);
 
   // Collect the spills for arguments and other not-materializable values.
   Spills.clear();
-  for (Argument &A : F.getArgumentList())
+  for (Argument &A : F.args())
     for (User *U : A.users())
       if (Checker.isDefinitionAcrossSuspend(A, U))
         Spills.emplace_back(&A, U);
@@ -719,7 +736,6 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
         Spills.emplace_back(&I, U);
       }
   }
-  std::sort(Spills.begin(), Spills.end());
   DEBUG(dump("Spills", Spills));
   moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
diff --git a/lib/Transforms/Coroutines/CoroInstr.h b/lib/Transforms/Coroutines/CoroInstr.h
index e03cef4bfc46..5c666bdfea1f 100644
--- a/lib/Transforms/Coroutines/CoroInstr.h
+++ b/lib/Transforms/Coroutines/CoroInstr.h
@@ -23,6 +23,9 @@
 // the Coroutine library.
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H
+#define LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H
+
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
 
@@ -316,3 +319,5 @@ public:
 };
 
 } // End namespace llvm.
+
+#endif
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 7a3f4f60bae9..ab648f884c5b 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -22,6 +22,7 @@
 #include "CoroInternal.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
@@ -144,6 +145,33 @@ static void replaceFallthroughCoroEnd(IntrinsicInst *End,
   BB->getTerminator()->eraseFromParent();
 }
 
+// In Resumers, we replace unwind coro.end with True to force the immediate
+// unwind to caller.
+static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *True = ConstantInt::getTrue(Context);
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    if (!CE->isUnwind())
+      continue;
+
+    auto *NewCE = cast<IntrinsicInst>(VMap[CE]);
+
+    // If coro.end has an associated bundle, add cleanupret instruction.
+    if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) {
+      Value *FromPad = Bundle->Inputs[0];
+      auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE);
+      NewCE->getParent()->splitBasicBlock(NewCE);
+      CleanupRet->getParent()->getTerminator()->eraseFromParent();
+    }
+
+    NewCE->replaceAllUsesWith(True);
+    NewCE->eraseFromParent();
+  }
+}
+
 // Rewrite final suspend point handling. We do not use suspend index to
 // represent the final suspend point. Instead we zero-out ResumeFnAddr in the
 // coroutine frame, since it is undefined behavior to resume a coroutine
@@ -157,9 +185,9 @@ static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
                                coro::Shape &Shape, SwitchInst *Switch,
                                bool IsDestroy) {
   assert(Shape.HasFinalSuspend);
-  auto FinalCase = --Switch->case_end();
-  BasicBlock *ResumeBB = FinalCase.getCaseSuccessor();
-  Switch->removeCase(FinalCase);
+  auto FinalCaseIt = std::prev(Switch->case_end());
+  BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor();
+  Switch->removeCase(FinalCaseIt);
   if (IsDestroy) {
     BasicBlock *OldSwitchBB = Switch->getParent();
     auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
@@ -195,7 +223,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   // Replace all args with undefs. The buildCoroutineFrame algorithm already
   // rewritten access to the args that occurs after suspend points with loads
   // and stores to/from the coroutine frame.
-  for (Argument &A : F.getArgumentList())
+  for (Argument &A : F.args())
     VMap[&A] = UndefValue::get(A.getType());
 
   SmallVector<ReturnInst *, 4> Returns;
@@ -216,9 +244,9 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   // Remove old return attributes.
   NewF->removeAttributes(
-      AttributeSet::ReturnIndex,
-      AttributeSet::get(
-          NewF->getContext(), AttributeSet::ReturnIndex,
+      AttributeList::ReturnIndex,
+      AttributeList::get(
+          NewF->getContext(), AttributeList::ReturnIndex,
           AttributeFuncs::typeIncompatible(NewF->getReturnType())));
 
   // Make AllocaSpillBlock the new entry block.
@@ -236,7 +264,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   IRBuilder<> Builder(&NewF->getEntryBlock().front());
 
   // Remap frame pointer.
-  Argument *NewFramePtr = &NewF->getArgumentList().front();
+  Argument *NewFramePtr = &*NewF->arg_begin();
   Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
   NewFramePtr->takeName(OldFramePtr);
   OldFramePtr->replaceAllUsesWith(NewFramePtr);
@@ -270,9 +298,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   // Remove coro.end intrinsics.
   replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
-  // FIXME: coming in upcoming patches:
-  // replaceUnwindCoroEnds(Shape.CoroEnds, VMap);
-
+  replaceUnwindCoroEnds(Shape, VMap);
   // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
   // to suppress deallocation code.
   coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
@@ -284,8 +310,16 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 }
 
 static void removeCoroEnds(coro::Shape &Shape) {
-  for (CoroEndInst *CE : Shape.CoroEnds)
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *False = ConstantInt::getFalse(Context);
+
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    CE->replaceAllUsesWith(False);
     CE->eraseFromParent();
+  }
 }
 
 static void replaceFrameSize(coro::Shape &Shape) {
diff --git a/lib/Transforms/Coroutines/Coroutines.cpp b/lib/Transforms/Coroutines/Coroutines.cpp
index 877ec34b4d3b..ea48043f9381 100644
--- a/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/lib/Transforms/Coroutines/Coroutines.cpp
@@ -245,9 +245,9 @@ void coro::Shape::buildFrom(Function &F) {
           if (CoroBegin)
             report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-          CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
-          CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
-          CB->removeAttribute(AttributeSet::FunctionIndex,
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+          CB->removeAttribute(AttributeList::FunctionIndex,
                               Attribute::NoDuplicate);
           CoroBegin = CB;
         }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 65b7bad3b1ed..a2c8a32dfe86 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -29,8 +29,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -38,6 +39,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
@@ -51,323 +53,400 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include <set>
 using namespace llvm;
 
 #define DEBUG_TYPE "argpromotion"
 
-STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted");
+STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
 STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
-STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted");
-STATISTIC(NumArgumentsDead     , "Number of dead pointer args eliminated");
+STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
-namespace {
-  /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
-  ///
-  struct ArgPromotion : public CallGraphSCCPass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      getAAResultsAnalysisUsage(AU);
-      CallGraphSCCPass::getAnalysisUsage(AU);
-    }
+/// A vector used to hold the indices of a single GEP instruction
+typedef std::vector<uint64_t> IndicesVector;
 
-    bool runOnSCC(CallGraphSCC &SCC) override;
-    static char ID; // Pass identification, replacement for typeid
-    explicit ArgPromotion(unsigned maxElements = 3)
-        : CallGraphSCCPass(ID), maxElements(maxElements) {
-      initializeArgPromotionPass(*PassRegistry::getPassRegistry());
-    }
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+static Function *
+doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+            Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                ReplaceCallSite) {
 
-  private:
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type *> Params;
 
-    using llvm::Pass::doInitialization;
-    bool doInitialization(CallGraph &CG) override;
-    /// The maximum number of elements to expand, or 0 for unlimited.
-    unsigned maxElements;
-  };
-}
+  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
 
-/// A vector used to hold the indices of a single GEP instruction
-typedef std::vector<uint64_t> IndicesVector;
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  //
+  std::map<Argument *, ScalarizeTable> ScalarizedElements;
 
-static CallGraphNode *
-PromoteArguments(CallGraphNode *CGN, CallGraph &CG,
-                 function_ref<AAResults &(Function &F)> AARGetter,
-                 unsigned MaxElements);
-static bool isDenselyPacked(Type *type, const DataLayout &DL);
-static bool canPaddingBeAccessed(Argument *Arg);
-static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR,
-                                    unsigned MaxElements);
-static CallGraphNode *
-DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG);
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
 
-char ArgPromotion::ID = 0;
-INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
-                "Promote 'by reference' arguments to scalars", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
-                "Promote 'by reference' arguments to scalars", false, false)
+  // Attribute - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  AttributeList PAL = F->getAttributes();
 
-Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
-  return new ArgPromotion(maxElements);
-}
+  // First, determine the new argument list
+  unsigned ArgIndex = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    if (ByValArgsToTransform.count(&*I)) {
+      // Simple byval argument? Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
+      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
+      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
+                        AttributeSet());
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(&*I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      ArgAttrVec.push_back(PAL.getParamAttributes(ArgIndex));
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
 
-static bool runImpl(CallGraphSCC &SCC, CallGraph &CG,
-                    function_ref<AAResults &(Function &F)> AARGetter,
-                    unsigned MaxElements) {
-  bool Changed = false, LocalChange;
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+      for (User *U : I->users()) {
+        Instruction *UI = cast<Instruction>(U);
+        Type *SrcTy;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          SrcTy = L->getType();
+        else
+          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
+        IndicesVector Indices;
+        Indices.reserve(UI->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(std::make_pair(SrcTy, Indices));
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(UI->user_back());
+        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
+      }
 
-  do {  // Iterate until we stop promoting from this SCC.
-    LocalChange = false;
-    // Attempt to promote arguments from all functions in this SCC.
-    for (CallGraphNode *OldNode : SCC) {
-      if (CallGraphNode *NewNode =
-              PromoteArguments(OldNode, CG, AARGetter, MaxElements)) {
-        LocalChange = true;
-        SCC.ReplaceNode(OldNode, NewNode);
+      // Add a parameter to the function for each element passed in.
+      for (const auto &ArgIndex : ArgIndices) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(
+            cast<PointerType>(I->getType()->getScalarType())->getElementType(),
+            ArgIndex.second));
+        ArgAttrVec.push_back(AttributeSet());
+        assert(Params.back());
       }
+
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
     }
-    Changed |= LocalChange;               // Remember that we changed something.
-  } while (LocalChange);
-  
-  return Changed;
-}
+  }
 
-bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
-  if (skipSCC(SCC))
-    return false;
+  Type *RetTy = FTy->getReturnType();
 
-  // Get the callgraph information that we need to update to reflect our
-  // changes.
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
 
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
-
-  return runImpl(SCC, CG, AARGetter, maxElements);
-}
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->copyAttributesFrom(F);
 
-/// \brief Checks if a type could have padding bytes.
-static bool isDenselyPacked(Type *type, const DataLayout &DL) {
+  // Patch the pointer to LLVM function in debug info descriptor.
+  NF->setSubprogram(F->getSubprogram());
+  F->setSubprogram(nullptr);
 
-  // There is no size information, so be conservative.
-  if (!type->isSized())
-    return false;
+  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
+               << "From: " << *F);
 
-  // If the alloc size is not equal to the storage size, then there are padding
-  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
-  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
-    return false;
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
+                                       PAL.getRetAttributes(), ArgAttrVec));
+  ArgAttrVec.clear();
 
-  if (!isa<CompositeType>(type))
-    return true;
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
 
-  // For homogenous sequential types, check for padding within members.
-  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
-    return isDenselyPacked(seqTy->getElementType(), DL);
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value *, 16> Args;
+  while (!F->use_empty()) {
+    CallSite CS(F->user_back());
+    assert(CS.getCalledFunction() == F);
+    Instruction *Call = CS.getInstruction();
+    const AttributeList &CallPAL = CS.getAttributes();
 
-  // Check for padding within and between elements of a struct.
-  StructType *StructTy = cast<StructType>(type);
-  const StructLayout *Layout = DL.getStructLayout(StructTy);
-  uint64_t StartPos = 0;
-  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
-    Type *ElTy = StructTy->getElementType(i);
-    if (!isDenselyPacked(ElTy, DL))
-      return false;
-    if (StartPos != Layout->getElementOffsetInBits(i))
-      return false;
-    StartPos += DL.getTypeAllocSizeInBits(ElTy);
-  }
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgIndex = 1;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+         ++I, ++AI, ++ArgIndex)
+      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+        Args.push_back(*AI); // Unmodified argument
+        ArgAttrVec.push_back(CallPAL.getAttributes(ArgIndex));
+      } else if (ByValArgsToTransform.count(&*I)) {
+        // Emit a GEP and load for each element of the struct.
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = {
+            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+          Value *Idx = GetElementPtrInst::Create(
+              STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);
+          // TODO: Tell AA about the new values?
+          Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call));
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse.
+        std::vector<Value *> Ops;
+        for (const auto &ArgIndex : ArgIndices) {
+          Value *V = *AI;
+          LoadInst *OrigLoad =
+              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
+          if (!ArgIndex.second.empty()) {
+            Ops.reserve(ArgIndex.second.size());
+            Type *ElTy = V->getType();
+            for (unsigned long II : ArgIndex.second) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              Type *IdxTy =
+                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
+                                      : Type::getInt64Ty(F->getContext()));
+              Ops.push_back(ConstantInt::get(IdxTy, II));
+              // Keep track of the type we're currently indexing.
+              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
+                ElTy = ElPTy->getElementType();
+              else
+                ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
+            }
+            // And create a GEP to extract those indices.
+            V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
+                                          V->getName() + ".idx", Call);
+            Ops.clear();
+          }
+          // Since we're replacing a load make sure we take the alignment
+          // of the previous load.
+          LoadInst *newLoad = new LoadInst(V, V->getName() + ".val", Call);
+          newLoad->setAlignment(OrigLoad->getAlignment());
+          // Transfer the AA info too.
+          AAMDNodes AAInfo;
+          OrigLoad->getAAMetadata(AAInfo);
+          newLoad->setAAMetadata(AAInfo);
 
-  return true;
-}
+          Args.push_back(newLoad);
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      }
 
-/// \brief Checks if the padding bytes of an argument could be accessed.
-static bool canPaddingBeAccessed(Argument *arg) {
+    // Push any varargs arguments on the list.
+    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
+      Args.push_back(*AI);
+      ArgAttrVec.push_back(CallPAL.getAttributes(ArgIndex));
+    }
 
-  assert(arg->hasByValAttr());
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
 
-  // Track all the pointers to the argument to make sure they are not captured.
-  SmallPtrSet<Value *, 16> PtrValues;
-  PtrValues.insert(arg);
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call);
+    } else {
+      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call);
+      NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+      NewCS = NewCall;
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(
+        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
+                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
+    Args.clear();
+    ArgAttrVec.clear();
 
-  // Track all of the stores.
-  SmallVector<StoreInst *, 16> Stores;
+    // Update the callgraph to know that the callsite has been transformed.
+    if (ReplaceCallSite)
+      (*ReplaceCallSite)(CS, NewCS);
 
-  // Scan through the uses recursively to make sure the pointer is always used
-  // sanely.
-  SmallVector<Value *, 16> WorkList;
-  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
-  while (!WorkList.empty()) {
-    Value *V = WorkList.back();
-    WorkList.pop_back();
-    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
-      if (PtrValues.insert(V).second)
-        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
-    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
-      Stores.push_back(Store);
-    } else if (!isa<LoadInst>(V)) {
-      return true;
+    if (!Call->use_empty()) {
+      Call->replaceAllUsesWith(NewCS.getInstruction());
+      NewCS->takeName(Call);
     }
-  }
 
-// Check to make sure the pointers aren't captured
-  for (StoreInst *Store : Stores)
-    if (PtrValues.count(Store->getValueOperand()))
-      return true;
-
-  return false;
-}
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
 
-/// PromoteArguments - This method checks the specified function to see if there
-/// are any promotable arguments and if it is safe to promote the function (for
-/// example, all callers are direct).  If safe to promote some arguments, it
-/// calls the DoPromotion method.
-///
-static CallGraphNode *
-PromoteArguments(CallGraphNode *CGN, CallGraph &CG,
-                 function_ref<AAResults &(Function &F)> AARGetter,
-                 unsigned MaxElements) {
-  Function *F = CGN->getFunction();
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
-  // Make sure that it is local to this module.
-  if (!F || !F->hasLocalLinkage()) return nullptr;
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
 
-  // Don't promote arguments for variadic functions. Adding, removing, or
-  // changing non-pack parameters can change the classification of pack
-  // parameters. Frontends encode that classification at the call site in the
-  // IR, while in the callee the classification is determined dynamically based
-  // on the number of registers consumed so far.
-  if (F->isVarArg()) return nullptr;
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  //
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I) {
+    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
+      ++I2;
+      continue;
+    }
 
-  // First check: see if there are any pointer arguments!  If not, quick exit.
-  SmallVector<Argument*, 16> PointerArgs;
-  for (Argument &I : F->args())
-    if (I.getType()->isPointerTy())
-      PointerArgs.push_back(&I);
-  if (PointerArgs.empty()) return nullptr;
+    if (ByValArgsToTransform.count(&*I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = &NF->begin()->front();
 
-  // Second check: make sure that all callers are direct callers.  We can't
-  // transform functions that have indirect callers.  Also see if the function
-  // is self-recursive.
-  bool isSelfRecursive = false;
-  for (Use &U : F->uses()) {
-    CallSite CS(U.getUser());
-    // Must be a direct call.
-    if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) return nullptr;
-    
-    if (CS.getInstruction()->getParent()->getParent() == F)
-      isSelfRecursive = true;
-  }
-  
-  const DataLayout &DL = F->getParent()->getDataLayout();
+      // Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
+                                        "", InsertPt);
+      StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
+                        nullptr};
 
-  AAResults &AAR = AARGetter(*F);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+        Value *Idx = GetElementPtrInst::Create(
+            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
+            InsertPt);
+        I2->setName(I->getName() + "." + Twine(i));
+        new StoreInst(&*I2++, Idx, InsertPt);
+      }
 
-  // Check to see which arguments are promotable.  If an argument is promotable,
-  // add it to ArgsToPromote.
-  SmallPtrSet<Argument*, 8> ArgsToPromote;
-  SmallPtrSet<Argument*, 8> ByValArgsToTransform;
-  for (Argument *PtrArg : PointerArgs) {
-    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(&*I);
 
-    // Replace sret attribute with noalias. This reduces register pressure by
-    // avoiding a register copy.
-    if (PtrArg->hasStructRetAttr()) {
-      unsigned ArgNo = PtrArg->getArgNo();
-      F->setAttributes(
-          F->getAttributes()
-              .removeAttribute(F->getContext(), ArgNo + 1, Attribute::StructRet)
-              .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias));
-      for (Use &U : F->uses()) {
-        CallSite CS(U.getUser());
-        CS.setAttributes(
-            CS.getAttributes()
-                .removeAttribute(F->getContext(), ArgNo + 1,
-                                 Attribute::StructRet)
-                .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias));
+      // If the alloca is used in a call, we must clear the tail flag since
+      // the callee now uses an alloca from the caller.
+      for (User *U : TheAlloca->users()) {
+        CallInst *Call = dyn_cast<CallInst>(U);
+        if (!Call)
+          continue;
+        Call->setTailCall(false);
       }
+      continue;
     }
 
-    // If this is a byval argument, and if the aggregate type is small, just
-    // pass the elements, which is always safe, if the passed value is densely
-    // packed or if we can prove the padding bytes are never accessed. This does
-    // not apply to inalloca.
-    bool isSafeToPromote =
-        PtrArg->hasByValAttr() &&
-        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
-    if (isSafeToPromote) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
-          DEBUG(dbgs() << "argpromotion disable promoting argument '"
-                << PtrArg->getName() << "' because it would require adding more"
-                << " than " << MaxElements << " arguments to the function.\n");
-          continue;
-        }
-        
-        // If all the elements are single-value types, we can promote it.
-        bool AllSimple = true;
-        for (const auto *EltTy : STy->elements()) {
-          if (!EltTy->isSingleValueType()) {
-            AllSimple = false;
-            break;
-          }
+    if (I->use_empty())
+      continue;
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
+        assert(ArgIndices.begin()->second.empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName() + ".val");
+        LI->replaceAllUsesWith(&*I2);
+        LI->eraseFromParent();
+        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+                     << "' in function '" << F->getName() << "'\n");
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             It->second != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
         }
 
-        // Safe to transform, don't even bother trying to "promote" it.
-        // Passing the elements as a scalar will allow sroa to hack on
-        // the new alloca we introduce.
-        if (AllSimple) {
-          ByValArgsToTransform.insert(PtrArg);
-          continue;
+        std::string NewName = I->getName();
+        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+          NewName += "." + utostr(Operands[i]);
         }
-      }
-    }
+        NewName += ".val";
+        TheArg->setName(NewName);
 
-    // If the argument is a recursive type and we're in a recursive
-    // function, we could end up infinitely peeling the function argument.
-    if (isSelfRecursive) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        bool RecursiveType = false;
-        for (const auto *EltTy : STy->elements()) {
-          if (EltTy == PtrArg->getType()) {
-            RecursiveType = true;
-            break;
-          }
+        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+                     << "' of function '" << NF->getName() << "'\n");
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->user_back());
+          L->replaceAllUsesWith(&*TheArg);
+          L->eraseFromParent();
         }
-        if (RecursiveType)
-          continue;
+        GEP->eraseFromParent();
       }
     }
-    
-    // Otherwise, see if we can promote the pointer to its value.
-    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
-                                MaxElements))
-      ArgsToPromote.insert(PtrArg);
-  }
 
-  // No promotable pointer arguments.
-  if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) 
-    return nullptr;
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    std::advance(I2, ArgIndices.size());
+  }
 
-  return DoPromotion(F, ArgsToPromote, ByValArgsToTransform, CG);
+  return NF;
 }
 
 /// AllCallersPassInValidPointerForArgument - Return true if we can prove that
 /// all callees pass in a valid pointer for the specified function argument.
-static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
+static bool allCallersPassInValidPointerForArgument(Argument *Arg) {
   Function *Callee = Arg->getParent();
   const DataLayout &DL = Callee->getParent()->getDataLayout();
 
@@ -390,26 +469,25 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
 /// elements in Prefix is the same as the corresponding elements in Longer.
 ///
 /// This means it also returns true when Prefix and Longer are equal!
-static bool IsPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
+static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
   if (Prefix.size() > Longer.size())
     return false;
   return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
 }
 
-
 /// Checks if Indices, or a prefix of Indices, is in Set.
-static bool PrefixIn(const IndicesVector &Indices,
+static bool prefixIn(const IndicesVector &Indices,
                      std::set<IndicesVector> &Set) {
-    std::set<IndicesVector>::iterator Low;
-    Low = Set.upper_bound(Indices);
-    if (Low != Set.begin())
-      Low--;
-    // Low is now the last element smaller than or equal to Indices. This means
-    // it points to a prefix of Indices (possibly Indices itself), if such
-    // prefix exists.
-    //
-    // This load is safe if any prefix of its operands is safe to load.
-    return Low != Set.end() && IsPrefix(*Low, Indices);
+  std::set<IndicesVector>::iterator Low;
+  Low = Set.upper_bound(Indices);
+  if (Low != Set.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This means
+  // it points to a prefix of Indices (possibly Indices itself), if such
+  // prefix exists.
+  //
+  // This load is safe if any prefix of its operands is safe to load.
+  return Low != Set.end() && isPrefix(*Low, Indices);
 }
 
 /// Mark the given indices (ToMark) as safe in the given set of indices
@@ -417,7 +495,7 @@ static bool PrefixIn(const IndicesVector &Indices,
 /// is already a prefix of Indices in Safe, Indices are implicitely marked safe
 /// already. Furthermore, any indices that Indices is itself a prefix of, are
 /// removed from Safe (since they are implicitely safe because of Indices now).
-static void MarkIndicesSafe(const IndicesVector &ToMark,
+static void markIndicesSafe(const IndicesVector &ToMark,
                             std::set<IndicesVector> &Safe) {
   std::set<IndicesVector>::iterator Low;
   Low = Safe.upper_bound(ToMark);
@@ -428,7 +506,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark,
   // means it points to a prefix of Indices (possibly Indices itself), if
   // such prefix exists.
   if (Low != Safe.end()) {
-    if (IsPrefix(*Low, ToMark))
+    if (isPrefix(*Low, ToMark))
       // If there is already a prefix of these indices (or exactly these
       // indices) marked a safe, don't bother adding these indices
       return;
@@ -441,7 +519,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark,
   ++Low;
   // If there we're a prefix of longer index list(s), remove those
   std::set<IndicesVector>::iterator End = Safe.end();
-  while (Low != End && IsPrefix(ToMark, *Low)) {
+  while (Low != End && isPrefix(ToMark, *Low)) {
     std::set<IndicesVector>::iterator Remove = Low;
     ++Low;
     Safe.erase(Remove);
@@ -486,7 +564,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
   GEPIndicesSet ToPromote;
 
   // If the pointer is always valid, any load with first index 0 is valid.
-  if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg))
+  if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg))
     SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
 
   // First, iterate the entry block and mark loads of (geps of) arguments as
@@ -512,25 +590,26 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
               return false;
 
           // Indices checked out, mark them as safe
-          MarkIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
           Indices.clear();
         }
       } else if (V == Arg) {
         // Direct loads are equivalent to a GEP with a single 0 index.
-        MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
       }
     }
 
   // Now, iterate all uses of the argument to see if there are any uses that are
   // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
-  SmallVector<LoadInst*, 16> Loads;
+  SmallVector<LoadInst *, 16> Loads;
   IndicesVector Operands;
   for (Use &U : Arg->uses()) {
     User *UR = U.getUser();
     Operands.clear();
     if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
       // Don't hack volatile/atomic loads
-      if (!LI->isSimple()) return false;
+      if (!LI->isSimple())
+        return false;
       Loads.push_back(LI);
       // Direct loads are equivalent to a GEP with a zero index and then a load.
       Operands.push_back(0);
@@ -547,30 +626,31 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
       }
 
       // Ensure that all of the indices are constants.
-      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end();
-        i != e; ++i)
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
+           ++i)
         if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
           Operands.push_back(C->getSExtValue());
         else
-          return false;  // Not a constant operand GEP!
+          return false; // Not a constant operand GEP!
 
       // Ensure that the only users of the GEP are load instructions.
       for (User *GEPU : GEP->users())
         if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
           // Don't hack volatile/atomic loads
-          if (!LI->isSimple()) return false;
+          if (!LI->isSimple())
+            return false;
           Loads.push_back(LI);
         } else {
           // Other uses than load?
           return false;
         }
     } else {
-      return false;  // Not a load or a GEP.
+      return false; // Not a load or a GEP.
     }
 
     // Now, see if it is safe to promote this load / loads of this GEP. Loading
     // is safe if Operands, or a prefix of Operands, is marked as safe.
-    if (!PrefixIn(Operands, SafeToUnconditionallyLoad))
+    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
       return false;
 
     // See if we are already promoting a load with these indices. If not, check
@@ -579,8 +659,10 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
     if (ToPromote.find(Operands) == ToPromote.end()) {
       if (MaxElements > 0 && ToPromote.size() == MaxElements) {
         DEBUG(dbgs() << "argpromotion not promoting argument '"
-              << Arg->getName() << "' because it would require adding more "
-              << "than " << MaxElements << " arguments to the function.\n");
+                     << Arg->getName()
+                     << "' because it would require adding more "
+                     << "than " << MaxElements
+                     << " arguments to the function.\n");
         // We limit aggregate promotion to only promoting up to a fixed number
         // of elements of the aggregate.
         return false;
@@ -589,7 +671,8 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
     }
   }
 
-  if (Loads.empty()) return true;  // No users, this is a dead argument.
+  if (Loads.empty())
+    return true; // No users, this is a dead argument.
 
   // Okay, now we know that the argument is only used by load instructions and
   // it is safe to unconditionally perform all of them. Use alias analysis to
@@ -598,7 +681,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
 
   // Because there could be several/many load instructions, remember which
   // blocks we know to be transparent to the load.
-  df_iterator_default_set<BasicBlock*, 16> TranspBlocks;
+  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
 
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
@@ -607,7 +690,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
 
     MemoryLocation Loc = MemoryLocation::get(Load);
     if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))
-      return false;  // Pointer is invalidated!
+      return false; // Pointer is invalidated!
 
     // Now check every path from the entry block to the load for transparency.
     // To do this, we perform a depth first search on the inverse CFG from the
@@ -625,416 +708,352 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
   return true;
 }
 
-/// DoPromotion - This method actually performs the promotion of the specified
-/// arguments, and returns the new function.  At this point, we know that it's
-/// safe to do so.
-static CallGraphNode *
-DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG) {
+/// \brief Checks if a type could have padding bytes.
+static bool isDenselyPacked(Type *type, const DataLayout &DL) {
 
-  // Start by computing a new prototype for the function, which is the same as
-  // the old function, but has modified arguments.
-  FunctionType *FTy = F->getFunctionType();
-  std::vector<Type*> Params;
+  // There is no size information, so be conservative.
+  if (!type->isSized())
+    return false;
 
-  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
+  // If the alloc size is not equal to the storage size, then there are padding
+  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
+    return false;
 
-  // ScalarizedElements - If we are promoting a pointer that has elements
-  // accessed out of it, keep track of which elements are accessed so that we
-  // can add one argument for each.
-  //
-  // Arguments that are directly loaded will have a zero element value here, to
-  // handle cases where there are both a direct load and GEP accesses.
-  //
-  std::map<Argument*, ScalarizeTable> ScalarizedElements;
+  if (!isa<CompositeType>(type))
+    return true;
 
-  // OriginalLoads - Keep track of a representative load instruction from the
-  // original function so that we can tell the alias analysis implementation
-  // what the new GEP/Load instructions we are inserting look like.
-  // We need to keep the original loads for each argument and the elements
-  // of the argument that are accessed.
-  std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads;
+  // For homogenous sequential types, check for padding within members.
+  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
+    return isDenselyPacked(seqTy->getElementType(), DL);
 
-  // Attribute - Keep track of the parameter attributes for the arguments
-  // that we are *not* promoting. For the ones that we do promote, the parameter
-  // attributes are lost
-  SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &PAL = F->getAttributes();
+  // Check for padding within and between elements of a struct.
+  StructType *StructTy = cast<StructType>(type);
+  const StructLayout *Layout = DL.getStructLayout(StructTy);
+  uint64_t StartPos = 0;
+  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
+    Type *ElTy = StructTy->getElementType(i);
+    if (!isDenselyPacked(ElTy, DL))
+      return false;
+    if (StartPos != Layout->getElementOffsetInBits(i))
+      return false;
+    StartPos += DL.getTypeAllocSizeInBits(ElTy);
+  }
 
-  // Add any return attributes.
-  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                              PAL.getRetAttributes()));
+  return true;
+}
 
-  // First, determine the new argument list
-  unsigned ArgIndex = 1;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
-       ++I, ++ArgIndex) {
-    if (ByValArgsToTransform.count(&*I)) {
-      // Simple byval argument? Just add all the struct element types.
-      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      StructType *STy = cast<StructType>(AgTy);
-      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
-      ++NumByValArgsPromoted;
-    } else if (!ArgsToPromote.count(&*I)) {
-      // Unchanged argument
-      Params.push_back(I->getType());
-      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
-      if (attrs.hasAttributes(ArgIndex)) {
-        AttrBuilder B(attrs, ArgIndex);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
-      }
-    } else if (I->use_empty()) {
-      // Dead argument (which are always marked as promotable)
-      ++NumArgumentsDead;
-    } else {
-      // Okay, this is being promoted. This means that the only uses are loads
-      // or GEPs which are only used by loads
+/// \brief Checks if the padding bytes of an argument could be accessed.
+static bool canPaddingBeAccessed(Argument *arg) {
 
-      // In this table, we will track which indices are loaded from the argument
-      // (where direct loads are tracked as no indices).
-      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-      for (User *U : I->users()) {
-        Instruction *UI = cast<Instruction>(U);
-        Type *SrcTy;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          SrcTy = L->getType();
-        else
-          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
-        IndicesVector Indices;
-        Indices.reserve(UI->getNumOperands() - 1);
-        // Since loads will only have a single operand, and GEPs only a single
-        // non-index operand, this will record direct loads without any indices,
-        // and gep+loads with the GEP indices.
-        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
-             II != IE; ++II)
-          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Indices.size() == 1 && Indices.front() == 0)
-          Indices.clear();
-        ArgIndices.insert(std::make_pair(SrcTy, Indices));
-        LoadInst *OrigLoad;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          OrigLoad = L;
-        else
-          // Take any load, we will use it only to update Alias Analysis
-          OrigLoad = cast<LoadInst>(UI->user_back());
-        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
-      }
+  assert(arg->hasByValAttr());
 
-      // Add a parameter to the function for each element passed in.
-      for (const auto &ArgIndex : ArgIndices) {
-        // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(
-            cast<PointerType>(I->getType()->getScalarType())->getElementType(),
-            ArgIndex.second));
-        assert(Params.back());
-      }
+  // Track all the pointers to the argument to make sure they are not captured.
+  SmallPtrSet<Value *, 16> PtrValues;
+  PtrValues.insert(arg);
 
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
-        ++NumArgumentsPromoted;
-      else
-        ++NumAggregatesPromoted;
+  // Track all of the stores.
+  SmallVector<StoreInst *, 16> Stores;
+
+  // Scan through the uses recursively to make sure the pointer is always used
+  // sanely.
+  SmallVector<Value *, 16> WorkList;
+  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
+  while (!WorkList.empty()) {
+    Value *V = WorkList.back();
+    WorkList.pop_back();
+    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+      if (PtrValues.insert(V).second)
+        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+      Stores.push_back(Store);
+    } else if (!isa<LoadInst>(V)) {
+      return true;
     }
   }
 
-  // Add any function attributes.
-  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(FTy->getContext(),
-                                              PAL.getFnAttributes()));
+  // Check to make sure the pointers aren't captured
+  for (StoreInst *Store : Stores)
+    if (PtrValues.count(Store->getValueOperand()))
+      return true;
 
-  Type *RetTy = FTy->getReturnType();
+  return false;
+}
 
-  // Construct the new function type using the new arguments.
-  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+///
+static Function *
+promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements,
+                 Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                     ReplaceCallSite) {
+  // Make sure that it is local to this module.
+  if (!F->hasLocalLinkage())
+    return nullptr;
 
-  // Create the new function body and insert it into the module.
-  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
-  NF->copyAttributesFrom(F);
+  // Don't promote arguments for variadic functions. Adding, removing, or
+  // changing non-pack parameters can change the classification of pack
+  // parameters. Frontends encode that classification at the call site in the
+  // IR, while in the callee the classification is determined dynamically based
+  // on the number of registers consumed so far.
+  if (F->isVarArg())
+    return nullptr;
 
-  // Patch the pointer to LLVM function in debug info descriptor.
-  NF->setSubprogram(F->getSubprogram());
-  F->setSubprogram(nullptr);
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<Argument *, 16> PointerArgs;
+  for (Argument &I : F->args())
+    if (I.getType()->isPointerTy())
+      PointerArgs.push_back(&I);
+  if (PointerArgs.empty())
+    return nullptr;
 
-  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
-        << "From: " << *F);
-  
-  // Recompute the parameter attributes list based on the new arguments for
-  // the function.
-  NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
-  AttributesVec.clear();
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.  Also see if the function
+  // is self-recursive.
+  bool isSelfRecursive = false;
+  for (Use &U : F->uses()) {
+    CallSite CS(U.getUser());
+    // Must be a direct call.
+    if (CS.getInstruction() == nullptr || !CS.isCallee(&U))
+      return nullptr;
 
-  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
-  NF->takeName(F);
+    if (CS.getInstruction()->getParent()->getParent() == F)
+      isSelfRecursive = true;
+  }
 
-  // Get a new callgraph node for NF.
-  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
-  // Loop over all of the callers of the function, transforming the call sites
-  // to pass in the loaded pointers.
-  //
-  SmallVector<Value*, 16> Args;
-  while (!F->use_empty()) {
-    CallSite CS(F->user_back());
-    assert(CS.getCalledFunction() == F);
-    Instruction *Call = CS.getInstruction();
-    const AttributeSet &CallPAL = CS.getAttributes();
+  AAResults &AAR = AARGetter(*F);
 
-    // Add any return attributes.
-    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
-      AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                                CallPAL.getRetAttributes()));
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument *, 8> ArgsToPromote;
+  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
+  for (Argument *PtrArg : PointerArgs) {
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
-    // Loop over the operands, inserting GEP and loads in the caller as
-    // appropriate.
-    CallSite::arg_iterator AI = CS.arg_begin();
-    ArgIndex = 1;
-    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-         I != E; ++I, ++AI, ++ArgIndex)
-      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
-        Args.push_back(*AI);          // Unmodified argument
+    // Replace sret attribute with noalias. This reduces register pressure by
+    // avoiding a register copy.
+    if (PtrArg->hasStructRetAttr()) {
+      unsigned ArgNo = PtrArg->getArgNo();
+      F->setAttributes(
+          F->getAttributes()
+              .removeAttribute(F->getContext(), ArgNo + 1, Attribute::StructRet)
+              .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias));
+      for (Use &U : F->uses()) {
+        CallSite CS(U.getUser());
+        CS.setAttributes(
+            CS.getAttributes()
+                .removeAttribute(F->getContext(), ArgNo + 1,
+                                 Attribute::StructRet)
+                .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias));
+      }
+    }
 
-        if (CallPAL.hasAttributes(ArgIndex)) {
-          AttrBuilder B(CallPAL, ArgIndex);
-          AttributesVec.
-            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
-        }
-      } else if (ByValArgsToTransform.count(&*I)) {
-        // Emit a GEP and load for each element of the struct.
-        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-        StructType *STy = cast<StructType>(AgTy);
-        Value *Idxs[2] = {
-              ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-          Value *Idx = GetElementPtrInst::Create(
-              STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);
-          // TODO: Tell AA about the new values?
-          Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe, if the passed value is densely
+    // packed or if we can prove the padding bytes are never accessed. This does
+    // not apply to inalloca.
+    bool isSafeToPromote =
+        PtrArg->hasByValAttr() &&
+        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
+    if (isSafeToPromote) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
+          DEBUG(dbgs() << "argpromotion disable promoting argument '"
+                       << PtrArg->getName()
+                       << "' because it would require adding more"
+                       << " than " << MaxElements
+                       << " arguments to the function.\n");
+          continue;
         }
-      } else if (!I->use_empty()) {
-        // Non-dead argument: insert GEPs and loads as appropriate.
-        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-        // Store the Value* version of the indices in here, but declare it now
-        // for reuse.
-        std::vector<Value*> Ops;
-        for (const auto &ArgIndex : ArgIndices) {
-          Value *V = *AI;
-          LoadInst *OrigLoad =
-              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
-          if (!ArgIndex.second.empty()) {
-            Ops.reserve(ArgIndex.second.size());
-            Type *ElTy = V->getType();
-            for (unsigned long II : ArgIndex.second) {
-              // Use i32 to index structs, and i64 for others (pointers/arrays).
-              // This satisfies GEP constraints.
-              Type *IdxTy = (ElTy->isStructTy() ?
-                    Type::getInt32Ty(F->getContext()) : 
-                    Type::getInt64Ty(F->getContext()));
-              Ops.push_back(ConstantInt::get(IdxTy, II));
-              // Keep track of the type we're currently indexing.
-              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
-                ElTy = ElPTy->getElementType();
-              else
-                ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
-            }
-            // And create a GEP to extract those indices.
-            V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
-                                          V->getName() + ".idx", Call);
-            Ops.clear();
+
+        // If all the elements are single-value types, we can promote it.
+        bool AllSimple = true;
+        for (const auto *EltTy : STy->elements()) {
+          if (!EltTy->isSingleValueType()) {
+            AllSimple = false;
+            break;
           }
-          // Since we're replacing a load make sure we take the alignment
-          // of the previous load.
-          LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call);
-          newLoad->setAlignment(OrigLoad->getAlignment());
-          // Transfer the AA info too.
-          AAMDNodes AAInfo;
-          OrigLoad->getAAMetadata(AAInfo);
-          newLoad->setAAMetadata(AAInfo);
+        }
 
-          Args.push_back(newLoad);
+        // Safe to transform, don't even bother trying to "promote" it.
+        // Passing the elements as a scalar will allow sroa to hack on
+        // the new alloca we introduce.
+        if (AllSimple) {
+          ByValArgsToTransform.insert(PtrArg);
+          continue;
         }
       }
+    }
 
-    // Push any varargs arguments on the list.
-    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
-      Args.push_back(*AI);
-      if (CallPAL.hasAttributes(ArgIndex)) {
-        AttrBuilder B(CallPAL, ArgIndex);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+    // If the argument is a recursive type and we're in a recursive
+    // function, we could end up infinitely peeling the function argument.
+    if (isSelfRecursive) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        bool RecursiveType = false;
+        for (const auto *EltTy : STy->elements()) {
+          if (EltTy == PtrArg->getType()) {
+            RecursiveType = true;
+            break;
+          }
+        }
+        if (RecursiveType)
+          continue;
       }
     }
 
-    // Add any function attributes.
-    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
-      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
-                                                CallPAL.getFnAttributes()));
+    // Otherwise, see if we can promote the pointer to its value.
+    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
+                                MaxElements))
+      ArgsToPromote.insert(PtrArg);
+  }
+
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+    return nullptr;
 
-    SmallVector<OperandBundleDef, 1> OpBundles;
-    CS.getOperandBundlesAsDefs(OpBundles);
+  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+}
 
-    Instruction *New;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles, "", Call);
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(),
-                                                            AttributesVec));
-    } else {
-      New = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
-                                                          AttributesVec));
-      cast<CallInst>(New)->setTailCallKind(
-          cast<CallInst>(Call)->getTailCallKind());
-    }
-    New->setDebugLoc(Call->getDebugLoc());
-    Args.clear();
-    AttributesVec.clear();
+PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
+                                             CGSCCAnalysisManager &AM,
+                                             LazyCallGraph &CG,
+                                             CGSCCUpdateResult &UR) {
+  bool Changed = false, LocalChange;
 
-    // Update the callgraph to know that the callsite has been transformed.
-    CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];
-    CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN);
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
 
-    if (!Call->use_empty()) {
-      Call->replaceAllUsesWith(New);
-      New->takeName(Call);
+    for (LazyCallGraph::Node &N : C) {
+      Function &OldF = N.getFunction();
+
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+      // FIXME: This lambda must only be used with this function. We should
+      // skip the lambda and just get the AA results directly.
+      auto AARGetter = [&](Function &F) -> AAResults & {
+        assert(&F == &OldF && "Called with an unexpected function!");
+        return FAM.getResult<AAManager>(F);
+      };
+
+      Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
+      if (!NewF)
+        continue;
+      LocalChange = true;
+
+      // Directly substitute the functions in the call graph. Note that this
+      // requires the old function to be completely dead and completely
+      // replaced by the new function. It does no call graph updates, it merely
+      // swaps out the particular function mapped to a particular node in the
+      // graph.
+      C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
+      OldF.eraseFromParent();
     }
 
-    // Finally, remove the old call from the program, reducing the use-count of
-    // F.
-    Call->eraseFromParent();
-  }
-
-  // Since we have now created the new function, splice the body of the old
-  // function right into the new function, leaving the old rotting hulk of the
-  // function empty.
-  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+    Changed |= LocalChange;
+  } while (LocalChange);
 
-  // Loop over the argument list, transferring uses of the old arguments over to
-  // the new arguments, also transferring over the names as well.
-  //
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
-       I2 = NF->arg_begin(); I != E; ++I) {
-    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
-      // If this is an unmodified argument, move the name and users over to the
-      // new version.
-      I->replaceAllUsesWith(&*I2);
-      I2->takeName(&*I);
-      ++I2;
-      continue;
-    }
+  if (!Changed)
+    return PreservedAnalyses::all();
 
-    if (ByValArgsToTransform.count(&*I)) {
-      // In the callee, we create an alloca, and store each of the new incoming
-      // arguments into the alloca.
-      Instruction *InsertPt = &NF->begin()->front();
+  return PreservedAnalyses::none();
+}
 
-      // Just add all the struct element types.
-      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt);
-      StructType *STy = cast<StructType>(AgTy);
-      Value *Idxs[2] = {
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
+namespace {
+/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+///
+struct ArgPromotion : public CallGraphSCCPass {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
 
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-        Value *Idx = GetElementPtrInst::Create(
-            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
-            InsertPt);
-        I2->setName(I->getName()+"."+Twine(i));
-        new StoreInst(&*I2++, Idx, InsertPt);
-      }
+  bool runOnSCC(CallGraphSCC &SCC) override;
+  static char ID; // Pass identification, replacement for typeid
+  explicit ArgPromotion(unsigned MaxElements = 3)
+      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
+    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+  }
 
-      // Anything that used the arg should now use the alloca.
-      I->replaceAllUsesWith(TheAlloca);
-      TheAlloca->takeName(&*I);
+private:
+  using llvm::Pass::doInitialization;
+  bool doInitialization(CallGraph &CG) override;
+  /// The maximum number of elements to expand, or 0 for unlimited.
+  unsigned MaxElements;
+};
+}
 
-      // If the alloca is used in a call, we must clear the tail flag since
-      // the callee now uses an alloca from the caller.
-      for (User *U : TheAlloca->users()) {
-        CallInst *Call = dyn_cast<CallInst>(U);
-        if (!Call)
-          continue;
-        Call->setTailCall(false);
-      }
-      continue;
-    }
+char ArgPromotion::ID = 0;
+INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
+                      "Promote 'by reference' arguments to scalars", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
+                    "Promote 'by reference' arguments to scalars", false, false)
 
-    if (I->use_empty())
-      continue;
+Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
+  return new ArgPromotion(MaxElements);
+}
 
-    // Otherwise, if we promoted this argument, then all users are load
-    // instructions (or GEPs with only load users), and all loads should be
-    // using the new argument that we added.
-    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
 
-    while (!I->use_empty()) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
-        assert(ArgIndices.begin()->second.empty() &&
-               "Load element should sort to front!");
-        I2->setName(I->getName()+".val");
-        LI->replaceAllUsesWith(&*I2);
-        LI->eraseFromParent();
-        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
-              << "' in function '" << F->getName() << "'\n");
-      } else {
-        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
-        IndicesVector Operands;
-        Operands.reserve(GEP->getNumIndices());
-        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
-             II != IE; ++II)
-          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
 
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Operands.size() == 1 && Operands.front() == 0)
-          Operands.clear();
+  LegacyAARGetter AARGetter(*this);
 
-        Function::arg_iterator TheArg = I2;
-        for (ScalarizeTable::iterator It = ArgIndices.begin();
-             It->second != Operands; ++It, ++TheArg) {
-          assert(It != ArgIndices.end() && "GEP not handled??");
-        }
+  bool Changed = false, LocalChange;
 
-        std::string NewName = I->getName();
-        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-            NewName += "." + utostr(Operands[i]);
-        }
-        NewName += ".val";
-        TheArg->setName(NewName);
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (CallGraphNode *OldNode : SCC) {
+      Function *OldF = OldNode->getFunction();
+      if (!OldF)
+        continue;
+
+      auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) {
+        Function *Caller = OldCS.getInstruction()->getParent()->getParent();
+        CallGraphNode *NewCalleeNode =
+            CG.getOrInsertFunction(NewCS.getCalledFunction());
+        CallGraphNode *CallerNode = CG[Caller];
+        CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+      };
+
+      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
+                                            {ReplaceCallSite})) {
+        LocalChange = true;
 
-        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
-              << "' of function '" << NF->getName() << "'\n");
+        // Update the call graph for the newly promoted function.
+        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
+        NewNode->stealCalledFunctionsFrom(OldNode);
+        if (OldNode->getNumReferences() == 0)
+          delete CG.removeFunctionFromModule(OldNode);
+        else
+          OldF->setLinkage(Function::ExternalLinkage);
 
-        // All of the uses must be load instructions.  Replace them all with
-        // the argument specified by ArgNo.
-        while (!GEP->use_empty()) {
-          LoadInst *L = cast<LoadInst>(GEP->user_back());
-          L->replaceAllUsesWith(&*TheArg);
-          L->eraseFromParent();
-        }
-        GEP->eraseFromParent();
+        // And updat ethe SCC we're iterating as well.
+        SCC.ReplaceNode(OldNode, NewNode);
       }
     }
+    // Remember that we changed something.
+    Changed |= LocalChange;
+  } while (LocalChange);
 
-    // Increment I2 past all of the arguments added for this promoted pointer.
-    std::advance(I2, ArgIndices.size());
-  }
-
-  NF_CGN->stealCalledFunctionsFrom(CG[F]);
-  
-  // Now that the old function is dead, delete it.  If there is a dangling
-  // reference to the CallgraphNode, just leave the dead function around for
-  // someone else to nuke.
-  CallGraphNode *CGN = CG[F];
-  if (CGN->getNumReferences() == 0)
-    delete CG.removeFunctionFromModule(CGN);
-  else
-    F->setLinkage(Function::ExternalLinkage);
-  
-  return NF_CGN;
+  return Changed;
 }
 
 bool ArgPromotion::doInitialization(CallGraph &CG) {
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index d75ed206ad23..62b5a9c9ba26 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -60,6 +60,23 @@ static bool IsBetterCanonical(const GlobalVariable &A,
   return A.hasGlobalUnnamedAddr();
 }
 
+static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GV->getAllMetadata(MDs);
+  for (const auto &V : MDs)
+    if (V.first != LLVMContext::MD_dbg)
+      return true;
+  return false;
+}
+
+static void copyDebugLocMetadata(const GlobalVariable *From,
+                                 GlobalVariable *To) {
+  SmallVector<DIGlobalVariableExpression *, 1> MDs;
+  From->getDebugInfo(MDs);
+  for (auto MD : MDs)
+    To->addDebugInfo(MD);
+}
+
 static unsigned getAlignment(GlobalVariable *GV) {
   unsigned Align = GV->getAlignment();
   if (Align)
@@ -113,6 +130,10 @@ static bool mergeConstants(Module &M) {
       if (GV->isWeakForLinker())
         continue;
 
+      // Don't touch globals with metadata other then !dbg.
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
@@ -155,6 +176,9 @@ static bool mergeConstants(Module &M) {
       if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr())
         continue;
 
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
       if (!GV->hasGlobalUnnamedAddr())
         Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
 
@@ -178,6 +202,8 @@ static bool mergeConstants(Module &M) {
                      getAlignment(Replacements[i].second)));
       }
 
+      copyDebugLocMetadata(Replacements[i].first, Replacements[i].second);
+
       // Eliminate any uses of the dead global.
       Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
 
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index ba2e60dee3bc..1b111de06157 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -98,8 +98,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   LLVMContext &Ctx = M.getContext();
   Constant *C = M.getOrInsertFunction(
       "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
-      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx), nullptr);
+      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
   Function *F = dyn_cast<Function>(C);
+  // Take over the existing function. The frontend emits a weak stub so that the
+  // linker knows about the symbol; this pass replaces the function body.
+  F->deleteBody();
   F->setAlignment(4096);
   auto args = F->arg_begin();
   Value &CallSiteTypeId = *(args++);
@@ -117,7 +120,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   IRBuilder<> IRBFail(TrapBB);
   Constant *CFICheckFailFn = M.getOrInsertFunction(
       "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx),
-      Type::getInt8PtrTy(Ctx), nullptr);
+      Type::getInt8PtrTy(Ctx));
   IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
   IRBFail.CreateBr(ExitBB);
 
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 1a5ed4692211..375b74c494d9 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -166,41 +166,43 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
-    AttributeSet PAL = CS.getAttributes();
+    AttributeList PAL = CS.getAttributes();
     if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
-      SmallVector<AttributeSet, 8> AttributesVec;
+      SmallVector<AttributeList, 8> AttributesVec;
       for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
         AttributesVec.push_back(PAL.getSlotAttributes(i));
-      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
-                                                  PAL.getFnAttributes()));
-      PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
+      if (PAL.hasAttributes(AttributeList::FunctionIndex))
+        AttributesVec.push_back(AttributeList::get(Fn.getContext(),
+                                                   AttributeList::FunctionIndex,
+                                                   PAL.getFnAttributes()));
+      PAL = AttributeList::get(Fn.getContext(), AttributesVec);
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
 
-    Instruction *New;
+    CallSite NewCS;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles, "", Call);
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(PAL);
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call);
     } else {
-      New = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(PAL);
-      cast<CallInst>(New)->setTailCallKind(
-          cast<CallInst>(Call)->getTailCallKind());
+      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
+      cast<CallInst>(NewCS.getInstruction())
+          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
     }
-    New->setDebugLoc(Call->getDebugLoc());
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(PAL);
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
 
     Args.clear();
 
     if (!Call->use_empty())
-      Call->replaceAllUsesWith(New);
+      Call->replaceAllUsesWith(NewCS.getInstruction());
 
-    New->takeName(Call);
+    NewCS->takeName(Call);
 
     // Finally, remove the old call from the program, reducing the use-count of
     // F.
@@ -681,8 +683,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   bool HasLiveReturnedArg = false;
 
   // Set up to build a new list of parameter attributes.
-  SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &PAL = F->getAttributes();
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  const AttributeList &PAL = F->getAttributes();
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -696,16 +698,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[i] = true;
-
-      // Get the original parameter attributes (skipping the first one, that is
-      // for the return value.
-      if (PAL.hasAttributes(i + 1)) {
-        AttrBuilder B(PAL, i + 1);
-        if (B.contains(Attribute::Returned))
-          HasLiveReturnedArg = true;
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
-      }
+      ArgAttrVec.push_back(PAL.getParamAttributes(i));
+      HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
       DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
@@ -779,30 +773,24 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttributeSet RAttrs = PAL.getRetAttributes();
+  AttrBuilder RAttrs(PAL.getRetAttributes());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
-    RAttrs = RAttrs.removeAttributes(NRetTy->getContext(),
-                                     AttributeSet::ReturnIndex,
-                                     AttributeFuncs::typeIncompatible(NRetTy));
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
   else
-    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
-             overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
            "Return attributes no longer compatible?");
 
-  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
-
-  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                              PAL.getFnAttributes()));
+  AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
+  assert(ArgAttrVec.size() == Params.size());
+  AttributeList NewPAL = AttributeList::get(
+      F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -829,18 +817,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     CallSite CS(F->user_back());
     Instruction *Call = CS.getInstruction();
 
-    AttributesVec.clear();
-    const AttributeSet &CallPAL = CS.getAttributes();
-
-    // The call return attributes.
-    AttributeSet RAttrs = CallPAL.getRetAttributes();
+    ArgAttrVec.clear();
+    const AttributeList &CallPAL = CS.getAttributes();
 
-    // Adjust in case the function was changed to return void.
-    RAttrs = RAttrs.removeAttributes(NRetTy->getContext(),
-                                     AttributeSet::ReturnIndex,
-                        AttributeFuncs::typeIncompatible(NF->getReturnType()));
-    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
-      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
+    // Adjust the call return attributes in case the function was changed to
+    // return void.
+    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -852,57 +836,55 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        if (CallPAL.hasAttributes(i + 1)) {
-          AttrBuilder B(CallPAL, i + 1);
+        AttributeSet Attrs = CallPAL.getParamAttributes(i);
+        if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
           // call sites keep the return value alive just like 'returned'
-          // attributes on function declaration but it's less clearly a win
-          // and this is not an expected case anyway
-          if (NRetTy != RetTy && B.contains(Attribute::Returned))
-            B.removeAttribute(Attribute::Returned);
-          AttributesVec.
-            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+          // attributes on function declaration but it's less clearly a win and
+          // this is not an expected case anyway
+          ArgAttrVec.push_back(AttributeSet::get(
+              F->getContext(),
+              AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+        } else {
+          // Otherwise, use the original attributes.
+          ArgAttrVec.push_back(Attrs);
         }
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      if (CallPAL.hasAttributes(i + 1)) {
-        AttrBuilder B(CallPAL, i + 1);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
-      }
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(i));
     }
 
-    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
-      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
-                                                CallPAL.getFnAttributes()));
-
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
+    assert(ArgAttrVec.size() == Args.size());
+    AttributeList NewCallPAL = AttributeList::get(
+        F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec);
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
 
-    Instruction *New;
+    CallSite NewCS;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles, "", Call->getParent());
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(NewCallPAL);
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call->getParent());
     } else {
-      New = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(NewCallPAL);
-      cast<CallInst>(New)->setTailCallKind(
-          cast<CallInst>(Call)->getTailCallKind());
+      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
+      cast<CallInst>(NewCS.getInstruction())
+          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
     }
-    New->setDebugLoc(Call->getDebugLoc());
-
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(NewCallPAL);
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
     Args.clear();
+    ArgAttrVec.clear();
 
+    Instruction *New = NewCS.getInstruction();
     if (!Call->use_empty()) {
       if (New->getType() == Call->getType()) {
         // Return type not changed? Just replace users then.
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 402a66552c24..4d13b3f40688 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -49,31 +49,35 @@ STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 
-namespace {
-typedef SmallSetVector<Function *, 8> SCCNodeSet;
-}
+// FIXME: This is disabled by default to avoid exposing security vulnerabilities
+// in C/C++ code compiled by clang:
+// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html
+static cl::opt<bool> EnableNonnullArgPropagation(
+    "enable-nonnull-arg-prop", cl::Hidden,
+    cl::desc("Try to propagate nonnull argument attributes from callsites to "
+             "caller functions."));
 
 namespace {
-/// The three kinds of memory access relevant to 'readonly' and
-/// 'readnone' attributes.
-enum MemoryAccessKind {
-  MAK_ReadNone = 0,
-  MAK_ReadOnly = 1,
-  MAK_MayWrite = 2
-};
+typedef SmallSetVector<Function *, 8> SCCNodeSet;
 }
 
-static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
+/// Returns the memory access attribute for function F using AAR for AA results,
+/// where SCCNodes is the current SCC.
+///
+/// If ThisBody is true, this function may examine the function body and will
+/// return a result pertaining to this copy of the function. If it is false, the
+/// result will be based only on AA results for the function declaration; it
+/// will be assumed that some other (perhaps less optimized) version of the
+/// function may be selected at link time.
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
+                                                  AAResults &AAR,
                                                   const SCCNodeSet &SCCNodes) {
   FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
   if (MRB == FMRB_DoesNotAccessMemory)
     // Already perfect!
     return MAK_ReadNone;
 
-  // Non-exact function definitions may not be selected at link time, and an
-  // alternative version that writes to memory may be selected.  See the comment
-  // on GlobalValue::isDefinitionExact for more details.
-  if (!F.hasExactDefinition()) {
+  if (!ThisBody) {
     if (AliasAnalysis::onlyReadsMemory(MRB))
       return MAK_ReadOnly;
 
@@ -172,9 +176,14 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
   return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
 }
 
+MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
+                                                       AAResults &AAR) {
+  return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
+}
+
 /// Deduce readonly/readnone attributes for the SCC.
 template <typename AARGetterT>
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
@@ -182,7 +191,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
     // Call the callable parameter to look up AA results for this function.
     AAResults &AAR = AARGetter(*F);
 
-    switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) {
+    // Non-exact function definitions may not be selected at link time, and an
+    // alternative version that writes to memory may be selected.  See the
+    // comment on GlobalValue::isDefinitionExact for more details.
+    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
+                                      AAR, SCCNodes)) {
     case MAK_MayWrite:
       return false;
     case MAK_ReadOnly:
@@ -212,11 +225,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
     AttrBuilder B;
     B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
     F->removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B));
+        AttributeList::FunctionIndex,
+        AttributeList::get(F->getContext(), AttributeList::FunctionIndex, B));
 
     // Add in the new attribute.
-    F->addAttribute(AttributeSet::FunctionIndex,
+    F->addAttribute(AttributeList::FunctionIndex,
                     ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
 
     if (ReadsMemory)
@@ -522,7 +535,7 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
 
     if (Value *RetArg = FindRetArg()) {
       auto *A = cast<Argument>(RetArg);
-      A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+      A->addAttr(AttributeList::get(F->getContext(), A->getArgNo() + 1, B));
       ++NumReturned;
       Changed = true;
     }
@@ -531,6 +544,49 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
   return Changed;
 }
 
+/// If a callsite has arguments that are also arguments to the parent function,
+/// try to propagate attributes from the callsite's arguments to the parent's
+/// arguments. This may be important because inlining can cause information loss
+/// when attribute knowledge disappears with the inlined call.
+static bool addArgumentAttrsFromCallsites(Function &F) {
+  if (!EnableNonnullArgPropagation)
+    return false;
+
+  bool Changed = false;
+
+  // For an argument attribute to transfer from a callsite to the parent, the
+  // call must be guaranteed to execute every time the parent is called.
+  // Conservatively, just check for calls in the entry block that are guaranteed
+  // to execute.
+  // TODO: This could be enhanced by testing if the callsite post-dominates the
+  // entry block or by doing simple forward walks or backward walks to the
+  // callsite.
+  BasicBlock &Entry = F.getEntryBlock();
+  for (Instruction &I : Entry) {
+    if (auto CS = CallSite(&I)) {
+      if (auto *CalledFunc = CS.getCalledFunction()) {
+        for (auto &CSArg : CalledFunc->args()) {
+          if (!CSArg.hasNonNullAttr())
+            continue;
+
+          // If the non-null callsite argument operand is an argument to 'F'
+          // (the caller) and the call is guaranteed to execute, then the value
+          // must be non-null throughout 'F'.
+          auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo()));
+          if (FArg && !FArg->hasNonNullAttr()) {
+            FArg->addAttr(Attribute::NonNull);
+            Changed = true;
+          }
+        }
+      }
+    }
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+  }
+  
+  return Changed;
+}
+
 /// Deduce nocapture attributes for the SCC.
 static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
@@ -549,6 +605,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (!F->hasExactDefinition())
       continue;
 
+    Changed |= addArgumentAttrsFromCallsites(*F);
+
     // Functions that are readonly (or readnone) and nounwind and don't return
     // a value can't capture arguments. Don't analyze them.
     if (F->onlyReadsMemory() && F->doesNotThrow() &&
@@ -556,7 +614,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
            ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+          A->addAttr(AttributeList::get(F->getContext(), A->getArgNo() + 1, B));
           ++NumNoCapture;
           Changed = true;
         }
@@ -576,7 +634,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
             A->addAttr(
-                AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+                AttributeList::get(F->getContext(), A->getArgNo() + 1, B));
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -604,7 +662,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         if (R != Attribute::None) {
           AttrBuilder B;
           B.addAttribute(R);
-          A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+          A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
           Changed = true;
           R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
         }
@@ -629,7 +687,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       if (ArgumentSCC[0]->Uses.size() == 1 &&
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
         Argument *A = ArgumentSCC[0]->Definition;
-        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -671,7 +729,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+      A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
       ++NumNoCapture;
       Changed = true;
     }
@@ -708,8 +766,9 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
         // Clear out existing readonly/readnone attributes
-        A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, R));
-        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        A->removeAttr(
+            AttributeList::get(A->getContext(), A->getArgNo() + 1, R));
+        A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
         ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
         Changed = true;
       }
@@ -769,7 +828,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
       case Instruction::Call:
       case Instruction::Invoke: {
         CallSite CS(RVI);
-        if (CS.paramHasAttr(0, Attribute::NoAlias))
+        if (CS.hasRetAttr(Attribute::NoAlias))
           break;
         if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
           break;
@@ -905,7 +964,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
   // pointers.
   for (Function *F : SCCNodes) {
     // Already nonnull.
-    if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                         Attribute::NonNull))
       continue;
 
@@ -926,7 +985,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
         // Mark the function eagerly since we may discover a function
         // which prevents us from speculating about the entire SCC
         DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
-        F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
         ++NumNonNullReturn;
         MadeChange = true;
       }
@@ -939,13 +998,13 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
 
   if (SCCReturnsNonNull) {
     for (Function *F : SCCNodes) {
-      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::NonNull) ||
           !F->getReturnType()->isPointerTy())
         continue;
 
       DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
-      F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
       ++NumNonNullReturn;
       MadeChange = true;
     }
@@ -1163,19 +1222,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
 bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
   if (skipSCC(SCC))
     return false;
-
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
-
-  return runImpl(SCC, AARGetter);
+  return runImpl(SCC, LegacyAARGetter(*this));
 }
 
 namespace {
@@ -1275,16 +1322,9 @@ PreservedAnalyses
 ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
   auto &CG = AM.getResult<CallGraphAnalysis>(M);
 
-  bool Changed = deduceFunctionAttributeInRPO(M, CG);
-
-  // CallGraphAnalysis holds AssertingVH and must be invalidated eagerly so
-  // that other passes don't delete stuff from under it.
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<CallGraphAnalysis>(M);
-
-  if (!Changed)
+  if (!deduceFunctionAttributeInRPO(M, CG))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
   PA.preserve<CallGraphAnalysis>();
   return PA;
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 6b32f6c31f72..d66411f04cc4 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -75,12 +75,6 @@ static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
 static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
                                  cl::desc("Compute dead symbols"));
 
-// Temporary allows the function import pass to disable always linking
-// referenced discardable symbols.
-static cl::opt<bool>
-    DontForceImportReferencedDiscardableSymbols("disable-force-link-odr",
-                                                cl::init(false), cl::Hidden);
-
 static cl::opt<bool> EnableImportMetadata(
     "enable-import-metadata", cl::init(
 #if !defined(NDEBUG)
@@ -124,7 +118,7 @@ namespace {
 static const GlobalValueSummary *
 selectCallee(const ModuleSummaryIndex &Index,
              const GlobalValueSummaryList &CalleeSummaryList,
-             unsigned Threshold) {
+             unsigned Threshold, StringRef CallerModulePath) {
   auto It = llvm::find_if(
       CalleeSummaryList,
       [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
@@ -145,6 +139,21 @@ selectCallee(const ModuleSummaryIndex &Index,
 
         auto *Summary = cast<FunctionSummary>(GVSummary);
 
+        // If this is a local function, make sure we import the copy
+        // in the caller's module. The only time a local function can
+        // share an entry in the index is if there is a local with the same name
+        // in another module that had the same source file name (in a different
+        // directory), where each was compiled in their own directory so there
+        // was not distinguishing path.
+        // However, do the import from another module if there is only one
+        // entry in the list - in that case this must be a reference due
+        // to indirect call profile data, since a function pointer can point to
+        // a local in another module.
+        if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
+            CalleeSummaryList.size() > 1 &&
+            Summary->modulePath() != CallerModulePath)
+          return false;
+
         if (Summary->instCount() > Threshold)
           return false;
 
@@ -163,11 +172,13 @@ selectCallee(const ModuleSummaryIndex &Index,
 /// null if there's no match.
 static const GlobalValueSummary *selectCallee(GlobalValue::GUID GUID,
                                               unsigned Threshold,
-                                              const ModuleSummaryIndex &Index) {
+                                              const ModuleSummaryIndex &Index,
+                                              StringRef CallerModulePath) {
   auto CalleeSummaryList = Index.findGlobalValueSummaryList(GUID);
   if (CalleeSummaryList == Index.end())
     return nullptr; // This function does not have a summary
-  return selectCallee(Index, CalleeSummaryList->second, Threshold);
+  return selectCallee(Index, CalleeSummaryList->second, Threshold,
+                      CallerModulePath);
 }
 
 using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */,
@@ -186,6 +197,15 @@ static void computeImportForFunction(
     auto GUID = Edge.first.getGUID();
     DEBUG(dbgs() << " edge -> " << GUID << " Threshold:" << Threshold << "\n");
 
+    if (Index.findGlobalValueSummaryList(GUID) == Index.end()) {
+      // For SamplePGO, the indirect call targets for local functions will
+      // have its original name annotated in profile. We try to find the
+      // corresponding PGOFuncName as the GUID.
+      GUID = Index.getGUIDFromOriginalID(GUID);
+      if (GUID == 0)
+        continue;
+    }
+
     if (DefinedGVSummaries.count(GUID)) {
       DEBUG(dbgs() << "ignored! Target already in destination module.\n");
       continue;
@@ -202,7 +222,8 @@ static void computeImportForFunction(
     const auto NewThreshold =
         Threshold * GetBonusMultiplier(Edge.second.Hotness);
 
-    auto *CalleeSummary = selectCallee(GUID, NewThreshold, Index);
+    auto *CalleeSummary =
+        selectCallee(GUID, NewThreshold, Index, Summary.modulePath());
     if (!CalleeSummary) {
       DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
       continue;
@@ -522,6 +543,23 @@ llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
 /// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
 void llvm::thinLTOResolveWeakForLinkerModule(
     Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
+  auto ConvertToDeclaration = [](GlobalValue &GV) {
+    DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n");
+    if (Function *F = dyn_cast<Function>(&GV)) {
+      F->deleteBody();
+      F->clearMetadata();
+    } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+      V->setInitializer(nullptr);
+      V->setLinkage(GlobalValue::ExternalLinkage);
+      V->clearMetadata();
+    } else
+      // For now we don't resolve or drop aliases. Once we do we'll
+      // need to add support here for creating either a function or
+      // variable declaration, and return the new GlobalValue* for
+      // the caller to use.
+      llvm_unreachable("Expected function or variable");
+  };
+
   auto updateLinkage = [&](GlobalValue &GV) {
     if (!GlobalValue::isWeakForLinker(GV.getLinkage()))
       return;
@@ -532,18 +570,25 @@ void llvm::thinLTOResolveWeakForLinkerModule(
     auto NewLinkage = GS->second->linkage();
     if (NewLinkage == GV.getLinkage())
       return;
-    DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
-                 << GV.getLinkage() << " to " << NewLinkage << "\n");
-    GV.setLinkage(NewLinkage);
-    // Remove functions converted to available_externally from comdats,
+    // Check for a non-prevailing def that has interposable linkage
+    // (e.g. non-odr weak or linkonce). In that case we can't simply
+    // convert to available_externally, since it would lose the
+    // interposable property and possibly get inlined. Simply drop
+    // the definition in that case.
+    if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
+        GlobalValue::isInterposableLinkage(GV.getLinkage()))
+      ConvertToDeclaration(GV);
+    else {
+      DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
+                   << GV.getLinkage() << " to " << NewLinkage << "\n");
+      GV.setLinkage(NewLinkage);
+    }
+    // Remove declarations from comdats, including available_externally
     // as this is a declaration for the linker, and will be dropped eventually.
     // It is illegal for comdats to contain declarations.
     auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
-    if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
-      assert(GO->hasAvailableExternallyLinkage() &&
-             "Expected comdat on definition (possibly available external)");
+    if (GO && GO->isDeclarationForLinker() && GO->hasComdat())
       GO->setComdat(nullptr);
-    }
   };
 
   // Process functions and global now
@@ -562,7 +607,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
   // the current module.
   StringSet<> AsmUndefinedRefs;
   ModuleSymbolTable::CollectAsmSymbols(
-      Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(),
+      TheModule,
       [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
         if (Flags & object::BasicSymbolRef::SF_Undefined)
           AsmUndefinedRefs.insert(Name);
@@ -617,14 +662,12 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
 // index.
 //
 Expected<bool> FunctionImporter::importFunctions(
-    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList,
-    bool ForceImportReferencedDiscardableSymbols) {
+    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
   DEBUG(dbgs() << "Starting import for Module "
                << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 
-  // Linker that will be used for importing function
-  Linker TheLinker(DestModule);
+  IRMover Mover(DestModule);
   // Do the actual import of functions now, one Module at a time
   std::set<StringRef> ModuleNameOrderedList;
   for (auto &FunctionsToImportPerModule : ImportList) {
@@ -648,7 +691,7 @@ Expected<bool> FunctionImporter::importFunctions(
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
     // Find the globals to import
-    DenseSet<const GlobalValue *> GlobalsToImport;
+    SetVector<GlobalValue *> GlobalsToImport;
     for (Function &F : *SrcModule) {
       if (!F.hasName())
         continue;
@@ -687,6 +730,13 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
     for (GlobalAlias &GA : SrcModule->aliases()) {
+      // FIXME: This should eventually be controlled entirely by the summary.
+      if (FunctionImportGlobalProcessing::doImportAsDefinition(
+              &GA, &GlobalsToImport)) {
+        GlobalsToImport.insert(&GA);
+        continue;
+      }
+
       if (!GA.hasName())
         continue;
       auto GUID = GA.getGUID();
@@ -731,12 +781,9 @@ Expected<bool> FunctionImporter::importFunctions(
                << " from " << SrcModule->getSourceFileName() << "\n";
     }
 
-    // Instruct the linker that the client will take care of linkonce resolution
-    unsigned Flags = Linker::Flags::None;
-    if (!ForceImportReferencedDiscardableSymbols)
-      Flags |= Linker::Flags::DontForceLinkLinkonceODR;
-
-    if (TheLinker.linkInModule(std::move(SrcModule), Flags, &GlobalsToImport))
+    if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(),
+                   [](GlobalValue &, IRMover::ValueAdder) {},
+                   /*IsPerformingImport=*/true))
       report_fatal_error("Function Import: link error");
 
     ImportedCount += GlobalsToImport.size();
@@ -796,8 +843,7 @@ static bool doImportingForModule(Module &M) {
     return loadFile(Identifier, M.getContext());
   };
   FunctionImporter Importer(*Index, ModuleLoader);
-  Expected<bool> Result = Importer.importFunctions(
-      M, ImportList, !DontForceImportReferencedDiscardableSymbols);
+  Expected<bool> Result = Importer.importFunctions(M, ImportList);
 
   // FIXME: Probably need to propagate Errors through the pass manager.
   if (!Result) {
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 7a04de3d12db..c91e8b454927 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include <unordered_map>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "globaldce"
@@ -50,7 +50,14 @@ namespace {
       if (skipModule(M))
         return false;
 
+      // We need a minimally functional dummy module analysis manager. It needs
+      // to at least know about the possibility of proxying a function analysis
+      // manager.
+      FunctionAnalysisManager DummyFAM;
       ModuleAnalysisManager DummyMAM;
+      DummyMAM.registerPass(
+          [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); });
+
       auto PA = Impl.run(M, DummyMAM);
       return !PA.areAllPreserved();
     }
@@ -78,9 +85,67 @@ static bool isEmptyFunction(Function *F) {
   return RI.getReturnValue() == nullptr;
 }
 
-PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
+/// Compute the set of GlobalValue that depends from V.
+/// The recursion stops as soon as a GlobalValue is met.
+void GlobalDCEPass::ComputeDependencies(Value *V,
+                                        SmallPtrSetImpl<GlobalValue *> &Deps) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Function *Parent = I->getParent()->getParent();
+    Deps.insert(Parent);
+  } else if (auto *GV = dyn_cast<GlobalValue>(V)) {
+    Deps.insert(GV);
+  } else if (auto *CE = dyn_cast<Constant>(V)) {
+    // Avoid walking the whole tree of a big ConstantExprs multiple times.
+    auto Where = ConstantDependenciesCache.find(CE);
+    if (Where != ConstantDependenciesCache.end()) {
+      auto const &K = Where->second;
+      Deps.insert(K.begin(), K.end());
+    } else {
+      SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE];
+      for (User *CEUser : CE->users())
+        ComputeDependencies(CEUser, LocalDeps);
+      Deps.insert(LocalDeps.begin(), LocalDeps.end());
+    }
+  }
+}
+
+void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
+  SmallPtrSet<GlobalValue *, 8> Deps;
+  for (User *User : GV.users())
+    ComputeDependencies(User, Deps);
+  Deps.erase(&GV); // Remove self-reference.
+  for (GlobalValue *GVU : Deps) {
+    GVDependencies.insert(std::make_pair(GVU, &GV));
+  }
+}
+
+/// Mark Global value as Live
+void GlobalDCEPass::MarkLive(GlobalValue &GV,
+                             SmallVectorImpl<GlobalValue *> *Updates) {
+  auto const Ret = AliveGlobals.insert(&GV);
+  if (!Ret.second)
+    return;
+
+  if (Updates)
+    Updates->push_back(&GV);
+  if (Comdat *C = GV.getComdat()) {
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
+      MarkLive(*CM.second, Updates); // Recursion depth is only two because only
+                                     // globals in the same comdat are visited.
+  }
+}
+
+PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   bool Changed = false;
 
+  // The algorithm first computes the set L of global variables that are
+  // trivially live.  Then it walks the initialization of these variables to
+  // compute the globals used to initialize them, which effectively builds a
+  // directed graph where nodes are global variables, and an edge from A to B
+  // means B is used to initialize A.  Finally, it propagates the liveness
+  // information through the graph starting from the nodes in L. Nodes note
+  // marked as alive are discarded.
+
   // Remove empty functions from the global ctors list.
   Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
 
@@ -103,21 +168,39 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
     // initializer.
     if (!GO.isDeclaration() && !GO.hasAvailableExternallyLinkage())
       if (!GO.isDiscardableIfUnused())
-        GlobalIsNeeded(&GO);
+        MarkLive(GO);
+
+    UpdateGVDependencies(GO);
   }
 
+  // Compute direct dependencies of aliases.
   for (GlobalAlias &GA : M.aliases()) {
     Changed |= RemoveUnusedGlobalValue(GA);
     // Externally visible aliases are needed.
     if (!GA.isDiscardableIfUnused())
-      GlobalIsNeeded(&GA);
+      MarkLive(GA);
+
+    UpdateGVDependencies(GA);
   }
 
+  // Compute direct dependencies of ifuncs.
   for (GlobalIFunc &GIF : M.ifuncs()) {
     Changed |= RemoveUnusedGlobalValue(GIF);
     // Externally visible ifuncs are needed.
     if (!GIF.isDiscardableIfUnused())
-      GlobalIsNeeded(&GIF);
+      MarkLive(GIF);
+
+    UpdateGVDependencies(GIF);
+  }
+
+  // Propagate liveness from collected Global Values through the computed
+  // dependencies.
+  SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(),
+                                           AliveGlobals.end()};
+  while (!NewLiveGVs.empty()) {
+    GlobalValue *LGV = NewLiveGVs.pop_back_val();
+    for (auto &&GVD : make_range(GVDependencies.equal_range(LGV)))
+      MarkLive(*GVD.second, &NewLiveGVs);
   }
 
   // Now that all globals which are needed are in the AliveGlobals set, we loop
@@ -154,7 +237,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
       GA.setAliasee(nullptr);
     }
 
-  // The third pass drops targets of ifuncs which are dead...
+  // The fourth pass drops targets of ifuncs which are dead...
   std::vector<GlobalIFunc*> DeadIFuncs;
   for (GlobalIFunc &GIF : M.ifuncs())
     if (!AliveGlobals.count(&GIF)) {
@@ -188,7 +271,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
 
   // Make sure that all memory is released
   AliveGlobals.clear();
-  SeenConstants.clear();
+  ConstantDependenciesCache.clear();
+  GVDependencies.clear();
   ComdatMembers.clear();
 
   if (Changed)
@@ -196,60 +280,6 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
   return PreservedAnalyses::all();
 }
 
-/// GlobalIsNeeded - the specific global value as needed, and
-/// recursively mark anything that it uses as also needed.
-void GlobalDCEPass::GlobalIsNeeded(GlobalValue *G) {
-  // If the global is already in the set, no need to reprocess it.
-  if (!AliveGlobals.insert(G).second)
-    return;
-
-  if (Comdat *C = G->getComdat()) {
-    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
-      GlobalIsNeeded(CM.second);
-  }
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
-    // If this is a global variable, we must make sure to add any global values
-    // referenced by the initializer to the alive set.
-    if (GV->hasInitializer())
-      MarkUsedGlobalsAsNeeded(GV->getInitializer());
-  } else if (GlobalIndirectSymbol *GIS = dyn_cast<GlobalIndirectSymbol>(G)) {
-    // The target of a global alias or ifunc is needed.
-    MarkUsedGlobalsAsNeeded(GIS->getIndirectSymbol());
-  } else {
-    // Otherwise this must be a function object.  We have to scan the body of
-    // the function looking for constants and global values which are used as
-    // operands.  Any operands of these types must be processed to ensure that
-    // any globals used will be marked as needed.
-    Function *F = cast<Function>(G);
-
-    for (Use &U : F->operands())
-      MarkUsedGlobalsAsNeeded(cast<Constant>(U.get()));
-
-    for (BasicBlock &BB : *F)
-      for (Instruction &I : BB)
-        for (Use &U : I.operands())
-          if (GlobalValue *GV = dyn_cast<GlobalValue>(U))
-            GlobalIsNeeded(GV);
-          else if (Constant *C = dyn_cast<Constant>(U))
-            MarkUsedGlobalsAsNeeded(C);
-  }
-}
-
-void GlobalDCEPass::MarkUsedGlobalsAsNeeded(Constant *C) {
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return GlobalIsNeeded(GV);
-
-  // Loop over all of the operands of the constant, adding any globals they
-  // use to the list of needed globals.
-  for (Use &U : C->operands()) {
-    // If we've already processed this constant there's no need to do it again.
-    Constant *Op = dyn_cast<Constant>(U);
-    if (Op && SeenConstants.insert(Op).second)
-      MarkUsedGlobalsAsNeeded(Op);
-  }
-}
-
 // RemoveUnusedGlobalValue - Loop over all of the uses of the specified
 // GlobalValue, looking for the constant pointer ref that may be pointing to it.
 // If found, check to see if the constant pointer ref is safe to destroy, and if
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 5b0d5e3bc01e..ade4f21ceb52 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1819,12 +1819,14 @@ static bool processInternalGlobal(
       GS.AccessingFunction->doesNotRecurse() &&
       isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
                                           LookupDomTree)) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
     Type *ElemTy = GV->getValueType();
     // FIXME: Pass Global's alignment when globals have alignment
-    AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr,
+    AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
                                         GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
@@ -1977,7 +1979,7 @@ static void ChangeCalleesToFastCall(Function *F) {
   }
 }
 
-static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
+static AttributeList StripNest(LLVMContext &C, const AttributeList &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
     unsigned Index = Attrs.getSlotIndex(i);
     if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
@@ -2387,7 +2389,7 @@ OptimizeGlobalAliases(Module &M,
 }
 
 static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
-  LibFunc::Func F = LibFunc::cxa_atexit;
+  LibFunc F = LibFunc_cxa_atexit;
   if (!TLI->has(F))
     return nullptr;
 
@@ -2396,7 +2398,7 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
     return nullptr;
 
   // Make sure that the function has the correct prototype.
-  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc::cxa_atexit)
+  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
     return nullptr;
 
   return Fn;
diff --git a/lib/Transforms/IPO/GlobalSplit.cpp b/lib/Transforms/IPO/GlobalSplit.cpp
index bbbd096e89c0..4705ebe265ae 100644
--- a/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/lib/Transforms/IPO/GlobalSplit.cpp
@@ -85,7 +85,16 @@ bool splitGlobal(GlobalVariable &GV) {
       uint64_t ByteOffset = cast<ConstantInt>(
               cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
               ->getZExtValue();
-      if (ByteOffset < SplitBegin || ByteOffset >= SplitEnd)
+      // Type metadata may be attached one byte after the end of the vtable, for
+      // classes without virtual methods in Itanium ABI. AFAIK, it is never
+      // attached to the first byte of a vtable. Subtract one to get the right
+      // slice.
+      // This is making an assumption that vtable groups are the only kinds of
+      // global variables that !type metadata can be attached to, and that they
+      // are either Itanium ABI vtable groups or contain a single vtable (i.e.
+      // Microsoft ABI vtables).
+      uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
+      if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
         continue;
       SplitGV->addMetadata(
           LLVMContext::MD_type,
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index 916135e33cd5..349807496dc2 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -136,7 +136,13 @@ static bool PropagateConstantReturn(Function &F) {
   // For more details, see GlobalValue::mayBeDerefined.
   if (!F.isDefinitionExact())
     return false;
-    
+
+  // Don't touch naked functions. The may contain asm returning
+  // value we don't see, so we may end up interprocedurally propagating
+  // the return value incorrectly.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
+
   // Check to see if this function returns a constant.
   SmallVector<Value *,4> RetVals;
   StructType *STy = dyn_cast<StructType>(F.getReturnType());
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 1770445b413f..50e7cc89a3b3 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -48,7 +48,7 @@ public:
   }
 
   explicit SimpleInliner(InlineParams Params)
-      : LegacyInlinerBase(ID), Params(Params) {
+      : LegacyInlinerBase(ID), Params(std::move(Params)) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -61,7 +61,8 @@ public:
         [&](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
-    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI);
+    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
+                               /*GetBFI=*/None, PSI);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
@@ -92,8 +93,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 }
 
 Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
-                                       unsigned SizeOptLevel) {
-  return new SimpleInliner(llvm::getInlineParams(OptLevel, SizeOptLevel));
+                                       unsigned SizeOptLevel,
+                                       bool DisableInlineHotCallSite) {
+  auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
+  if (DisableInlineHotCallSite)
+    Param.HotCallSiteThreshold = 0;
+  return new SimpleInliner(Param);
 }
 
 Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 3f4731c937d1..6c83c99ae3be 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -260,8 +261,8 @@ static bool InlineCallIfPossible(
 /// Return true if inlining of CS can block the caller from being
 /// inlined which is proved to be more beneficial. \p IC is the
 /// estimated inline cost associated with callsite \p CS.
-/// \p TotalAltCost will be set to the estimated cost of inlining the caller
-/// if \p CS is suppressed for inlining.
+/// \p TotalSecondaryCost will be set to the estimated cost of inlining the
+/// caller if \p CS is suppressed for inlining.
 static bool
 shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
                  int &TotalSecondaryCost,
@@ -288,7 +289,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   // treating them as truly abstract units etc.
   TotalSecondaryCost = 0;
   // The candidate cost to be imposed upon the current function.
-  int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
+  int CandidateCost = IC.getCost() - 1;
   // This bool tracks what happens if we do NOT inline C into B.
   bool callerWillBeRemoved = Caller->hasLocalLinkage();
   // This bool tracks what happens if we DO inline C into B.
@@ -325,7 +326,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   // one is set very low by getInlineCost, in anticipation that Caller will
   // be removed entirely.  We did not account for this above unless there
   // is only one caller of Caller.
-  if (callerWillBeRemoved && !Caller->use_empty())
+  if (callerWillBeRemoved && !Caller->hasOneUse())
     TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
 
   if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
@@ -342,6 +343,7 @@ static bool shouldInline(CallSite CS,
   InlineCost IC = GetInlineCost(CS);
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
 
   if (IC.isAlways()) {
     DEBUG(dbgs() << "    Inlining: cost=always"
@@ -355,19 +357,20 @@ static bool shouldInline(CallSite CS,
   if (IC.isNever()) {
     DEBUG(dbgs() << "    NOT Inlining: cost=never"
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", Callee)
-             << " should never be inlined (cost=never)");
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+             << NV("Callee", Callee) << " not inlined into "
+             << NV("Caller", Caller)
+             << " because it should never be inlined (cost=never)");
     return false;
   }
 
-  Function *Caller = CS.getCaller();
   if (!IC) {
     DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
                  << ", thres=" << (IC.getCostDelta() + IC.getCost())
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", Callee) << " too costly to inline (cost="
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", Callee) << " not inlined into "
+             << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
     return false;
@@ -378,8 +381,8 @@ static bool shouldInline(CallSite CS,
     DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
                  << " Cost = " << IC.getCost()
                  << ", outer Cost = " << TotalSecondaryCost << '\n');
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE,
-                                        "IncreaseCostInOtherContexts", Call)
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+                                      Call)
              << "Not inlining. Cost of inlining " << NV("Callee", Callee)
              << " increases the cost of inlining " << NV("Caller", Caller)
              << " in other contexts");
@@ -552,16 +555,11 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
 
         // If the policy determines that we should inline this function,
         // try to do so.
-        using namespace ore;
-        if (!shouldInline(CS, GetInlineCost, ORE)) {
-          ORE.emit(
-              OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
-              << NV("Callee", Callee) << " will not be inlined into "
-              << NV("Caller", Caller));
+        if (!shouldInline(CS, GetInlineCost, ORE))
           continue;
-        }
 
         // Attempt to inline the function.
+        using namespace ore;
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
                                   InlineHistoryID, InsertLifetime, AARGetter,
                                   ImportedFunctionsStats)) {
@@ -638,22 +636,12 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   ACT = &getAnalysis<AssumptionCacheTracker>();
   PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
   };
   return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
                          [this](CallSite CS) { return getInlineCost(CS); },
-                         AARGetter, ImportedFunctionsStats);
+                         LegacyAARGetter(*this), ImportedFunctionsStats);
 }
 
 /// Remove now-dead linkonce functions at the end of
@@ -750,9 +738,6 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
-  FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
-          .getManager();
   const ModuleAnalysisManager &MAM =
       AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager();
   bool Changed = false;
@@ -761,35 +746,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   Module &M = *InitialC.begin()->getFunction().getParent();
   ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
 
-  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-      [&](Function &F) -> AssumptionCache & {
-    return FAM.getResult<AssumptionAnalysis>(F);
-  };
-
-  // Setup the data structure used to plumb customization into the
-  // `InlineFunction` routine.
-  InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache);
+  // We use a single common worklist for calls across the entire SCC. We
+  // process these in-order and append new calls introduced during inlining to
+  // the end.
+  //
+  // Note that this particular order of processing is actually critical to
+  // avoid very bad behaviors. Consider *highly connected* call graphs where
+  // each function contains a small amonut of code and a couple of calls to
+  // other functions. Because the LLVM inliner is fundamentally a bottom-up
+  // inliner, it can handle gracefully the fact that these all appear to be
+  // reasonable inlining candidates as it will flatten things until they become
+  // too big to inline, and then move on and flatten another batch.
+  //
+  // However, when processing call edges *within* an SCC we cannot rely on this
+  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+  // functions we can end up incrementally inlining N calls into each of
+  // N functions because each incremental inlining decision looks good and we
+  // don't have a topological ordering to prevent explosions.
+  //
+  // To compensate for this, we don't process transitive edges made immediate
+  // by inlining until we've done one pass of inlining across the entire SCC.
+  // Large, highly connected SCCs still lead to some amount of code bloat in
+  // this model, but it is uniformly spread across all the functions in the SCC
+  // and eventually they all become too large to inline, rather than
+  // incrementally maknig a single function grow in a super linear fashion.
+  SmallVector<std::pair<CallSite, int>, 16> Calls;
 
-  auto GetInlineCost = [&](CallSite CS) {
-    Function &Callee = *CS.getCalledFunction();
-    auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
-    return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI);
-  };
+  // Populate the initial list of calls in this SCC.
+  for (auto &N : InitialC) {
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(N.getFunction()))
+      if (auto CS = CallSite(&I))
+        if (Function *Callee = CS.getCalledFunction())
+          if (!Callee->isDeclaration())
+            Calls.push_back({CS, -1});
+  }
+  if (Calls.empty())
+    return PreservedAnalyses::all();
 
-  // We use a worklist of nodes to process so that we can handle if the SCC
-  // structure changes and some nodes are no longer part of the current SCC. We
-  // also need to use an updatable pointer for the SCC as a consequence.
-  SmallVector<LazyCallGraph::Node *, 16> Nodes;
-  for (auto &N : InitialC)
-    Nodes.push_back(&N);
+  // Capture updatable variables for the current SCC and RefSCC.
   auto *C = &InitialC;
   auto *RC = &C->getOuterRefSCC();
 
-  // We also use a secondary worklist of call sites within a particular node to
-  // allow quickly continuing to inline through newly inlined call sites where
-  // possible.
-  SmallVector<std::pair<CallSite, int>, 16> Calls;
-
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
   // infinite inlining in some obscure cases.  To represent this, we use an
@@ -805,34 +807,58 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // defer deleting these to make it easier to handle the call graph updates.
   SmallVector<Function *, 4> DeadFunctions;
 
-  do {
-    auto &N = *Nodes.pop_back_val();
+  // Loop forward over all of the calls. Note that we cannot cache the size as
+  // inlining can introduce new calls that need to be processed.
+  for (int i = 0; i < (int)Calls.size(); ++i) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls[i].first.getCaller();
+    LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C)
       continue;
-    Function &F = N.getFunction();
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
 
+    DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+
+    // Get a FunctionAnalysisManager via a proxy for this particular node. We
+    // do this each time we visit a node as the SCC may have changed and as
+    // we're going to mutate this particular function we want to make sure the
+    // proxy is in place to forward any invalidation events. We can use the
+    // manager we get here for looking up results for functions other than this
+    // node however because those functions aren't going to be mutated by this
+    // pass.
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG)
+            .getManager();
+    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+        [&](Function &F) -> AssumptionCache & {
+      return FAM.getResult<AssumptionAnalysis>(F);
+    };
+    auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
+      return FAM.getResult<BlockFrequencyAnalysis>(F);
+    };
+
+    auto GetInlineCost = [&](CallSite CS) {
+      Function &Callee = *CS.getCalledFunction();
+      auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
+      return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI},
+                           PSI);
+    };
+
     // Get the remarks emission analysis for the caller.
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    // We want to generally process call sites top-down in order for
-    // simplifications stemming from replacing the call with the returned value
-    // after inlining to be visible to subsequent inlining decisions. So we
-    // walk the function backwards and then process the back of the vector.
-    // FIXME: Using reverse is a really bad way to do this. Instead we should
-    // do an actual PO walk of the function body.
-    for (Instruction &I : reverse(instructions(F)))
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
-          if (!Callee->isDeclaration())
-            Calls.push_back({CS, -1});
-
+    // Now process as many calls as we have within this caller in the sequnece.
+    // We bail out as soon as the caller has to change so we can update the
+    // call graph and prepare the context of that new caller.
     bool DidInline = false;
-    while (!Calls.empty()) {
+    for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) {
       int InlineHistoryID;
       CallSite CS;
-      std::tie(CS, InlineHistoryID) = Calls.pop_back_val();
+      std::tie(CS, InlineHistoryID) = Calls[i];
       Function &Callee = *CS.getCalledFunction();
 
       if (InlineHistoryID != -1 &&
@@ -843,6 +869,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       if (!shouldInline(CS, GetInlineCost, ORE))
         continue;
 
+      // Setup the data structure used to plumb customization into the
+      // `InlineFunction` routine.
+      InlineFunctionInfo IFI(
+          /*cg=*/nullptr, &GetAssumptionCache,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
+          &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
       if (!InlineFunction(CS, IFI))
         continue;
       DidInline = true;
@@ -870,6 +903,12 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         // made dead by this operation on other functions).
         Callee.removeDeadConstantUsers();
         if (Callee.use_empty()) {
+          Calls.erase(
+              std::remove_if(Calls.begin() + i + 1, Calls.end(),
+                             [&Callee](const std::pair<CallSite, int> &Call) {
+                               return Call.first.getCaller() == &Callee;
+                             }),
+              Calls.end());
           // Clear the body and queue the function itself for deletion when we
           // finish inlining and call graph updates.
           // Note that after this point, it is an error to do anything other
@@ -882,6 +921,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       }
     }
 
+    // Back the call index up by one to put us in a good position to go around
+    // the outer loop.
+    --i;
+
     if (!DidInline)
       continue;
     Changed = true;
@@ -896,8 +939,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // below.
     for (Function *InlinedCallee : InlinedCallees) {
       LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee);
-      for (LazyCallGraph::Edge &E : CalleeN)
-        RC->insertTrivialRefEdge(N, *E.getNode());
+      for (LazyCallGraph::Edge &E : *CalleeN)
+        RC->insertTrivialRefEdge(N, E.getNode());
     }
     InlinedCallees.clear();
 
@@ -908,8 +951,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // re-use the exact same logic for updating the call graph to reflect the
     // change..
     C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
+    DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
-  } while (!Nodes.empty());
+  }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
   // all of the trivially dead functions, updating the call graph and the CGSCC
@@ -920,8 +964,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // sets.
   for (Function *DeadF : DeadFunctions) {
     // Get the necessary information out of the call graph and nuke the
-    // function there.
+    // function there. Also, cclear out any cached analyses.
     auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG)
+            .getManager();
+    FAM.clear(*DeadF);
+    AM.clear(DeadC);
     auto &DeadRC = DeadC.getOuterRefSCC();
     CG.removeDeadFunction(*DeadF);
 
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index deb7e819480b..785207efbe5c 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -42,8 +42,6 @@
 using namespace llvm;
 using namespace lowertypetests;
 
-using SummaryAction = LowerTypeTestsSummaryAction;
-
 #define DEBUG_TYPE "lowertypetests"
 
 STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -57,13 +55,13 @@ static cl::opt<bool> AvoidReuse(
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<SummaryAction> ClSummaryAction(
+static cl::opt<PassSummaryAction> ClSummaryAction(
     "lowertypetests-summary-action",
     cl::desc("What to do with the summary when running this pass"),
-    cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
-               clEnumValN(SummaryAction::Import, "import",
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
                           "Import typeid resolutions from summary and globals"),
-               clEnumValN(SummaryAction::Export, "export",
+               clEnumValN(PassSummaryAction::Export, "export",
                           "Export typeid resolutions to summary and globals")),
     cl::Hidden);
 
@@ -234,8 +232,8 @@ public:
 class LowerTypeTestsModule {
   Module &M;
 
-  SummaryAction Action;
-  ModuleSummaryIndex *Summary;
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
 
   bool LinkerSubsectionsViaSymbols;
   Triple::ArchType Arch;
@@ -253,15 +251,21 @@ class LowerTypeTestsModule {
   // Indirect function call index assignment counter for WebAssembly
   uint64_t IndirectIndex = 1;
 
-  // Mapping from type identifiers to the call sites that test them.
-  DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites;
+  // Mapping from type identifiers to the call sites that test them, as well as
+  // whether the type identifier needs to be exported to ThinLTO backends as
+  // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId).
+  struct TypeIdUserInfo {
+    std::vector<CallInst *> CallSites;
+    bool IsExported = false;
+  };
+  DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers;
 
   /// This structure describes how to lower type tests for a particular type
   /// identifier. It is either built directly from the global analysis (during
   /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
   /// identifier summaries and external symbol references (in ThinLTO backends).
   struct TypeIdLowering {
-    TypeTestResolution::Kind TheKind;
+    TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
 
     /// All except Unsat: the start address within the combined global.
     Constant *OffsetedGlobal;
@@ -274,9 +278,6 @@ class LowerTypeTestsModule {
     /// covering members of this type identifier as a multiple of 2^AlignLog2.
     Constant *SizeM1;
 
-    /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width.
-    unsigned SizeM1BitWidth;
-
     /// ByteArray: the byte array to test the address against.
     Constant *TheByteArray;
 
@@ -291,6 +292,10 @@ class LowerTypeTestsModule {
 
   Function *WeakInitializerFn = nullptr;
 
+  void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
+  TypeIdLowering importTypeId(StringRef TypeId);
+  void importTypeTest(CallInst *CI);
+
   BitSetInfo
   buildBitSet(Metadata *TypeId,
               const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
@@ -327,8 +332,8 @@ class LowerTypeTestsModule {
   void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
 
 public:
-  LowerTypeTestsModule(Module &M, SummaryAction Action,
-                       ModuleSummaryIndex *Summary);
+  LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
+                       const ModuleSummaryIndex *ImportSummary);
   bool lower();
 
   // Lower the module using the action and summary passed as command line
@@ -341,15 +346,17 @@ struct LowerTypeTests : public ModulePass {
 
   bool UseCommandLine = false;
 
-  SummaryAction Action;
-  ModuleSummaryIndex *Summary;
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
 
   LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
-  LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
-      : ModulePass(ID), Action(Action), Summary(Summary) {
+  LowerTypeTests(ModuleSummaryIndex *ExportSummary,
+                 const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
@@ -358,7 +365,7 @@ struct LowerTypeTests : public ModulePass {
       return false;
     if (UseCommandLine)
       return LowerTypeTestsModule::runForTesting(M);
-    return LowerTypeTestsModule(M, Action, Summary).lower();
+    return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
   }
 };
 
@@ -368,9 +375,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
                 false)
 char LowerTypeTests::ID = 0;
 
-ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
-                                           ModuleSummaryIndex *Summary) {
-  return new LowerTypeTests(Action, Summary);
+ModulePass *
+llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                               const ModuleSummaryIndex *ImportSummary) {
+  return new LowerTypeTests(ExportSummary, ImportSummary);
 }
 
 /// Build a bit set for TypeId using the object layouts in
@@ -494,10 +502,11 @@ Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
     return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
   } else {
     Constant *ByteArray = TIL.TheByteArray;
-    if (!LinkerSubsectionsViaSymbols && AvoidReuse) {
+    if (!LinkerSubsectionsViaSymbols && AvoidReuse && !ImportSummary) {
       // Each use of the byte array uses a different alias. This makes the
       // backend less likely to reuse previously computed byte array addresses,
       // improving the security of the CFI mechanism based on this pass.
+      // This won't work when importing because TheByteArray is external.
       ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
                                       "bits_use", ByteArray, &M);
     }
@@ -593,8 +602,7 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                      IntPtrTy));
   Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
 
-  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy);
-  Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst);
+  Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1);
 
   // If the bit set is all ones, testing against it is unnecessary.
   if (TIL.TheKind == TypeTestResolution::AllOnes)
@@ -687,6 +695,123 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
   }
 }
 
+/// Export the given type identifier so that ThinLTO backends may import it.
+/// Type identifiers are exported by adding coarse-grained information about how
+/// to test the type identifier to the summary, and creating symbols in the
+/// object file (aliases and absolute symbols) containing fine-grained
+/// information about the type identifier.
+void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
+                                        const TypeIdLowering &TIL) {
+  TypeTestResolution &TTRes =
+      ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
+  TTRes.TheKind = TIL.TheKind;
+
+  auto ExportGlobal = [&](StringRef Name, Constant *C) {
+    GlobalAlias *GA =
+        GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                            "__typeid_" + TypeId + "_" + Name, C, &M);
+    GA->setVisibility(GlobalValue::HiddenVisibility);
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    ExportGlobal("global_addr", TIL.OffsetedGlobal);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy));
+    ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy));
+
+    uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
+    if (TIL.TheKind == TypeTestResolution::Inline)
+      TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6;
+    else
+      TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32;
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    ExportGlobal("byte_array", TIL.TheByteArray);
+    ExportGlobal("bit_mask", TIL.BitMask);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    ExportGlobal("inline_bits",
+                 ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy));
+}
+
+LowerTypeTestsModule::TypeIdLowering
+LowerTypeTestsModule::importTypeId(StringRef TypeId) {
+  const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
+  if (!TidSummary)
+    return {}; // Unsat: no globals match this type id.
+  const TypeTestResolution &TTRes = TidSummary->TTRes;
+
+  TypeIdLowering TIL;
+  TIL.TheKind = TTRes.TheKind;
+
+  auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) {
+    Constant *C =
+        M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty);
+    auto *GV = dyn_cast<GlobalVariable>(C);
+    // We only need to set metadata if the global is newly created, in which
+    // case it would not have hidden visibility.
+    if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+      return C;
+
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+    auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+      GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                      MDNode::get(M.getContext(), {MinC, MaxC}));
+    };
+    if (AbsWidth == IntPtrTy->getBitWidth())
+      SetAbsRange(~0ull, ~0ull); // Full set.
+    else if (AbsWidth)
+      SetAbsRange(0, 1ull << AbsWidth);
+    return C;
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    TIL.OffsetedGlobal = ImportGlobal("global_addr", 0);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty);
+    TIL.SizeM1 = ConstantExpr::getPtrToInt(
+        ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    TIL.TheByteArray = ImportGlobal("byte_array", 0);
+    TIL.BitMask = ImportGlobal("bit_mask", 8);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    TIL.InlineBits = ConstantExpr::getPtrToInt(
+        ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth),
+        TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
+
+  return TIL;
+}
+
+void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
+  auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+  if (!TypeIdMDVal)
+    report_fatal_error("Second argument of llvm.type.test must be metadata");
+
+  auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
+  if (!TypeIdStr)
+    report_fatal_error(
+        "Second argument of llvm.type.test must be a metadata string");
+
+  TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
+  Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
+  CI->replaceAllUsesWith(Lowered);
+  CI->eraseFromParent();
+}
+
 void LowerTypeTestsModule::lowerTypeTestCalls(
     ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
     const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
@@ -708,16 +833,12 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
         Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
     TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+    TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1);
     if (BSI.isAllOnes()) {
       TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
                                        : TypeTestResolution::AllOnes;
-      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
-      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
-                                    BSI.BitSize - 1);
     } else if (BSI.BitSize <= 64) {
       TIL.TheKind = TypeTestResolution::Inline;
-      TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6;
-      TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1);
       uint64_t InlineBits = 0;
       for (auto Bit : BSI.Bits)
         InlineBits |= uint64_t(1) << Bit;
@@ -728,17 +849,19 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
             (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
     } else {
       TIL.TheKind = TypeTestResolution::ByteArray;
-      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
-      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
-                                    BSI.BitSize - 1);
       ++NumByteArraysCreated;
       ByteArrayInfo *BAI = createByteArray(BSI);
       TIL.TheByteArray = BAI->ByteArray;
       TIL.BitMask = BAI->MaskGlobal;
     }
 
+    TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
+
+    if (TIUI.IsExported)
+      exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+
     // Lower each call to llvm.type.test for this type identifier.
-    for (CallInst *CI : TypeTestCallSites[TypeId]) {
+    for (CallInst *CI : TIUI.CallSites) {
       ++NumTypeTestCallsLowered;
       Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
       CI->replaceAllUsesWith(Lowered);
@@ -757,9 +880,9 @@ void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
     report_fatal_error(
         "A member of a type identifier may not have an explicit section");
 
-  if (isa<GlobalVariable>(GO) && GO->isDeclarationForLinker())
-    report_fatal_error(
-        "A global var member of a type identifier must be a definition");
+  // FIXME: We previously checked that global var member of a type identifier
+  // must be a definition, but the IR linker may leave type metadata on
+  // declarations. We should restore this check after fixing PR31759.
 
   auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
   if (!OffsetConstMD)
@@ -1173,13 +1296,11 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 }
 
 /// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
-                                           ModuleSummaryIndex *Summary)
-    : M(M), Action(Action), Summary(Summary) {
-  // FIXME: Use these fields.
-  (void)this->Action;
-  (void)this->Summary;
-
+LowerTypeTestsModule::LowerTypeTestsModule(
+    Module &M, ModuleSummaryIndex *ExportSummary,
+    const ModuleSummaryIndex *ImportSummary)
+    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
+  assert(!(ExportSummary && ImportSummary));
   Triple TargetTriple(M.getTargetTriple());
   LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
   Arch = TargetTriple.getArch();
@@ -1203,7 +1324,11 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
     ExitOnErr(errorCodeToError(In.error()));
   }
 
-  bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+  bool Changed =
+      LowerTypeTestsModule(
+          M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .lower();
 
   if (!ClWriteSummary.empty()) {
     ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
@@ -1222,9 +1347,18 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
-  if (!TypeTestFunc || TypeTestFunc->use_empty())
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary)
     return false;
 
+  if (ImportSummary) {
+    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+         UI != UE;) {
+      auto *CI = cast<CallInst>((*UI++).getUser());
+      importTypeTest(CI);
+    }
+    return true;
+  }
+
   // Equivalence class set containing type identifiers and the globals that
   // reference them. This is used to partition the set of type identifiers in
   // the module into disjoint sets.
@@ -1248,6 +1382,9 @@ bool LowerTypeTestsModule::lower() {
   unsigned I = 0;
   SmallVector<MDNode *, 2> Types;
   for (GlobalObject &GO : M.global_objects()) {
+    if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
+      continue;
+
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
     if (Types.empty())
@@ -1262,33 +1399,57 @@ bool LowerTypeTestsModule::lower() {
     }
   }
 
-  for (const Use &U : TypeTestFunc->uses()) {
-    auto CI = cast<CallInst>(U.getUser());
+  auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & {
+    // Add the call site to the list of call sites for this type identifier. We
+    // also use TypeIdUsers to keep track of whether we have seen this type
+    // identifier before. If we have, we don't need to re-add the referenced
+    // globals to the equivalence class.
+    auto Ins = TypeIdUsers.insert({TypeId, {}});
+    if (Ins.second) {
+      // Add the type identifier to the equivalence class.
+      GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId);
+      GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+
+      // Add the referenced globals to the type identifier's equivalence class.
+      for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals)
+        CurSet = GlobalClasses.unionSets(
+            CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+    }
+
+    return Ins.first->second;
+  };
 
-    auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
-    if (!BitSetMDVal)
-      report_fatal_error("Second argument of llvm.type.test must be metadata");
-    auto BitSet = BitSetMDVal->getMetadata();
+  if (TypeTestFunc) {
+    for (const Use &U : TypeTestFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
 
-    // Add the call site to the list of call sites for this type identifier. We
-    // also use TypeTestCallSites to keep track of whether we have seen this
-    // type identifier before. If we have, we don't need to re-add the
-    // referenced globals to the equivalence class.
-    std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool>
-        Ins = TypeTestCallSites.insert(
-            std::make_pair(BitSet, std::vector<CallInst *>()));
-    Ins.first->second.push_back(CI);
-    if (!Ins.second)
-      continue;
+      auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+      if (!TypeIdMDVal)
+        report_fatal_error("Second argument of llvm.type.test must be metadata");
+      auto TypeId = TypeIdMDVal->getMetadata();
+      AddTypeIdUse(TypeId).CallSites.push_back(CI);
+    }
+  }
 
-    // Add the type identifier to the equivalence class.
-    GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet);
-    GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdInfo) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
 
-    // Add the referenced globals to the type identifier's equivalence class.
-    for (GlobalTypeMember *GTM : TypeIdInfo[BitSet].RefGlobals)
-      CurSet = GlobalClasses.unionSets(
-          CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS)
+          continue;
+        // FIXME: Only add live functions.
+        for (GlobalValue::GUID G : FS->type_tests())
+          for (Metadata *MD : MetadataByGUID[G])
+            AddTypeIdUse(MD).IsExported = true;
+      }
+    }
   }
 
   if (GlobalClasses.empty())
@@ -1349,8 +1510,9 @@ bool LowerTypeTestsModule::lower() {
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed =
-      LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
+  bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr,
+                                      /*ImportSummary=*/nullptr)
+                     .lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index e0bb0eb42b59..771770ddc060 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -96,8 +96,10 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
@@ -127,6 +129,26 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck(
              "'0' disables this check. Works only with '-debug' key."),
     cl::init(0), cl::Hidden);
 
+// Under option -mergefunc-preserve-debug-info we:
+// - Do not create a new function for a thunk.
+// - Retain the debug info for a thunk's parameters (and associated
+//   instructions for the debug info) from the entry block.
+//   Note: -debug will display the algorithm at work.
+// - Create debug-info for the call (to the shared implementation) made by
+//   a thunk and its return value.
+// - Erase the rest of the function, retaining the (minimally sized) entry
+//   block to create a thunk.
+// - Preserve a thunk's call site to point to the thunk even when both occur
+//   within the same translation unit, to aid debugability. Note that this
+//   behaviour differs from the underlying -mergefunc implementation which
+//   modifies the thunk's call site to point to the shared implementation
+//   when both occur within the same translation unit.
+static cl::opt<bool>
+    MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden,
+                      cl::init(false),
+                      cl::desc("Preserve debug info in thunk when mergefunc "
+                               "transformations are made."));
+
 namespace {
 
 class FunctionNode {
@@ -215,8 +237,21 @@ private:
   /// Replace G with a thunk or an alias to F. Deletes G.
   void writeThunkOrAlias(Function *F, Function *G);
 
-  /// Replace G with a simple tail call to bitcast(F). Also replace direct uses
-  /// of G with bitcast(F). Deletes G.
+  /// Fill PDIUnrelatedWL with instructions from the entry block that are
+  /// unrelated to parameter related debug info.
+  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+                                 std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Erase the rest of the CFG (i.e. barring the entry block).
+  void eraseTail(Function *G);
+
+  /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+  /// parameter debug info, from the entry block.
+  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Replace G with a simple tail call to bitcast(F). Also (unless
+  /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+  /// delete G.
   void writeThunk(Function *F, Function *G);
 
   /// Replace G with an alias to F. Deletes G.
@@ -269,8 +304,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
         if (Res1 != -Res2) {
           dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
                  << "\n";
-          F1->dump();
-          F2->dump();
+          dbgs() << *F1 << '\n' << *F2 << '\n';
           Valid = false;
         }
 
@@ -305,9 +339,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
                    << TripleNumber << "\n";
             dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
                    << Res4 << "\n";
-            F1->dump();
-            F2->dump();
-            F3->dump();
+            dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n';
             Valid = false;
           }
         }
@@ -400,19 +432,15 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
       // Transferring other attributes may help other optimizations, but that
       // should be done uniformly and not in this ad-hoc way.
       auto &Context = New->getContext();
-      auto NewFuncAttrs = New->getAttributes();
-      auto CallSiteAttrs = CS.getAttributes();
-
-      CallSiteAttrs = CallSiteAttrs.addAttributes(
-          Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes());
-
-      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) {
-        AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx);
-        if (Attrs.getNumSlots())
-          CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs);
-      }
-
-      CS.setAttributes(CallSiteAttrs);
+      auto NewPAL = New->getAttributes();
+      SmallVector<AttributeSet, 4> NewArgAttrs;
+      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++)
+        NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx));
+      // Don't transfer attributes from the function to the callee. Function
+      // attributes typically aren't relevant to the calling convention or ABI.
+      CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
+                                          NewPAL.getRetAttributes(),
+                                          NewArgAttrs));
 
       remove(CS.getInstruction()->getParent()->getParent());
       U->set(BitcastNew);
@@ -461,51 +489,242 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
     return Builder.CreateBitCast(V, DestTy);
 }
 
-// Replace G with a simple tail call to bitcast(F). Also replace direct uses
-// of G with bitcast(F). Deletes G.
+// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+// parameter debug info, from the entry block.
+void MergeFunctions::eraseInstsUnrelatedToPDI(
+    std::vector<Instruction *> &PDIUnrelatedWL) {
+
+  DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in "
+                  "entry block) unrelated to parameter debug info from entry "
+                  "block: {\n");
+  while (!PDIUnrelatedWL.empty()) {
+    Instruction *I = PDIUnrelatedWL.back();
+    DEBUG(dbgs() << "  Deleting Instruction: ");
+    DEBUG(I->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+    I->eraseFromParent();
+    PDIUnrelatedWL.pop_back();
+  }
+  DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+                  "debug info from entry block. \n");
+}
+
+// Reduce G to its entry block.
+void MergeFunctions::eraseTail(Function *G) {
+
+  std::vector<BasicBlock *> WorklistBB;
+  for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
+       BBI != BBE; ++BBI) {
+    BBI->dropAllReferences();
+    WorklistBB.push_back(&*BBI);
+  }
+  while (!WorklistBB.empty()) {
+    BasicBlock *BB = WorklistBB.back();
+    BB->eraseFromParent();
+    WorklistBB.pop_back();
+  }
+}
+
+// We are interested in the following instructions from the entry block as being
+// related to parameter debug info:
+// - @llvm.dbg.declare
+// - stores from the incoming parameters to locations on the stack-frame
+// - allocas that create these locations on the stack-frame
+// - @llvm.dbg.value
+// - the entry block's terminator
+// The rest are unrelated to debug info for the parameters; fill up
+// PDIUnrelatedWL with such instructions.
+void MergeFunctions::filterInstsUnrelatedToPDI(
+    BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
+
+  std::set<Instruction *> PDIRelated;
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
+       BI != BIE; ++BI) {
+    if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
+      DEBUG(dbgs() << " Deciding: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DVI->getVariable();
+      if (DILocVar->isParameter()) {
+        DEBUG(dbgs() << "  Include (parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+        PDIRelated.insert(&*BI);
+      } else {
+        DEBUG(dbgs() << "  Delete (!parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
+    } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
+      DEBUG(dbgs() << " Deciding: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DDI->getVariable();
+      if (DILocVar->isParameter()) {
+        DEBUG(dbgs() << "  Parameter: ");
+        DEBUG(DILocVar->print(dbgs()));
+        AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+        if (AI) {
+          DEBUG(dbgs() << "  Processing alloca users: ");
+          DEBUG(dbgs() << "\n");
+          for (User *U : AI->users()) {
+            if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+              if (Value *Arg = SI->getValueOperand()) {
+                if (dyn_cast<Argument>(Arg)) {
+                  DEBUG(dbgs() << "  Include: ");
+                  DEBUG(AI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(AI);
+                  DEBUG(dbgs() << "   Include (parameter): ");
+                  DEBUG(SI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(SI);
+                  DEBUG(dbgs() << "  Include: ");
+                  DEBUG(BI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(&*BI);
+                } else {
+                  DEBUG(dbgs() << "   Delete (!parameter): ");
+                  DEBUG(SI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                }
+              }
+            } else {
+              DEBUG(dbgs() << "   Defer: ");
+              DEBUG(U->print(dbgs()));
+              DEBUG(dbgs() << "\n");
+            }
+          }
+        } else {
+          DEBUG(dbgs() << "  Delete (alloca NULL): ");
+          DEBUG(BI->print(dbgs()));
+          DEBUG(dbgs() << "\n");
+        }
+      } else {
+        DEBUG(dbgs() << "  Delete (!parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
+    } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
+      DEBUG(dbgs() << " Will Include Terminator: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      PDIRelated.insert(&*BI);
+    } else {
+      DEBUG(dbgs() << " Defer: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+  DEBUG(dbgs()
+        << " Report parameter debug info related/related instructions: {\n");
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
+       BI != BE; ++BI) {
+
+    Instruction *I = &*BI;
+    if (PDIRelated.find(I) == PDIRelated.end()) {
+      DEBUG(dbgs() << "  !PDIRelated: ");
+      DEBUG(I->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      PDIUnrelatedWL.push_back(I);
+    } else {
+      DEBUG(dbgs() << "   PDIRelated: ");
+      DEBUG(I->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+  DEBUG(dbgs() << " }\n");
+}
+
+// Replace G with a simple tail call to bitcast(F). Also (unless
+// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+// delete G. Under MergeFunctionsPDI, we use G itself for creating
+// the thunk as we preserve the debug info (and associated instructions)
+// from G's entry block pertaining to G's incoming arguments which are
+// passed on as corresponding arguments in the call that G makes to F.
+// For better debugability, under MergeFunctionsPDI, we do not modify G's
+// call sites to point to F even when within the same translation unit.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
-  if (!G->isInterposable()) {
-    // Redirect direct callers of G to F.
+  if (!G->isInterposable() && !MergeFunctionsPDI) {
+    // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+    // above).
     replaceDirectCallers(G, F);
   }
 
   // If G was internal then we may have replaced all uses of G with F. If so,
-  // stop here and delete G. There's no need for a thunk.
-  if (G->hasLocalLinkage() && G->use_empty()) {
+  // stop here and delete G. There's no need for a thunk. (See note on
+  // MergeFunctionsPDI above).
+  if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
     G->eraseFromParent();
     return;
   }
 
-  Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
-                                    G->getParent());
-  BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG);
-  IRBuilder<> Builder(BB);
+  BasicBlock *GEntryBlock = nullptr;
+  std::vector<Instruction *> PDIUnrelatedWL;
+  BasicBlock *BB = nullptr;
+  Function *NewG = nullptr;
+  if (MergeFunctionsPDI) {
+    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+                    "function as thunk; retain original: "
+                 << G->getName() << "()\n");
+    GEntryBlock = &G->getEntryBlock();
+    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+                    "debug info for "
+                 << G->getName() << "() {\n");
+    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
+    GEntryBlock->getTerminator()->eraseFromParent();
+    BB = GEntryBlock;
+  } else {
+    NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                            G->getParent());
+    BB = BasicBlock::Create(F->getContext(), "", NewG);
+  }
 
+  IRBuilder<> Builder(BB);
+  Function *H = MergeFunctionsPDI ? G : NewG;
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
   FunctionType *FFTy = F->getFunctionType();
-  for (Argument & AI : NewG->args()) {
+  for (Argument & AI : H->args()) {
     Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
     ++i;
   }
 
   CallInst *CI = Builder.CreateCall(F, Args);
+  ReturnInst *RI = nullptr;
   CI->setTailCall();
   CI->setCallingConv(F->getCallingConv());
   CI->setAttributes(F->getAttributes());
-  if (NewG->getReturnType()->isVoidTy()) {
-    Builder.CreateRetVoid();
+  if (H->getReturnType()->isVoidTy()) {
+    RI = Builder.CreateRetVoid();
   } else {
-    Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType()));
+    RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType()));
   }
 
-  NewG->copyAttributesFrom(G);
-  NewG->takeName(G);
-  removeUsers(G);
-  G->replaceAllUsesWith(NewG);
-  G->eraseFromParent();
+  if (MergeFunctionsPDI) {
+    DISubprogram *DIS = G->getSubprogram();
+    if (DIS) {
+      DebugLoc CIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      DebugLoc RIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      CI->setDebugLoc(CIDbgLoc);
+      RI->setDebugLoc(RIDbgLoc);
+    } else {
+      DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+                   << G->getName() << "()\n");
+    }
+    eraseTail(G);
+    eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
+    DEBUG(dbgs() << "} // End of parameter related debug info filtering for: "
+                 << G->getName() << "()\n");
+  } else {
+    NewG->copyAttributesFrom(G);
+    NewG->takeName(G);
+    removeUsers(G);
+    G->replaceAllUsesWith(NewG);
+    G->eraseFromParent();
+  }
 
-  DEBUG(dbgs() << "writeThunk: " << NewG->getName() << '\n');
+  DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
   ++NumThunksWritten;
 }
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 7ef3fc1fc2a7..a2f6e5639d9d 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -33,7 +33,7 @@ STATISTIC(NumPartialInlined, "Number of functions partially inlined");
 
 namespace {
 struct PartialInlinerImpl {
-  PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(IFI) {}
+  PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(std::move(IFI)) {}
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 941efb210d1c..f11b58d1adc4 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -93,10 +93,6 @@ static cl::opt<CFLAAType>
                         clEnumValN(CFLAAType::Both, "both",
                                    "Enable both variants of CFL-AA")));
 
-static cl::opt<bool>
-EnableMLSM("mlsm", cl::init(true), cl::Hidden,
-           cl::desc("Enable motion of merged load and store"));
-
 static cl::opt<bool> EnableLoopInterchange(
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
@@ -141,8 +137,8 @@ static cl::opt<int> PreInlineThreshold(
              "(default = 75)"));
 
 static cl::opt<bool> EnableGVNHoist(
-    "enable-gvn-hoist", cl::init(false), cl::Hidden,
-    cl::desc("Enable the GVN hoisting pass"));
+    "enable-gvn-hoist", cl::init(true), cl::Hidden,
+    cl::desc("Enable the GVN hoisting pass (default = on)"));
 
 static cl::opt<bool>
     DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
@@ -172,6 +168,7 @@ PassManagerBuilder::PassManagerBuilder() {
     PGOInstrUse = RunPGOInstrUse;
     PrepareForThinLTO = EnablePrepareForThinLTO;
     PerformThinLTO = false;
+    DivergentTarget = false;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -248,8 +245,6 @@ void PassManagerBuilder::populateFunctionPassManager(
   FPM.add(createCFGSimplificationPass());
   FPM.add(createSROAPass());
   FPM.add(createEarlyCSEPass());
-  if(EnableGVNHoist)
-    FPM.add(createGVNHoistPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
@@ -294,6 +289,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // Break up aggregate allocas, using SSAUpdater.
   MPM.add(createSROAPass());
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  if (EnableGVNHoist)
+    MPM.add(createGVNHoistPass());
   // Speculative execution if the target has divergent branches; otherwise nop.
   MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
   MPM.add(createJumpThreadingPass());         // Thread jumps.
@@ -305,29 +302,34 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createLibCallsShrinkWrapPass());
   addExtensionsToPM(EP_Peephole, MPM);
 
+  // Optimize memory intrinsic calls based on the profiled size information.
+  if (SizeLevel == 0)
+    MPM.add(createPGOMemOPSizeOptLegacyPass());
+
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
   MPM.add(createLICMPass());                  // Hoist loop invariants
-  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
   MPM.add(createCFGSimplificationPass());
   addInstructionCombiningPass(MPM);
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  addExtensionsToPM(EP_LateLoopOptimizations, MPM);
   MPM.add(createLoopDeletionPass());          // Delete dead loops
+
   if (EnableLoopInterchange) {
     MPM.add(createLoopInterchangePass()); // Interchange loops
     MPM.add(createCFGSimplificationPass());
   }
   if (!DisableUnrollLoops)
-    MPM.add(createSimpleLoopUnrollPass());    // Unroll small loops
+    MPM.add(createSimpleLoopUnrollPass(OptLevel));    // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
 
   if (OptLevel > 1) {
-    if (EnableMLSM)
-      MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
+    MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
     MPM.add(NewGVN ? createNewGVNPass()
                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
   }
@@ -369,7 +371,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
       // BBVectorize may have significantly shortened a loop body; unroll again.
       if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
+        MPM.add(createLoopUnrollPass(OptLevel));
     }
   }
 
@@ -434,7 +436,16 @@ void PassManagerBuilder::populateModulePassManager(
   // earlier in the pass pipeline, here before globalopt. Otherwise imported
   // available_externally functions look unreferenced and are removed.
   if (PerformThinLTO)
-    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true));
+    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
+                                                     !PGOSampleUse.empty()));
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
+  // as it will change the CFG too much to make the 2nd profile annotation
+  // in backend more difficult.
+  bool PrepareForThinLTOUsingPGOSampleProfile =
+      PrepareForThinLTO && !PGOSampleUse.empty();
+  if (PrepareForThinLTOUsingPGOSampleProfile)
+    DisableUnrollLoops = true;
 
   if (!DisableUnitAtATime) {
     // Infer attributes about declarations if possible.
@@ -454,14 +465,18 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
   }
 
-  if (!PerformThinLTO) {
+  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
+  // call promotion as it will change the CFG too much to make the 2nd
+  // profile annotation in backend more difficult.
+  if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) {
     /// PGO instrumentation is added during the compile phase for ThinLTO, do
     /// not run it a second time
     addPGOInstrPasses(MPM);
     // Indirect call promotion that promotes intra-module targets only.
     // For ThinLTO this is done earlier due to interactions with globalopt
     // for imported functions.
-    MPM.add(createPGOIndirectCallPromotionLegacyPass());
+    MPM.add(
+        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
   }
 
   if (EnableNonLTOGlobalsModRef)
@@ -589,7 +604,7 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCorrelatedValuePropagationPass());
     addInstructionCombiningPass(MPM);
     MPM.add(createLICMPass());
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
     MPM.add(createCFGSimplificationPass());
     addInstructionCombiningPass(MPM);
   }
@@ -615,16 +630,16 @@ void PassManagerBuilder::populateModulePassManager(
 
       // BBVectorize may have significantly shortened a loop body; unroll again.
       if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
+        MPM.add(createLoopUnrollPass(OptLevel));
     }
   }
 
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables
   addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
-    MPM.add(createLoopUnrollPass());    // Unroll small loops
+    MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
 
     // LoopUnroll may generate some redundency to cleanup.
     addInstructionCombiningPass(MPM);
@@ -684,7 +699,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
-    PM.add(createPGOIndirectCallPromotionLegacyPass(true));
+    PM.add(
+        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
 
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
@@ -703,7 +719,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createGlobalSplitPass());
 
   // Apply whole-program devirtualization and virtual constant propagation.
-  PM.add(createWholeProgramDevirtPass());
+  PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
 
   // That's all we need at opt level 1.
   if (OptLevel == 1)
@@ -759,8 +775,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
   PM.add(createLICMPass());                 // Hoist loop invariants.
-  if (EnableMLSM)
-    PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
+  PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
   PM.add(NewGVN ? createNewGVNPass()
                 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
@@ -775,11 +790,11 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createLoopInterchangePass());
 
   if (!DisableUnrollLoops)
-    PM.add(createSimpleLoopUnrollPass());   // Unroll small loops
+    PM.add(createSimpleLoopUnrollPass(OptLevel));   // Unroll small loops
   PM.add(createLoopVectorizePass(true, LoopVectorize));
   // The vectorizer may have significantly shortened a loop body; unroll again.
   if (!DisableUnrollLoops)
-    PM.add(createLoopUnrollPass());
+    PM.add(createLoopUnrollPass(OptLevel));
 
   // Now that we've optimized loops (in particular loop induction variables),
   // we may have exposed more scalar opportunities. Run parts of the scalar
@@ -833,6 +848,23 @@ void PassManagerBuilder::populateThinLTOPassManager(
   if (VerifyInput)
     PM.add(createVerifierPass());
 
+  if (ImportSummary) {
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
+    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
+  }
+
   populateModulePassManager(PM);
 
   if (VerifyOutput)
@@ -857,8 +889,7 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   // Lower type metadata and the type.test intrinsic. This pass supports Clang's
   // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
   // link time if CFI is enabled. The pass does nothing if CFI is disabled.
-  PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None,
-                                  /*Summary=*/nullptr));
+  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 6a43f8dbac48..3371de6e3d14 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -43,6 +44,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -50,6 +52,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cctype>
 
@@ -159,8 +162,11 @@ protected:
   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
   const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
+  std::vector<const FunctionSamples *>
+  findIndirectCallFunctionSamples(const Instruction &I) const;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  bool inlineHotFunctions(Function &F);
+  bool inlineHotFunctions(Function &F,
+                          DenseSet<GlobalValue::GUID> &ImportGUIDs);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -173,7 +179,7 @@ protected:
   void buildEdges(Function &F);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
-  unsigned getOffset(unsigned L, unsigned H) const;
+  unsigned getOffset(const DILocation *DIL) const;
   void clearFunctionData();
 
   /// \brief Map basic blocks to their computed weights.
@@ -326,11 +332,12 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
   // If there are inlined callsites in this function, count the samples found
   // in the respective bodies. However, do not bother counting callees with 0
   // total samples, these are callees that were never invoked at runtime.
-  for (const auto &I : FS->getCallsiteSamples()) {
-    const FunctionSamples *CalleeSamples = &I.second;
-    if (callsiteIsHot(FS, CalleeSamples))
-      Count += countUsedRecords(CalleeSamples);
-  }
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Count += countUsedRecords(CalleeSamples);
+    }
 
   return Count;
 }
@@ -343,11 +350,12 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
   unsigned Count = FS->getBodySamples().size();
 
   // Only count records in hot callsites.
-  for (const auto &I : FS->getCallsiteSamples()) {
-    const FunctionSamples *CalleeSamples = &I.second;
-    if (callsiteIsHot(FS, CalleeSamples))
-      Count += countBodyRecords(CalleeSamples);
-  }
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Count += countBodyRecords(CalleeSamples);
+    }
 
   return Count;
 }
@@ -362,11 +370,12 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
     Total += I.second.getSamples();
 
   // Only count samples in hot callsites.
-  for (const auto &I : FS->getCallsiteSamples()) {
-    const FunctionSamples *CalleeSamples = &I.second;
-    if (callsiteIsHot(FS, CalleeSamples))
-      Total += countBodySamples(CalleeSamples);
-  }
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Total += countBodySamples(CalleeSamples);
+    }
 
   return Total;
 }
@@ -398,15 +407,11 @@ void SampleProfileLoader::clearFunctionData() {
   CoverageTracker.clear();
 }
 
-/// \brief Returns the offset of lineno \p L to head_lineno \p H
-///
-/// \param L  Lineno
-/// \param H  Header lineno of the function
-///
-/// \returns offset to the header lineno. 16 bits are used to represent offset.
+/// Returns the line offset to the start line of the subprogram.
 /// We assume that a single function will not exceed 65535 LOC.
-unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const {
-  return (L - H) & 0xffff;
+unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
+  return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+         0xffff;
 }
 
 /// \brief Print the weight of edge \p E on stream \p OS.
@@ -451,8 +456,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
 /// \param Inst Instruction to query.
 ///
 /// \returns the weight of \p Inst.
-ErrorOr<uint64_t>
-SampleProfileLoader::getInstWeight(const Instruction &Inst) {
+ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   const DebugLoc &DLoc = Inst.getDebugLoc();
   if (!DLoc)
     return std::error_code();
@@ -470,19 +474,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // If a call/invoke instruction is inlined in profile, but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
-  bool IsCall = isa<CallInst>(Inst) || isa<InvokeInst>(Inst);
-  if (IsCall && findCalleeFunctionSamples(Inst))
+  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+      findCalleeFunctionSamples(Inst))
     return 0;
 
   const DILocation *DIL = DLoc;
-  unsigned Lineno = DLoc.getLine();
-  unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine();
-
-  uint32_t LineOffset = getOffset(Lineno, HeaderLineno);
-  uint32_t Discriminator = DIL->getDiscriminator();
-  ErrorOr<uint64_t> R = IsCall
-                            ? FS->findCallSamplesAt(LineOffset, Discriminator)
-                            : FS->findSamplesAt(LineOffset, Discriminator);
+  uint32_t LineOffset = getOffset(DIL);
+  uint32_t Discriminator = DIL->getBaseDiscriminator();
+  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
   if (R) {
     bool FirstMark =
         CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
@@ -491,13 +490,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) {
       LLVMContext &Ctx = F->getContext();
       emitOptimizationRemark(
           Ctx, DEBUG_TYPE, *F, DLoc,
-          Twine("Applied ") + Twine(*R) + " samples from profile (offset: " +
-              Twine(LineOffset) +
+          Twine("Applied ") + Twine(*R) +
+              " samples from profile (offset: " + Twine(LineOffset) +
               ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")");
     }
-    DEBUG(dbgs() << "    " << Lineno << "." << DIL->getDiscriminator() << ":"
-                 << Inst << " (line offset: " << Lineno - HeaderLineno << "."
-                 << DIL->getDiscriminator() << " - weight: " << R.get()
+    DEBUG(dbgs() << "    " << DLoc.getLine() << "."
+                 << DIL->getBaseDiscriminator() << ":" << Inst
+                 << " (line offset: " << LineOffset << "."
+                 << DIL->getBaseDiscriminator() << " - weight: " << R.get()
                  << ")\n");
   }
   return R;
@@ -511,8 +511,7 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) {
 /// \param BB The basic block to query.
 ///
 /// \returns the weight for \p BB.
-ErrorOr<uint64_t>
-SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
+ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
   uint64_t Max = 0;
   bool HasWeight = false;
   for (auto &I : BB->getInstList()) {
@@ -565,16 +564,49 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
   if (!DIL) {
     return nullptr;
   }
-  DISubprogram *SP = DIL->getScope()->getSubprogram();
-  if (!SP)
-    return nullptr;
+
+  StringRef CalleeName;
+  if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
+    if (Function *Callee = CI->getCalledFunction())
+      CalleeName = Callee->getName();
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return nullptr;
 
-  return FS->findFunctionSamplesAt(LineLocation(
-      getOffset(DIL->getLine(), SP->getLine()), DIL->getDiscriminator()));
+  return FS->findFunctionSamplesAt(
+      LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName);
+}
+
+/// Returns a vector of FunctionSamples that are the indirect call targets
+/// of \p Inst. The vector is sorted by the total number of samples.
+std::vector<const FunctionSamples *>
+SampleProfileLoader::findIndirectCallFunctionSamples(
+    const Instruction &Inst) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  std::vector<const FunctionSamples *> R;
+
+  if (!DIL) {
+    return R;
+  }
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return R;
+
+  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
+          LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
+    if (M->size() == 0)
+      return R;
+    for (const auto &NameFS : *M) {
+      R.push_back(&NameFS.second);
+    }
+    std::sort(R.begin(), R.end(),
+              [](const FunctionSamples *L, const FunctionSamples *R) {
+                return L->getTotalSamples() > R->getTotalSamples();
+              });
+  }
+  return R;
 }
 
 /// \brief Get the FunctionSamples for an instruction.
@@ -588,23 +620,23 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
 /// \returns the FunctionSamples pointer to the inlined instance.
 const FunctionSamples *
 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
-  SmallVector<LineLocation, 10> S;
+  SmallVector<std::pair<LineLocation, StringRef>, 10> S;
   const DILocation *DIL = Inst.getDebugLoc();
-  if (!DIL) {
+  if (!DIL)
     return Samples;
-  }
+
+  const DILocation *PrevDIL = DIL;
   for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
-    DISubprogram *SP = DIL->getScope()->getSubprogram();
-    if (!SP)
-      return nullptr;
-    S.push_back(LineLocation(getOffset(DIL->getLine(), SP->getLine()),
-                             DIL->getDiscriminator()));
+    S.push_back(std::make_pair(
+        LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
+        PrevDIL->getScope()->getSubprogram()->getLinkageName()));
+    PrevDIL = DIL;
   }
   if (S.size() == 0)
     return Samples;
   const FunctionSamples *FS = Samples;
   for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
-    FS = FS->findFunctionSamplesAt(S[i]);
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
   }
   return FS;
 }
@@ -614,14 +646,17 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 /// Iteratively traverse all callsites of the function \p F, and find if
 /// the corresponding inlined instance exists and is hot in profile. If
 /// it is hot enough, inline the callsites and adds new callsites of the
-/// callee into the caller.
-///
-/// TODO: investigate the possibility of not invoking InlineFunction directly.
+/// callee into the caller. If the call is an indirect call, first promote
+/// it to direct call. Each indirect call is limited with a single target.
 ///
 /// \param F function to perform iterative inlining.
+/// \param ImportGUIDs a set to be updated to include all GUIDs that come
+///     from a different module but inlined in the profiled binary.
 ///
 /// \returns True if there is any inline happened.
-bool SampleProfileLoader::inlineHotFunctions(Function &F) {
+bool SampleProfileLoader::inlineHotFunctions(
+    Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
   bool Changed = false;
   LLVMContext &Ctx = F.getContext();
   std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
@@ -647,18 +682,42 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
     }
     for (auto I : CIS) {
       InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
-      CallSite CS(I);
-      Function *CalledFunction = CS.getCalledFunction();
-      if (!CalledFunction || !CalledFunction->getSubprogram())
+      Function *CalledFunction = CallSite(I).getCalledFunction();
+      Instruction *DI = I;
+      if (!CalledFunction && !PromotedInsns.count(I) &&
+          CallSite(I).isIndirectCall())
+        for (const auto *FS : findIndirectCallFunctionSamples(*I)) {
+          auto CalleeFunctionName = FS->getName();
+          const char *Reason = "Callee function not available";
+          CalledFunction = F.getParent()->getFunction(CalleeFunctionName);
+          if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) {
+            // The indirect target was promoted and inlined in the profile, as a
+            // result, we do not have profile info for the branch probability.
+            // We set the probability to 80% taken to indicate that the static
+            // call is likely taken.
+            DI = dyn_cast<Instruction>(
+                promoteIndirectCall(I, CalledFunction, 80, 100, false)
+                    ->stripPointerCasts());
+            PromotedInsns.insert(I);
+          } else {
+            DEBUG(dbgs() << "\nFailed to promote indirect call to "
+                         << CalleeFunctionName << " because " << Reason
+                         << "\n");
+            continue;
+          }
+        }
+      if (!CalledFunction || !CalledFunction->getSubprogram()) {
+        findCalleeFunctionSamples(*I)->findImportedFunctions(
+            ImportGUIDs, F.getParent(),
+            Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
         continue;
+      }
       DebugLoc DLoc = I->getDebugLoc();
-      uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples();
-      if (InlineFunction(CS, IFI)) {
+      if (InlineFunction(CallSite(DI), IFI)) {
         LocalChanged = true;
         emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
                                Twine("inlined hot callee '") +
-                                   CalledFunction->getName() + "' with " +
-                                   Twine(NumSamples) + " samples into '" +
+                                   CalledFunction->getName() + "' into '" +
                                    F.getName() + "'");
       }
     }
@@ -994,6 +1053,26 @@ void SampleProfileLoader::buildEdges(Function &F) {
   }
 }
 
+/// Sorts the CallTargetMap \p M by count in descending order and stores the
+/// sorted result in \p Sorted. Returns the total counts.
+static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
+                                const SampleRecord::CallTargetMap &M) {
+  Sorted.clear();
+  uint64_t Sum = 0;
+  for (auto I = M.begin(); I != M.end(); ++I) {
+    Sum += I->getValue();
+    Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
+  }
+  std::sort(Sorted.begin(), Sorted.end(),
+            [](const InstrProfValueData &L, const InstrProfValueData &R) {
+              if (L.Count == R.Count)
+                return L.Value > R.Value;
+              else
+                return L.Count > R.Count;
+            });
+  return Sum;
+}
+
 /// \brief Propagate weights into edges
 ///
 /// The following rules are applied to every block BB in the CFG:
@@ -1015,10 +1094,6 @@ void SampleProfileLoader::propagateWeights(Function &F) {
   bool Changed = true;
   unsigned I = 0;
 
-  // Add an entry count to the function using the samples gathered
-  // at the function entry.
-  F.setEntryCount(Samples->getHeadSamples() + 1);
-
   // If BB weight is larger than its corresponding loop's header BB weight,
   // use the BB weight to replace the loop header BB weight.
   for (auto &BI : F) {
@@ -1071,13 +1146,32 @@ void SampleProfileLoader::propagateWeights(Function &F) {
 
     if (BlockWeights[BB]) {
       for (auto &I : BB->getInstList()) {
-        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-          if (!dyn_cast<IntrinsicInst>(&I)) {
-            SmallVector<uint32_t, 1> Weights;
-            Weights.push_back(BlockWeights[BB]);
-            CI->setMetadata(LLVMContext::MD_prof,
-                            MDB.createBranchWeights(Weights));
-          }
+        if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          continue;
+        CallSite CS(&I);
+        if (!CS.getCalledFunction()) {
+          const DebugLoc &DLoc = I.getDebugLoc();
+          if (!DLoc)
+            continue;
+          const DILocation *DIL = DLoc;
+          uint32_t LineOffset = getOffset(DIL);
+          uint32_t Discriminator = DIL->getBaseDiscriminator();
+
+          const FunctionSamples *FS = findFunctionSamples(I);
+          if (!FS)
+            continue;
+          auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
+          if (!T || T.get().size() == 0)
+            continue;
+          SmallVector<InstrProfValueData, 2> SortedCallTargets;
+          uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
+          annotateValueSite(*I.getParent()->getParent()->getParent(), I,
+                            SortedCallTargets, Sum, IPVK_IndirectCallTarget,
+                            SortedCallTargets.size());
+        } else if (!dyn_cast<IntrinsicInst>(&I)) {
+          SmallVector<uint32_t, 1> Weights;
+          Weights.push_back(BlockWeights[BB]);
+          I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
         }
       }
     }
@@ -1115,9 +1209,13 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       }
     }
 
+    uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
-    if (MaxWeight > 0) {
+    // Do not set weights if the weights are present. In ThinLTO, the profile
+    // annotation is done twice. If the first annotation already set the
+    // weights, the second pass does not need to set it.
+    if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
       DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
       TI->setMetadata(llvm::LLVMContext::MD_prof,
                       MDB.createBranchWeights(Weights));
@@ -1228,12 +1326,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
                << ": " << getFunctionLoc(F) << "\n");
 
-  Changed |= inlineHotFunctions(F);
+  DenseSet<GlobalValue::GUID> ImportGUIDs;
+  Changed |= inlineHotFunctions(F, ImportGUIDs);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
 
   if (Changed) {
+    // Add an entry count to the function using the samples gathered at the
+    // function entry. Also sets the GUIDs that comes from a different
+    // module but inlined in the profiled binary. This is aiming at making
+    // the IR match the profiled binary before annotation.
+    F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs);
+
     // Compute dominance and loop info needed for propagation.
     computeDominanceAndLoopInfo(F);
 
@@ -1329,7 +1434,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 bool SampleProfileLoader::runOnFunction(Function &F) {
   F.setEntryCount(0);
   Samples = Reader->getSamplesFor(F);
-  if (!Samples->empty())
+  if (Samples && !Samples->empty())
     return emitAnnotations(F);
   return false;
 }
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 8f6f161428e8..fb64367eef91 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -323,6 +323,14 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
       LiveGVs.insert(GVE);
   }
 
+  std::set<DICompileUnit *> LiveCUs;
+  // Any CU referenced from a subprogram is live.
+  for (DISubprogram *SP : F.subprograms()) {
+    if (SP->getUnit())
+      LiveCUs.insert(SP->getUnit());
+  }
+
+  bool HasDeadCUs = false;
   for (DICompileUnit *DIC : F.compile_units()) {
     // Create our live global variable list.
     bool GlobalVariableChange = false;
@@ -341,6 +349,11 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
         GlobalVariableChange = true;
     }
 
+    if (!LiveGlobalVariables.empty())
+      LiveCUs.insert(DIC);
+    else if (!LiveCUs.count(DIC))
+      HasDeadCUs = true;
+
     // If we found dead global variables, replace the current global
     // variable list with our new live global variable list.
     if (GlobalVariableChange) {
@@ -352,5 +365,16 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
     LiveGlobalVariables.clear();
   }
 
+  if (HasDeadCUs) {
+    // Delete the old node and replace it with a new one
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+    NMD->clearOperands();
+    if (!LiveCUs.empty()) {
+      for (DICompileUnit *CU : LiveCUs)
+        NMD->addOperand(CU);
+    }
+    Changed = true;
+  }
+
   return Changed;
 }
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 3680cfc813a1..65deb82cd2a5 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -14,16 +14,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 using namespace llvm;
 
@@ -41,23 +46,14 @@ namespace {
 std::string getModuleId(Module *M) {
   MD5 Md5;
   bool ExportsSymbols = false;
-  auto AddGlobal = [&](GlobalValue &GV) {
+  for (auto &GV : M->global_values()) {
     if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
         !GV.hasExternalLinkage())
-      return;
+      continue;
     ExportsSymbols = true;
     Md5.update(GV.getName());
     Md5.update(ArrayRef<uint8_t>{0});
-  };
-
-  for (auto &F : *M)
-    AddGlobal(F);
-  for (auto &GV : M->globals())
-    AddGlobal(GV);
-  for (auto &GA : M->aliases())
-    AddGlobal(GA);
-  for (auto &IF : M->ifuncs())
-    AddGlobal(IF);
+  }
 
   if (!ExportsSymbols)
     return "";
@@ -73,15 +69,21 @@ std::string getModuleId(Module *M) {
 // Promote each local-linkage entity defined by ExportM and used by ImportM by
 // changing visibility and appending the given ModuleId.
 void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
-  auto PromoteInternal = [&](GlobalValue &ExportGV) {
+  DenseMap<const Comdat *, Comdat *> RenamedComdats;
+  for (auto &ExportGV : ExportM.global_values()) {
     if (!ExportGV.hasLocalLinkage())
-      return;
+      continue;
 
-    GlobalValue *ImportGV = ImportM.getNamedValue(ExportGV.getName());
+    auto Name = ExportGV.getName();
+    GlobalValue *ImportGV = ImportM.getNamedValue(Name);
     if (!ImportGV || ImportGV->use_empty())
-      return;
+      continue;
+
+    std::string NewName = (Name + ModuleId).str();
 
-    std::string NewName = (ExportGV.getName() + ModuleId).str();
+    if (const auto *C = ExportGV.getComdat())
+      if (C->getName() == Name)
+        RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
 
     ExportGV.setName(NewName);
     ExportGV.setLinkage(GlobalValue::ExternalLinkage);
@@ -89,16 +91,15 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
 
     ImportGV->setName(NewName);
     ImportGV->setVisibility(GlobalValue::HiddenVisibility);
-  };
+  }
 
-  for (auto &F : ExportM)
-    PromoteInternal(F);
-  for (auto &GV : ExportM.globals())
-    PromoteInternal(GV);
-  for (auto &GA : ExportM.aliases())
-    PromoteInternal(GA);
-  for (auto &IF : ExportM.ifuncs())
-    PromoteInternal(IF);
+  if (!RenamedComdats.empty())
+    for (auto &GO : ExportM.global_objects())
+      if (auto *C = GO.getComdat()) {
+        auto Replacement = RenamedComdats.find(C);
+        if (Replacement != RenamedComdats.end())
+          GO.setComdat(Replacement->second);
+      }
 }
 
 // Promote all internal (i.e. distinct) type ids used by the module by replacing
@@ -194,24 +195,7 @@ void simplifyExternals(Module &M) {
 }
 
 void filterModule(
-    Module *M, std::function<bool(const GlobalValue *)> ShouldKeepDefinition) {
-  for (Function &F : *M) {
-    if (ShouldKeepDefinition(&F))
-      continue;
-
-    F.deleteBody();
-    F.clearMetadata();
-  }
-
-  for (GlobalVariable &GV : M->globals()) {
-    if (ShouldKeepDefinition(&GV))
-      continue;
-
-    GV.setInitializer(nullptr);
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.clearMetadata();
-  }
-
+    Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
   for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E;) {
     GlobalAlias *GA = &*I++;
@@ -219,7 +203,7 @@ void filterModule(
       continue;
 
     GlobalObject *GO;
-    if (I->getValueType()->isFunctionTy())
+    if (GA->getValueType()->isFunctionTy())
       GO = Function::Create(cast<FunctionType>(GA->getValueType()),
                             GlobalValue::ExternalLinkage, "", M);
     else
@@ -231,53 +215,168 @@ void filterModule(
     GA->replaceAllUsesWith(GO);
     GA->eraseFromParent();
   }
+
+  for (Function &F : *M) {
+    if (ShouldKeepDefinition(&F))
+      continue;
+
+    F.deleteBody();
+    F.setComdat(nullptr);
+    F.clearMetadata();
+  }
+
+  for (GlobalVariable &GV : M->globals()) {
+    if (ShouldKeepDefinition(&GV))
+      continue;
+
+    GV.setInitializer(nullptr);
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    GV.setComdat(nullptr);
+    GV.clearMetadata();
+  }
+}
+
+void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
+  if (auto *F = dyn_cast<Function>(C))
+    return Fn(F);
+  if (isa<GlobalValue>(C))
+    return;
+  for (Value *Op : C->operands())
+    forEachVirtualFunction(cast<Constant>(Op), Fn);
 }
 
 // If it's possible to split M into regular and thin LTO parts, do so and write
 // a multi-module bitcode file with the two parts to OS. Otherwise, write only a
 // regular LTO bitcode file to OS.
-void splitAndWriteThinLTOBitcode(raw_ostream &OS, Module &M) {
+void splitAndWriteThinLTOBitcode(
+    raw_ostream &OS, raw_ostream *ThinLinkOS,
+    function_ref<AAResults &(Function &)> AARGetter, Module &M) {
   std::string ModuleId = getModuleId(&M);
   if (ModuleId.empty()) {
     // We couldn't generate a module ID for this module, just write it out as a
     // regular LTO module.
     WriteBitcodeToFile(&M, OS);
+    if (ThinLinkOS)
+      // We don't have a ThinLTO part, but still write the module to the
+      // ThinLinkOS if requested so that the expected output file is produced.
+      WriteBitcodeToFile(&M, *ThinLinkOS);
     return;
   }
 
   promoteTypeIds(M, ModuleId);
 
-  auto IsInMergedM = [&](const GlobalValue *GV) {
-    auto *GVar = dyn_cast<GlobalVariable>(GV->getBaseObject());
-    if (!GVar)
-      return false;
-
+  // Returns whether a global has attached type metadata. Such globals may
+  // participate in CFI or whole-program devirtualization, so they need to
+  // appear in the merged module instead of the thin LTO module.
+  auto HasTypeMetadata = [&](const GlobalObject *GO) {
     SmallVector<MDNode *, 1> MDs;
-    GVar->getMetadata(LLVMContext::MD_type, MDs);
+    GO->getMetadata(LLVMContext::MD_type, MDs);
     return !MDs.empty();
   };
 
+  // Collect the set of virtual functions that are eligible for virtual constant
+  // propagation. Each eligible function must not access memory, must return
+  // an integer of width <=64 bits, must take at least one argument, must not
+  // use its first argument (assumed to be "this") and all arguments other than
+  // the first one must be of <=64 bit integer type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  std::set<const Function *> EligibleVirtualFns;
+  // If any member of a comdat lives in MergedM, put all members of that
+  // comdat in MergedM to keep the comdat together.
+  DenseSet<const Comdat *> MergedMComdats;
+  for (GlobalVariable &GV : M.globals())
+    if (HasTypeMetadata(&GV)) {
+      if (const auto *C = GV.getComdat())
+        MergedMComdats.insert(C);
+      forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
+        auto *RT = dyn_cast<IntegerType>(F->getReturnType());
+        if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
+            !F->arg_begin()->use_empty())
+          return;
+        for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) {
+          auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
+          if (!ArgT || ArgT->getBitWidth() > 64)
+            return;
+        }
+        if (computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+          EligibleVirtualFns.insert(F);
+      });
+    }
+
   ValueToValueMapTy VMap;
-  std::unique_ptr<Module> MergedM(CloneModule(&M, VMap, IsInMergedM));
+  std::unique_ptr<Module> MergedM(
+      CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool {
+        if (const auto *C = GV->getComdat())
+          if (MergedMComdats.count(C))
+            return true;
+        if (auto *F = dyn_cast<Function>(GV))
+          return EligibleVirtualFns.count(F);
+        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+          return HasTypeMetadata(GVar);
+        return false;
+      }));
+  StripDebugInfo(*MergedM);
+
+  for (Function &F : *MergedM)
+    if (!F.isDeclaration()) {
+      // Reset the linkage of all functions eligible for virtual constant
+      // propagation. The canonical definitions live in the thin LTO module so
+      // that they can be imported.
+      F.setLinkage(GlobalValue::AvailableExternallyLinkage);
+      F.setComdat(nullptr);
+    }
 
-  filterModule(&M, [&](const GlobalValue *GV) { return !IsInMergedM(GV); });
+  // Remove all globals with type metadata, globals with comdats that live in
+  // MergedM, and aliases pointing to such globals from the thin LTO module.
+  filterModule(&M, [&](const GlobalValue *GV) {
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+      if (HasTypeMetadata(GVar))
+        return false;
+    if (const auto *C = GV->getComdat())
+      if (MergedMComdats.count(C))
+        return false;
+    return true;
+  });
 
   promoteInternals(*MergedM, M, ModuleId);
   promoteInternals(M, *MergedM, ModuleId);
 
   simplifyExternals(*MergedM);
 
-  SmallVector<char, 0> Buffer;
-  BitcodeWriter W(Buffer);
 
   // FIXME: Try to re-use BSI and PFI from the original module here.
   ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
-  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
-                /*GenerateHash=*/true);
 
-  W.writeModule(MergedM.get());
+  SmallVector<char, 0> Buffer;
 
+  BitcodeWriter W(Buffer);
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                /*GenerateHash=*/true, &ModHash);
+  W.writeModule(MergedM.get());
   OS << Buffer;
+
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info (the merged module was already stripped above)
+  // and write it to the given OS.
+  if (ThinLinkOS) {
+    Buffer.clear();
+    BitcodeWriter W2(Buffer);
+    StripDebugInfo(M);
+    W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                   /*GenerateHash=*/false, &ModHash);
+    W2.writeModule(MergedM.get());
+    *ThinLinkOS << Buffer;
+  }
 }
 
 // Returns whether this module needs to be split because it uses type metadata.
@@ -292,28 +391,45 @@ bool requiresSplit(Module &M) {
   return false;
 }
 
-void writeThinLTOBitcode(raw_ostream &OS, Module &M,
-                         const ModuleSummaryIndex *Index) {
+void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
+                         function_ref<AAResults &(Function &)> AARGetter,
+                         Module &M, const ModuleSummaryIndex *Index) {
   // See if this module has any type metadata. If so, we need to split it.
   if (requiresSplit(M))
-    return splitAndWriteThinLTOBitcode(OS, M);
+    return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
 
   // Otherwise we can just write it out as a regular module.
+
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
   WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
-                     /*GenerateHash=*/true);
+                     /*GenerateHash=*/true, &ModHash);
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info and write it to the given OS.
+  if (ThinLinkOS) {
+    StripDebugInfo(M);
+    WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+                       Index,
+                       /*GenerateHash=*/false, &ModHash);
+  }
 }
 
 class WriteThinLTOBitcode : public ModulePass {
   raw_ostream &OS; // raw_ostream to print on
+  // The output stream on which to emit a minimized module for use
+  // just in the thin link, if requested.
+  raw_ostream *ThinLinkOS;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) {
+  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
-  explicit WriteThinLTOBitcode(raw_ostream &o)
-      : ModulePass(ID), OS(o) {
+  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
+      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
@@ -322,12 +438,14 @@ public:
   bool runOnModule(Module &M) override {
     const ModuleSummaryIndex *Index =
         &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
-    writeThinLTOBitcode(OS, M, Index);
+    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
     return true;
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ModuleSummaryIndexWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 } // anonymous namespace
@@ -335,10 +453,13 @@ public:
 char WriteThinLTOBitcode::ID = 0;
 INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
                       "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
                     "Write ThinLTO Bitcode", false, true)
 
-ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str) {
-  return new WriteThinLTOBitcode(Str);
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
+                                                raw_ostream *ThinLinkOS) {
+  return new WriteThinLTOBitcode(Str, ThinLinkOS);
 }
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 844cc0f70eed..cb7d487b68b0 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -25,6 +25,20 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
+// This pass is intended to be used during the regular and thin LTO pipelines.
+// During regular LTO, the pass determines the best optimization for each
+// virtual call and applies the resolutions directly to virtual calls that are
+// eligible for virtual call optimization (i.e. calls that use either of the
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During
+// ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over a single merged module
+//   that contains all vtables with !type metadata that participate in the link.
+//   The pass computes a resolution for each virtual call and stores it in the
+//   type identifier summary.
+// - Import phase: this is run during the thin backends over the individual
+//   modules. The pass applies the resolutions previously computed during the
+//   import phase to each eligible virtual call.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -35,6 +49,8 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -54,12 +70,16 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include <algorithm>
 #include <cstddef>
@@ -72,6 +92,26 @@ using namespace wholeprogramdevirt;
 
 #define DEBUG_TYPE "wholeprogramdevirt"
 
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "wholeprogramdevirt-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "wholeprogramdevirt-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "wholeprogramdevirt-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -259,15 +299,92 @@ struct VirtualCallSite {
   }
 };
 
+// Call site information collected for a specific VTableSlot and possibly a list
+// of constant integer arguments. The grouping by arguments is handled by the
+// VTableSlotInfo class.
+struct CallSiteInfo {
+  /// The set of call sites for this slot. Used during regular LTO and the
+  /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
+  /// call sites that appear in the merged module itself); in each of these
+  /// cases we are directly operating on the call sites at the IR level.
+  std::vector<VirtualCallSite> CallSites;
+
+  // These fields are used during the export phase of ThinLTO and reflect
+  // information collected from function summaries.
+
+  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
+  /// this slot.
+  bool SummaryHasTypeTestAssumeUsers;
+
+  /// CFI-specific: a vector containing the list of function summaries that use
+  /// the llvm.type.checked.load intrinsic and therefore will require
+  /// resolutions for llvm.type.test in order to implement CFI checks if
+  /// devirtualization was unsuccessful. If devirtualization was successful, the
+  /// pass will clear this vector by calling markDevirt(). If at the end of the
+  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
+  /// to each of the function summaries in the vector.
+  std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+
+  bool isExported() const {
+    return SummaryHasTypeTestAssumeUsers ||
+           !SummaryTypeCheckedLoadUsers.empty();
+  }
+
+  /// As explained in the comment for SummaryTypeCheckedLoadUsers.
+  void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); }
+};
+
+// Call site information collected for a specific VTableSlot.
+struct VTableSlotInfo {
+  // The set of call sites which do not have all constant integer arguments
+  // (excluding "this").
+  CallSiteInfo CSInfo;
+
+  // The set of call sites with all constant integer arguments (excluding
+  // "this"), grouped by argument list.
+  std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
+
+  void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses);
+
+private:
+  CallSiteInfo &findCallSiteInfo(CallSite CS);
+};
+
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
+  std::vector<uint64_t> Args;
+  auto *CI = dyn_cast<IntegerType>(CS.getType());
+  if (!CI || CI->getBitWidth() > 64 || CS.arg_empty())
+    return CSInfo;
+  for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64)
+      return CSInfo;
+    Args.push_back(CI->getZExtValue());
+  }
+  return ConstCSInfo[Args];
+}
+
+void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
+                                 unsigned *NumUnsafeUses) {
+  findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses});
+}
+
 struct DevirtModule {
   Module &M;
+  function_ref<AAResults &(Function &)> AARGetter;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
   IntegerType *Int8Ty;
   PointerType *Int8PtrTy;
   IntegerType *Int32Ty;
+  IntegerType *Int64Ty;
+  IntegerType *IntPtrTy;
 
   bool RemarksEnabled;
 
-  MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots;
+  MapVector<VTableSlot, VTableSlotInfo> CallSlots;
 
   // This map keeps track of the number of "unsafe" uses of a loaded function
   // pointer. The key is the associated llvm.type.test intrinsic call generated
@@ -279,11 +396,18 @@ struct DevirtModule {
   // true.
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
 
-  DevirtModule(Module &M)
-      : M(M), Int8Ty(Type::getInt8Ty(M.getContext())),
+  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+               ModuleSummaryIndex *ExportSummary,
+               const ModuleSummaryIndex *ImportSummary)
+      : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())),
         Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
         Int32Ty(Type::getInt32Ty(M.getContext())),
-        RemarksEnabled(areRemarksEnabled()) {}
+        Int64Ty(Type::getInt64Ty(M.getContext())),
+        IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+        RemarksEnabled(areRemarksEnabled()) {
+    assert(!(ExportSummary && ImportSummary));
+  }
 
   bool areRemarksEnabled();
 
@@ -298,57 +422,169 @@ struct DevirtModule {
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
                             uint64_t ByteOffset);
+
+  void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
+                             bool &IsExported);
   bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           MutableArrayRef<VirtualCallSite> CallSites);
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res);
+
   bool tryEvaluateFunctionsWithArgs(
       MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-      ArrayRef<ConstantInt *> Args);
-  bool tryUniformRetValOpt(IntegerType *RetType,
-                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           MutableArrayRef<VirtualCallSite> CallSites);
+      ArrayRef<uint64_t> Args);
+
+  void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                             uint64_t TheRetVal);
+  bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           CallSiteInfo &CSInfo,
+                           WholeProgramDevirtResolution::ByArg *Res);
+
+  // Returns the global symbol name that is used to export information about the
+  // given vtable slot and list of arguments.
+  std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                            StringRef Name);
+
+  // This function is called during the export phase to create a symbol
+  // definition containing information about the given vtable slot and list of
+  // arguments.
+  void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                    Constant *C);
+
+  // This function is called during the import phase to create a reference to
+  // the symbol definition created during the export phase.
+  Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                         StringRef Name, unsigned AbsWidth = 0);
+
+  void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
+                            Constant *UniqueMemberAddr);
   bool tryUniqueRetValOpt(unsigned BitWidth,
                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                          MutableArrayRef<VirtualCallSite> CallSites);
+                          CallSiteInfo &CSInfo,
+                          WholeProgramDevirtResolution::ByArg *Res,
+                          VTableSlot Slot, ArrayRef<uint64_t> Args);
+
+  void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                             Constant *Byte, Constant *Bit);
   bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           ArrayRef<VirtualCallSite> CallSites);
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res, VTableSlot Slot);
 
   void rebuildGlobal(VTableBits &B);
 
+  // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
+  void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
+
+  // If we were able to eliminate all unsafe uses for a type checked load,
+  // eliminate the associated type tests by replacing them with true.
+  void removeRedundantTypeTests();
+
   bool run();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M,
+                            function_ref<AAResults &(Function &)> AARGetter);
 };
 
 struct WholeProgramDevirt : public ModulePass {
   static char ID;
 
-  WholeProgramDevirt() : ModulePass(ID) {
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
+                     const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
     initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
+    if (UseCommandLine)
+      return DevirtModule::runForTesting(M, LegacyAARGetter(*this));
+    return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary)
+        .run();
+  }
 
-    return DevirtModule(M).run();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt",
-                "Whole program devirtualization", false, false)
+INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
+                      "Whole program devirtualization", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
+                    "Whole program devirtualization", false, false)
 char WholeProgramDevirt::ID = 0;
 
-ModulePass *llvm::createWholeProgramDevirtPass() {
-  return new WholeProgramDevirt;
+ModulePass *
+llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                                   const ModuleSummaryIndex *ImportSummary) {
+  return new WholeProgramDevirt(ExportSummary, ImportSummary);
 }
 
 PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
-                                              ModuleAnalysisManager &) {
-  if (!DevirtModule(M).run())
+                                              ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+  if (!DevirtModule(M, AARGetter, nullptr, nullptr).run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
 
+bool DevirtModule::runForTesting(
+    Module &M, function_ref<AAResults &(Function &)> AARGetter) {
+  ModuleSummaryIndex Summary;
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed =
+      DevirtModule(
+          M, AARGetter,
+          ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .run();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr(
+        "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
+
+  return Changed;
+}
+
 void DevirtModule::buildTypeIdentifierMap(
     std::vector<VTableBits> &Bits,
     DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
@@ -443,9 +679,31 @@ bool DevirtModule::tryFindVirtualCallTargets(
   return !TargetsForSlot.empty();
 }
 
+void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
+                                         Constant *TheFn, bool &IsExported) {
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      if (RemarksEnabled)
+        VCallSite.emitRemark("single-impl", TheFn->getName());
+      VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
+          TheFn, VCallSite.CS.getCalledValue()->getType()));
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    if (CSInfo.isExported()) {
+      IsExported = true;
+      CSInfo.markDevirt();
+    }
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
 bool DevirtModule::trySingleImplDevirt(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) {
   // See if the program contains a single implementation of this virtual
   // function.
   Function *TheFn = TargetsForSlot[0].Fn;
@@ -453,39 +711,51 @@ bool DevirtModule::trySingleImplDevirt(
     if (TheFn != Target.Fn)
       return false;
 
+  // If so, update each call site to call that implementation directly.
   if (RemarksEnabled)
     TargetsForSlot[0].WasDevirt = true;
-  // If so, update each call site to call that implementation directly.
-  for (auto &&VCallSite : CallSites) {
-    if (RemarksEnabled)
-      VCallSite.emitRemark("single-impl", TheFn->getName());
-    VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
-        TheFn, VCallSite.CS.getCalledValue()->getType()));
-    // This use is no longer unsafe.
-    if (VCallSite.NumUnsafeUses)
-      --*VCallSite.NumUnsafeUses;
+
+  bool IsExported = false;
+  applySingleImplDevirt(SlotInfo, TheFn, IsExported);
+  if (!IsExported)
+    return false;
+
+  // If the only implementation has local linkage, we must promote to external
+  // to make it visible to thin LTO objects. We can only get here during the
+  // ThinLTO export phase.
+  if (TheFn->hasLocalLinkage()) {
+    TheFn->setLinkage(GlobalValue::ExternalLinkage);
+    TheFn->setVisibility(GlobalValue::HiddenVisibility);
+    TheFn->setName(TheFn->getName() + "$merged");
   }
+
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  Res->SingleImplName = TheFn->getName();
+
   return true;
 }
 
 bool DevirtModule::tryEvaluateFunctionsWithArgs(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    ArrayRef<ConstantInt *> Args) {
+    ArrayRef<uint64_t> Args) {
   // Evaluate each function and store the result in each target's RetVal
   // field.
   for (VirtualCallTarget &Target : TargetsForSlot) {
     if (Target.Fn->arg_size() != Args.size() + 1)
       return false;
-    for (unsigned I = 0; I != Args.size(); ++I)
-      if (Target.Fn->getFunctionType()->getParamType(I + 1) !=
-          Args[I]->getType())
-        return false;
 
     Evaluator Eval(M.getDataLayout(), nullptr);
     SmallVector<Constant *, 2> EvalArgs;
     EvalArgs.push_back(
         Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
-    EvalArgs.insert(EvalArgs.end(), Args.begin(), Args.end());
+    for (unsigned I = 0; I != Args.size(); ++I) {
+      auto *ArgTy = dyn_cast<IntegerType>(
+          Target.Fn->getFunctionType()->getParamType(I + 1));
+      if (!ArgTy)
+        return false;
+      EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
+    }
+
     Constant *RetVal;
     if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
         !isa<ConstantInt>(RetVal))
@@ -495,9 +765,18 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs(
   return true;
 }
 
+void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                         uint64_t TheRetVal) {
+  for (auto Call : CSInfo.CallSites)
+    Call.replaceAndErase(
+        "uniform-ret-val", FnName, RemarksEnabled,
+        ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal));
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryUniformRetValOpt(
-    IntegerType *RetType, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
+    WholeProgramDevirtResolution::ByArg *Res) {
   // Uniform return value optimization. If all functions return the same
   // constant, replace all calls with that constant.
   uint64_t TheRetVal = TargetsForSlot[0].RetVal;
@@ -505,19 +784,77 @@ bool DevirtModule::tryUniformRetValOpt(
     if (Target.RetVal != TheRetVal)
       return false;
 
-  auto TheRetValConst = ConstantInt::get(RetType, TheRetVal);
-  for (auto Call : CallSites)
-    Call.replaceAndErase("uniform-ret-val", TargetsForSlot[0].Fn->getName(),
-                         RemarksEnabled, TheRetValConst);
+  if (CSInfo.isExported()) {
+    Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+    Res->Info = TheRetVal;
+  }
+
+  applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
   if (RemarksEnabled)
     for (auto &&Target : TargetsForSlot)
       Target.WasDevirt = true;
   return true;
 }
 
+std::string DevirtModule::getGlobalName(VTableSlot Slot,
+                                        ArrayRef<uint64_t> Args,
+                                        StringRef Name) {
+  std::string FullName = "__typeid_";
+  raw_string_ostream OS(FullName);
+  OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
+  for (uint64_t Arg : Args)
+    OS << '_' << Arg;
+  OS << '_' << Name;
+  return OS.str();
+}
+
+void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                StringRef Name, Constant *C) {
+  GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                                        getGlobalName(Slot, Args, Name), C, &M);
+  GA->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                     StringRef Name, unsigned AbsWidth) {
+  Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty);
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  // We only need to set metadata if the global is newly created, in which
+  // case it would not have hidden visibility.
+  if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+    return C;
+
+  GV->setVisibility(GlobalValue::HiddenVisibility);
+  auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+    GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                    MDNode::get(M.getContext(), {MinC, MaxC}));
+  };
+  if (AbsWidth == IntPtrTy->getBitWidth())
+    SetAbsRange(~0ull, ~0ull); // Full set.
+  else if (AbsWidth)
+    SetAbsRange(0, 1ull << AbsWidth);
+  return GV;
+}
+
+void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                        bool IsOne,
+                                        Constant *UniqueMemberAddr) {
+  for (auto &&Call : CSInfo.CallSites) {
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                              Call.VTable, UniqueMemberAddr);
+    Cmp = B.CreateZExt(Cmp, Call.CS->getType());
+    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp);
+  }
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryUniqueRetValOpt(
     unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
+    VTableSlot Slot, ArrayRef<uint64_t> Args) {
   // IsOne controls whether we look for a 0 or a 1.
   auto tryUniqueRetValOptFor = [&](bool IsOne) {
     const TypeMemberInfo *UniqueMember = nullptr;
@@ -533,16 +870,23 @@ bool DevirtModule::tryUniqueRetValOpt(
     // checked for a uniform return value in tryUniformRetValOpt.
     assert(UniqueMember);
 
-    // Replace each call with the comparison.
-    for (auto &&Call : CallSites) {
-      IRBuilder<> B(Call.CS.getInstruction());
-      Value *OneAddr = B.CreateBitCast(UniqueMember->Bits->GV, Int8PtrTy);
-      OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset);
-      Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
-                                Call.VTable, OneAddr);
-      Call.replaceAndErase("unique-ret-val", TargetsForSlot[0].Fn->getName(),
-                           RemarksEnabled, Cmp);
+    Constant *UniqueMemberAddr =
+        ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy);
+    UniqueMemberAddr = ConstantExpr::getGetElementPtr(
+        Int8Ty, UniqueMemberAddr,
+        ConstantInt::get(Int64Ty, UniqueMember->Offset));
+
+    if (CSInfo.isExported()) {
+      Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+      Res->Info = IsOne;
+
+      exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
     }
+
+    // Replace each call with the comparison.
+    applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
+                         UniqueMemberAddr);
+
     // Update devirtualization statistics for targets.
     if (RemarksEnabled)
       for (auto &&Target : TargetsForSlot)
@@ -560,9 +904,30 @@ bool DevirtModule::tryUniqueRetValOpt(
   return false;
 }
 
+void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                                         Constant *Byte, Constant *Bit) {
+  for (auto Call : CSInfo.CallSites) {
+    auto *RetType = cast<IntegerType>(Call.CS.getType());
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte);
+    if (RetType->getBitWidth() == 1) {
+      Value *Bits = B.CreateLoad(Addr);
+      Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+      auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+      Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
+                           IsBitSet);
+    } else {
+      Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+      Value *Val = B.CreateLoad(RetType, ValAddr);
+      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val);
+    }
+  }
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryVirtualConstProp(
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    ArrayRef<VirtualCallSite> CallSites) {
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
   // This only works if the function returns an integer.
   auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
   if (!RetType)
@@ -571,55 +936,38 @@ bool DevirtModule::tryVirtualConstProp(
   if (BitWidth > 64)
     return false;
 
-  // Make sure that each function does not access memory, takes at least one
-  // argument, does not use its first argument (which we assume is 'this'),
-  // and has the same return type.
+  // Make sure that each function is defined, does not access memory, takes at
+  // least one argument, does not use its first argument (which we assume is
+  // 'this'), and has the same return type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
   for (VirtualCallTarget &Target : TargetsForSlot) {
-    if (!Target.Fn->doesNotAccessMemory() || Target.Fn->arg_empty() ||
-        !Target.Fn->arg_begin()->use_empty() ||
+    if (Target.Fn->isDeclaration() ||
+        computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
+            MAK_ReadNone ||
+        Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
         Target.Fn->getReturnType() != RetType)
       return false;
   }
 
-  // Group call sites by the list of constant arguments they pass.
-  // The comparator ensures deterministic ordering.
-  struct ByAPIntValue {
-    bool operator()(const std::vector<ConstantInt *> &A,
-                    const std::vector<ConstantInt *> &B) const {
-      return std::lexicographical_compare(
-          A.begin(), A.end(), B.begin(), B.end(),
-          [](ConstantInt *AI, ConstantInt *BI) {
-            return AI->getValue().ult(BI->getValue());
-          });
-    }
-  };
-  std::map<std::vector<ConstantInt *>, std::vector<VirtualCallSite>,
-           ByAPIntValue>
-      VCallSitesByConstantArg;
-  for (auto &&VCallSite : CallSites) {
-    std::vector<ConstantInt *> Args;
-    if (VCallSite.CS.getType() != RetType)
-      continue;
-    for (auto &&Arg :
-         make_range(VCallSite.CS.arg_begin() + 1, VCallSite.CS.arg_end())) {
-      if (!isa<ConstantInt>(Arg))
-        break;
-      Args.push_back(cast<ConstantInt>(&Arg));
-    }
-    if (Args.size() + 1 != VCallSite.CS.arg_size())
-      continue;
-
-    VCallSitesByConstantArg[Args].push_back(VCallSite);
-  }
-
-  for (auto &&CSByConstantArg : VCallSitesByConstantArg) {
+  for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
     if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
       continue;
 
-    if (tryUniformRetValOpt(RetType, TargetsForSlot, CSByConstantArg.second))
+    WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
+    if (Res)
+      ResByArg = &Res->ResByArg[CSByConstantArg.first];
+
+    if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
       continue;
 
-    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second))
+    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
+                           ResByArg, Slot, CSByConstantArg.first))
       continue;
 
     // Find an allocation offset in bits in all vtables associated with the
@@ -659,26 +1007,20 @@ bool DevirtModule::tryVirtualConstProp(
       for (auto &&Target : TargetsForSlot)
         Target.WasDevirt = true;
 
-    // Rewrite each call to a load from OffsetByte/OffsetBit.
-    for (auto Call : CSByConstantArg.second) {
-      IRBuilder<> B(Call.CS.getInstruction());
-      Value *Addr = B.CreateConstGEP1_64(Call.VTable, OffsetByte);
-      if (BitWidth == 1) {
-        Value *Bits = B.CreateLoad(Addr);
-        Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
-        Value *BitsAndBit = B.CreateAnd(Bits, Bit);
-        auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
-        Call.replaceAndErase("virtual-const-prop-1-bit",
-                             TargetsForSlot[0].Fn->getName(),
-                             RemarksEnabled, IsBitSet);
-      } else {
-        Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
-        Value *Val = B.CreateLoad(RetType, ValAddr);
-        Call.replaceAndErase("virtual-const-prop",
-                             TargetsForSlot[0].Fn->getName(),
-                             RemarksEnabled, Val);
-      }
+    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+
+    if (CSByConstantArg.second.isExported()) {
+      ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+      exportGlobal(Slot, CSByConstantArg.first, "byte",
+                   ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy));
+      exportGlobal(Slot, CSByConstantArg.first, "bit",
+                   ConstantExpr::getIntToPtr(BitConst, Int8PtrTy));
     }
+
+    // Rewrite each call to a load from OffsetByte/OffsetBit.
+    applyVirtualConstProp(CSByConstantArg.second,
+                          TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
   }
   return true;
 }
@@ -733,7 +1075,11 @@ bool DevirtModule::areRemarksEnabled() {
   if (FL.empty())
     return false;
   const Function &Fn = FL.front();
-  auto DI = OptimizationRemark(DEBUG_TYPE, Fn, DebugLoc(), "");
+
+  const auto &BBL = Fn.getBasicBlockList();
+  if (BBL.empty())
+    return false;
+  auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
   return DI.isEnabled();
 }
 
@@ -766,8 +1112,8 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
       Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
       if (SeenPtrs.insert(Ptr).second) {
         for (DevirtCallSite Call : DevirtCalls) {
-          CallSlots[{TypeId, Call.Offset}].push_back(
-              {CI->getArgOperand(0), Call.CS, nullptr});
+          CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0),
+                                                       Call.CS, nullptr);
         }
       }
     }
@@ -853,14 +1199,79 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
     if (HasNonCallUses)
       ++NumUnsafeUses;
     for (DevirtCallSite Call : DevirtCalls) {
-      CallSlots[{TypeId, Call.Offset}].push_back(
-          {Ptr, Call.CS, &NumUnsafeUses});
+      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS,
+                                                   &NumUnsafeUses);
     }
 
     CI->eraseFromParent();
   }
 }
 
+void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  const TypeIdSummary *TidSummary =
+      ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString());
+  if (!TidSummary)
+    return;
+  auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
+  if (ResI == TidSummary->WPDRes.end())
+    return;
+  const WholeProgramDevirtResolution &Res = ResI->second;
+
+  if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    // The type of the function in the declaration is irrelevant because every
+    // call site will cast it to the correct type.
+    auto *SingleImpl = M.getOrInsertFunction(
+        Res.SingleImplName, Type::getVoidTy(M.getContext()));
+
+    // This is the import phase so we should not be exporting anything.
+    bool IsExported = false;
+    applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
+    assert(!IsExported);
+  }
+
+  for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
+    auto I = Res.ResByArg.find(CSByConstantArg.first);
+    if (I == Res.ResByArg.end())
+      continue;
+    auto &ResByArg = I->second;
+    // FIXME: We should figure out what to do about the "function name" argument
+    // to the apply* functions, as the function names are unavailable during the
+    // importing phase. For now we just pass the empty string. This does not
+    // impact correctness because the function names are just used for remarks.
+    switch (ResByArg.TheKind) {
+    case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+      applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
+      break;
+    case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
+      Constant *UniqueMemberAddr =
+          importGlobal(Slot, CSByConstantArg.first, "unique_member");
+      applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
+                           UniqueMemberAddr);
+      break;
+    }
+    case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
+      Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32);
+      Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty);
+      Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8);
+      Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty);
+      applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+    }
+    default:
+      break;
+    }
+  }
+}
+
+void DevirtModule::removeRedundantTypeTests() {
+  auto True = ConstantInt::getTrue(M.getContext());
+  for (auto &&U : NumUnsafeUsesForTypeTest) {
+    if (U.second == 0) {
+      U.first->replaceAllUsesWith(True);
+      U.first->eraseFromParent();
+    }
+  }
+}
+
 bool DevirtModule::run() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
@@ -868,7 +1279,11 @@ bool DevirtModule::run() {
       M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
   Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
 
-  if ((!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+  // Normally if there are no users of the devirtualization intrinsics in the
+  // module, this pass has nothing to do. But if we are exporting, we also need
+  // to handle any users that appear only in the function summaries.
+  if (!ExportSummary &&
+      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
        AssumeFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
     return false;
@@ -879,6 +1294,17 @@ bool DevirtModule::run() {
   if (TypeCheckedLoadFunc)
     scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
 
+  if (ImportSummary) {
+    for (auto &S : CallSlots)
+      importResolution(S.first, S.second);
+
+    removeRedundantTypeTests();
+
+    // The rest of the code is only necessary when exporting or during regular
+    // LTO, so we are done.
+    return true;
+  }
+
   // Rebuild type metadata into a map for easy lookup.
   std::vector<VTableBits> Bits;
   DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
@@ -886,6 +1312,53 @@ bool DevirtModule::run() {
   if (TypeIdMap.empty())
     return true;
 
+  // Collect information from summary about which calls to try to devirtualize.
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdMap) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS)
+          continue;
+        // FIXME: Only add live functions.
+        for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers =
+                true;
+          }
+        }
+        for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}]
+                .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_test_assume_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryHasTypeTestAssumeUsers = true;
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_checked_load_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+      }
+    }
+  }
+
   // For each (type, offset) pair:
   bool DidVirtualConstProp = false;
   std::map<std::string, Function*> DevirtTargets;
@@ -894,19 +1367,39 @@ bool DevirtModule::run() {
     // function implementation at offset S.first.ByteOffset, and add to
     // TargetsForSlot.
     std::vector<VirtualCallTarget> TargetsForSlot;
-    if (!tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
-                                   S.first.ByteOffset))
-      continue;
-
-    if (!trySingleImplDevirt(TargetsForSlot, S.second) &&
-        tryVirtualConstProp(TargetsForSlot, S.second))
+    if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
+                                  S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res = nullptr;
+      if (ExportSummary && isa<MDString>(S.first.TypeID))
+        Res = &ExportSummary
+                   ->getOrInsertTypeIdSummary(
+                       cast<MDString>(S.first.TypeID)->getString())
+                   .WPDRes[S.first.ByteOffset];
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) &&
+          tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first))
         DidVirtualConstProp = true;
 
-    // Collect functions devirtualized at least for one call site for stats.
-    if (RemarksEnabled)
-      for (const auto &T : TargetsForSlot)
-        if (T.WasDevirt)
-          DevirtTargets[T.Fn->getName()] = T.Fn;
+      // Collect functions devirtualized at least for one call site for stats.
+      if (RemarksEnabled)
+        for (const auto &T : TargetsForSlot)
+          if (T.WasDevirt)
+            DevirtTargets[T.Fn->getName()] = T.Fn;
+    }
+
+    // CFI-specific: if we are exporting and any llvm.type.checked.load
+    // intrinsics were *not* devirtualized, we need to add the resulting
+    // llvm.type.test intrinsics to the function summaries so that the
+    // LowerTypeTests pass will export them.
+    if (ExportSummary && isa<MDString>(S.first.TypeID)) {
+      auto GUID =
+          GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
+      for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
+        FS->addTypeTest(GUID);
+      for (auto &CCS : S.second.ConstCSInfo)
+        for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
+          FS->addTypeTest(GUID);
+    }
   }
 
   if (RemarksEnabled) {
@@ -914,23 +1407,12 @@ bool DevirtModule::run() {
     for (const auto &DT : DevirtTargets) {
       Function *F = DT.second;
       DISubprogram *SP = F->getSubprogram();
-      DebugLoc DL = SP ? DebugLoc::get(SP->getScopeLine(), 0, SP) : DebugLoc();
-      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, DL,
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP,
                              Twine("devirtualized ") + F->getName());
     }
   }
 
-  // If we were able to eliminate all unsafe uses for a type checked load,
-  // eliminate the type test by replacing it with true.
-  if (TypeCheckedLoadFunc) {
-    auto True = ConstantInt::getTrue(M.getContext());
-    for (auto &&U : NumUnsafeUsesForTypeTest) {
-      if (U.second == 0) {
-        U.first->replaceAllUsesWith(True);
-        U.first->eraseFromParent();
-      }
-    }
-  }
+  removeRedundantTypeTests();
 
   // Rebuild each global we touched as part of virtual constant propagation to
   // include the before and after bytes.
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 2d34c1cc74bd..174ec8036274 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -902,7 +902,7 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
   APInt RHSKnownOne(BitWidth, 0);
   computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
 
-  // Addition of two 2's compliment numbers having opposite signs will never
+  // Addition of two 2's complement numbers having opposite signs will never
   // overflow.
   if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
       (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
@@ -939,7 +939,7 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
   APInt RHSKnownOne(BitWidth, 0);
   computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
 
-  // Subtraction of two 2's compliment numbers having identical signs will
+  // Subtraction of two 2's complement numbers having identical signs will
   // never overflow.
   if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) ||
       (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1]))
@@ -1042,43 +1042,42 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
-  const APInt *Val;
-  if (match(RHS, m_APInt(Val))) {
-    // X + (signbit) --> X ^ signbit
-    if (Val->isSignBit())
+  const APInt *RHSC;
+  if (match(RHS, m_APInt(RHSC))) {
+    if (RHSC->isSignBit()) {
+      // If wrapping is not allowed, then the addition must set the sign bit:
+      // X + (signbit) --> X | signbit
+      if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
+        return BinaryOperator::CreateOr(LHS, RHS);
+
+      // If wrapping is allowed, then the addition flips the sign bit of LHS:
+      // X + (signbit) --> X ^ signbit
       return BinaryOperator::CreateXor(LHS, RHS);
+    }
 
     // Is this add the last step in a convoluted sext?
     Value *X;
     const APInt *C;
     if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
         C->isMinSignedValue() &&
-        C->sext(LHS->getType()->getScalarSizeInBits()) == *Val) {
+        C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
       // add(zext(xor i16 X, -32768), -32768) --> sext X
       return CastInst::Create(Instruction::SExt, X, LHS->getType());
     }
 
-    if (Val->isNegative() &&
+    if (RHSC->isNegative() &&
         match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
-        Val->sge(-C->sext(Val->getBitWidth()))) {
+        RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
       // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
-      return CastInst::Create(
-          Instruction::ZExt,
-          Builder->CreateNUWAdd(
-              X, Constant::getIntegerValue(X->getType(),
-                                           *C + Val->trunc(C->getBitWidth()))),
-          I.getType());
+      Constant *NewC =
+          ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
+      return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
     }
   }
 
   // FIXME: Use the match above instead of dyn_cast to allow these transforms
   // for splat vectors.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
-    // See if SimplifyDemandedBits can simplify this.  This handles stuff like
-    // (X & 254)+1 -> (X&254)|1
-    if (SimplifyDemandedInstructionBits(I))
-      return &I;
-
     // zext(bool) + C -> bool ? C + 1 : C
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
       if (ZI->getSrcTy()->isIntegerTy(1))
@@ -1129,8 +1128,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
-  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
-    if (Instruction *NV = FoldOpIntoPhi(I))
+  if (isa<Constant>(RHS))
+    if (Instruction *NV = foldOpWithConstantIntoOperand(I))
       return NV;
 
   if (I.getType()->getScalarType()->isIntegerTy(1))
@@ -1201,11 +1200,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         return BinaryOperator::CreateAnd(NewAdd, C2);
       }
     }
-
-    // Try to fold constant add into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
   }
 
   // add (select X 0 (sub n A)) A  -->  select X A n
@@ -1253,7 +1247,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
     // (add (sext x), (sext y)) --> (sext (add int x, y))
     if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of sexts), and if the
       // integer add will not overflow.
       if (LHSConv->getOperand(0)->getType() ==
@@ -1290,7 +1284,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
     // (add (zext x), (zext y)) --> (zext (add int x, y))
     if (auto *RHSConv = dyn_cast<ZExtInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of zexts), and if the
       // integer add will not overflow.
       if (LHSConv->getOperand(0)->getType() ==
@@ -1311,13 +1305,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(LHS, m_And(m_Specific(B), m_Specific(A)))))
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
 
     if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(RHS, m_And(m_Specific(B), m_Specific(A)))))
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
   }
 
@@ -1325,8 +1317,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
-        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(LHS, m_And(m_Specific(B), m_Specific(A))))) {
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B)))) {
       auto *New = BinaryOperator::CreateAdd(A, B);
       New->setHasNoSignedWrap(I.hasNoSignedWrap());
       New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
@@ -1334,8 +1325,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
 
     if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
-        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(RHS, m_And(m_Specific(B), m_Specific(A))))) {
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B)))) {
       auto *New = BinaryOperator::CreateAdd(A, B);
       New->setHasNoSignedWrap(I.hasNoSignedWrap());
       New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
@@ -1394,6 +1384,8 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    Value *LHSIntVal = LHSConv->getOperand(0);
+
     // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
     // ... if the constant fits in the integer value.  This is useful for things
     // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
@@ -1401,12 +1393,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     // instcombined.
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
       Constant *CI =
-      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
+      ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
+          WillNotOverflowSignedAdd(LHSIntVal, CI, I)) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+        Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal,
                                               CI, "addconv");
         return new SIToFPInst(NewAdd, I.getType());
       }
@@ -1414,17 +1406,17 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 
     // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
     if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      Value *RHSIntVal = RHSConv->getOperand(0);
+
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of int->fp conversions),
       // and if the integer add will not overflow.
-      if (LHSConv->getOperand(0)->getType() ==
-              RHSConv->getOperand(0)->getType() &&
+      if (LHSIntVal->getType() == RHSIntVal->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0), I)) {
+          WillNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
-                                              RHSConv->getOperand(0),"addconv");
+        Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal,
+                                              RHSIntVal, "addconv");
         return new SIToFPInst(NewAdd, I.getType());
       }
     }
@@ -1562,7 +1554,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return Res;
   }
 
-  if (I.getType()->isIntegerTy(1))
+  if (I.getType()->getScalarType()->isIntegerTy(1))
     return BinaryOperator::CreateXor(Op0, Op1);
 
   // Replace (-1 - A) with (~A).
@@ -1580,14 +1572,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
+    // Try to fold constant sub into PHI values.
+    if (PHINode *PN = dyn_cast<PHINode>(Op1))
+      if (Instruction *R = foldOpIntoPhi(I, PN))
+        return R;
+
     // C-(X+C2) --> (C-C2)-X
     Constant *C2;
     if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
 
-    if (SimplifyDemandedInstructionBits(I))
-      return &I;
-
     // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
     if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X))))
       if (X->getType()->getScalarType()->isIntegerTy(1))
@@ -1622,11 +1616,11 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
-    if ((*Op0C + 1).isPowerOf2()) {
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
-      computeKnownBits(&I, KnownZero, KnownOne, 0, &I);
-      if ((*Op0C | KnownZero).isAllOnesValue())
+    if (Op0C->isMask()) {
+      APInt RHSKnownZero(BitWidth, 0);
+      APInt RHSKnownOne(BitWidth, 0);
+      computeKnownBits(Op1, RHSKnownZero, RHSKnownOne, 0, &I);
+      if ((*Op0C | RHSKnownZero).isAllOnesValue())
         return BinaryOperator::CreateXor(Op1, Op0);
     }
   }
@@ -1634,8 +1628,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   {
     Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
-    if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
-        match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
+    if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
       return BinaryOperator::CreateNeg(Y);
 
     // (X-Y)-X == -Y
@@ -1645,18 +1638,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   // (sub (or A, B) (xor A, B)) --> (and A, B)
   {
-    Value *A = nullptr, *B = nullptr;
+    Value *A, *B;
     if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(Op0, m_Or(m_Specific(A), m_Specific(B))) ||
-         match(Op0, m_Or(m_Specific(B), m_Specific(A)))))
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
   }
 
-  if (Op0->hasOneUse()) {
-    Value *Y = nullptr;
+  {
+    Value *Y;
     // ((X | Y) - X) --> (~X & Y)
-    if (match(Op0, m_Or(m_Value(Y), m_Specific(Op1))) ||
-        match(Op0, m_Or(m_Specific(Op1), m_Value(Y))))
+    if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
       return BinaryOperator::CreateAnd(
           Y, Builder->CreateNot(Op1, Op1->getName() + ".not"));
   }
@@ -1664,7 +1655,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (Op1->hasOneUse()) {
     Value *X = nullptr, *Y = nullptr, *Z = nullptr;
     Constant *C = nullptr;
-    Constant *CI = nullptr;
 
     // (X - (Y - Z))  -->  (X + (Z - Y)).
     if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
@@ -1673,8 +1663,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     // (X - (X & Y))   -->   (X & ~Y)
     //
-    if (match(Op1, m_And(m_Value(Y), m_Specific(Op0))) ||
-        match(Op1, m_And(m_Specific(Op0), m_Value(Y))))
+    if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0))))
       return BinaryOperator::CreateAnd(Op0,
                                   Builder->CreateNot(Y, Y->getName() + ".not"));
 
@@ -1702,14 +1691,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
     Value *A, *B;
-    if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) ||
-        match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B))))
+    Constant *CI;
+    if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B)))))
       return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B));
 
     // X - A*CI -> X + A*-CI
-    // X - CI*A -> X + A*-CI
-    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI))) ||
-        match(Op1, m_Mul(m_Constant(CI), m_Value(A)))) {
+    // No need to handle commuted multiply because multiply handling will
+    // ensure constant will be move to the right hand side.
+    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI)))) {
       Value *NewMul = Builder->CreateMul(A, ConstantExpr::getNeg(CI));
       return BinaryOperator::CreateAdd(Op0, NewMul);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index da5384a86aac..b2a41c699202 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -137,9 +137,8 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
 }
 
 /// This handles expressions of the form ((val OP C1) & C2).  Where
-/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
-/// guaranteed to be a binary operator.
-Instruction *InstCombiner::OptAndOp(Instruction *Op,
+/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
+Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
                                     ConstantInt *OpRHS,
                                     ConstantInt *AndRHS,
                                     BinaryOperator &TheAnd) {
@@ -149,6 +148,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     Together = ConstantExpr::getAnd(AndRHS, OpRHS);
 
   switch (Op->getOpcode()) {
+  default: break;
   case Instruction::Xor:
     if (Op->hasOneUse()) {
       // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
@@ -159,13 +159,6 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     break;
   case Instruction::Or:
     if (Op->hasOneUse()){
-      if (Together != OpRHS) {
-        // (X | C1) & C2 --> (X | (C1&C2)) & C2
-        Value *Or = Builder->CreateOr(X, Together);
-        Or->takeName(Op);
-        return BinaryOperator::CreateAnd(Or, AndRHS);
-      }
-
       ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together);
       if (TogetherCI && !TogetherCI->isZero()){
         // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1
@@ -302,178 +295,91 @@ Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
   return Builder->CreateICmp(Pred, VMinusLo, HiMinusLo);
 }
 
-/// Returns true iff Val consists of one contiguous run of 1s with any number
-/// of 0s on either side.  The 1s are allowed to wrap from LSB to MSB,
-/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
-/// not, since all 1s are not contiguous.
-static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
-  const APInt& V = Val->getValue();
-  uint32_t BitWidth = Val->getType()->getBitWidth();
-  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
-
-  // look for the first zero bit after the run of ones
-  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
-  // look for the first non-zero bit
-  ME = V.getActiveBits();
-  return true;
-}
-
-/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines
-/// whether the operator is a sub. If we can fold one of the following xforms:
+/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
+/// that can be simplified.
+/// One of A and B is considered the mask. The other is the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum contains
+/// only "Mask", then both A and B can be considered masks. If A is the mask,
+/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
+/// If both A and C are constants, this proof is also easy.
+/// For the following explanations, we assume that A is the mask.
 ///
-/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
-/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
-/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
+/// bits of A are set in B.
+///   Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
 ///
-/// return (A +/- B).
+/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
+/// bits of A are cleared in B.
+///   Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
+///
+/// "Mixed" declares that (A & B) == C and C might or might not contain any
+/// number of one bits and zero bits.
+///   Example: (icmp eq (A & 3), 1) -> AMask_Mixed
+///
+/// "Not" means that in above descriptions "==" should be replaced by "!=".
+///   Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
 ///
-Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
-                                        ConstantInt *Mask, bool isSub,
-                                        Instruction &I) {
-  Instruction *LHSI = dyn_cast<Instruction>(LHS);
-  if (!LHSI || LHSI->getNumOperands() != 2 ||
-      !isa<ConstantInt>(LHSI->getOperand(1))) return nullptr;
-
-  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
-
-  switch (LHSI->getOpcode()) {
-  default: return nullptr;
-  case Instruction::And:
-    if (ConstantExpr::getAnd(N, Mask) == Mask) {
-      // If the AndRHS is a power of two minus one (0+1+), this is simple.
-      if ((Mask->getValue().countLeadingZeros() +
-           Mask->getValue().countPopulation()) ==
-          Mask->getValue().getBitWidth())
-        break;
-
-      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
-      // part, we don't need any explicit masks to take them out of A.  If that
-      // is all N is, ignore it.
-      uint32_t MB = 0, ME = 0;
-      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
-        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
-        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
-        if (MaskedValueIsZero(RHS, Mask, 0, &I))
-          break;
-      }
-    }
-    return nullptr;
-  case Instruction::Or:
-  case Instruction::Xor:
-    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
-    if ((Mask->getValue().countLeadingZeros() +
-         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
-        && ConstantExpr::getAnd(N, Mask)->isNullValue())
-      break;
-    return nullptr;
-  }
-
-  if (isSub)
-    return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold");
-  return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold");
-}
-
-/// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C)
-/// One of A and B is considered the mask, the other the value. This is
-/// described as the "AMask" or "BMask" part of the enum. If the enum
-/// contains only "Mask", then both A and B can be considered masks.
-/// If A is the mask, then it was proven, that (A & C) == C. This
-/// is trivial if C == A, or C == 0. If both A and C are constants, this
-/// proof is also easy.
-/// For the following explanations we assume that A is the mask.
-/// The part "AllOnes" declares, that the comparison is true only
-/// if (A & B) == A, or all bits of A are set in B.
-///   Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes
-/// The part "AllZeroes" declares, that the comparison is true only
-/// if (A & B) == 0, or all bits of A are cleared in B.
-///   Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes
-/// The part "Mixed" declares, that (A & B) == C and C might or might not
-/// contain any number of one bits and zero bits.
-///   Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed
-/// The Part "Not" means, that in above descriptions "==" should be replaced
-/// by "!=".
-///   Example: (icmp ne (A & 3), 3) -> FoldMskICmp_AMask_NotAllOnes
 /// If the mask A contains a single bit, then the following is equivalent:
 ///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
 ///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
 enum MaskedICmpType {
-  FoldMskICmp_AMask_AllOnes           =     1,
-  FoldMskICmp_AMask_NotAllOnes        =     2,
-  FoldMskICmp_BMask_AllOnes           =     4,
-  FoldMskICmp_BMask_NotAllOnes        =     8,
-  FoldMskICmp_Mask_AllZeroes          =    16,
-  FoldMskICmp_Mask_NotAllZeroes       =    32,
-  FoldMskICmp_AMask_Mixed             =    64,
-  FoldMskICmp_AMask_NotMixed          =   128,
-  FoldMskICmp_BMask_Mixed             =   256,
-  FoldMskICmp_BMask_NotMixed          =   512
+  AMask_AllOnes           =     1,
+  AMask_NotAllOnes        =     2,
+  BMask_AllOnes           =     4,
+  BMask_NotAllOnes        =     8,
+  Mask_AllZeros           =    16,
+  Mask_NotAllZeros        =    32,
+  AMask_Mixed             =    64,
+  AMask_NotMixed          =   128,
+  BMask_Mixed             =   256,
+  BMask_NotMixed          =   512
 };
 
-/// Return the set of pattern classes (from MaskedICmpType)
-/// that (icmp SCC (A & B), C) satisfies.
-static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
-                                    ICmpInst::Predicate SCC)
-{
+/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
+/// satisfies.
+static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
+                                  ICmpInst::Predicate Pred) {
   ConstantInt *ACst = dyn_cast<ConstantInt>(A);
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
   ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-  bool icmp_eq = (SCC == ICmpInst::ICMP_EQ);
-  bool icmp_abit = (ACst && !ACst->isZero() &&
-                    ACst->getValue().isPowerOf2());
-  bool icmp_bbit = (BCst && !BCst->isZero() &&
-                    BCst->getValue().isPowerOf2());
-  unsigned result = 0;
+  bool IsEq = (Pred == ICmpInst::ICMP_EQ);
+  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
+  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+  unsigned MaskVal = 0;
   if (CCst && CCst->isZero()) {
     // if C is zero, then both A and B qualify as mask
-    result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes |
-                          FoldMskICmp_AMask_Mixed |
-                          FoldMskICmp_BMask_Mixed)
-                       : (FoldMskICmp_Mask_NotAllZeroes |
-                          FoldMskICmp_AMask_NotMixed |
-                          FoldMskICmp_BMask_NotMixed));
-    if (icmp_abit)
-      result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes |
-                            FoldMskICmp_AMask_NotMixed)
-                         : (FoldMskICmp_AMask_AllOnes |
-                            FoldMskICmp_AMask_Mixed));
-    if (icmp_bbit)
-      result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes |
-                            FoldMskICmp_BMask_NotMixed)
-                         : (FoldMskICmp_BMask_AllOnes |
-                            FoldMskICmp_BMask_Mixed));
-    return result;
+    MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
+                     : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
+                       : (AMask_AllOnes | AMask_Mixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
+                       : (BMask_AllOnes | BMask_Mixed));
+    return MaskVal;
   }
+
   if (A == C) {
-    result |= (icmp_eq ? (FoldMskICmp_AMask_AllOnes |
-                          FoldMskICmp_AMask_Mixed)
-                       : (FoldMskICmp_AMask_NotAllOnes |
-                          FoldMskICmp_AMask_NotMixed));
-    if (icmp_abit)
-      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_AMask_NotMixed)
-                         : (FoldMskICmp_Mask_AllZeroes |
-                            FoldMskICmp_AMask_Mixed));
-  } else if (ACst && CCst &&
-             ConstantExpr::getAnd(ACst, CCst) == CCst) {
-    result |= (icmp_eq ? FoldMskICmp_AMask_Mixed
-                       : FoldMskICmp_AMask_NotMixed);
+    MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
+                     : (AMask_NotAllOnes | AMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
+                       : (Mask_AllZeros | AMask_Mixed));
+  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+    MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
   }
+
   if (B == C) {
-    result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes |
-                          FoldMskICmp_BMask_Mixed)
-                       : (FoldMskICmp_BMask_NotAllOnes |
-                          FoldMskICmp_BMask_NotMixed));
-    if (icmp_bbit)
-      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_BMask_NotMixed)
-                         : (FoldMskICmp_Mask_AllZeroes |
-                            FoldMskICmp_BMask_Mixed));
-  } else if (BCst && CCst &&
-             ConstantExpr::getAnd(BCst, CCst) == CCst) {
-    result |= (icmp_eq ? FoldMskICmp_BMask_Mixed
-                       : FoldMskICmp_BMask_NotMixed);
-  }
-  return result;
+    MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
+                     : (BMask_NotAllOnes | BMask_NotMixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
+                       : (Mask_AllZeros | BMask_Mixed));
+  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+    MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
+  }
+
+  return MaskVal;
 }
 
 /// Convert an analysis of a masked ICmp into its equivalent if all boolean
@@ -482,32 +388,30 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
 /// involves swapping those bits over.
 static unsigned conjugateICmpMask(unsigned Mask) {
   unsigned NewMask;
-  NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes |
-                     FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed |
-                     FoldMskICmp_BMask_Mixed))
+  NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
+                     AMask_Mixed | BMask_Mixed))
             << 1;
 
-  NewMask |=
-      (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes |
-               FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed |
-               FoldMskICmp_BMask_NotMixed))
-      >> 1;
+  NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
+                      AMask_NotMixed | BMask_NotMixed))
+             >> 1;
 
   return NewMask;
 }
 
-/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
-/// Return the set of pattern classes (from MaskedICmpType)
-/// that both LHS and RHS satisfy.
-static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
-                                             Value*& B, Value*& C,
-                                             Value*& D, Value*& E,
-                                             ICmpInst *LHS, ICmpInst *RHS,
-                                             ICmpInst::Predicate &LHSCC,
-                                             ICmpInst::Predicate &RHSCC) {
-  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0;
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
+/// Return the set of pattern classes (from MaskedICmpType) that both LHS and
+/// RHS satisfy.
+static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+                                         Value *&D, Value *&E, ICmpInst *LHS,
+                                         ICmpInst *RHS,
+                                         ICmpInst::Predicate &PredL,
+                                         ICmpInst::Predicate &PredR) {
+  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+    return 0;
   // vectors are not (yet?) supported
-  if (LHS->getOperand(0)->getType()->isVectorTy()) return 0;
+  if (LHS->getOperand(0)->getType()->isVectorTy())
+    return 0;
 
   // Here comes the tricky part:
   // LHS might be of the form L11 & L12 == X, X == L21 & L22,
@@ -517,9 +421,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   // above.
   Value *L1 = LHS->getOperand(0);
   Value *L2 = LHS->getOperand(1);
-  Value *L11,*L12,*L21,*L22;
+  Value *L11, *L12, *L21, *L22;
   // Check whether the icmp can be decomposed into a bit test.
-  if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) {
+  if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) {
     L21 = L22 = L1 = nullptr;
   } else {
     // Look for ANDs in the LHS icmp.
@@ -543,22 +447,26 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   }
 
   // Bail if LHS was a icmp that can't be decomposed into an equality.
-  if (!ICmpInst::isEquality(LHSCC))
+  if (!ICmpInst::isEquality(PredL))
     return 0;
 
   Value *R1 = RHS->getOperand(0);
   Value *R2 = RHS->getOperand(1);
-  Value *R11,*R12;
-  bool ok = false;
-  if (decomposeBitTestICmp(RHS, RHSCC, R11, R12, R2)) {
+  Value *R11, *R12;
+  bool Ok = false;
+  if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) {
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12;
+      A = R11;
+      D = R12;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11;
+      A = R12;
+      D = R11;
     } else {
       return 0;
     }
-    E = R2; R1 = nullptr; ok = true;
+    E = R2;
+    R1 = nullptr;
+    Ok = true;
   } else if (R1->getType()->isIntegerTy()) {
     if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
       // As before, model no mask as a trivial mask if it'll let us do an
@@ -568,46 +476,62 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     }
 
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12; E = R2; ok = true;
+      A = R11;
+      D = R12;
+      E = R2;
+      Ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11; E = R2; ok = true;
+      A = R12;
+      D = R11;
+      E = R2;
+      Ok = true;
     }
   }
 
   // Bail if RHS was a icmp that can't be decomposed into an equality.
-  if (!ICmpInst::isEquality(RHSCC))
+  if (!ICmpInst::isEquality(PredR))
     return 0;
 
   // Look for ANDs on the right side of the RHS icmp.
-  if (!ok && R2->getType()->isIntegerTy()) {
+  if (!Ok && R2->getType()->isIntegerTy()) {
     if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
       R11 = R2;
       R12 = Constant::getAllOnesValue(R2->getType());
     }
 
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12; E = R1; ok = true;
+      A = R11;
+      D = R12;
+      E = R1;
+      Ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11; E = R1; ok = true;
+      A = R12;
+      D = R11;
+      E = R1;
+      Ok = true;
     } else {
       return 0;
     }
   }
-  if (!ok)
+  if (!Ok)
     return 0;
 
   if (L11 == A) {
-    B = L12; C = L2;
+    B = L12;
+    C = L2;
   } else if (L12 == A) {
-    B = L11; C = L2;
+    B = L11;
+    C = L2;
   } else if (L21 == A) {
-    B = L22; C = L1;
+    B = L22;
+    C = L1;
   } else if (L22 == A) {
-    B = L21; C = L1;
+    B = L21;
+    C = L1;
   }
 
-  unsigned LeftType = getTypeOfMaskedICmp(A, B, C, LHSCC);
-  unsigned RightType = getTypeOfMaskedICmp(A, D, E, RHSCC);
+  unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
+  unsigned RightType = getMaskedICmpType(A, D, E, PredR);
   return LeftType & RightType;
 }
 
@@ -616,12 +540,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy *Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
-  unsigned Mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
-                                               LHSCC, RHSCC);
-  if (Mask == 0) return nullptr;
-  assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
-         "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  unsigned Mask =
+      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
+  if (Mask == 0)
+    return nullptr;
+
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
 
   // In full generality:
   //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
@@ -642,7 +568,7 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     Mask = conjugateICmpMask(Mask);
   }
 
-  if (Mask & FoldMskICmp_Mask_AllZeroes) {
+  if (Mask & Mask_AllZeros) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
     Value *NewOr = Builder->CreateOr(B, D);
@@ -653,14 +579,14 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     Value *Zero = Constant::getNullValue(A->getType());
     return Builder->CreateICmp(NewCC, NewAnd, Zero);
   }
-  if (Mask & FoldMskICmp_BMask_AllOnes) {
+  if (Mask & BMask_AllOnes) {
     // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
     Value *NewOr = Builder->CreateOr(B, D);
     Value *NewAnd = Builder->CreateAnd(A, NewOr);
     return Builder->CreateICmp(NewCC, NewAnd, NewOr);
   }
-  if (Mask & FoldMskICmp_AMask_AllOnes) {
+  if (Mask & AMask_AllOnes) {
     // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
     Value *NewAnd1 = Builder->CreateAnd(B, D);
@@ -672,11 +598,13 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   // their actual values. This isn't strictly necessary, just a "handle the
   // easy cases for now" decision.
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  if (!BCst) return nullptr;
+  if (!BCst)
+    return nullptr;
   ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-  if (!DCst) return nullptr;
+  if (!DCst)
+    return nullptr;
 
-  if (Mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
     // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
@@ -689,7 +617,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (Mask & FoldMskICmp_AMask_NotAllOnes) {
+
+  if (Mask & AMask_NotAllOnes) {
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
     // Only valid if one of the masks is a superset of the other (check "B|D" is
@@ -701,7 +630,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (Mask & FoldMskICmp_BMask_Mixed) {
+
+  if (Mask & BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
     // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
@@ -713,23 +643,28 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set.
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-    if (!CCst) return nullptr;
+    if (!CCst)
+      return nullptr;
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
-    if (!ECst) return nullptr;
-    if (LHSCC != NewCC)
+    if (!ECst)
+      return nullptr;
+    if (PredL != NewCC)
       CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
-    if (RHSCC != NewCC)
+    if (PredR != NewCC)
       ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
     // If there is a conflict, we should actually return a false for the
     // whole construct.
     if (((BCst->getValue() & DCst->getValue()) &
          (CCst->getValue() ^ ECst->getValue())) != 0)
       return ConstantInt::get(LHS->getType(), !IsAnd);
+
     Value *NewOr1 = Builder->CreateOr(B, D);
     Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
     Value *NewAnd = Builder->CreateAnd(A, NewOr1);
     return Builder->CreateICmp(NewCC, NewAnd, NewOr2);
   }
+
   return nullptr;
 }
 
@@ -789,12 +724,67 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return Builder->CreateICmp(NewPred, Input, RangeEnd);
 }
 
+static Value *
+foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
+                                     bool JoinedByAnd,
+                                     InstCombiner::BuilderTy *Builder) {
+  Value *X = LHS->getOperand(0);
+  if (X != RHS->getOperand(0))
+    return nullptr;
+
+  const APInt *C1, *C2;
+  if (!match(LHS->getOperand(1), m_APInt(C1)) ||
+      !match(RHS->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred !=  RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // The larger unsigned constant goes on the right.
+  if (C1->ugt(*C2))
+    std::swap(C1, C2);
+
+  APInt Xor = *C1 ^ *C2;
+  if (Xor.isPowerOf2()) {
+    // If LHSC and RHSC differ by only one bit, then set that bit in X and
+    // compare against the larger constant:
+    // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
+    // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
+    // We choose an 'or' with a Pow2 constant rather than the inverse mask with
+    // 'and' because that may lead to smaller codegen from a smaller constant.
+    Value *Or = Builder->CreateOr(X, ConstantInt::get(X->getType(), Xor));
+    return Builder->CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
+  }
+
+  // Special case: get the ordering right when the values wrap around zero.
+  // Ie, we assumed the constants were unsigned when swapping earlier.
+  if (*C1 == 0 && C2->isAllOnesValue())
+    std::swap(C1, C2);
+
+  if (*C1 == *C2 - 1) {
+    // (X == 13 || X == 14) --> X - 13 <=u 1
+    // (X != 13 && X != 14) --> X - 13  >u 1
+    // An 'add' is the canonical IR form, so favor that over a 'sub'.
+    Value *Add = Builder->CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
+    auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
+    return Builder->CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
+  }
+
+  return nullptr;
+}
+
 /// Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(LHSCC, RHSCC)) {
+  if (PredicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -819,86 +809,90 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
     return V;
 
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
+    return V;
+
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
-  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-  if (!LHSCst || !RHSCst) return nullptr;
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC)
+    return nullptr;
 
-  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+  if (LHSC == RHSC && PredL == PredR) {
     // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
     // where C is a power of 2 or
     // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
-    if ((LHSCC == ICmpInst::ICMP_ULT && LHSCst->getValue().isPowerOf2()) ||
-        (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero())) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
+        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
+      Value *NewOr = Builder->CreateOr(LHS0, RHS0);
+      return Builder->CreateICmp(PredL, NewOr, LHSC);
     }
   }
 
   // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
   // where CMAX is the all ones value for the truncated type,
   // iff the lower bits of C2 and CA are zero.
-  if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC &&
-      LHS->hasOneUse() && RHS->hasOneUse()) {
+  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
+      RHS->hasOneUse()) {
     Value *V;
-    ConstantInt *AndCst, *SmallCst = nullptr, *BigCst = nullptr;
+    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
 
     // (trunc x) == C1 & (and x, CA) == C2
     // (and x, CA) == C2 & (trunc x) == C1
-    if (match(Val2, m_Trunc(m_Value(V))) &&
-        match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
-      SmallCst = RHSCst;
-      BigCst = LHSCst;
-    } else if (match(Val, m_Trunc(m_Value(V))) &&
-               match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
-      SmallCst = LHSCst;
-      BigCst = RHSCst;
+    if (match(RHS0, m_Trunc(m_Value(V))) &&
+        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = RHSC;
+      BigC = LHSC;
+    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = LHSC;
+      BigC = RHSC;
     }
 
-    if (SmallCst && BigCst) {
-      unsigned BigBitSize = BigCst->getType()->getBitWidth();
-      unsigned SmallBitSize = SmallCst->getType()->getBitWidth();
+    if (SmallC && BigC) {
+      unsigned BigBitSize = BigC->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallC->getType()->getBitWidth();
 
       // Check that the low bits are zero.
       APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
-      if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) {
-        Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue());
-        APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue();
-        Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N);
-        return Builder->CreateICmp(LHSCC, NewAnd, NewVal);
+      if ((Low & AndC->getValue()) == 0 && (Low & BigC->getValue()) == 0) {
+        Value *NewAnd = Builder->CreateAnd(V, Low | AndC->getValue());
+        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
+        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+        return Builder->CreateICmp(PredL, NewAnd, NewVal);
       }
     }
   }
 
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return nullptr;
+  if (LHS0 != RHS0)
+    return nullptr;
 
-  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
-  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
-      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
-      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
-      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
     return nullptr;
 
   // We can't fold (ugt x, C) & (sgt x, C2).
-  if (!PredicatesFoldable(LHSCC, RHSCC))
+  if (!PredicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
-  if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) &&
-       CmpInst::isSigned(RHSCC)))
-    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
   else
-    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
 
   if (ShouldSwap) {
     std::swap(LHS, RHS);
-    std::swap(LHSCst, RHSCst);
-    std::swap(LHSCC, RHSCC);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
   }
 
   // At this point, we know we have two icmp instructions
@@ -907,113 +901,95 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
   // (from the icmp folding check above), that the two constants
   // are not equal and that the larger constant is on the RHS
-  assert(LHSCst != RHSCst && "Compares not folded above?");
+  assert(LHSC != RHSC && "Compares not folded above?");
 
-  switch (LHSCC) {
-  default: llvm_unreachable("Unknown integer condition code!");
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
   case ICmpInst::ICMP_EQ:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
-    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
-    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:  // (X == 13 & X != 15) -> X == 13
+    case ICmpInst::ICMP_ULT: // (X == 13 & X <  15) -> X == 13
+    case ICmpInst::ICMP_SLT: // (X == 13 & X <  15) -> X == 13
       return LHS;
     }
   case ICmpInst::ICMP_NE:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_ULT:
-      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
-        return Builder->CreateICmpULT(Val, LHSCst);
-      if (LHSCst->isNullValue())    // (X !=  0 & X u< 14) -> X-1 u< 13
-        return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13
+        return Builder->CreateICmpULT(LHS0, LHSC);
+      if (LHSC->isNullValue()) // (X !=  0 & X u< 14) -> X-1 u< 13
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                                false, true);
-      break;                        // (X != 13 & X u< 15) -> no change
+      break; // (X != 13 & X u< 15) -> no change
     case ICmpInst::ICMP_SLT:
-      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
-        return Builder->CreateICmpSLT(Val, LHSCst);
-      break;                        // (X != 13 & X s< 15) -> no change
-    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
-    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13
+        return Builder->CreateICmpSLT(LHS0, LHSC);
+      break;                 // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_EQ:  // (X != 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT: // (X != 13 & X u> 15) -> X u> 15
+    case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15
       return RHS;
     case ICmpInst::ICMP_NE:
-      // Special case to get the ordering right when the values wrap around
-      // zero.
-      if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue())
-        std::swap(LHSCst, RHSCst);
-      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1),
-                                      Val->getName()+".cmp");
-      }
-      break;                        // (X != 13 & X != 15) -> no change
+      // Potential folds for this case should already be handled.
+      break;
     }
     break;
   case ICmpInst::ICMP_ULT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
-    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X u< 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT: // (X u< 13 & X u> 15) -> false
       return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
-    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+    case ICmpInst::ICMP_NE:  // (X u< 13 & X != 15) -> X u< 13
+    case ICmpInst::ICMP_ULT: // (X u< 13 & X u< 15) -> X u< 13
       return LHS;
-    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SLT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
-    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:  // (X s< 13 & X != 15) -> X < 13
+    case ICmpInst::ICMP_SLT: // (X s< 13 & X s< 15) -> X < 13
       return LHS;
-    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_UGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X u> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT: // (X u> 13 & X u> 15) -> X u> 15
       return RHS;
-    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
-      break;
     case ICmpInst::ICMP_NE:
-      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
-        return Builder->CreateICmp(LHSCC, Val, RHSCst);
-      break;                        // (X u> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
-      return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+      if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14
+        return Builder->CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                              false, true);
-    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X s> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_SGT: // (X s> 13 & X s> 15) -> X s> 15
       return RHS;
-    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
-      break;
     case ICmpInst::ICMP_NE:
-      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
-        return Builder->CreateICmp(LHSCC, Val, RHSCst);
-      break;                        // (X s> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
-      return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
-                             true, true);
-    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
-      break;
+      if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14
+        return Builder->CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
+                             true);
     }
     break;
   }
@@ -1314,39 +1290,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
 
         break;
       }
-      case Instruction::Add:
-        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
-        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
-        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
-        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);
-        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
-        break;
-
       case Instruction::Sub:
-        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
-        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
-        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
-        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);
-
         // -x & 1 -> x & 1
         if (AndRHSMask == 1 && match(Op0LHS, m_Zero()))
           return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
 
-        // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
-        // has 1's for all bits that the subtraction with A might affect.
-        if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) {
-          uint32_t BitWidth = AndRHSMask.getBitWidth();
-          uint32_t Zeros = AndRHSMask.countLeadingZeros();
-          APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
-
-          if (MaskedValueIsZero(Op0LHS, Mask, 0, &I)) {
-            Value *NewNeg = Builder->CreateNeg(Op0RHS);
-            return BinaryOperator::CreateAnd(NewNeg, AndRHS);
-          }
-        }
         break;
 
       case Instruction::Shl:
@@ -1361,6 +1309,33 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         break;
       }
 
+      // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
+      // of X and OP behaves well when given trunc(C1) and X.
+      switch (Op0I->getOpcode()) {
+      default:
+        break;
+      case Instruction::Xor:
+      case Instruction::Or:
+      case Instruction::Mul:
+      case Instruction::Add:
+      case Instruction::Sub:
+        Value *X;
+        ConstantInt *C1;
+        if (match(Op0I, m_c_BinOp(m_ZExt(m_Value(X)), m_ConstantInt(C1)))) {
+          if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
+            auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
+            Value *BinOp;
+            if (isa<ZExtInst>(Op0LHS))
+              BinOp = Builder->CreateBinOp(Op0I->getOpcode(), X, TruncC1);
+            else
+              BinOp = Builder->CreateBinOp(Op0I->getOpcode(), TruncC1, X);
+            auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
+            auto *And = Builder->CreateAnd(BinOp, TruncC2);
+            return new ZExtInst(And, I.getType());
+          }
+        }
+      }
+
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
         if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
           return Res;
@@ -1381,10 +1356,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return BinaryOperator::CreateAnd(NewCast, C3);
       }
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
     return DeMorgan;
@@ -1630,15 +1606,15 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
 /// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                    Instruction *CxtI) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
 
-  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() &&
-      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
+  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero() &&
+      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
 
     BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
     BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
@@ -1680,52 +1656,52 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
   // This implies all values in the two ranges differ by exactly one bit.
 
-  if ((LHSCC == ICmpInst::ICMP_ULT || LHSCC == ICmpInst::ICMP_ULE) &&
-      LHSCC == RHSCC && LHSCst && RHSCst && LHS->hasOneUse() &&
-      RHS->hasOneUse() && LHSCst->getType() == RHSCst->getType() &&
-      LHSCst->getValue() == (RHSCst->getValue())) {
+  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
+      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
+      LHSC->getType() == RHSC->getType() &&
+      LHSC->getValue() == (RHSC->getValue())) {
 
     Value *LAdd = LHS->getOperand(0);
     Value *RAdd = RHS->getOperand(0);
 
     Value *LAddOpnd, *RAddOpnd;
-    ConstantInt *LAddCst, *RAddCst;
-    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddCst))) &&
-        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddCst))) &&
-        LAddCst->getValue().ugt(LHSCst->getValue()) &&
-        RAddCst->getValue().ugt(LHSCst->getValue())) {
-
-      APInt DiffCst = LAddCst->getValue() ^ RAddCst->getValue();
-      if (LAddOpnd == RAddOpnd && DiffCst.isPowerOf2()) {
-        ConstantInt *MaxAddCst = nullptr;
-        if (LAddCst->getValue().ult(RAddCst->getValue()))
-          MaxAddCst = RAddCst;
+    ConstantInt *LAddC, *RAddC;
+    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddC))) &&
+        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddC))) &&
+        LAddC->getValue().ugt(LHSC->getValue()) &&
+        RAddC->getValue().ugt(LHSC->getValue())) {
+
+      APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
+      if (LAddOpnd == RAddOpnd && DiffC.isPowerOf2()) {
+        ConstantInt *MaxAddC = nullptr;
+        if (LAddC->getValue().ult(RAddC->getValue()))
+          MaxAddC = RAddC;
         else
-          MaxAddCst = LAddCst;
+          MaxAddC = LAddC;
 
-        APInt RRangeLow = -RAddCst->getValue();
-        APInt RRangeHigh = RRangeLow + LHSCst->getValue();
-        APInt LRangeLow = -LAddCst->getValue();
-        APInt LRangeHigh = LRangeLow + LHSCst->getValue();
+        APInt RRangeLow = -RAddC->getValue();
+        APInt RRangeHigh = RRangeLow + LHSC->getValue();
+        APInt LRangeLow = -LAddC->getValue();
+        APInt LRangeHigh = LRangeLow + LHSC->getValue();
         APInt LowRangeDiff = RRangeLow ^ LRangeLow;
         APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
         APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
                                                    : RRangeLow - LRangeLow;
 
         if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
-            RangeDiff.ugt(LHSCst->getValue())) {
-          Value *MaskCst = ConstantInt::get(LAddCst->getType(), ~DiffCst);
+            RangeDiff.ugt(LHSC->getValue())) {
+          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
 
-          Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskCst);
-          Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddCst);
-          return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSCst));
+          Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskC);
+          Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddC);
+          return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSC));
         }
       }
     }
   }
 
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(LHSCC, RHSCC)) {
+  if (PredicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -1743,25 +1719,25 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
-  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
     Value *A = nullptr, *B = nullptr;
-    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) {
-      B = Val;
-      if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1))
-        A = Val2;
-      else if (RHSCC == ICmpInst::ICMP_UGT && Val == Val2)
+    if (PredL == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero()) {
+      B = LHS0;
+      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS->getOperand(1))
+        A = RHS0;
+      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = RHS->getOperand(1);
     }
     // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
     // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
-    else if (RHSCC == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
-      B = Val2;
-      if (LHSCC == ICmpInst::ICMP_ULT && Val2 == LHS->getOperand(1))
-        A = Val;
-      else if (LHSCC == ICmpInst::ICMP_UGT && Val2 == Val)
+    else if (PredR == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
+      B = RHS0;
+      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS->getOperand(1))
+        A = LHS0;
+      else if (PredL == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = LHS->getOperand(1);
     }
     if (A && B)
@@ -1778,54 +1754,58 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
     return V;
 
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
+    return V;
+
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
-  if (!LHSCst || !RHSCst) return nullptr;
+  if (!LHSC || !RHSC)
+    return nullptr;
 
-  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+  if (LHSC == RHSC && PredL == PredR) {
     // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
-    if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    if (PredL == ICmpInst::ICMP_NE && LHSC->isZero()) {
+      Value *NewOr = Builder->CreateOr(LHS0, RHS0);
+      return Builder->CreateICmp(PredL, NewOr, LHSC);
     }
   }
 
   // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
   //   iff C2 + CA == C1.
-  if (LHSCC == ICmpInst::ICMP_ULT && RHSCC == ICmpInst::ICMP_EQ) {
-    ConstantInt *AddCst;
-    if (match(Val, m_Add(m_Specific(Val2), m_ConstantInt(AddCst))))
-      if (RHSCst->getValue() + AddCst->getValue() == LHSCst->getValue())
-        return Builder->CreateICmpULE(Val, LHSCst);
+  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
+    ConstantInt *AddC;
+    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
+      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
+        return Builder->CreateICmpULE(LHS0, LHSC);
   }
 
   // From here on, we only handle:
   //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return nullptr;
+  if (LHS0 != RHS0)
+    return nullptr;
 
-  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
-  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
-      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
-      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
-      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
     return nullptr;
 
   // We can't fold (ugt x, C) | (sgt x, C2).
-  if (!PredicatesFoldable(LHSCC, RHSCC))
+  if (!PredicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
-  if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) &&
-       CmpInst::isSigned(RHSCC)))
-    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
   else
-    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
 
   if (ShouldSwap) {
     std::swap(LHS, RHS);
-    std::swap(LHSCst, RHSCst);
-    std::swap(LHSCC, RHSCC);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
   }
 
   // At this point, we know we have two icmp instructions
@@ -1834,127 +1814,98 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
   // icmp folding check above), that the two constants are not
   // equal.
-  assert(LHSCst != RHSCst && "Compares not folded above?");
+  assert(LHSC != RHSC && "Compares not folded above?");
 
-  switch (LHSCC) {
-  default: llvm_unreachable("Unknown integer condition code!");
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
   case ICmpInst::ICMP_EQ:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ:
-      if (LHS->getOperand(0) == RHS->getOperand(0)) {
-        // if LHSCst and RHSCst differ only by one bit:
-        // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2
-        assert(LHSCst->getValue().ule(LHSCst->getValue()));
-
-        APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
-        if (Xor.isPowerOf2()) {
-          Value *Cst = Builder->getInt(Xor);
-          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst);
-          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);
-        }
-      }
-
-      if (LHSCst == SubOne(RHSCst)) {
-        // (X == 13 | X == 14) -> X-13 <u 2
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
-        return Builder->CreateICmpULT(Add, AddCST);
-      }
-
-      break;                         // (X == 13 | X == 15) -> no change
-    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
-    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      // Potential folds for this case should already be handled.
+      break;
+    case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change
       break;
-    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
-    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+    case ICmpInst::ICMP_NE:  // (X == 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT: // (X == 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_SLT: // (X == 13 | X s< 15) -> X s< 15
       return RHS;
     }
     break;
   case ICmpInst::ICMP_NE:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
-    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
-    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X != 13 | X == 15) -> X != 13
+    case ICmpInst::ICMP_UGT: // (X != 13 | X u> 15) -> X != 13
+    case ICmpInst::ICMP_SGT: // (X != 13 | X s> 15) -> X != 13
       return LHS;
-    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
-    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
-    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+    case ICmpInst::ICMP_NE:  // (X != 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT: // (X != 13 | X u< 15) -> true
+    case ICmpInst::ICMP_SLT: // (X != 13 | X s< 15) -> true
       return Builder->getTrue();
     }
   case ICmpInst::ICMP_ULT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
-      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
+      // If RHSC is [us]MAXINT, it is always false.  Not handling
       // this can cause overflow.
-      if (RHSCst->isMaxValue(false))
+      if (RHSC->isMaxValue(false))
         return LHS;
-      return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
                              false, false);
-    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_NE:  // (X u< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT: // (X u< 13 | X u< 15) -> X u< 15
       return RHS;
-    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SLT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
-      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2
+      // If RHSC is [us]MAXINT, it is always false.  Not handling
       // this can cause overflow.
-      if (RHSCst->isMaxValue(true))
+      if (RHSC->isMaxValue(true))
         return LHS;
-      return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
-                             true, false);
-    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
+                             false);
+    case ICmpInst::ICMP_NE:  // (X s< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_SLT: // (X s< 13 | X s< 15) -> X s< 15
       return RHS;
-    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_UGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
-    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X u> 13 | X == 15) -> X u> 13
+    case ICmpInst::ICMP_UGT: // (X u> 13 | X u> 15) -> X u> 13
       return LHS;
-    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
-    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+    case ICmpInst::ICMP_NE:  // (X u> 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT: // (X u> 13 | X u< 15) -> true
       return Builder->getTrue();
-    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
-    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X s> 13 | X == 15) -> X > 13
+    case ICmpInst::ICMP_SGT: // (X s> 13 | X s> 15) -> X > 13
       return LHS;
-    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
-    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+    case ICmpInst::ICMP_NE:  // (X s> 13 | X != 15) -> true
+    case ICmpInst::ICMP_SLT: // (X s> 13 | X s< 15) -> true
       return Builder->getTrue();
-    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
-      break;
     }
     break;
   }
@@ -2100,17 +2051,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     ConstantInt *C1 = nullptr; Value *X = nullptr;
-    // (X & C1) | C2 --> (X | C2) & (C1|C2)
-    // iff (C1 & C2) == 0.
-    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) &&
-        (RHS->getValue() & C1->getValue()) != 0 &&
-        Op0->hasOneUse()) {
-      Value *Or = Builder->CreateOr(X, RHS);
-      Or->takeName(Op0);
-      return BinaryOperator::CreateAnd(Or,
-                             Builder->getInt(RHS->getValue() | C1->getValue()));
-    }
-
     // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
     if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
         Op0->hasOneUse()) {
@@ -2119,45 +2059,51 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       return BinaryOperator::CreateXor(Or,
                             Builder->getInt(C1->getValue() & ~RHS->getValue()));
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
   // Given an OR instruction, check to see if this is a bswap.
   if (Instruction *BSwap = MatchBSwap(I))
     return BSwap;
 
-  Value *A = nullptr, *B = nullptr;
-  ConstantInt *C1 = nullptr, *C2 = nullptr;
+  {
+    Value *A;
+    const APInt *C;
+    // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+    if (match(Op0, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op1, *C, 0, &I)) {
+      Value *NOr = Builder->CreateOr(A, Op1);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       cast<Instruction>(Op0)->getOperand(1));
+    }
 
-  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
-  if (Op0->hasOneUse() &&
-      match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op1, C1->getValue(), 0, &I)) {
-    Value *NOr = Builder->CreateOr(A, Op1);
-    NOr->takeName(Op0);
-    return BinaryOperator::CreateXor(NOr, C1);
+    // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+    if (match(Op1, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op0, *C, 0, &I)) {
+      Value *NOr = Builder->CreateOr(A, Op0);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       cast<Instruction>(Op1)->getOperand(1));
+    }
   }
 
-  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
-  if (Op1->hasOneUse() &&
-      match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op0, C1->getValue(), 0, &I)) {
-    Value *NOr = Builder->CreateOr(A, Op0);
-    NOr->takeName(Op0);
-    return BinaryOperator::CreateXor(NOr, C1);
-  }
+  Value *A, *B;
 
   // ((~A & B) | A) -> (A | B)
-  if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-      match(Op1, m_Specific(A)))
-    return BinaryOperator::CreateOr(A, B);
+  if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A))))
+    return BinaryOperator::CreateOr(A, Op1);
+  if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op0, A);
 
   // ((A & B) | ~A) -> (~A | B)
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      match(Op1, m_Not(m_Specific(A))))
-    return BinaryOperator::CreateOr(Builder->CreateNot(A), B);
+  // The NOT is guaranteed to be in the RHS by complexity ordering.
+  if (match(Op1, m_Not(m_Value(A))) &&
+      match(Op0, m_c_And(m_Specific(A), m_Value(B))))
+    return BinaryOperator::CreateOr(Op1, B);
 
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
@@ -2177,8 +2123,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
       match(Op1, m_And(m_Value(B), m_Value(D)))) {
     Value *V1 = nullptr, *V2 = nullptr;
-    C1 = dyn_cast<ConstantInt>(C);
-    C2 = dyn_cast<ConstantInt>(D);
+    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
+    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
     if (C1 && C2) {  // (A & C1)|(B & C2)
       if ((C1->getValue() & C2->getValue()) == 0) {
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
@@ -2403,6 +2349,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   // be simplified by a later pass either, so we try swapping the inner/outer
   // ORs in the hopes that we'll be able to simplify it this way.
   // (X|C) | V --> (X|V) | C
+  ConstantInt *C1;
   if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) &&
       match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) {
     Value *Inner = Builder->CreateOr(A, Op1);
@@ -2493,23 +2440,22 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  if (Constant *RHS = dyn_cast<Constant>(Op1)) {
-    if (RHS->isAllOnesValue() && Op0->hasOneUse())
-      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
-      if (CmpInst *CI = dyn_cast<CmpInst>(Op0))
-        return CmpInst::Create(CI->getOpcode(),
-                               CI->getInversePredicate(),
-                               CI->getOperand(0), CI->getOperand(1));
+  // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+  ICmpInst::Predicate Pred;
+  if (match(Op0, m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))) &&
+      match(Op1, m_AllOnes())) {
+    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return replaceInstUsesWith(I, Op0);
   }
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+  if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) {
     // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
     if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
       if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
         if (CI->hasOneUse() && Op0C->hasOneUse()) {
           Instruction::CastOps Opcode = Op0C->getOpcode();
           if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-              (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(),
+              (RHSC == ConstantExpr::getCast(Opcode, Builder->getTrue(),
                                             Op0C->getDestTy()))) {
             CI->setPredicate(CI->getInversePredicate());
             return CastInst::Create(Opcode, CI, Op0C->getType());
@@ -2520,26 +2466,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
       // ~(c-X) == X-c-1 == X+(-c-1)
-      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+      if (Op0I->getOpcode() == Instruction::Sub && RHSC->isAllOnesValue())
         if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
           Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
-          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
-                                      ConstantInt::get(I.getType(), 1));
-          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1),
+                                           SubOne(NegOp0I0C));
         }
 
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
         if (Op0I->getOpcode() == Instruction::Add) {
           // ~(X-c) --> (-c-1)-X
-          if (RHS->isAllOnesValue()) {
+          if (RHSC->isAllOnesValue()) {
             Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
-            return BinaryOperator::CreateSub(
-                           ConstantExpr::getSub(NegOp0CI,
-                                      ConstantInt::get(I.getType(), 1)),
-                                      Op0I->getOperand(0));
-          } else if (RHS->getValue().isSignBit()) {
+            return BinaryOperator::CreateSub(SubOne(NegOp0CI),
+                                             Op0I->getOperand(0));
+          } else if (RHSC->getValue().isSignBit()) {
             // (X + C) ^ signbit -> (X + C + signbit)
-            Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue());
+            Constant *C = Builder->getInt(RHSC->getValue() + Op0CI->getValue());
             return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
 
           }
@@ -2547,10 +2490,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
           // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
           if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue(),
                                 0, &I)) {
-            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHSC);
             // Anything in both C1 and C2 is known to be zero, remove it from
             // NewRHS.
-            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHSC);
             NewRHS = ConstantExpr::getAnd(NewRHS,
                                        ConstantExpr::getNot(CommonBits));
             Worklist.Add(Op0I);
@@ -2568,7 +2511,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
               E1->getOpcode() == Instruction::Xor &&
               (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
             // fold (C1 >> C2) ^ C3
-            ConstantInt *C2 = Op0CI, *C3 = RHS;
+            ConstantInt *C2 = Op0CI, *C3 = RHSC;
             APInt FoldConst = C1->getValue().lshr(C2->getValue());
             FoldConst ^= C3->getValue();
             // Prepare the two operands.
@@ -2582,27 +2525,26 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         }
       }
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
-  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
-  if (Op1I) {
+  {
     Value *A, *B;
-    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
-      if (A == Op0) {              // B^(B|A) == (A|B)^B
-        Op1I->swapOperands();
-        I.swapOperands();
-        std::swap(Op0, Op1);
-      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+    if (match(Op1, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
+      if (A == Op0) {                                      // A^(A|B) == A^(B|A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B|A) == (B|A)^A
         I.swapOperands();     // Simplified below.
         std::swap(Op0, Op1);
       }
-    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) &&
-               Op1I->hasOneUse()){
+    } else if (match(Op1, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
       if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
-        Op1I->swapOperands();
+        cast<BinaryOperator>(Op1)->swapOperands();
         std::swap(A, B);
       }
       if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
@@ -2612,65 +2554,63 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
-  if (Op0I) {
+  {
     Value *A, *B;
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        Op0I->hasOneUse()) {
+    if (match(Op0, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
       if (A == Op1)                                  // (B|A)^B == (A|B)^B
         std::swap(A, B);
       if (B == Op1)                                  // (A|B)^B == A & ~B
         return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
-    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-               Op0I->hasOneUse()){
+    } else if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
       if (A == Op1)                                        // (A&B)^A -> (B&A)^A
         std::swap(A, B);
+      const APInt *C;
       if (B == Op1 &&                                      // (B&A)^A == ~B & A
-          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+          !match(Op1, m_APInt(C))) {  // Canonical form is (B&C)^C
         return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1);
       }
     }
   }
 
-  if (Op0I && Op1I) {
+  {
     Value *A, *B, *C, *D;
     // (A & B)^(A | B) -> A ^ B
-    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Or(m_Value(C), m_Value(D)))) {
       if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
     // (A | B)^(A & B) -> A ^ B
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_And(m_Value(C), m_Value(D)))) {
       if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
     // (A | ~B) ^ (~A | B) -> A ^ B
     // (~B | A) ^ (~A | B) -> A ^ B
-    if (match(Op0I, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B))))
+    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B))))
       return BinaryOperator::CreateXor(A, B);
 
     // (~A | B) ^ (A | ~B) -> A ^ B
-    if (match(Op0I, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
-        match(Op1I, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
+    if (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
       return BinaryOperator::CreateXor(A, B);
     }
     // (A & ~B) ^ (~A & B) -> A ^ B
     // (~B & A) ^ (~A & B) -> A ^ B
-    if (match(Op0I, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B))))
+    if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B))))
       return BinaryOperator::CreateXor(A, B);
 
     // (~A & B) ^ (A & ~B) -> A ^ B
-    if (match(Op0I, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-        match(Op1I, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
+    if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
       return BinaryOperator::CreateXor(A, B);
     }
     // (A ^ C)^(A | B) -> ((~A) & B) ^ C
-    if (match(Op0I, m_Xor(m_Value(D), m_Value(C))) &&
-        match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+    if (match(Op0, m_Xor(m_Value(D), m_Value(C))) &&
+        match(Op1, m_Or(m_Value(A), m_Value(B)))) {
       if (D == A)
         return BinaryOperator::CreateXor(
             Builder->CreateAnd(Builder->CreateNot(A), B), C);
@@ -2679,8 +2619,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             Builder->CreateAnd(Builder->CreateNot(B), A), C);
     }
     // (A | B)^(A ^ C) -> ((~A) & B) ^ C
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Xor(m_Value(D), m_Value(C)))) {
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Xor(m_Value(D), m_Value(C)))) {
       if (D == A)
         return BinaryOperator::CreateXor(
             Builder->CreateAnd(Builder->CreateNot(A), B), C);
@@ -2689,12 +2629,12 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             Builder->CreateAnd(Builder->CreateNot(B), A), C);
     }
     // (A & B) ^ (A ^ B) -> (A | B)
-    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Xor(m_Specific(A), m_Specific(B))))
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
     // (A ^ B) ^ (A & B) -> (A | B)
-    if (match(Op0I, m_Xor(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_And(m_Specific(A), m_Specific(B))))
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2ef82ba3ed8c..69484f47223f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -60,6 +60,12 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
+    "unfold-element-atomic-memcpy-max-elements",
+    cl::init(16),
+    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
+             "allowed to unfold"));
+
 /// Return the specified type promoted as it would be to pass though a va_arg
 /// area.
 static Type *getPromotedType(Type *Ty) {
@@ -70,27 +76,6 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
-/// Given an aggregate type which ultimately holds a single scalar element,
-/// like {{{type}}} or [1 x type], return type.
-static Type *reduceToSingleValueType(Type *T) {
-  while (!T->isSingleValueType()) {
-    if (StructType *STy = dyn_cast<StructType>(T)) {
-      if (STy->getNumElements() == 1)
-        T = STy->getElementType(0);
-      else
-        break;
-    } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) {
-      if (ATy->getNumElements() == 1)
-        T = ATy->getElementType();
-      else
-        break;
-    } else
-      break;
-  }
-
-  return T;
-}
-
 /// Return a constant boolean vector that has true elements in all positions
 /// where the input constant data vector has an element with the sign bit set.
 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
@@ -108,6 +93,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
   return ConstantVector::get(BoolVec);
 }
 
+Instruction *
+InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
+  // Try to unfold this intrinsic into sequence of explicit atomic loads and
+  // stores.
+  // First check that number of elements is compile time constant.
+  auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
+  if (!NumElementsCI)
+    return nullptr;
+
+  // Check that there are not too many elements.
+  uint64_t NumElements = NumElementsCI->getZExtValue();
+  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
+    return nullptr;
+
+  // Don't unfold into illegal integers
+  uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
+  if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
+    return nullptr;
+
+  // Cast source and destination to the correct type. Intrinsic input arguments
+  // are usually represented as i8*.
+  // Often operands will be explicitly casted to i8* and we can just strip
+  // those casts instead of inserting new ones. However it's easier to rely on
+  // other InstCombine rules which will cover trivial cases anyway.
+  Value *Src = AMI->getRawSource();
+  Value *Dst = AMI->getRawDest();
+  Type *ElementPointerType = Type::getIntNPtrTy(
+      AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
+
+  Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
+                                                "memcpy_unfold.src_casted");
+  Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
+                                                "memcpy_unfold.dst_casted");
+
+  for (uint64_t i = 0; i < NumElements; ++i) {
+    // Get current element addresses
+    ConstantInt *ElementIdxCI =
+        ConstantInt::get(AMI->getContext(), APInt(64, i));
+    Value *SrcElementAddr =
+        Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+    Value *DstElementAddr =
+        Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+    // Load from the source. Transfer alignment information and mark load as
+    // unordered atomic.
+    LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+    Load->setOrdering(AtomicOrdering::Unordered);
+    // We know alignment of the first element. It is also guaranteed by the
+    // verifier that element size is less or equal than first element alignment
+    // and both of this values are powers of two.
+    // This means that all subsequent accesses are at least element size
+    // aligned.
+    // TODO: We can infer better alignment but there is no evidence that this
+    // will matter.
+    Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
+                              : AMI->getElementSizeInBytes());
+    Load->setDebugLoc(AMI->getDebugLoc());
+
+    // Store loaded value via unordered atomic store.
+    StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
+    Store->setOrdering(AtomicOrdering::Unordered);
+    Store->setAlignment(i == 0 ? AMI->getDstAlignment()
+                               : AMI->getElementSizeInBytes());
+    Store->setDebugLoc(AMI->getDebugLoc());
+  }
+
+  // Set the number of elements of the copy to 0, it will be deleted on the
+  // next iteration.
+  AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
+  return AMI;
+}
+
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
   unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
@@ -144,41 +201,19 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
   Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
 
-  // Memcpy forces the use of i8* for the source and destination.  That means
-  // that if you're using memcpy to move one double around, you'll get a cast
-  // from double* to i8*.  We'd much rather use a double load+store rather than
-  // an i64 load+store, here because this improves the odds that the source or
-  // dest address will be promotable.  See if we can find a better type than the
-  // integer datatype.
-  Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  // If the memcpy has metadata describing the members, see if we can get the
+  // TBAA tag describing our copy.
   MDNode *CopyMD = nullptr;
-  if (StrippedDest != MI->getArgOperand(0)) {
-    Type *SrcETy = cast<PointerType>(StrippedDest->getType())
-                                    ->getElementType();
-    if (SrcETy->isSized() && DL.getTypeStoreSize(SrcETy) == Size) {
-      // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
-      // down through these levels if so.
-      SrcETy = reduceToSingleValueType(SrcETy);
-
-      if (SrcETy->isSingleValueType()) {
-        NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
-        NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
-
-        // If the memcpy has metadata describing the members, see if we can
-        // get the TBAA tag describing our copy.
-        if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
-          if (M->getNumOperands() == 3 && M->getOperand(0) &&
-              mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
-              mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() &&
-              M->getOperand(1) &&
-              mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
-              mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
-                  Size &&
-              M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
-            CopyMD = cast<MDNode>(M->getOperand(2));
-        }
-      }
-    }
+  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+    if (M->getNumOperands() == 3 && M->getOperand(0) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() &&
+        M->getOperand(1) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+        Size &&
+        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+      CopyMD = cast<MDNode>(M->getOperand(2));
   }
 
   // If the memcpy/memmove provides better alignment info than we can
@@ -510,6 +545,131 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
+static Value *simplifyX86muldq(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+  assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
+         Arg1->getType()->getScalarSizeInBits() == 32 &&
+         ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
+
+  // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
+  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
+    return ConstantAggregateZero::get(ResTy);
+
+  // Constant folding.
+  // PMULDQ  = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 sext(shuffle<0,2,..>(Arg1))))
+  // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 zext(shuffle<0,2,..>(Arg1))))
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  unsigned NumElts = ResTy->getVectorNumElements();
+  assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
+         Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
+         "Unexpected muldq/muludq types");
+
+  unsigned IntrinsicID = II.getIntrinsicID();
+  bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
+                   Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
+                   Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
+
+  SmallVector<unsigned, 16> ShuffleMask;
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i * 2);
+
+  auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
+  auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
+
+  if (IsSigned) {
+    LHS = Builder.CreateSExt(LHS, ResTy);
+    RHS = Builder.CreateSExt(RHS, ResTy);
+  } else {
+    LHS = Builder.CreateZExt(LHS, ResTy);
+    RHS = Builder.CreateZExt(RHS, ResTy);
+  }
+
+  return Builder.CreateMul(LHS, RHS);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II, InstCombiner &IC,
+                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+
+  // Fast all undef handling.
+  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+    return UndefValue::get(ResTy);
+
+  Type *ArgTy = Arg0->getType();
+  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+  unsigned NumDstElts = ResTy->getVectorNumElements();
+  unsigned NumSrcElts = ArgTy->getVectorNumElements();
+  assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types");
+
+  unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
+  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+  assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) &&
+         "Unexpected packing types");
+
+  // Constant folding.
+  auto *Cst0 = dyn_cast<Constant>(Arg0);
+  auto *Cst1 = dyn_cast<Constant>(Arg1);
+  if (!Cst0 || !Cst1)
+    return nullptr;
+
+  SmallVector<Constant *, 32> Vals;
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
+      unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
+      auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0;
+      auto *COp = Cst->getAggregateElement(SrcIdx);
+      if (COp && isa<UndefValue>(COp)) {
+        Vals.push_back(UndefValue::get(ResTy->getScalarType()));
+        continue;
+      }
+
+      auto *CInt = dyn_cast_or_null<ConstantInt>(COp);
+      if (!CInt)
+        return nullptr;
+
+      APInt Val = CInt->getValue();
+      assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() &&
+             "Unexpected constant bitwidth");
+
+      if (IsSigned) {
+        // PACKSS: Truncate signed value with signed saturation.
+        // Source values less than dst minint are saturated to minint.
+        // Source values greater than dst maxint are saturated to maxint.
+        if (Val.isSignedIntN(DstScalarSizeInBits))
+          Val = Val.trunc(DstScalarSizeInBits);
+        else if (Val.isNegative())
+          Val = APInt::getSignedMinValue(DstScalarSizeInBits);
+        else
+          Val = APInt::getSignedMaxValue(DstScalarSizeInBits);
+      } else {
+        // PACKUS: Truncate signed value with unsigned saturation.
+        // Source values less than zero are saturated to zero.
+        // Source values greater than dst maxuint are saturated to maxuint.
+        if (Val.isIntN(DstScalarSizeInBits))
+          Val = Val.trunc(DstScalarSizeInBits);
+        else if (Val.isNegative())
+          Val = APInt::getNullValue(DstScalarSizeInBits);
+        else
+          Val = APInt::getAllOnesValue(DstScalarSizeInBits);
+      }
+
+      Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val));
+    }
+  }
+
+  return ConstantVector::get(Vals);
+}
+
 static Value *simplifyX86movmsk(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
@@ -1330,6 +1490,27 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   return true;
 }
 
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1373,6 +1554,254 @@ static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
   return false;
 }
 
+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+  // Each NVVM intrinsic we can simplify can be replaced with one of:
+  //
+  //  * an LLVM intrinsic,
+  //  * an LLVM cast operation,
+  //  * an LLVM binary operation, or
+  //  * ad-hoc LLVM IR for the particular operation.
+
+  // Some transformations are only valid when the module's
+  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+  // transformations are valid regardless of the module's ftz setting.
+  enum FtzRequirementTy {
+    FTZ_Any,       // Any ftz setting is ok.
+    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
+    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+  };
+  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+  // simplify.
+  enum SpecialCase {
+    SPC_Reciprocal,
+  };
+
+  // SimplifyAction is a poor-man's variant (plus an additional flag) that
+  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+  struct SimplifyAction {
+    // Invariant: At most one of these Optionals has a value.
+    Optional<Intrinsic::ID> IID;
+    Optional<Instruction::CastOps> CastOp;
+    Optional<Instruction::BinaryOps> BinaryOp;
+    Optional<SpecialCase> Special;
+
+    FtzRequirementTy FtzRequirement = FTZ_Any;
+
+    SimplifyAction() = default;
+
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+        : IID(IID), FtzRequirement(FtzReq) {}
+
+    // Cast operations don't have anything to do with FTZ, so we skip that
+    // argument.
+    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+        : Special(Special), FtzRequirement(FtzReq) {}
+  };
+
+  // Try to generate a SimplifyAction describing how to replace our
+  // IntrinsicInstr with target-generic LLVM IR.
+  const SimplifyAction Action = [II]() -> SimplifyAction {
+    switch (II->getIntrinsicID()) {
+
+    // NVVM intrinsics that map directly to LLVM intrinsics.
+    case Intrinsic::nvvm_ceil_d:
+      return {Intrinsic::ceil, FTZ_Any};
+    case Intrinsic::nvvm_ceil_f:
+      return {Intrinsic::ceil, FTZ_MustBeOff};
+    case Intrinsic::nvvm_ceil_ftz_f:
+      return {Intrinsic::ceil, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fabs_d:
+      return {Intrinsic::fabs, FTZ_Any};
+    case Intrinsic::nvvm_fabs_f:
+      return {Intrinsic::fabs, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fabs_ftz_f:
+      return {Intrinsic::fabs, FTZ_MustBeOn};
+    case Intrinsic::nvvm_floor_d:
+      return {Intrinsic::floor, FTZ_Any};
+    case Intrinsic::nvvm_floor_f:
+      return {Intrinsic::floor, FTZ_MustBeOff};
+    case Intrinsic::nvvm_floor_ftz_f:
+      return {Intrinsic::floor, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_d:
+      return {Intrinsic::fma, FTZ_Any};
+    case Intrinsic::nvvm_fma_rn_f:
+      return {Intrinsic::fma, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fma_rn_ftz_f:
+      return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_d:
+      return {Intrinsic::maxnum, FTZ_Any};
+    case Intrinsic::nvvm_fmax_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_d:
+      return {Intrinsic::minnum, FTZ_Any};
+    case Intrinsic::nvvm_fmin_f:
+      return {Intrinsic::minnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_f:
+      return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_round_d:
+      return {Intrinsic::round, FTZ_Any};
+    case Intrinsic::nvvm_round_f:
+      return {Intrinsic::round, FTZ_MustBeOff};
+    case Intrinsic::nvvm_round_ftz_f:
+      return {Intrinsic::round, FTZ_MustBeOn};
+    case Intrinsic::nvvm_sqrt_rn_d:
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_f:
+      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
+      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
+      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
+      // the versions with explicit ftz-ness.
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_rn_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOff};
+    case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOn};
+    case Intrinsic::nvvm_trunc_d:
+      return {Intrinsic::trunc, FTZ_Any};
+    case Intrinsic::nvvm_trunc_f:
+      return {Intrinsic::trunc, FTZ_MustBeOff};
+    case Intrinsic::nvvm_trunc_ftz_f:
+      return {Intrinsic::trunc, FTZ_MustBeOn};
+
+    // NVVM intrinsics that map to LLVM cast operations.
+    //
+    // Note that llvm's target-generic conversion operators correspond to the rz
+    // (round to zero) versions of the nvvm conversion intrinsics, even though
+    // most everything else here uses the rn (round to nearest even) nvvm ops.
+    case Intrinsic::nvvm_d2i_rz:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_d2ll_rz:
+    case Intrinsic::nvvm_f2ll_rz:
+      return {Instruction::FPToSI};
+    case Intrinsic::nvvm_d2ui_rz:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_d2ull_rz:
+    case Intrinsic::nvvm_f2ull_rz:
+      return {Instruction::FPToUI};
+    case Intrinsic::nvvm_i2d_rz:
+    case Intrinsic::nvvm_i2f_rz:
+    case Intrinsic::nvvm_ll2d_rz:
+    case Intrinsic::nvvm_ll2f_rz:
+      return {Instruction::SIToFP};
+    case Intrinsic::nvvm_ui2d_rz:
+    case Intrinsic::nvvm_ui2f_rz:
+    case Intrinsic::nvvm_ull2d_rz:
+    case Intrinsic::nvvm_ull2f_rz:
+      return {Instruction::UIToFP};
+
+    // NVVM intrinsics that map to LLVM binary ops.
+    case Intrinsic::nvvm_add_rn_d:
+      return {Instruction::FAdd, FTZ_Any};
+    case Intrinsic::nvvm_add_rn_f:
+      return {Instruction::FAdd, FTZ_MustBeOff};
+    case Intrinsic::nvvm_add_rn_ftz_f:
+      return {Instruction::FAdd, FTZ_MustBeOn};
+    case Intrinsic::nvvm_mul_rn_d:
+      return {Instruction::FMul, FTZ_Any};
+    case Intrinsic::nvvm_mul_rn_f:
+      return {Instruction::FMul, FTZ_MustBeOff};
+    case Intrinsic::nvvm_mul_rn_ftz_f:
+      return {Instruction::FMul, FTZ_MustBeOn};
+    case Intrinsic::nvvm_div_rn_d:
+      return {Instruction::FDiv, FTZ_Any};
+    case Intrinsic::nvvm_div_rn_f:
+      return {Instruction::FDiv, FTZ_MustBeOff};
+    case Intrinsic::nvvm_div_rn_ftz_f:
+      return {Instruction::FDiv, FTZ_MustBeOn};
+
+    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+    // need special handling.
+    //
+    // We seem to be mising intrinsics for rcp.approx.{ftz.}f32, which is just
+    // as well.
+    case Intrinsic::nvvm_rcp_rn_d:
+      return {SPC_Reciprocal, FTZ_Any};
+    case Intrinsic::nvvm_rcp_rn_f:
+      return {SPC_Reciprocal, FTZ_MustBeOff};
+    case Intrinsic::nvvm_rcp_rn_ftz_f:
+      return {SPC_Reciprocal, FTZ_MustBeOn};
+
+    // We do not currently simplify intrinsics that give an approximate answer.
+    // These include:
+    //
+    //   - nvvm_cos_approx_{f,ftz_f}
+    //   - nvvm_ex2_approx_{d,f,ftz_f}
+    //   - nvvm_lg2_approx_{d,f,ftz_f}
+    //   - nvvm_sin_approx_{f,ftz_f}
+    //   - nvvm_sqrt_approx_{f,ftz_f}
+    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
+    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
+    //   - nvvm_rcp_approx_ftz_d
+    //
+    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+    // means that fastmath is enabled in the intrinsic.  Unfortunately only
+    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
+    // information gets lost and we can't select on it.
+    //
+    // TODO: div and rcp are lowered to a binary op, so these we could in theory
+    // lower them to "fast fdiv".
+
+    default:
+      return {};
+    }
+  }();
+
+  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+  // can bail out now.  (Notice that in the case that IID is not an NVVM
+  // intrinsic, we don't have to look up any module metadata, as
+  // FtzRequirementTy will be FTZ_Any.)
+  if (Action.FtzRequirement != FTZ_Any) {
+    bool FtzEnabled =
+        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
+        "true";
+
+    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+      return nullptr;
+  }
+
+  // Simplify to target-generic intrinsic.
+  if (Action.IID) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    // All the target-generic intrinsics currently of interest to us have one
+    // type argument, equal to that of the nvvm intrinsic's argument.
+    Type *Tys[] = {II->getArgOperand(0)->getType()};
+    return CallInst::Create(
+        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+  }
+
+  // Simplify to target-generic binary op.
+  if (Action.BinaryOp)
+    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+                                  II->getArgOperand(1), II->getName());
+
+  // Simplify to target-generic cast op.
+  if (Action.CastOp)
+    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+                            II->getName());
+
+  // All that's left are the special cases.
+  if (!Action.Special)
+    return nullptr;
+
+  switch (*Action.Special) {
+  case SPC_Reciprocal:
+    // Simplify reciprocal.
+    return BinaryOperator::Create(
+        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+        II->getArgOperand(0), II->getName());
+  }
+  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
 Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
   removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
   return nullptr;
@@ -1462,6 +1891,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
+  if (auto *AMI = dyn_cast<ElementAtomicMemCpyInst>(II)) {
+    if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
+      if (C->isNullValue())
+        return eraseInstFromFunction(*AMI);
+
+    if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
+      return I;
+  }
+
+  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
+    return I;
+
   auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
                                               unsigned DemandedWidth) {
     APInt UndefElts(Width, 0);
@@ -1581,8 +2022,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, V);
     break;
   }
-  case Intrinsic::fma:
   case Intrinsic::fmuladd: {
+    // Canonicalize fast fmuladd to the separate fmul + fadd.
+    if (II->hasUnsafeAlgebra()) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->setFastMathFlags(II->getFastMathFlags());
+      Value *Mul = Builder->CreateFMul(II->getArgOperand(0),
+                                       II->getArgOperand(1));
+      Value *Add = Builder->CreateFAdd(Mul, II->getArgOperand(2));
+      Add->takeName(II);
+      return replaceInstUsesWith(*II, Add);
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::fma: {
     Value *Src0 = II->getArgOperand(0);
     Value *Src1 = II->getArgOperand(1);
 
@@ -1631,6 +2085,26 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return SelectInst::Create(Cond, Call0, Call1);
     }
 
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::ceil:
+  case Intrinsic::floor:
+  case Intrinsic::round:
+  case Intrinsic::nearbyint:
+  case Intrinsic::rint:
+  case Intrinsic::trunc: {
+    Value *ExtSrc;
+    if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
+        II->getArgOperand(0)->hasOneUse()) {
+      // fabs (fpext x) -> fpext (fabs x)
+      Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
+                                           { ExtSrc->getType() });
+      CallInst *NewFabs = Builder->CreateCall(F, ExtSrc);
+      NewFabs->copyFastMathFlags(II);
+      NewFabs->takeName(II);
+      return new FPExtInst(NewFabs, II->getType());
+    }
+
     break;
   }
   case Intrinsic::cos:
@@ -1863,6 +2337,37 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     break;
   }
+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    bool Arg0IsZero = match(Arg0, m_Zero());
+    if (Arg0IsZero)
+      std::swap(Arg0, Arg1);
+    Value *A, *B;
+    // This fold requires only the NINF(not +/- inf) since inf minus
+    // inf is nan.
+    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
+    // equal for both compares.
+    // NNAN is not needed because nans compare the same for both compares.
+    // The compare intrinsic uses the above assumptions and therefore
+    // doesn't require additional flags.
+    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
+         match(Arg1, m_Zero()) &&
+         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
+      if (Arg0IsZero)
+        std::swap(A, B);
+      II->setArgOperand(0, A);
+      II->setArgOperand(1, B);
+      return II;
+    }
+    break;
+  }
 
   case Intrinsic::x86_avx512_mask_add_ps_512:
   case Intrinsic::x86_avx512_mask_div_ps_512:
@@ -2130,6 +2635,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx2_pmulu_dq:
   case Intrinsic::x86_avx512_pmul_dq_512:
   case Intrinsic::x86_avx512_pmulu_dq_512: {
+    if (Value *V = simplifyX86muldq(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+
     unsigned VWidth = II->getType()->getVectorNumElements();
     APInt UndefElts(VWidth, 0);
     APInt DemandedElts = APInt::getAllOnesValue(VWidth);
@@ -2141,6 +2649,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+    if (Value *V = simplifyX86pack(*II, *this, *Builder, true))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512:
+    if (Value *V = simplifyX86pack(*II, *this, *Builder, false))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_pclmulqdq: {
+    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      unsigned Imm = C->getZExtValue();
+
+      bool MadeChange = false;
+      Value *Arg0 = II->getArgOperand(0);
+      Value *Arg1 = II->getArgOperand(1);
+      unsigned VWidth = Arg0->getType()->getVectorNumElements();
+      APInt DemandedElts(VWidth, 0);
+
+      APInt UndefElts1(VWidth, 0);
+      DemandedElts = (Imm & 0x01) ? 2 : 1;
+      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
+                                                UndefElts1)) {
+        II->setArgOperand(0, V);
+        MadeChange = true;
+      }
+
+      APInt UndefElts2(VWidth, 0);
+      DemandedElts = (Imm & 0x10) ? 2 : 1;
+      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
+                                                UndefElts2)) {
+        II->setArgOperand(1, V);
+        MadeChange = true;
+      }
+
+      // If both input elements are undef, the result is undef.
+      if (UndefElts1[(Imm & 0x01) ? 1 : 0] ||
+          UndefElts2[(Imm & 0x10) ? 1 : 0])
+        return replaceInstUsesWith(*II,
+                                   ConstantAggregateZero::get(II->getType()));
+
+      if (MadeChange)
+        return II;
+    }
+    break;
+  }
+
   case Intrinsic::x86_sse41_insertps:
     if (Value *V = simplifyX86insertps(*II, *Builder))
       return replaceInstUsesWith(*II, V);
@@ -2531,9 +3097,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
-
   case Intrinsic::amdgcn_rcp: {
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
       APFloat Val(ArgVal.getSemantics(), 1.0);
       APFloat::opStatus Status = Val.divide(ArgVal,
@@ -2546,6 +3117,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+    break;
+  }
   case Intrinsic::amdgcn_frexp_mant:
   case Intrinsic::amdgcn_frexp_exp: {
     Value *Src = II->getArgOperand(0);
@@ -2650,6 +3229,274 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
   }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem
+          = II->getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded = ConstantVector::get({
+            ConstantFP::get(II->getContext(), Val0),
+            ConstantFP::get(II->getContext(), Val1) });
+        return replaceInstUsesWith(*II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II->getArgOperand(0);
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(*II, Src);
+
+    unsigned Width;
+    Type *Ty = II->getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0)
+        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
+
+      if (Width >= IntSize) {
+        // Hardware ignores high bits, so remove those.
+        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
+                                              Width & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
+                                              Offset & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;
+
+    // TODO: Also emit sub if only width is constant.
+    if (!CWidth && COffset && Offset == 0) {
+      Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
+      Value *ShiftVal = Builder->CreateSub(KSize, II->getArgOperand(2));
+      ShiftVal = Builder->CreateZExt(ShiftVal, II->getType());
+
+      Value *Shl = Builder->CreateShl(Src, ShiftVal);
+      Value *RightShift = Signed ?
+        Builder->CreateAShr(Shl, ShiftVal) :
+        Builder->CreateLShr(Shl, ShiftVal);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    if (!CWidth || !COffset)
+      break;
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = Builder->CreateShl(Src, IntSize  - Offset - Width);
+      Value *RightShift = Signed ?
+        Builder->CreateAShr(Shl, IntSize - Width) :
+        Builder->CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    Value *RightShift = Signed ?
+      Builder->CreateAShr(Src, Offset) :
+      Builder->CreateLShr(Src, Offset);
+
+    RightShift->takeName(II);
+    return replaceInstUsesWith(*II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (!En) // Illegal.
+      break;
+
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II->getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed)
+      return II;
+
+    break;
+
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    Value *Src2 = II->getArgOperand(2);
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II->setArgOperand(0, Src0);
+      II->setArgOperand(1, Src1);
+      II->setArgOperand(2, Src2);
+      return II;
+    }
+
+    if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+      CallInst *NewCall = Builder->CreateMinNum(Src0, Src1);
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return replaceInstUsesWith(*II,
+            ConstantFP::get(Builder->getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (!CC)
+      break;
+
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        return replaceInstUsesWith(*II,
+                                   ConstantExpr::getSExt(CCmp, II->getType()));
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred
+        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II->setArgOperand(0, Src1);
+      II->setArgOperand(1, Src0);
+      II->setArgOperand(2, ConstantInt::get(CC->getType(),
+                                            static_cast<int>(SwapPred)));
+      return II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
+         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
+      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, m_Zero()) &&
+        match(Src0,
+              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
+        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
+
+      Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID,
+                                              SrcLHS->getType());
+      Value *Args[] = { SrcLHS, SrcRHS,
+                        ConstantInt::get(CC->getType(), SrcPred) };
+      CallInst *NewCall = Builder->CreateCall(NewF, Args);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    break;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
@@ -2790,7 +3637,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
       // isKnownNonNull -> nonnull attribute
       if (isKnownNonNullAt(DerivedPtr, II, &DT))
-        II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
     }
 
     // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
@@ -2799,11 +3646,38 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
     break;
   }
-  }
 
+  case Intrinsic::experimental_guard: {
+    // Is this guard followed by another guard?
+    Instruction *NextInst = II->getNextNode();
+    Value *NextCond = nullptr;
+    if (match(NextInst,
+              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
+      Value *CurrCond = II->getArgOperand(0);
+
+      // Remove a guard that it is immediately preceded by an identical guard.
+      if (CurrCond == NextCond)
+        return eraseInstFromFunction(*NextInst);
+
+      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+      II->setArgOperand(0, Builder->CreateAnd(CurrCond, NextCond));
+      return eraseInstFromFunction(*NextInst);
+    }
+    break;
+  }
+  }
   return visitCallSite(II);
 }
 
+// Fence instruction simplification
+Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
+  // Remove identical consecutive fences.
+  if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+    if (FI.isIdenticalTo(NFI))
+      return eraseInstFromFunction(FI);
+  return nullptr;
+}
+
 // InvokeInst simplification
 //
 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
@@ -2950,7 +3824,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
 
   for (Value *V : CS.args()) {
     if (V->getType()->isPointerTy() &&
-        !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+        !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
         isKnownNonNullAt(V, CS.getInstruction(), &DT))
       Indices.push_back(ArgNo + 1);
     ArgNo++;
@@ -2959,7 +3833,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   assert(ArgNo == CS.arg_size() && "sanity check");
 
   if (!Indices.empty()) {
-    AttributeSet AS = CS.getAttributes();
+    AttributeList AS = CS.getAttributes();
     LLVMContext &Ctx = CS.getInstruction()->getContext();
     AS = AS.addAttribute(Ctx, Indices,
                          Attribute::get(Ctx, Attribute::NonNull));
@@ -3081,7 +3955,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     return false;
 
   Instruction *Caller = CS.getInstruction();
-  const AttributeSet &CallerPAL = CS.getAttributes();
+  const AttributeList &CallerPAL = CS.getAttributes();
 
   // Okay, this is a cast from a function to a different type.  Unless doing so
   // would cause a type conversion of one of our arguments, change this call to
@@ -3108,7 +3982,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
       if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
         return false;   // Attribute not compatible with transformed value.
     }
@@ -3149,8 +4023,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
-    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
-          overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
+    if (AttrBuilder(CallerPAL.getParamAttributes(i))
+            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
     if (CS.isInAllocaArgument(i))
@@ -3158,9 +4032,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy &&
-        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
-                                                         Attribute::ByVal)) {
+    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (!ParamPTy || !ParamPTy->getElementType()->isSized())
         return false;
@@ -3205,7 +4077,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         break;
 
       // Check if it has an attribute that's incompatible with varargs.
-      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
+      AttributeList PAttrs = CallerPAL.getSlotAttributes(i - 1);
       if (PAttrs.hasAttribute(Index, Attribute::StructRet))
         return false;
     }
@@ -3213,44 +4085,37 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   // Okay, we decided that this is a safe thing to do: go ahead and start
   // inserting cast instructions as necessary.
-  std::vector<Value*> Args;
+  SmallVector<Value *, 8> Args;
+  SmallVector<AttributeSet, 8> ArgAttrs;
   Args.reserve(NumActualArgs);
-  SmallVector<AttributeSet, 8> attrVec;
-  attrVec.reserve(NumCommonArgs);
+  ArgAttrs.reserve(NumActualArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
   RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
 
-  // Add the new return attributes.
-  if (RAttrs.hasAttributes())
-    attrVec.push_back(AttributeSet::get(Caller->getContext(),
-                                        AttributeSet::ReturnIndex, RAttrs));
-
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
     Type *ParamTy = FT->getParamType(i);
 
-    if ((*AI)->getType() == ParamTy) {
-      Args.push_back(*AI);
-    } else {
-      Args.push_back(Builder->CreateBitOrPointerCast(*AI, ParamTy));
-    }
+    Value *NewArg = *AI;
+    if ((*AI)->getType() != ParamTy)
+      NewArg = Builder->CreateBitOrPointerCast(*AI, ParamTy);
+    Args.push_back(NewArg);
 
     // Add any parameter attributes.
-    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
-    if (PAttrs.hasAttributes())
-      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
-                                          PAttrs));
+    ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
   }
 
   // If the function takes more arguments than the call was taking, add them
   // now.
-  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i)
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
     Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+    ArgAttrs.push_back(AttributeSet());
+  }
 
   // If we are removing arguments to the function, emit an obnoxious warning.
   if (FT->getNumParams() < NumActualArgs) {
@@ -3259,54 +4124,56 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       // Add all of the arguments in their promoted form to the arg list.
       for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
         Type *PTy = getPromotedType((*AI)->getType());
+        Value *NewArg = *AI;
         if (PTy != (*AI)->getType()) {
           // Must promote to pass through va_arg area!
           Instruction::CastOps opcode =
             CastInst::getCastOpcode(*AI, false, PTy, false);
-          Args.push_back(Builder->CreateCast(opcode, *AI, PTy));
-        } else {
-          Args.push_back(*AI);
+          NewArg = Builder->CreateCast(opcode, *AI, PTy);
         }
+        Args.push_back(NewArg);
 
         // Add any parameter attributes.
-        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
-        if (PAttrs.hasAttributes())
-          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
-                                              PAttrs));
+        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
       }
     }
   }
 
   AttributeSet FnAttrs = CallerPAL.getFnAttributes();
-  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
-    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
-                                                       attrVec);
+  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
+         "missing argument attributes");
+  LLVMContext &Ctx = Callee->getContext();
+  AttributeList NewCallerPAL = AttributeList::get(
+      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
 
   SmallVector<OperandBundleDef, 1> OpBundles;
   CS.getOperandBundlesAsDefs(OpBundles);
 
-  Instruction *NC;
+  CallSite NewCS;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-    NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles);
-    NC->takeName(II);
-    cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
-    cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
+    NewCS = Builder->CreateInvoke(Callee, II->getNormalDest(),
+                                  II->getUnwindDest(), Args, OpBundles);
   } else {
-    CallInst *CI = cast<CallInst>(Caller);
-    NC = Builder->CreateCall(Callee, Args, OpBundles);
-    NC->takeName(CI);
-    cast<CallInst>(NC)->setTailCallKind(CI->getTailCallKind());
-    cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
-    cast<CallInst>(NC)->setAttributes(NewCallerPAL);
+    NewCS = Builder->CreateCall(Callee, Args, OpBundles);
+    cast<CallInst>(NewCS.getInstruction())
+        ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind());
   }
+  NewCS->takeName(Caller);
+  NewCS.setCallingConv(CS.getCallingConv());
+  NewCS.setAttributes(NewCallerPAL);
+
+  // Preserve the weight metadata for the new call instruction. The metadata
+  // is used by SamplePGO to check callsite's hotness.
+  uint64_t W;
+  if (Caller->extractProfTotalWeight(W))
+    NewCS->setProfWeight(W);
 
   // Insert a cast of the return type as necessary.
+  Instruction *NC = NewCS.getInstruction();
   Value *NV = NC;
   if (OldRetTy != NV->getType() && !Caller->use_empty()) {
     if (!NV->getType()->isVoidTy()) {
@@ -3351,7 +4218,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Value *Callee = CS.getCalledValue();
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
-  const AttributeSet &Attrs = CS.getAttributes();
+  AttributeList Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
@@ -3364,50 +4231,46 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
   FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());
 
-  const AttributeSet &NestAttrs = NestF->getAttributes();
+  AttributeList NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
-    unsigned NestIdx = 1;
+    unsigned NestArgNo = 0;
     Type *NestTy = nullptr;
     AttributeSet NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
-         E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
-      if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) {
+                                      E = NestFTy->param_end();
+         I != E; ++NestArgNo, ++I) {
+      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+      if (AS.hasAttribute(Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
-        NestAttr = NestAttrs.getParamAttributes(NestIdx);
+        NestAttr = AS;
         break;
       }
+    }
 
     if (NestTy) {
       Instruction *Caller = CS.getInstruction();
       std::vector<Value*> NewArgs;
+      std::vector<AttributeSet> NewArgAttrs;
       NewArgs.reserve(CS.arg_size() + 1);
-
-      SmallVector<AttributeSet, 8> NewAttrs;
-      NewAttrs.reserve(Attrs.getNumSlots() + 1);
+      NewArgAttrs.reserve(CS.arg_size());
 
       // Insert the nest argument into the call argument list, which may
       // mean appending it.  Likewise for attributes.
 
-      // Add any result attributes.
-      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
-        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                             Attrs.getRetAttributes()));
-
       {
-        unsigned Idx = 1;
+        unsigned ArgNo = 0;
         CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
         do {
-          if (Idx == NestIdx) {
+          if (ArgNo == NestArgNo) {
             // Add the chain argument and attributes.
             Value *NestVal = Tramp->getArgOperand(2);
             if (NestVal->getType() != NestTy)
               NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
-            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                                 NestAttr));
+            NewArgAttrs.push_back(NestAttr);
           }
 
           if (I == E)
@@ -3415,23 +4278,13 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          AttributeSet Attr = Attrs.getParamAttributes(Idx);
-          if (Attr.hasAttributes(Idx)) {
-            AttrBuilder B(Attr, Idx);
-            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                                 Idx + (Idx >= NestIdx), B));
-          }
+          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
 
-          ++Idx;
+          ++ArgNo;
           ++I;
         } while (true);
       }
 
-      // Add any function attributes.
-      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
-                                             Attrs.getFnAttributes()));
-
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
       // with the chain parameter inserted.
@@ -3442,12 +4295,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       // Insert the chain's type into the list of parameter types, which may
       // mean appending it.
       {
-        unsigned Idx = 1;
+        unsigned ArgNo = 0;
         FunctionType::param_iterator I = FTy->param_begin(),
           E = FTy->param_end();
 
         do {
-          if (Idx == NestIdx)
+          if (ArgNo == NestArgNo)
             // Add the chain's type.
             NewTypes.push_back(NestTy);
 
@@ -3457,7 +4310,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
           // Add the original type.
           NewTypes.push_back(*I);
 
-          ++Idx;
+          ++ArgNo;
           ++I;
         } while (true);
       }
@@ -3470,8 +4323,9 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttributeSet &NewPAL =
-          AttributeSet::get(FTy->getContext(), NewAttrs);
+      AttributeList NewPAL =
+          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
+                             Attrs.getRetAttributes(), NewArgAttrs);
 
       SmallVector<OperandBundleDef, 1> OpBundles;
       CS.getOperandBundlesAsDefs(OpBundles);
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index e74b590e2b7c..25683132c786 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -274,12 +274,12 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       return NV;
 
   // If we are casting a PHI, then fold the cast into the PHI.
-  if (isa<PHINode>(Src)) {
+  if (auto *PN = dyn_cast<PHINode>(Src)) {
     // Don't do this if it would create a PHI node with an illegal type from a
     // legal type.
     if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
-        ShouldChangeType(CI.getType(), Src->getType()))
-      if (Instruction *NV = FoldOpIntoPhi(CI))
+        shouldChangeType(CI.getType(), Src->getType()))
+      if (Instruction *NV = foldOpIntoPhi(CI, PN))
         return NV;
   }
 
@@ -447,7 +447,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC,
 Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
   Type *SrcTy = Trunc.getSrcTy();
   Type *DestTy = Trunc.getType();
-  if (isa<IntegerType>(SrcTy) && !ShouldChangeType(SrcTy, DestTy))
+  if (isa<IntegerType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
     return nullptr;
 
   BinaryOperator *LogicOp;
@@ -463,6 +463,56 @@ Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
   return BinaryOperator::Create(LogicOp->getOpcode(), NarrowOp0, NarrowC);
 }
 
+/// Try to narrow the width of a splat shuffle. This could be generalized to any
+/// shuffle with a constant operand, but we limit the transform to avoid
+/// creating a shuffle type that targets may not be able to lower effectively.
+static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
+  if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
+      Shuf->getMask()->getSplatValue() &&
+      Shuf->getType() == Shuf->getOperand(0)->getType()) {
+    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
+    Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getMask());
+  }
+
+  return nullptr;
+}
+
+/// Try to narrow the width of an insert element. This could be generalized for
+/// any vector constant, but we limit the transform to insertion into undef to
+/// avoid potential backend problems from unsupported insertion widths. This
+/// could also be extended to handle the case of inserting a scalar constant
+/// into a vector variable.
+static Instruction *shrinkInsertElt(CastInst &Trunc,
+                                    InstCombiner::BuilderTy &Builder) {
+  Instruction::CastOps Opcode = Trunc.getOpcode();
+  assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
+         "Unexpected instruction for shrinking");
+
+  auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
+  if (!InsElt || !InsElt->hasOneUse())
+    return nullptr;
+
+  Type *DestTy = Trunc.getType();
+  Type *DestScalarTy = DestTy->getScalarType();
+  Value *VecOp = InsElt->getOperand(0);
+  Value *ScalarOp = InsElt->getOperand(1);
+  Value *Index = InsElt->getOperand(2);
+
+  if (isa<UndefValue>(VecOp)) {
+    // trunc   (inselt undef, X, Index) --> inselt undef,   (trunc X), Index
+    // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
+    UndefValue *NarrowUndef = UndefValue::get(DestTy);
+    Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
+    return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
@@ -488,7 +538,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateTruncated(Src, DestTy, *this, &CI)) {
 
     // If this cast is a truncate, evaluting in a different type always
@@ -554,8 +604,14 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *I = shrinkBitwiseLogic(CI))
     return I;
 
+  if (Instruction *I = shrinkSplatShuffle(CI, *Builder))
+    return I;
+
+  if (Instruction *I = shrinkInsertElt(CI, *Builder))
+    return I;
+
   if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
-      ShouldChangeType(SrcTy, DestTy)) {
+      shouldChangeType(SrcTy, DestTy)) {
     // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
     // dest type is native and cst < dest size.
     if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
@@ -838,11 +894,6 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -851,10 +902,10 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
-    assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
-           "Unreasonable BitsToClear");
+    assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
+           "Can't clear more bits than in SrcTy");
 
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -1124,11 +1175,6 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -1145,7 +1191,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateSExtd(Src, DestTy)) {
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -1167,18 +1213,16 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
                                       ShAmt);
   }
 
-  // If this input is a trunc from our destination, then turn sext(trunc(x))
+  // If the input is a trunc from the destination type, then turn sext(trunc(x))
   // into shifts.
-  if (TruncInst *TI = dyn_cast<TruncInst>(Src))
-    if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
-      uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
-      uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-
-      // We need to emit a shl + ashr to do the sign extend.
-      Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
-      Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
-      return BinaryOperator::CreateAShr(Res, ShAmt);
-    }
+  Value *X;
+  if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
+    // sext(trunc(X)) --> ashr(shl(X, C), C)
+    unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+    unsigned DestBitSize = DestTy->getScalarSizeInBits();
+    Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder->CreateShl(X, ShAmt), ShAmt);
+  }
 
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
     return transformSExtICmp(ICI, CI);
@@ -1225,17 +1269,15 @@ static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   return nullptr;
 }
 
-/// If this is a floating-point extension instruction, look
-/// through it until we get the source value.
+/// Look through floating-point extensions until we get the source value.
 static Value *lookThroughFPExtensions(Value *V) {
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (I->getOpcode() == Instruction::FPExt)
-      return lookThroughFPExtensions(I->getOperand(0));
+  while (auto *FPExt = dyn_cast<FPExtInst>(V))
+    V = FPExt->getOperand(0);
 
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+  if (auto *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
       return V;  // No constant folding of this.
     // See if the value can be truncated to half and then reextended.
@@ -1392,24 +1434,49 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
   if (II) {
     switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::fabs: {
-        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
-        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
-                                                   CI.getType());
-        Type *IntrinsicType[] = { CI.getType() };
-        Function *Overload = Intrinsic::getDeclaration(
-            CI.getModule(), II->getIntrinsicID(), IntrinsicType);
-
-        SmallVector<OperandBundleDef, 1> OpBundles;
-        II->getOperandBundlesAsDefs(OpBundles);
-
-        Value *Args[] = { InnerTrunc };
-        return CallInst::Create(Overload, Args, OpBundles, II->getName());
+    default: break;
+    case Intrinsic::fabs:
+    case Intrinsic::ceil:
+    case Intrinsic::floor:
+    case Intrinsic::rint:
+    case Intrinsic::round:
+    case Intrinsic::nearbyint:
+    case Intrinsic::trunc: {
+      Value *Src = II->getArgOperand(0);
+      if (!Src->hasOneUse())
+        break;
+
+      // Except for fabs, this transformation requires the input of the unary FP
+      // operation to be itself an fpext from the type to which we're
+      // truncating.
+      if (II->getIntrinsicID() != Intrinsic::fabs) {
+        FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
+        if (!FPExtSrc || FPExtSrc->getOperand(0)->getType() != CI.getType())
+          break;
       }
+
+      // Do unary FP operation on smaller type.
+      // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+      Value *InnerTrunc = Builder->CreateFPTrunc(Src, CI.getType());
+      Type *IntrinsicType[] = { CI.getType() };
+      Function *Overload = Intrinsic::getDeclaration(
+        CI.getModule(), II->getIntrinsicID(), IntrinsicType);
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
+
+      Value *Args[] = { InnerTrunc };
+      CallInst *NewCI =  CallInst::Create(Overload, Args,
+                                          OpBundles, II->getName());
+      NewCI->copyFastMathFlags(II);
+      return NewCI;
+    }
     }
   }
 
+  if (Instruction *I = shrinkInsertElt(CI, *Builder))
+    return I;
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 428f94bb5e93..bbafa9e9f468 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -230,7 +230,9 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     return nullptr;
 
   uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
-  if (ArrayElementCount > 1024) return nullptr; // Don't blow up on huge arrays.
+  // Don't blow up on huge arrays.
+  if (ArrayElementCount > MaxArraySizeForCombine)
+    return nullptr;
 
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
@@ -1663,7 +1665,7 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
       (Cmp.isEquality() || (!C1->isNegative() && !C2->isNegative()))) {
     // TODO: Is this a good transform for vectors? Wider types may reduce
     // throughput. Should this transform be limited (even for scalars) by using
-    // ShouldChangeType()?
+    // shouldChangeType()?
     if (!Cmp.getType()->isVectorTy()) {
       Type *WideType = W->getType();
       unsigned WideScalarBits = WideType->getScalarSizeInBits();
@@ -1792,6 +1794,15 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
                           ConstantInt::get(V->getType(), 1));
   }
 
+  // X | C == C --> X <=u C
+  // X | C != C --> X  >u C
+  //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
+  if (Cmp.isEquality() && Cmp.getOperand(1) == Or->getOperand(1) &&
+      (*C + 1).isPowerOf2()) {
+    Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+    return new ICmpInst(Pred, Or->getOperand(0), Or->getOperand(1));
+  }
+
   if (!Cmp.isEquality() || *C != 0 || !Or->hasOneUse())
     return nullptr;
 
@@ -1914,61 +1925,89 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
 
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *X = Shl->getOperand(0);
-  if (Cmp.isEquality()) {
-    // If the shift is NUW, then it is just shifting out zeros, no need for an
-    // AND.
-    Constant *LShrC = ConstantInt::get(Shl->getType(), C->lshr(*ShiftAmt));
-    if (Shl->hasNoUnsignedWrap())
-      return new ICmpInst(Pred, X, LShrC);
-
-    // If the shift is NSW and we compare to 0, then it is just shifting out
-    // sign bits, no need for an AND either.
-    if (Shl->hasNoSignedWrap() && *C == 0)
-      return new ICmpInst(Pred, X, LShrC);
-
-    if (Shl->hasOneUse()) {
-      // Otherwise, strength reduce the shift into an and.
-      Constant *Mask = ConstantInt::get(Shl->getType(),
-          APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
-
-      Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
-      return new ICmpInst(Pred, And, LShrC);
+  Type *ShType = Shl->getType();
+
+  // NSW guarantees that we are only shifting out sign bits from the high bits,
+  // so we can ASHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoSignedWrap()) {
+    if (Pred == ICmpInst::ICMP_SGT) {
+      // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
+      APInt ShiftedC = C->ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
+      // This is the same code as the SGT case, but assert the pre-condition
+      // that is needed for this to work with equality predicates.
+      assert(C->ashr(*ShiftAmt).shl(*ShiftAmt) == *C &&
+             "Compare known true or false was not folded");
+      APInt ShiftedC = C->ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_SLT) {
+      // SLE is the same as above, but SLE is canonicalized to SLT, so convert:
+      // (X << S) <=s C is equiv to X <=s (C >> S) for all C
+      // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
+      // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
+      assert(!C->isMinSignedValue() && "Unexpected icmp slt");
+      APInt ShiftedC = (*C - 1).ashr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead; isSignTest may change 'Pred', so only
+    // do that if we're sure to not continue on in this function.
+    if (isSignTest(Pred, *C))
+      return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
+  }
+
+  // NUW guarantees that we are only shifting out zero bits from the high bits,
+  // so we can LSHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoUnsignedWrap()) {
+    if (Pred == ICmpInst::ICMP_UGT) {
+      // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
+      APInt ShiftedC = C->lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
+      // This is the same code as the UGT case, but assert the pre-condition
+      // that is needed for this to work with equality predicates.
+      assert(C->lshr(*ShiftAmt).shl(*ShiftAmt) == *C &&
+             "Compare known true or false was not folded");
+      APInt ShiftedC = C->lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_ULT) {
+      // ULE is the same as above, but ULE is canonicalized to ULT, so convert:
+      // (X << S) <=u C is equiv to X <=u (C >> S) for all C
+      // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
+      // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
+      assert(C->ugt(0) && "ult 0 should have been eliminated");
+      APInt ShiftedC = (*C - 1).lshr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
     }
   }
 
-  // If this is a signed comparison to 0 and the shift is sign preserving,
-  // use the shift LHS operand instead; isSignTest may change 'Pred', so only
-  // do that if we're sure to not continue on in this function.
-  if (Shl->hasNoSignedWrap() && isSignTest(Pred, *C))
-    return new ICmpInst(Pred, X, Constant::getNullValue(X->getType()));
+  if (Cmp.isEquality() && Shl->hasOneUse()) {
+    // Strength-reduce the shift into an 'and'.
+    Constant *Mask = ConstantInt::get(
+        ShType,
+        APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
+    Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
+    Constant *LShrC = ConstantInt::get(ShType, C->lshr(*ShiftAmt));
+    return new ICmpInst(Pred, And, LShrC);
+  }
 
   // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
   bool TrueIfSigned = false;
   if (Shl->hasOneUse() && isSignBitCheck(Pred, *C, TrueIfSigned)) {
     // (X << 31) <s 0  --> (X & 1) != 0
     Constant *Mask = ConstantInt::get(
-        X->getType(),
+        ShType,
         APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
     Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
     return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
-                        And, Constant::getNullValue(And->getType()));
-  }
-
-  // When the shift is nuw and pred is >u or <=u, comparison only really happens
-  // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the
-  // <=u case can be further converted to match <u (see below).
-  if (Shl->hasNoUnsignedWrap() &&
-      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) {
-    // Derivation for the ult case:
-    // (X << S) <=u C is equiv to X <=u (C >> S) for all C
-    // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
-    // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
-    assert((Pred != ICmpInst::ICMP_ULT || C->ugt(0)) &&
-           "Encountered `ult 0` that should have been eliminated by "
-           "InstSimplify.");
-    APInt ShiftedC = Pred == ICmpInst::ICMP_ULT ? (*C - 1).lshr(*ShiftAmt) + 1
-                                                : C->lshr(*ShiftAmt);
-    return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), ShiftedC));
+                        And, Constant::getNullValue(ShType));
   }
 
   // Transform (icmp pred iM (shl iM %v, N), C)
@@ -1981,8 +2020,8 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
   if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
       DL.isLegalInteger(TypeBits - Amt)) {
     Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
-    if (X->getType()->isVectorTy())
-      TruncTy = VectorType::get(TruncTy, X->getType()->getVectorNumElements());
+    if (ShType->isVectorTy())
+      TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements());
     Constant *NewC =
         ConstantInt::get(TruncTy, C->ashr(*ShiftAmt).trunc(TypeBits - Amt));
     return new ICmpInst(Pred, Builder->CreateTrunc(X, TruncTy), NewC);
@@ -2342,8 +2381,24 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   // Fold icmp pred (add X, C2), C.
   Value *X = Add->getOperand(0);
   Type *Ty = Add->getType();
-  auto CR =
-      ConstantRange::makeExactICmpRegion(Cmp.getPredicate(), *C).subtract(*C2);
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the add does not wrap, we can always adjust the compare by subtracting
+  // the constants. Equality comparisons are handled elsewhere. SGE/SLE are
+  // canonicalized to SGT/SLT.
+  if (Add->hasNoSignedWrap() &&
+      (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) {
+    bool Overflow;
+    APInt NewC = C->ssub_ov(*C2, Overflow);
+    // If there is overflow, the result must be true or false.
+    // TODO: Can we assert there is no overflow because InstSimplify always
+    // handles those cases?
+    if (!Overflow)
+      // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
+      return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
+  }
+
+  auto CR = ConstantRange::makeExactICmpRegion(Pred, *C).subtract(*C2);
   const APInt &Upper = CR.getUpper();
   const APInt &Lower = CR.getLower();
   if (Cmp.isSigned()) {
@@ -2364,16 +2419,14 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   // X+C <u C2 -> (X & -C2) == C
   //   iff C & (C2-1) == 0
   //       C2 is a power of 2
-  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
-      (*C2 & (*C - 1)) == 0)
+  if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() && (*C2 & (*C - 1)) == 0)
     return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateAnd(X, -(*C)),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
   // X+C >u C2 -> (X & ~C2) != C
   //   iff C & C2 == 0
   //       C2+1 is a power of 2
-  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() &&
-      (*C2 & *C) == 0)
+  if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == 0)
     return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateAnd(X, ~(*C)),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
@@ -2656,7 +2709,7 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
     // block.  If in the same block, we're encouraging jump threading.  If
     // not, we are just pessimizing the code by making an i1 phi.
     if (LHSI->getParent() == I.getParent())
-      if (Instruction *NV = FoldOpIntoPhi(I))
+      if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
         return NV;
     break;
   case Instruction::Select: {
@@ -2767,12 +2820,6 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X+cst) < 0 --> X < -cst
-  if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
-    if (ConstantInt *RHSC = dyn_cast_or_null<ConstantInt>(B))
-      if (!RHSC->isMinValue(/*isSigned=*/true))
-        return new ICmpInst(Pred, A, ConstantExpr::getNeg(RHSC));
-
   // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
   if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
     return new ICmpInst(Pred, A == Op1 ? B : A,
@@ -2847,6 +2894,31 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
 
+  // TODO: The subtraction-related identities shown below also hold, but
+  // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
+  // wouldn't happen even if they were implemented.
+  //
+  // icmp ult (X - 1), Y -> icmp ule X, Y
+  // icmp uge (X - 1), Y -> icmp ugt X, Y
+  // icmp ugt X, (Y - 1) -> icmp uge X, Y
+  // icmp ule X, (Y - 1) -> icmp ult X, Y
+
+  // icmp ule (X + 1), Y -> icmp ult X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
+
+  // icmp ugt (X + 1), Y -> icmp uge X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
+
+  // icmp uge X, (Y + 1) -> icmp ugt X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
+
+  // icmp ult X, (Y + 1) -> icmp ule X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
+
   // if C1 has greater magnitude than C2:
   //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
   //  s.t. C3 = C1 - C2
@@ -3738,16 +3810,14 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth,
   // greater than the RHS must differ in a bit higher than these due to carry.
   case ICmpInst::ICMP_UGT: {
     unsigned trailingOnes = RHS.countTrailingOnes();
-    APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes);
-    return ~lowBitsSet;
+    return APInt::getBitsSetFrom(BitWidth, trailingOnes);
   }
 
   // Similarly, for a ULT comparison, we don't care about the trailing zeros.
   // Any value less than the RHS must differ in a higher bit because of carries.
   case ICmpInst::ICMP_ULT: {
     unsigned trailingZeros = RHS.countTrailingZeros();
-    APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros);
-    return ~lowBitsSet;
+    return APInt::getBitsSetFrom(BitWidth, trailingZeros);
   }
 
   default:
@@ -3887,7 +3957,7 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
   assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
   if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
     BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
-    // The check for the unique predecessor is not the best that can be
+    // The check for the single predecessor is not the best that can be
     // done. But it protects efficiently against cases like when SI's
     // home block has two successors, Succ and Succ1, and Succ1 predecessor
     // of Succ. Then SI can't be replaced by SIOpd because the use that gets
@@ -3895,8 +3965,10 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
     // guarantees that the path all uses of SI (outside SI's parent) are on
     // is disjoint from all other paths out of SI. But that information
     // is more expensive to compute, and the trade-off here is in favor
-    // of compile-time.
-    if (Succ->getUniquePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+    // of compile-time. It should also be noticed that we check for a single
+    // predecessor and not only uniqueness. This to handle the situation when
+    // Succ and Succ1 points to the same basic block.
+    if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
       NumSel++;
       SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
       return true;
@@ -3932,12 +4004,12 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
   APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
 
-  if (SimplifyDemandedBits(I.getOperandUse(0),
+  if (SimplifyDemandedBits(&I, 0,
                            getDemandedBitsLHSMask(I, BitWidth, IsSignBit),
                            Op0KnownZero, Op0KnownOne, 0))
     return &I;
 
-  if (SimplifyDemandedBits(I.getOperandUse(1), APInt::getAllOnesValue(BitWidth),
+  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
                            Op1KnownZero, Op1KnownOne, 0))
     return &I;
 
@@ -4801,7 +4873,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         // block.  If in the same block, we're encouraging jump threading.  If
         // not, we are just pessimizing the code by making an i1 phi.
         if (LHSI->getParent() == I.getParent())
-          if (Instruction *NV = FoldOpIntoPhi(I))
+          if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
             return NV;
         break;
       case Instruction::SIToFP:
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 2847ce858e79..71000063ab3c 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -28,6 +28,9 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/DIBuilder.h"
 
 #define DEBUG_TYPE "instcombine"
 
@@ -40,21 +43,29 @@ class DbgDeclareInst;
 class MemIntrinsic;
 class MemSetInst;
 
-/// \brief Assign a complexity or rank value to LLVM Values.
+/// Assign a complexity or rank value to LLVM Values. This is used to reduce
+/// the amount of pattern matching needed for compares and commutative
+/// instructions. For example, if we have:
+///   icmp ugt X, Constant
+/// or
+///   xor (add X, Constant), cast Z
+///
+/// We do not have to consider the commuted variants of these patterns because
+/// canonicalization based on complexity guarantees the above ordering.
 ///
 /// This routine maps IR values to various complexity ranks:
 ///   0 -> undef
 ///   1 -> Constants
 ///   2 -> Other non-instructions
 ///   3 -> Arguments
-///   3 -> Unary operations
-///   4 -> Other instructions
+///   4 -> Cast and (f)neg/not instructions
+///   5 -> Other instructions
 static inline unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
-        BinaryOperator::isNot(V))
-      return 3;
-    return 4;
+    if (isa<CastInst>(V) || BinaryOperator::isNeg(V) ||
+        BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V))
+      return 4;
+    return 5;
   }
   if (isa<Argument>(V))
     return 3;
@@ -289,6 +300,7 @@ public:
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitFenceInst(FenceInst &FI);
   Instruction *visitSwitchInst(SwitchInst &SI);
   Instruction *visitReturnInst(ReturnInst &RI);
   Instruction *visitInsertValueInst(InsertValueInst &IV);
@@ -313,9 +325,14 @@ public:
   bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
                                  const unsigned SIOpd);
 
+  /// Try to replace instruction \p I with value \p V which are pointers
+  /// in different address space.
+  /// \return true if successful.
+  bool replacePointer(Instruction &I, Value *V);
+
 private:
-  bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
-  bool ShouldChangeType(Type *From, Type *To) const;
+  bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
+  bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
@@ -456,8 +473,9 @@ public:
   /// methods should return the value returned by this function.
   Instruction *eraseInstFromFunction(Instruction &I) {
     DEBUG(dbgs() << "IC: ERASE " << I << '\n');
-
     assert(I.use_empty() && "Cannot erase instruction that is used!");
+    salvageDebugInfo(I);
+
     // Make sure that we reprocess all operands now that we reduced their
     // use counts.
     if (I.getNumOperands() < 8) {
@@ -499,6 +517,9 @@ public:
     return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
 
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine;
+
 private:
   /// \brief Performs a few simplifications for operators which are associative
   /// or commutative.
@@ -518,8 +539,16 @@ private:
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
                                  APInt &KnownOne, unsigned Depth,
                                  Instruction *CxtI);
-  bool SimplifyDemandedBits(Use &U, const APInt &DemandedMask, APInt &KnownZero,
+  bool SimplifyDemandedBits(Instruction *I, unsigned Op,
+                            const APInt &DemandedMask, APInt &KnownZero,
                             APInt &KnownOne, unsigned Depth = 0);
+  /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
+  /// bits. It also tries to handle simplifications that can be done based on
+  /// DemandedMask, but without modifying the Instruction.
+  Value *SimplifyMultipleUseDemandedBits(Instruction *I,
+                                         const APInt &DemandedMask,
+                                         APInt &KnownZero, APInt &KnownOne,
+                                         unsigned Depth, Instruction *CxtI);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
   /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
   Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
@@ -540,7 +569,7 @@ private:
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
   /// only possible if all operands to the PHI are constants).
-  Instruction *FoldOpIntoPhi(Instruction &I);
+  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
 
   /// Given an instruction with a select as one operand and a constant as the
   /// other operand, try to fold the binary operator into the select arguments.
@@ -549,7 +578,7 @@ private:
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// This is a convenience wrapper function for the above two functions.
-  Instruction *foldOpWithConstantIntoOperand(Instruction &I);
+  Instruction *foldOpWithConstantIntoOperand(BinaryOperator &I);
 
   /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
@@ -628,16 +657,16 @@ private:
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
 
-  Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+  Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
 
-  Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
-                            bool isSub, Instruction &I);
   Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                          bool isSigned, bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+  Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 49e516e9c176..6288e054f1bc 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,13 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -223,6 +225,107 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   return nullptr;
 }
 
+namespace {
+// If I and V are pointers in different address space, it is not allowed to
+// use replaceAllUsesWith since I and V have different types. A
+// non-target-specific transformation should not use addrspacecast on V since
+// the two address space may be disjoint depending on target.
+//
+// This class chases down uses of the old pointer until reaching the load
+// instructions, then replaces the old pointer in the load instructions with
+// the new pointer. If during the chasing it sees bitcast or GEP, it will
+// create new bitcast or GEP with the new pointer and use them in the load
+// instruction.
+class PointerReplacer {
+public:
+  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  void replacePointer(Instruction &I, Value *V);
+
+private:
+  void findLoadAndReplace(Instruction &I);
+  void replace(Instruction *I);
+  Value *getReplacement(Value *I);
+
+  SmallVector<Instruction *, 4> Path;
+  MapVector<Value *, Value *> WorkMap;
+  InstCombiner &IC;
+};
+} // end anonymous namespace
+
+void PointerReplacer::findLoadAndReplace(Instruction &I) {
+  for (auto U : I.users()) {
+    auto *Inst = dyn_cast<Instruction>(&*U);
+    if (!Inst)
+      return;
+    DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
+    if (isa<LoadInst>(Inst)) {
+      for (auto P : Path)
+        replace(P);
+      replace(Inst);
+    } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
+      Path.push_back(Inst);
+      findLoadAndReplace(*Inst);
+      Path.pop_back();
+    } else {
+      return;
+    }
+  }
+}
+
+Value *PointerReplacer::getReplacement(Value *V) {
+  auto Loc = WorkMap.find(V);
+  if (Loc != WorkMap.end())
+    return Loc->second;
+  return nullptr;
+}
+
+void PointerReplacer::replace(Instruction *I) {
+  if (getReplacement(I))
+    return;
+
+  if (auto *LT = dyn_cast<LoadInst>(I)) {
+    auto *V = getReplacement(LT->getPointerOperand());
+    assert(V && "Operand not replaced");
+    auto *NewI = new LoadInst(V);
+    NewI->takeName(LT);
+    IC.InsertNewInstWith(NewI, *LT);
+    IC.replaceInstUsesWith(*LT, NewI);
+    WorkMap[LT] = NewI;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    auto *V = getReplacement(GEP->getPointerOperand());
+    assert(V && "Operand not replaced");
+    SmallVector<Value *, 8> Indices;
+    Indices.append(GEP->idx_begin(), GEP->idx_end());
+    auto *NewI = GetElementPtrInst::Create(
+        V->getType()->getPointerElementType(), V, Indices);
+    IC.InsertNewInstWith(NewI, *GEP);
+    NewI->takeName(GEP);
+    WorkMap[GEP] = NewI;
+  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
+    auto *V = getReplacement(BC->getOperand(0));
+    assert(V && "Operand not replaced");
+    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
+                                  V->getType()->getPointerAddressSpace());
+    auto *NewI = new BitCastInst(V, NewT);
+    IC.InsertNewInstWith(NewI, *BC);
+    NewI->takeName(BC);
+    WorkMap[BC] = NewI;
+  } else {
+    llvm_unreachable("should never reach here");
+  }
+}
+
+void PointerReplacer::replacePointer(Instruction &I, Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(I.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && PT->getElementType() == NT->getElementType() &&
+         "Invalid usage");
+#endif
+  WorkMap[&I] = V;
+  findLoadAndReplace(I);
+}
+
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI))
     return I;
@@ -293,12 +396,22 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           eraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Constant *Cast
-          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
-        Instruction *NewI = replaceInstUsesWith(AI, Cast);
-        eraseInstFromFunction(*Copy);
-        ++NumGlobalCopies;
-        return NewI;
+        auto *SrcTy = TheSrc->getType();
+        auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
+                                        SrcTy->getPointerAddressSpace());
+        Constant *Cast =
+            ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
+        if (AI.getType()->getPointerAddressSpace() ==
+            SrcTy->getPointerAddressSpace()) {
+          Instruction *NewI = replaceInstUsesWith(AI, Cast);
+          eraseInstFromFunction(*Copy);
+          ++NumGlobalCopies;
+          return NewI;
+        } else {
+          PointerReplacer PtrReplacer(*this);
+          PtrReplacer.replacePointer(AI, Cast);
+          ++NumGlobalCopies;
+        }
       }
     }
   }
@@ -608,7 +721,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     // arrays of arbitrary size but this has a terrible impact on compile time.
     // The threshold here is chosen arbitrarily, maybe needs a little bit of
     // tuning.
-    if (NumElements > 1024)
+    if (NumElements > IC.MaxArraySizeForCombine)
       return nullptr;
 
     const DataLayout &DL = IC.getDataLayout();
@@ -1113,7 +1226,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     // arrays of arbitrary size but this has a terrible impact on compile time.
     // The threshold here is chosen arbitrarily, maybe needs a little bit of
     // tuning.
-    if (NumElements > 1024)
+    if (NumElements > IC.MaxArraySizeForCombine)
       return false;
 
     const DataLayout &DL = IC.getDataLayout();
@@ -1268,8 +1381,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       break;
     }
 
-    // Don't skip over loads or things that can modify memory.
-    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
+    // Don't skip over loads, throws or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
       break;
   }
 
@@ -1392,8 +1505,8 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
       }
       // If we find something that may be using or overwriting the stored
       // value, or if we run out of instructions, we can't do the xform.
-      if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
-          BBI == OtherBB->begin())
+      if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
+          BBI->mayWriteToMemory() || BBI == OtherBB->begin())
         return false;
     }
 
@@ -1402,7 +1515,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     // StoreBB.
     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
       // FIXME: This should really be AA driven.
-      if (I->mayReadFromMemory() || I->mayWriteToMemory())
+      if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
         return false;
     }
   }
@@ -1425,7 +1538,9 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                    SI.getOrdering(),
                                    SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
-  NewSI->setDebugLoc(OtherStore->getDebugLoc());
+  // The debug locations of the original instructions might differ; merge them.
+  NewSI->setDebugLoc(DILocation::getMergedLocation(SI.getDebugLoc(),
+                                                   OtherStore->getDebugLoc()));
 
   // If the two stores had AA tags, merge them.
   AAMDNodes AATags;
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 45a19fb0f1f2..f1ac82057e6c 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -298,39 +298,33 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
   {
-    Value *Op1C = Op1;
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
-    if (!BO ||
-        (BO->getOpcode() != Instruction::UDiv &&
-         BO->getOpcode() != Instruction::SDiv)) {
-      Op1C = Op0;
-      BO = dyn_cast<BinaryOperator>(Op1);
+    Value *Y = Op1;
+    BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
+    if (!Div || (Div->getOpcode() != Instruction::UDiv &&
+                 Div->getOpcode() != Instruction::SDiv)) {
+      Y = Op0;
+      Div = dyn_cast<BinaryOperator>(Op1);
     }
-    Value *Neg = dyn_castNegVal(Op1C);
-    if (BO && BO->hasOneUse() &&
-        (BO->getOperand(1) == Op1C || BO->getOperand(1) == Neg) &&
-        (BO->getOpcode() == Instruction::UDiv ||
-         BO->getOpcode() == Instruction::SDiv)) {
-      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+    Value *Neg = dyn_castNegVal(Y);
+    if (Div && Div->hasOneUse() &&
+        (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
+        (Div->getOpcode() == Instruction::UDiv ||
+         Div->getOpcode() == Instruction::SDiv)) {
+      Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
 
       // If the division is exact, X % Y is zero, so we end up with X or -X.
-      if (PossiblyExactOperator *SDiv = dyn_cast<PossiblyExactOperator>(BO))
-        if (SDiv->isExact()) {
-          if (Op1BO == Op1C)
-            return replaceInstUsesWith(I, Op0BO);
-          return BinaryOperator::CreateNeg(Op0BO);
-        }
-
-      Value *Rem;
-      if (BO->getOpcode() == Instruction::UDiv)
-        Rem = Builder->CreateURem(Op0BO, Op1BO);
-      else
-        Rem = Builder->CreateSRem(Op0BO, Op1BO);
-      Rem->takeName(BO);
+      if (Div->isExact()) {
+        if (DivOp1 == Y)
+          return replaceInstUsesWith(I, X);
+        return BinaryOperator::CreateNeg(X);
+      }
 
-      if (Op1BO == Op1C)
-        return BinaryOperator::CreateSub(Op0BO, Rem);
-      return BinaryOperator::CreateSub(Rem, Op0BO);
+      auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
+                                                          : Instruction::SRem;
+      Value *Rem = Builder->CreateBinOp(RemOpc, X, DivOp1);
+      if (DivOp1 == Y)
+        return BinaryOperator::CreateSub(X, Rem);
+      return BinaryOperator::CreateSub(Rem, X);
     }
   }
 
@@ -1461,16 +1455,16 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
       if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
         if (Instruction *R = FoldOpIntoSelect(I, SI))
           return R;
-      } else if (isa<PHINode>(Op0I)) {
+      } else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
         using namespace llvm::PatternMatch;
         const APInt *Op1Int;
         if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
             (I.getOpcode() == Instruction::URem ||
              !Op1Int->isMinSignedValue())) {
-          // FoldOpIntoPhi will speculate instructions to the end of the PHI's
+          // foldOpIntoPhi will speculate instructions to the end of the PHI's
           // predecessor blocks, so do this only if we know the srem or urem
           // will not fault.
-          if (Instruction *NV = FoldOpIntoPhi(I))
+          if (Instruction *NV = foldOpIntoPhi(I, PN))
             return NV;
         }
       }
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 4cbffe9533b7..85e5b6ba2dc2 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -457,8 +457,8 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
   }
 
   // The more common cases of a phi with no constant operands or just one
-  // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi()
-  // respectively. FoldOpIntoPhi() wants to do the opposite transform that is
+  // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi()
+  // respectively. foldOpIntoPhi() wants to do the opposite transform that is
   // performed here. It tries to replicate a cast in the phi operand's basic
   // block to expose other folding opportunities. Thus, InstCombine will
   // infinite loop without this check.
@@ -507,7 +507,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     // Be careful about transforming integer PHIs.  We don't want to pessimize
     // the code by turning an i32 into an i1293.
     if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
-      if (!ShouldChangeType(PN.getType(), CastSrcTy))
+      if (!shouldChangeType(PN.getType(), CastSrcTy))
         return nullptr;
     }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 36644845352e..693b6c95c169 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -120,6 +120,16 @@ static Constant *getSelectFoldableConstant(Instruction *I) {
 /// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
+  // Don't break up min/max patterns. The hasOneUse checks below prevent that
+  // for most cases, but vector min/max with bitcasts can be transformed. If the
+  // one-use restrictions are eased for other patterns, we still don't want to
+  // obfuscate min/max.
+  if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
+       match(&SI, m_SMax(m_Value(), m_Value())) ||
+       match(&SI, m_UMin(m_Value(), m_Value())) ||
+       match(&SI, m_UMax(m_Value(), m_Value()))))
+    return nullptr;
+
   // If this is a cast from the same type, merge.
   if (TI->getNumOperands() == 1 && TI->isCast()) {
     Type *FIOpndTy = FI->getOperand(0)->getType();
@@ -364,7 +374,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
 /// into:
 ///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
 static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
-                                  InstCombiner::BuilderTy *Builder) {
+                                 InstCombiner::BuilderTy *Builder) {
   ICmpInst::Predicate Pred = ICI->getPredicate();
   Value *CmpLHS = ICI->getOperand(0);
   Value *CmpRHS = ICI->getOperand(1);
@@ -395,13 +405,12 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   if (match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) ||
       match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) {
     IntrinsicInst *II = cast<IntrinsicInst>(Count);
-    IRBuilder<> Builder(II);
     // Explicitly clear the 'undef_on_zero' flag.
     IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone());
     Type *Ty = NewI->getArgOperand(1)->getType();
     NewI->setArgOperand(1, Constant::getNullValue(Ty));
-    Builder.Insert(NewI);
-    return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType());
+    Builder->Insert(NewI);
+    return Builder->CreateZExtOrTrunc(NewI, ValueOnZero->getType());
   }
 
   return nullptr;
@@ -500,18 +509,16 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
   return true;
 }
 
-/// If this is an integer min/max where the select's 'true' operand is a
-/// constant, canonicalize that constant to the 'false' operand:
-/// select (icmp Pred X, C), C, X --> select (icmp Pred' X, C), X, C
+/// If this is an integer min/max (icmp + select) with a constant operand,
+/// create the canonical icmp for the min/max operation and canonicalize the
+/// constant to the 'false' operand of the select:
+/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
+/// Note: if C1 != C2, this will change the icmp constant to the existing
+/// constant operand of the select.
 static Instruction *
 canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
                                InstCombiner::BuilderTy &Builder) {
-  // TODO: We should also canonicalize min/max when the select has a different
-  // constant value than the cmp constant, but we need to fix the backend first.
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)) ||
-      !isa<Constant>(Sel.getTrueValue()) ||
-      isa<Constant>(Sel.getFalseValue()) ||
-      Cmp.getOperand(1) != Sel.getTrueValue())
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
   // Canonicalize the compare predicate based on whether we have min or max.
@@ -526,16 +533,25 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   default: return nullptr;
   }
 
-  // Canonicalize the constant to the right side.
-  if (isa<Constant>(LHS))
-    std::swap(LHS, RHS);
+  // Is this already canonical?
+  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
+      Cmp.getPredicate() == NewPred)
+    return nullptr;
+
+  // Create the canonical compare and plug it into the select.
+  Sel.setCondition(Builder.CreateICmp(NewPred, LHS, RHS));
 
-  Value *NewCmp = Builder.CreateICmp(NewPred, LHS, RHS);
-  SelectInst *NewSel = SelectInst::Create(NewCmp, LHS, RHS, "", nullptr, &Sel);
+  // If the select operands did not change, we're done.
+  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
+    return &Sel;
 
-  // We swapped the select operands, so swap the metadata too.
-  NewSel->swapProfMetadata();
-  return NewSel;
+  // If we are swapping the select operands, swap the metadata too.
+  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
+         "Unexpected results from matchSelectPattern");
+  Sel.setTrueValue(LHS);
+  Sel.setFalseValue(RHS);
+  Sel.swapProfMetadata();
+  return &Sel;
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
@@ -786,7 +802,9 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
   // This transform is performance neutral if we can elide at least one xor from
   // the set of three operands, since we'll be tacking on an xor at the very
   // end.
-  if (IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
+  if (SelectPatternResult::isMinOrMax(SPF1) &&
+      SelectPatternResult::isMinOrMax(SPF2) &&
+      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
       IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
       IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
     if (!NotA)
@@ -1035,8 +1053,10 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
       // If the select condition element is false, choose from the 2nd vector.
       Mask.push_back(ConstantInt::get(Int32Ty, i + NumElts));
     } else if (isa<UndefValue>(Elt)) {
-      // If the select condition element is undef, the shuffle mask is undef.
-      Mask.push_back(UndefValue::get(Int32Ty));
+      // Undef in a select condition (choose one of the operands) does not mean
+      // the same thing as undef in a shuffle mask (any value is acceptable), so
+      // give up.
+      return nullptr;
     } else {
       // Bail out on a constant expression.
       return nullptr;
@@ -1364,11 +1384,11 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   }
 
   // See if we can fold the select into a phi node if the condition is a select.
-  if (isa<PHINode>(SI.getCondition()))
+  if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
     // The true/false values have to be live in the PHI predecessor's blocks.
     if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
         canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
-      if (Instruction *NV = FoldOpIntoPhi(SI))
+      if (Instruction *NV = foldOpIntoPhi(SI, PN))
         return NV;
 
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
@@ -1450,6 +1470,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
+  // If we can compute the condition, there's no need for a select.
+  // Like the above fold, we are attempting to reduce compile-time cost by
+  // putting this fold here with limitations rather than in InstSimplify.
+  // The motivation for this call into value tracking is to take advantage of
+  // the assumption cache, so make sure that is populated.
+  if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
+    APInt KnownOne(1, 0), KnownZero(1, 0);
+    computeKnownBits(CondVal, KnownZero, KnownOne, 0, &SI);
+    if (KnownOne == 1)
+      return replaceInstUsesWith(SI, TrueVal);
+    if (KnownZero == 1)
+      return replaceInstUsesWith(SI, FalseVal);
+  }
+
   if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, *Builder))
     return BitCastSel;
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 4ff9b64ac57c..9aa679c60e47 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -22,8 +22,8 @@ using namespace PatternMatch;
 #define DEBUG_TYPE "instcombine"
 
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
-  assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  assert(Op0->getType() == Op1->getType());
 
   // See if we can fold away this shift.
   if (SimplifyDemandedInstructionBits(I))
@@ -65,63 +65,60 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
 }
 
 /// Return true if we can simplify two logical (either left or right) shifts
-/// that have constant shift amounts.
-static bool canEvaluateShiftedShift(unsigned FirstShiftAmt,
-                                    bool IsFirstShiftLeft,
-                                    Instruction *SecondShift, InstCombiner &IC,
+/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
+static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
+                                    Instruction *InnerShift, InstCombiner &IC,
                                     Instruction *CxtI) {
-  assert(SecondShift->isLogicalShift() && "Unexpected instruction type");
+  assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
 
-  // We need constant shifts.
-  auto *SecondShiftConst = dyn_cast<ConstantInt>(SecondShift->getOperand(1));
-  if (!SecondShiftConst)
+  // We need constant scalar or constant splat shifts.
+  const APInt *InnerShiftConst;
+  if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
     return false;
 
-  unsigned SecondShiftAmt = SecondShiftConst->getZExtValue();
-  bool IsSecondShiftLeft = SecondShift->getOpcode() == Instruction::Shl;
-
-  // We can always fold  shl(c1) +  shl(c2) ->  shl(c1+c2).
-  // We can always fold lshr(c1) + lshr(c2) -> lshr(c1+c2).
-  if (IsFirstShiftLeft == IsSecondShiftLeft)
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  if (IsInnerShl == IsOuterShl)
     return true;
 
-  // We can always fold lshr(c) +  shl(c) -> and(c2).
-  // We can always fold  shl(c) + lshr(c) -> and(c2).
-  if (FirstShiftAmt == SecondShiftAmt)
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  unsigned InnerShAmt = InnerShiftConst->getZExtValue();
+  if (InnerShAmt == OuterShAmt)
     return true;
 
-  unsigned TypeWidth = SecondShift->getType()->getScalarSizeInBits();
-
   // If the 2nd shift is bigger than the 1st, we can fold:
-  //   lshr(c1) +  shl(c2) ->  shl(c3) + and(c4) or
-  //   shl(c1)  + lshr(c2) -> lshr(c3) + and(c4),
+  // lshr (shl X, C1), C2 -->  and (shl X, C1 - C2), C3
+  // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
   // but it isn't profitable unless we know the and'd out bits are already zero.
-  // Also check that the 2nd shift is valid (less than the type width) or we'll
-  // crash trying to produce the bit mask for the 'and'.
-  if (SecondShiftAmt > FirstShiftAmt && SecondShiftAmt < TypeWidth) {
-    unsigned MaskShift = IsSecondShiftLeft ? TypeWidth - SecondShiftAmt
-                                           : SecondShiftAmt - FirstShiftAmt;
-    APInt Mask = APInt::getLowBitsSet(TypeWidth, FirstShiftAmt) << MaskShift;
-    if (IC.MaskedValueIsZero(SecondShift->getOperand(0), Mask, 0, CxtI))
+  // Also, check that the inner shift is valid (less than the type width) or
+  // we'll crash trying to produce the bit mask for the 'and'.
+  unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
+  if (InnerShAmt > OuterShAmt && InnerShAmt < TypeWidth) {
+    unsigned MaskShift =
+        IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
+    APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
+    if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI))
       return true;
   }
 
   return false;
 }
 
-/// See if we can compute the specified value, but shifted
-/// logically to the left or right by some number of bits.  This should return
-/// true if the expression can be computed for the same cost as the current
-/// expression tree.  This is used to eliminate extraneous shifting from things
-/// like:
+/// See if we can compute the specified value, but shifted logically to the left
+/// or right by some number of bits. This should return true if the expression
+/// can be computed for the same cost as the current expression tree. This is
+/// used to eliminate extraneous shifting from things like:
 ///      %C = shl i128 %A, 64
 ///      %D = shl i128 %B, 96
 ///      %E = or i128 %C, %D
 ///      %F = lshr i128 %E, 64
-/// where the client will ask if E can be computed shifted right by 64-bits.  If
-/// this succeeds, the GetShiftedValue function will be called to produce the
-/// value.
-static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
+/// where the client will ask if E can be computed shifted right by 64-bits. If
+/// this succeeds, getShiftedValue() will be called to produce the value.
+static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
                                InstCombiner &IC, Instruction *CxtI) {
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
@@ -165,8 +162,8 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
   case Instruction::Or:
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
-    return CanEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
-           CanEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
+    return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
+           canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
 
   case Instruction::Shl:
   case Instruction::LShr:
@@ -176,8 +173,8 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
     SelectInst *SI = cast<SelectInst>(I);
     Value *TrueVal = SI->getTrueValue();
     Value *FalseVal = SI->getFalseValue();
-    return CanEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
-           CanEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
+    return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
+           canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -185,16 +182,79 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
-      if (!CanEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
+      if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
         return false;
     return true;
   }
   }
 }
 
-/// When CanEvaluateShifted returned true for an expression,
-/// this value inserts the new computation that produces the shifted value.
-static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
+/// Fold OuterShift (InnerShift X, C1), C2.
+/// See canEvaluateShiftedShift() for the constraints on these instructions.
+static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
+                               bool IsOuterShl,
+                               InstCombiner::BuilderTy &Builder) {
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  Type *ShType = InnerShift->getType();
+  unsigned TypeWidth = ShType->getScalarSizeInBits();
+
+  // We only accept shifts-by-a-constant in canEvaluateShifted().
+  const APInt *C1;
+  match(InnerShift->getOperand(1), m_APInt(C1));
+  unsigned InnerShAmt = C1->getZExtValue();
+
+  // Change the shift amount and clear the appropriate IR flags.
+  auto NewInnerShift = [&](unsigned ShAmt) {
+    InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt));
+    if (IsInnerShl) {
+      InnerShift->setHasNoUnsignedWrap(false);
+      InnerShift->setHasNoSignedWrap(false);
+    } else {
+      InnerShift->setIsExact(false);
+    }
+    return InnerShift;
+  };
+
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  if (IsInnerShl == IsOuterShl) {
+    // If this is an oversized composite shift, then unsigned shifts get 0.
+    if (InnerShAmt + OuterShAmt >= TypeWidth)
+      return Constant::getNullValue(ShType);
+
+    return NewInnerShift(InnerShAmt + OuterShAmt);
+  }
+
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  if (InnerShAmt == OuterShAmt) {
+    APInt Mask = IsInnerShl
+                     ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt)
+                     : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt);
+    Value *And = Builder.CreateAnd(InnerShift->getOperand(0),
+                                   ConstantInt::get(ShType, Mask));
+    if (auto *AndI = dyn_cast<Instruction>(And)) {
+      AndI->moveBefore(InnerShift);
+      AndI->takeName(InnerShift);
+    }
+    return And;
+  }
+
+  assert(InnerShAmt > OuterShAmt &&
+         "Unexpected opposite direction logical shift pair");
+
+  // In general, we would need an 'and' for this transform, but
+  // canEvaluateShiftedShift() guarantees that the masked-off bits are not used.
+  // lshr (shl X, C1), C2 -->  shl X, C1 - C2
+  // shl (lshr X, C1), C2 --> lshr X, C1 - C2
+  return NewInnerShift(InnerShAmt - OuterShAmt);
+}
+
+/// When canEvaluateShifted() returns true for an expression, this function
+/// inserts the new computation that produces the shifted value.
+static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                               InstCombiner &IC, const DataLayout &DL) {
   // We can always evaluate constants shifted.
   if (Constant *C = dyn_cast<Constant>(V)) {
@@ -220,100 +280,21 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
     I->setOperand(
-        0, GetShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
+        0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
     I->setOperand(
-        1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
     return I;
 
-  case Instruction::Shl: {
-    BinaryOperator *BO = cast<BinaryOperator>(I);
-    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
-
-    // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
-
-    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
-    if (isLeftShift) {
-      // If this is oversized composite shift, then unsigned shifts get 0.
-      unsigned NewShAmt = NumBits+CI->getZExtValue();
-      if (NewShAmt >= TypeWidth)
-        return Constant::getNullValue(I->getType());
-
-      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
-      BO->setHasNoUnsignedWrap(false);
-      BO->setHasNoSignedWrap(false);
-      return I;
-    }
-
-    // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have
-    // zeros.
-    if (CI->getValue() == NumBits) {
-      APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits));
-      V = IC.Builder->CreateAnd(BO->getOperand(0),
-                                ConstantInt::get(BO->getContext(), Mask));
-      if (Instruction *VI = dyn_cast<Instruction>(V)) {
-        VI->moveBefore(BO);
-        VI->takeName(BO);
-      }
-      return V;
-    }
-
-    // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
-    // the and won't be needed.
-    assert(CI->getZExtValue() > NumBits);
-    BO->setOperand(1, ConstantInt::get(BO->getType(),
-                                       CI->getZExtValue() - NumBits));
-    BO->setHasNoUnsignedWrap(false);
-    BO->setHasNoSignedWrap(false);
-    return BO;
-  }
-  // FIXME: This is almost identical to the SHL case. Refactor both cases into
-  // a helper function.
-  case Instruction::LShr: {
-    BinaryOperator *BO = cast<BinaryOperator>(I);
-    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
-    // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
-
-    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
-    if (!isLeftShift) {
-      // If this is oversized composite shift, then unsigned shifts get 0.
-      unsigned NewShAmt = NumBits+CI->getZExtValue();
-      if (NewShAmt >= TypeWidth)
-        return Constant::getNullValue(BO->getType());
-
-      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
-      BO->setIsExact(false);
-      return I;
-    }
-
-    // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have
-    // zeros.
-    if (CI->getValue() == NumBits) {
-      APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits));
-      V = IC.Builder->CreateAnd(I->getOperand(0),
-                                ConstantInt::get(BO->getContext(), Mask));
-      if (Instruction *VI = dyn_cast<Instruction>(V)) {
-        VI->moveBefore(I);
-        VI->takeName(I);
-      }
-      return V;
-    }
-
-    // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
-    // the and won't be needed.
-    assert(CI->getZExtValue() > NumBits);
-    BO->setOperand(1, ConstantInt::get(BO->getType(),
-                                       CI->getZExtValue() - NumBits));
-    BO->setIsExact(false);
-    return BO;
-  }
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift,
+                            *(IC.Builder));
 
   case Instruction::Select:
     I->setOperand(
-        1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
     I->setOperand(
-        2, GetShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
+        2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
     return I;
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -321,215 +302,39 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i), NumBits,
+      PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits,
                                               isLeftShift, IC, DL));
     return PN;
   }
   }
 }
 
-/// Try to fold (X << C1) << C2, where the shifts are some combination of
-/// shl/ashr/lshr.
-static Instruction *
-foldShiftByConstOfShiftByConst(BinaryOperator &I, ConstantInt *COp1,
-                               InstCombiner::BuilderTy *Builder) {
-  Value *Op0 = I.getOperand(0);
-  uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
-
-  // Find out if this is a shift of a shift by a constant.
-  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
-  if (ShiftOp && !ShiftOp->isShift())
-    ShiftOp = nullptr;
-
-  if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
-
-    // This is a constant shift of a constant shift. Be careful about hiding
-    // shl instructions behind bit masks. They are used to represent multiplies
-    // by a constant, and it is important that simple arithmetic expressions
-    // are still recognizable by scalar evolution.
-    //
-    // The transforms applied to shl are very similar to the transforms applied
-    // to mul by constant. We can be more aggressive about optimizing right
-    // shifts.
-    //
-    // Combinations of right and left shifts will still be optimized in
-    // DAGCombine where scalar evolution no longer applies.
-
-    ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
-    uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
-    uint32_t ShiftAmt2 = COp1->getLimitedValue(TypeBits);
-    assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
-    if (ShiftAmt1 == 0)
-      return nullptr; // Will be simplified in the future.
-    Value *X = ShiftOp->getOperand(0);
-
-    IntegerType *Ty = cast<IntegerType>(I.getType());
-
-    // Check for (X << c1) << c2  and  (X >> c1) >> c2
-    if (I.getOpcode() == ShiftOp->getOpcode()) {
-      uint32_t AmtSum = ShiftAmt1 + ShiftAmt2; // Fold into one big shift.
-      // If this is an oversized composite shift, then unsigned shifts become
-      // zero (handled in InstSimplify) and ashr saturates.
-      if (AmtSum >= TypeBits) {
-        if (I.getOpcode() != Instruction::AShr)
-          return nullptr;
-        AmtSum = TypeBits - 1; // Saturate to 31 for i32 ashr.
-      }
-
-      return BinaryOperator::Create(I.getOpcode(), X,
-                                    ConstantInt::get(Ty, AmtSum));
-    }
-
-    if (ShiftAmt1 == ShiftAmt2) {
-      // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
-      if (I.getOpcode() == Instruction::LShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
-        return BinaryOperator::CreateAnd(
-            X, ConstantInt::get(I.getContext(), Mask));
-      }
-    } else if (ShiftAmt1 < ShiftAmt2) {
-      uint32_t ShiftDiff = ShiftAmt2 - ShiftAmt1;
-
-      // (X >>?,exact C1) << C2 --> X << (C2-C1)
-      // The inexact version is deferred to DAGCombine so we don't hide shl
-      // behind a bit mask.
-      if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl && ShiftOp->isExact()) {
-        assert(ShiftOp->getOpcode() == Instruction::LShr ||
-               ShiftOp->getOpcode() == Instruction::AShr);
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        BinaryOperator *NewShl =
-            BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
-        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
-        return NewShl;
-      }
-
-      // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
-      if (I.getOpcode() == Instruction::LShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        // (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
-        if (ShiftOp->hasNoUnsignedWrap()) {
-          BinaryOperator *NewLShr =
-              BinaryOperator::Create(Instruction::LShr, X, ShiftDiffCst);
-          NewLShr->setIsExact(I.isExact());
-          return NewLShr;
-        }
-        Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
-
-        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(
-            Shift, ConstantInt::get(I.getContext(), Mask));
-      }
-
-      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
-      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
-      if (I.getOpcode() == Instruction::AShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        if (ShiftOp->hasNoSignedWrap()) {
-          // (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
-          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-          BinaryOperator *NewAShr =
-              BinaryOperator::Create(Instruction::AShr, X, ShiftDiffCst);
-          NewAShr->setIsExact(I.isExact());
-          return NewAShr;
-        }
-      }
-    } else {
-      assert(ShiftAmt2 < ShiftAmt1);
-      uint32_t ShiftDiff = ShiftAmt1 - ShiftAmt2;
-
-      // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
-      // The inexact version is deferred to DAGCombine so we don't hide shl
-      // behind a bit mask.
-      if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl && ShiftOp->isExact()) {
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        BinaryOperator *NewShr =
-            BinaryOperator::Create(ShiftOp->getOpcode(), X, ShiftDiffCst);
-        NewShr->setIsExact(true);
-        return NewShr;
-      }
-
-      // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
-      if (I.getOpcode() == Instruction::LShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        if (ShiftOp->hasNoUnsignedWrap()) {
-          // (X <<nuw C1) >>u C2 --> X <<nuw (C1-C2)
-          BinaryOperator *NewShl =
-              BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
-          NewShl->setHasNoUnsignedWrap(true);
-          return NewShl;
-        }
-        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-
-        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(
-            Shift, ConstantInt::get(I.getContext(), Mask));
-      }
-
-      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
-      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
-      if (I.getOpcode() == Instruction::AShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        if (ShiftOp->hasNoSignedWrap()) {
-          // (X <<nsw C1) >>s C2 --> X <<nsw (C1-C2)
-          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-          BinaryOperator *NewShl =
-              BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
-          NewShl->setHasNoSignedWrap(true);
-          return NewShl;
-        }
-      }
-    }
-  }
-
-  return nullptr;
-}
-
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
 
-  ConstantInt *COp1 = nullptr;
-  if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Op1))
-    COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue());
-  else if (ConstantVector *CV = dyn_cast<ConstantVector>(Op1))
-    COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue());
-  else
-    COp1 = dyn_cast<ConstantInt>(Op1);
-
-  if (!COp1)
+  const APInt *Op1C;
+  if (!match(Op1, m_APInt(Op1C)))
     return nullptr;
 
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
-      CanEvaluateShifted(Op0, COp1->getZExtValue(), isLeftShift, *this, &I)) {
+      canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
 
     return replaceInstUsesWith(
-        I, GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this, DL));
+        I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
   }
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
-  uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
+  unsigned TypeBits = Op0->getType()->getScalarSizeInBits();
 
-  assert(!COp1->uge(TypeBits) &&
+  assert(!Op1C->uge(TypeBits) &&
          "Shift over the type width should have been removed already");
 
-  // ((X*C1) << C2) == (X * (C1 << C2))
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
-    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
-      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
-        return BinaryOperator::CreateMul(BO->getOperand(0),
-                                         ConstantExpr::getShl(BOOp, Op1));
-
   if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
     return FoldedShift;
 
@@ -544,7 +349,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
         isa<ConstantInt>(TrOp->getOperand(1))) {
       // Okay, we'll do this xform.  Make the shift of shift.
-      Constant *ShAmt = ConstantExpr::getZExt(COp1, TrOp->getType());
+      Constant *ShAmt =
+          ConstantExpr::getZExt(cast<Constant>(Op1), TrOp->getType());
       // (shift2 (shift1 & 0x00FF), c2)
       Value *NSh = Builder->CreateBinOp(I.getOpcode(), TrOp, ShAmt,I.getName());
 
@@ -561,10 +367,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
       // shift.  We know that it is a logical shift by a constant, so adjust the
       // mask as appropriate.
       if (I.getOpcode() == Instruction::Shl)
-        MaskV <<= COp1->getZExtValue();
+        MaskV <<= Op1C->getZExtValue();
       else {
         assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
-        MaskV = MaskV.lshr(COp1->getZExtValue());
+        MaskV = MaskV.lshr(Op1C->getZExtValue());
       }
 
       // shift1 & 0x00FF
@@ -598,7 +404,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           // (X + (Y << C))
           Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), YS, V1,
                                           Op0BO->getOperand(1)->getName());
-          uint32_t Op1Val = COp1->getLimitedValue(TypeBits);
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
 
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(I.getContext(), Bits);
@@ -634,7 +440,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           // (X + (Y << C))
           Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), V1, YS,
                                           Op0BO->getOperand(0)->getName());
-          uint32_t Op1Val = COp1->getLimitedValue(TypeBits);
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
 
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(I.getContext(), Bits);
@@ -705,9 +511,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     }
   }
 
-  if (Instruction *Folded = foldShiftByConstOfShiftByConst(I, COp1, Builder))
-    return Folded;
-
   return nullptr;
 }
 
@@ -715,59 +518,97 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V =
-          SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(),
-                          I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V = SimplifyShlInst(Op0, Op1, I.hasNoSignedWrap(),
+                                 I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *V = commonShiftTransforms(I))
     return V;
 
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
-    unsigned ShAmt = Op1C->getZExtValue();
-
-    // Turn:
-    //  %zext = zext i32 %V to i64
-    //  %res = shl i64 %V, 8
-    //
-    // Into:
-    //  %shl = shl i32 %V, 8
-    //  %res = zext i32 %shl to i64
-    //
-    // This is only valid if %V would have zeros shifted out.
-    if (auto *ZI = dyn_cast<ZExtInst>(I.getOperand(0))) {
-      unsigned SrcBitWidth = ZI->getSrcTy()->getScalarSizeInBits();
-      if (ShAmt < SrcBitWidth &&
-          MaskedValueIsZero(ZI->getOperand(0),
-                            APInt::getHighBitsSet(SrcBitWidth, ShAmt), 0, &I)) {
-        auto *Shl = Builder->CreateShl(ZI->getOperand(0), ShAmt);
-        return new ZExtInst(Shl, I.getType());
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = I.getType()->getScalarSizeInBits();
+    Type *Ty = I.getType();
+
+    // shl (zext X), ShAmt --> zext (shl X, ShAmt)
+    // This is only valid if X would have zeros shifted out.
+    Value *X;
+    if (match(Op0, m_ZExt(m_Value(X)))) {
+      unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt < SrcWidth &&
+          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
+        return new ZExtInst(Builder->CreateShl(X, ShAmt), Ty);
+    }
+
+    // (X >>u C) << C --> X & (-1 << C)
+    if (match(Op0, m_LShr(m_Value(X), m_Specific(Op1)))) {
+      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    // Be careful about hiding shl instructions behind bit masks. They are used
+    // to represent multiplies by a constant, and it is important that simple
+    // arithmetic expressions are still recognizable by scalar evolution.
+    // The inexact versions are deferred to DAGCombine, so we don't hide shl
+    // behind a bit mask.
+    const APInt *ShOp1;
+    if (match(Op0, m_CombineOr(m_Exact(m_LShr(m_Value(X), m_APInt(ShOp1))),
+                               m_Exact(m_AShr(m_Value(X), m_APInt(ShOp1)))))) {
+      unsigned ShrAmt = ShOp1->getZExtValue();
+      if (ShrAmt < ShAmt) {
+        // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+        auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
       }
+      if (ShrAmt > ShAmt) {
+        // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+        auto *NewShr = BinaryOperator::Create(
+            cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
+        NewShr->setIsExact(true);
+        return NewShr;
+      }
+    }
+
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X << C1) << C2 --> X << (C1 + C2)
+        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
     }
 
     // If the shifted-out value is known-zero, then this is a NUW shift.
     if (!I.hasNoUnsignedWrap() &&
-        MaskedValueIsZero(I.getOperand(0),
-                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt), 0,
-                          &I)) {
+        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
       I.setHasNoUnsignedWrap();
       return &I;
     }
 
-    // If the shifted out value is all signbits, this is a NSW shift.
-    if (!I.hasNoSignedWrap() &&
-        ComputeNumSignBits(I.getOperand(0), 0, &I) > ShAmt) {
+    // If the shifted-out value is all signbits, then this is a NSW shift.
+    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
       I.setHasNoSignedWrap();
       return &I;
     }
   }
 
-  // (C1 << A) << C2 -> (C1 << C2) << A
-  Constant *C1, *C2;
-  Value *A;
-  if (match(I.getOperand(0), m_OneUse(m_Shl(m_Constant(C1), m_Value(A)))) &&
-      match(I.getOperand(1), m_Constant(C2)))
-    return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
+  Constant *C1;
+  if (match(Op1, m_Constant(C1))) {
+    Constant *C2;
+    Value *X;
+    // (C2 << X) << C1 --> (C2 << C1) << X
+    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
+      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
+
+    // (X * C2) << C1 --> X * (C2 << C1)
+    if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+  }
 
   return nullptr;
 }
@@ -776,43 +617,83 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
-                                  DL, &TLI, &DT, &AC))
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V = SimplifyLShrInst(Op0, Op1, I.isExact(), DL, &TLI, &DT, &AC))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-    unsigned ShAmt = Op1C->getZExtValue();
-
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
-      unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+  Type *Ty = I.getType();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = Ty->getScalarSizeInBits();
+    auto *II = dyn_cast<IntrinsicInst>(Op0);
+    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
+        (II->getIntrinsicID() == Intrinsic::ctlz ||
+         II->getIntrinsicID() == Intrinsic::cttz ||
+         II->getIntrinsicID() == Intrinsic::ctpop)) {
       // ctlz.i32(x)>>5  --> zext(x == 0)
       // cttz.i32(x)>>5  --> zext(x == 0)
       // ctpop.i32(x)>>5 --> zext(x == -1)
-      if ((II->getIntrinsicID() == Intrinsic::ctlz ||
-           II->getIntrinsicID() == Intrinsic::cttz ||
-           II->getIntrinsicID() == Intrinsic::ctpop) &&
-          isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt) {
-        bool isCtPop = II->getIntrinsicID() == Intrinsic::ctpop;
-        Constant *RHS = ConstantInt::getSigned(Op0->getType(), isCtPop ? -1:0);
-        Value *Cmp = Builder->CreateICmpEQ(II->getArgOperand(0), RHS);
-        return new ZExtInst(Cmp, II->getType());
+      bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop;
+      Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0);
+      Value *Cmp = Builder->CreateICmpEQ(II->getArgOperand(0), RHS);
+      return new ZExtInst(Cmp, Ty);
+    }
+
+    Value *X;
+    const APInt *ShOp1;
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
+          auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
+          NewLShr->setIsExact(I.isExact());
+          return NewLShr;
+        }
+        // (X << C1) >>u C2  --> (X >>u (C2 - C1)) & (-1 >> C2)
+        Value *NewLShr = Builder->CreateLShr(X, ShiftDiff, "", I.isExact());
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
       }
+      if (ShlAmt > ShAmt) {
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+          auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+          NewShl->setHasNoUnsignedWrap(true);
+          return NewShl;
+        }
+        // (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+        Value *NewShl = Builder->CreateShl(X, ShiftDiff);
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+      }
+      assert(ShlAmt == ShAmt);
+      // (X << C) >>u C --> X & (-1 >>u C)
+      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
+        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
     }
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(Op1C->getBitWidth(), ShAmt),
-                          0, &I)){
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
       I.setIsExact();
       return &I;
     }
   }
-
   return nullptr;
 }
 
@@ -820,48 +701,66 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
-                                  DL, &TLI, &DT, &AC))
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V = SimplifyAShrInst(Op0, Op1, I.isExact(), DL, &TLI, &DT, &AC))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-    unsigned ShAmt = Op1C->getZExtValue();
+  Type *Ty = I.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
 
-    // If the input is a SHL by the same constant (ashr (shl X, C), C), then we
-    // have a sign-extend idiom.
+    // If the shift amount equals the difference in width of the destination
+    // and source scalar types:
+    // ashr (shl (zext X), C), C --> sext X
     Value *X;
-    if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) {
-      // If the input is an extension from the shifted amount value, e.g.
-      //   %x = zext i8 %A to i32
-      //   %y = shl i32 %x, 24
-      //   %z = ashr %y, 24
-      // then turn this into "z = sext i8 A to i32".
-      if (ZExtInst *ZI = dyn_cast<ZExtInst>(X)) {
-        uint32_t SrcBits = ZI->getOperand(0)->getType()->getScalarSizeInBits();
-        uint32_t DestBits = ZI->getType()->getScalarSizeInBits();
-        if (Op1C->getZExtValue() == DestBits-SrcBits)
-          return new SExtInst(ZI->getOperand(0), ZI->getType());
+    if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) &&
+        ShAmt == BitWidth - X->getType()->getScalarSizeInBits())
+      return new SExtInst(X, Ty);
+
+    // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
+    // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+    const APInt *ShOp1;
+    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff);
+        NewAShr->setIsExact(I.isExact());
+        return NewAShr;
       }
+      if (ShlAmt > ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff);
+        NewShl->setHasNoSignedWrap(true);
+        return NewShl;
+      }
+    }
+
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized arithmetic shifts replicate the sign bit.
+      AmtSum = std::min(AmtSum, BitWidth - 1);
+      // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+      return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
     }
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(Op1C->getBitWidth(), ShAmt),
-                          0, &I)) {
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
       I.setIsExact();
       return &I;
     }
   }
 
   // See if we can turn a signed shr into an unsigned shr.
-  if (MaskedValueIsZero(Op0,
-                        APInt::getSignBit(I.getType()->getScalarSizeInBits()),
-                        0, &I))
+  if (MaskedValueIsZero(Op0, APInt::getSignBit(BitWidth), 0, &I))
     return BinaryOperator::CreateLShr(Op0, Op1);
 
   return nullptr;
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8b930bd95dfe..4e6f02058d83 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -30,18 +30,20 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
   assert(I && "No instruction?");
   assert(OpNo < I->getNumOperands() && "Operand index too large");
 
-  // If the operand is not a constant integer, nothing to do.
-  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
-  if (!OpC) return false;
+  // The operand must be a constant integer or splat integer.
+  Value *Op = I->getOperand(OpNo);
+  const APInt *C;
+  if (!match(Op, m_APInt(C)))
+    return false;
 
   // If there are no bits set that aren't demanded, nothing to do.
-  Demanded = Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
-  if ((~Demanded & OpC->getValue()) == 0)
+  Demanded = Demanded.zextOrTrunc(C->getBitWidth());
+  if ((~Demanded & *C) == 0)
     return false;
 
   // This instruction is producing bits that are not demanded. Shrink the RHS.
-  Demanded &= OpC->getValue();
-  I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded));
+  Demanded &= *C;
+  I->setOperand(OpNo, ConstantInt::get(Op->getType(), Demanded));
 
   return true;
 }
@@ -66,12 +68,13 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Use &U, const APInt &DemandedMask,
+bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                        const APInt &DemandedMask,
                                         APInt &KnownZero, APInt &KnownOne,
                                         unsigned Depth) {
-  auto *UserI = dyn_cast<Instruction>(U.getUser());
+  Use &U = I->getOperandUse(OpNo);
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero,
-                                          KnownOne, Depth, UserI);
+                                          KnownOne, Depth, I);
   if (!NewVal) return false;
   U = NewVal;
   return true;
@@ -114,9 +117,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       KnownOne.getBitWidth() == BitWidth &&
       "Value *V, DemandedMask, KnownZero and KnownOne "
       "must have same BitWidth");
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    // We know all of the bits for a constant!
-    KnownOne = CI->getValue() & DemandedMask;
+  const APInt *C;
+  if (match(V, m_APInt(C))) {
+    // We know all of the bits for a scalar constant or a splat vector constant!
+    KnownOne = *C & DemandedMask;
     KnownZero = ~KnownOne & DemandedMask;
     return nullptr;
   }
@@ -138,9 +142,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   if (Depth == 6)        // Limit search depth.
     return nullptr;
 
-  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
@@ -151,107 +152,43 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   // we can't do any simplifications of the operands, because DemandedMask
   // only reflects the bits demanded by *one* of the users.
   if (Depth != 0 && !I->hasOneUse()) {
-    // Despite the fact that we can't simplify this instruction in all User's
-    // context, we can at least compute the knownzero/knownone bits, and we can
-    // do simplifications that apply to *just* the one user if we know that
-    // this instruction has a simpler value in that context.
-    if (I->getOpcode() == Instruction::And) {
-      // If either the LHS or the RHS are Zero, the result is zero.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
-                       CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
-
-      // If all of the demanded bits are known 1 on one side, return the other.
-      // These bits cannot contribute to the result of the 'and' in this
-      // context.
-      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
-          (DemandedMask & ~LHSKnownZero))
-        return I->getOperand(0);
-      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
-          (DemandedMask & ~RHSKnownZero))
-        return I->getOperand(1);
-
-      // If all of the demanded bits in the inputs are known zeros, return zero.
-      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
-        return Constant::getNullValue(VTy);
-
-    } else if (I->getOpcode() == Instruction::Or) {
-      // We can simplify (X|Y) -> X or Y in the user's context if we know that
-      // only bits from X or Y are demanded.
-
-      // If either the LHS or the RHS are One, the result is One.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
-                       CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
-
-      // If all of the demanded bits are known zero on one side, return the
-      // other.  These bits cannot contribute to the result of the 'or' in this
-      // context.
-      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
-          (DemandedMask & ~LHSKnownOne))
-        return I->getOperand(0);
-      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
-          (DemandedMask & ~RHSKnownOne))
-        return I->getOperand(1);
-
-      // If all of the potentially set bits on one side are known to be set on
-      // the other side, just use the 'other' side.
-      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
-          (DemandedMask & (~RHSKnownZero)))
-        return I->getOperand(0);
-      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
-          (DemandedMask & (~LHSKnownZero)))
-        return I->getOperand(1);
-    } else if (I->getOpcode() == Instruction::Xor) {
-      // We can simplify (X^Y) -> X or Y in the user's context if we know that
-      // only bits from X or Y are demanded.
-
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
-                       CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
-
-      // If all of the demanded bits are known zero on one side, return the
-      // other.
-      if ((DemandedMask & RHSKnownZero) == DemandedMask)
-        return I->getOperand(0);
-      if ((DemandedMask & LHSKnownZero) == DemandedMask)
-        return I->getOperand(1);
-    }
-
-    // Compute the KnownZero/KnownOne bits to simplify things downstream.
-    computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
-    return nullptr;
+    return SimplifyMultipleUseDemandedBits(I, DemandedMask, KnownZero, KnownOne,
+                                           Depth, CxtI);
   }
 
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+
   // If this is the root being simplified, allow it to have multiple uses,
   // just set the DemandedMask to all bits so that we can try to simplify the
   // operands.  This allows visitTruncInst (for example) to simplify the
   // operand of a trunc without duplicating all the logic below.
   if (Depth == 0 && !V->hasOneUse())
-    DemandedMask = APInt::getAllOnesValue(BitWidth);
+    DemandedMask.setAllBits();
 
   switch (I->getOpcode()) {
   default:
     computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
     break;
-  case Instruction::And:
+  case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
-                             LHSKnownZero, LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnownZero, LHSKnownZero,
+                             LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
 
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    APInt IKnownZero = RHSKnownZero | LHSKnownZero;
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    APInt IKnownOne = RHSKnownOne & LHSKnownOne;
+
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if ((DemandedMask & ((RHSKnownZero | LHSKnownZero)|
-                         (RHSKnownOne & LHSKnownOne))) == DemandedMask)
-      return Constant::getIntegerValue(VTy, RHSKnownOne & LHSKnownOne);
+    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(VTy, IKnownOne);
 
     // If all of the demanded bits are known 1 on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
@@ -262,34 +199,33 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         (DemandedMask & ~RHSKnownZero))
       return I->getOperand(1);
 
-    // If all of the demanded bits in the inputs are known zeros, return zero.
-    if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
-      return Constant::getNullValue(VTy);
-
     // If the RHS is a constant, see if we can simplify it.
     if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
       return I;
 
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    KnownOne = RHSKnownOne & LHSKnownOne;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    KnownZero = RHSKnownZero | LHSKnownZero;
+    KnownZero = std::move(IKnownZero);
+    KnownOne  = std::move(IKnownOne);
     break;
-  case Instruction::Or:
+  }
+  case Instruction::Or: {
     // If either the LHS or the RHS are One, the result is One.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne,
-                             LHSKnownZero, LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnownOne, LHSKnownZero,
+                             LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
 
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    APInt IKnownZero = RHSKnownZero & LHSKnownZero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    APInt IKnownOne = RHSKnownOne | LHSKnownOne;
+
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if ((DemandedMask & ((RHSKnownZero & LHSKnownZero)|
-                         (RHSKnownOne | LHSKnownOne))) == DemandedMask)
-      return Constant::getIntegerValue(VTy, RHSKnownOne | LHSKnownOne);
+    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(VTy, IKnownOne);
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
@@ -313,16 +249,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (ShrinkDemandedConstant(I, 1, DemandedMask))
       return I;
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    KnownZero = RHSKnownZero & LHSKnownZero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    KnownOne = RHSKnownOne | LHSKnownOne;
+    KnownZero = std::move(IKnownZero);
+    KnownOne  = std::move(IKnownOne);
     break;
+  }
   case Instruction::Xor: {
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, LHSKnownZero,
-                             LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask, LHSKnownZero, LHSKnownOne,
+                             Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -400,9 +335,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       }
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    KnownZero= (RHSKnownZero & LHSKnownZero) | (RHSKnownOne & LHSKnownOne);
+    KnownZero = std::move(IKnownZero);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOne = (RHSKnownZero & LHSKnownOne) | (RHSKnownOne & LHSKnownZero);
+    KnownOne  = std::move(IKnownOne);
     break;
   }
   case Instruction::Select:
@@ -412,10 +347,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero,
-                             LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 1, DemandedMask, LHSKnownZero, LHSKnownOne,
+                             Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -434,8 +369,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.zext(truncBf);
     KnownZero = KnownZero.zext(truncBf);
     KnownOne = KnownOne.zext(truncBf);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     DemandedMask = DemandedMask.trunc(BitWidth);
     KnownZero = KnownZero.trunc(BitWidth);
@@ -460,8 +395,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Don't touch a vector-to-scalar bitcast.
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     break;
@@ -472,15 +407,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     DemandedMask = DemandedMask.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
     assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     // The top bits are known to be zero.
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    KnownZero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::SExt: {
@@ -490,7 +425,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     APInt InputDemandedBits = DemandedMask &
                               APInt::getLowBitsSet(BitWidth, SrcBitWidth);
 
-    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
+    APInt NewBits(APInt::getBitsSetFrom(BitWidth, SrcBitWidth));
     // If any of the sign extended bits are demanded, we know that the sign
     // bit is demanded.
     if ((NewBits & DemandedMask) != 0)
@@ -499,8 +434,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, InputDemandedBits, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     InputDemandedBits = InputDemandedBits.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
@@ -530,11 +465,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Right fill the mask of bits for this ADD/SUB to demand the most
       // significant bit and all those below it.
       APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth + 1) ||
+      if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+          SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnownZero, LHSKnownOne,
+                               Depth + 1) ||
           ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
-          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth + 1)) {
+          SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnownZero, RHSKnownOne,
+                               Depth + 1)) {
         // Disable the nsw and nuw flags here: We can no longer guarantee that
         // we won't wrap after simplification. Removing the nsw/nuw flags is
         // legal here because the top bit is not demanded.
@@ -543,6 +479,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         BinOP.setHasNoUnsignedWrap(false);
         return I;
       }
+
+      // If we are known to be adding/subtracting zeros to every bit below
+      // the highest demanded bit, we just return the other side.
+      if ((DemandedFromOps & RHSKnownZero) == DemandedFromOps)
+        return I->getOperand(0);
+      // We can't do this with the LHS for subtraction.
+      if (I->getOpcode() == Instruction::Add &&
+          (DemandedFromOps & LHSKnownZero) == DemandedFromOps)
+        return I->getOperand(1);
     }
 
     // Otherwise just hand the add/sub off to computeKnownBits to fill in
@@ -569,19 +514,19 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is NUW/NSW, then it does demand the high bits.
       ShlOperator *IOp = cast<ShlOperator>(I);
       if (IOp->hasNoSignedWrap())
-        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+        DemandedMaskIn.setHighBits(ShiftAmt+1);
       else if (IOp->hasNoUnsignedWrap())
-        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setHighBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne,
+                               Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       KnownZero <<= ShiftAmt;
       KnownOne  <<= ShiftAmt;
       // low bits known zero.
       if (ShiftAmt)
-        KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        KnownZero.setLowBits(ShiftAmt);
     }
     break;
   case Instruction::LShr:
@@ -595,19 +540,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<LShrOperator>(I)->isExact())
-        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setLowBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne,
+                               Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-      if (ShiftAmt) {
-        // Compute the new bits that are at the top now.
-        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-        KnownZero |= HighBits;  // high bits known zero.
-      }
+      KnownZero = KnownZero.lshr(ShiftAmt);
+      KnownOne  = KnownOne.lshr(ShiftAmt);
+      if (ShiftAmt)
+        KnownZero.setHighBits(ShiftAmt);  // high bits known zero.
     }
     break;
   case Instruction::AShr:
@@ -635,26 +577,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If any of the "high bits" are demanded, we should set the sign bit as
       // demanded.
       if (DemandedMask.countLeadingZeros() <= ShiftAmt)
-        DemandedMaskIn.setBit(BitWidth-1);
+        DemandedMaskIn.setSignBit();
 
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<AShrOperator>(I)->isExact())
-        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setLowBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne,
+                               Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now.
       APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      KnownZero = KnownZero.lshr(ShiftAmt);
+      KnownOne  = KnownOne.lshr(ShiftAmt);
 
       // Handle the sign bits.
       APInt SignBit(APInt::getSignBit(BitWidth));
       // Adjust to where it is now in the mask.
-      SignBit = APIntOps::lshr(SignBit, ShiftAmt);
+      SignBit = SignBit.lshr(ShiftAmt);
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
@@ -683,8 +625,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
         APInt LowBits = RA - 1;
         APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, LHSKnownZero,
-                                 LHSKnownOne, Depth + 1))
+        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnownZero, LHSKnownOne,
+                                 Depth + 1))
           return I;
 
         // The low bits of LHS are unchanged by the srem.
@@ -693,12 +635,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
         // If LHS is non-negative or has all low bits zero, then the upper bits
         // are all zero.
-        if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits))
+        if (LHSKnownZero.isNegative() || ((LHSKnownZero & LowBits) == LowBits))
           KnownZero |= ~LowBits;
 
         // If LHS is negative and not all low bits are zero, then the upper bits
         // are all one.
-        if (LHSKnownOne[BitWidth-1] && ((LHSKnownOne & LowBits) != 0))
+        if (LHSKnownOne.isNegative() && ((LHSKnownOne & LowBits) != 0))
           KnownOne |= ~LowBits;
 
         assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
@@ -713,21 +655,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                        CxtI);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero.setBit(KnownZero.getBitWidth() - 1);
+        KnownZero.setSignBit();
     }
     break;
   case Instruction::URem: {
     APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
     APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, KnownZero2,
-                             KnownOne2, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), AllOnes, KnownZero2,
-                             KnownOne2, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, AllOnes, KnownZero2, KnownOne2, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, AllOnes, KnownZero2, KnownOne2, Depth + 1))
       return I;
 
     unsigned Leaders = KnownZero2.countLeadingOnes();
-    Leaders = std::max(Leaders,
-                       KnownZero2.countLeadingOnes());
     KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
     break;
   }
@@ -792,11 +730,11 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           return ConstantInt::getNullValue(VTy);
 
         // We know that the upper bits are set to zero.
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - ArgWidth);
+        KnownZero.setBitsFrom(ArgWidth);
         return nullptr;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero = APInt::getHighBitsSet(64, 32);
+        KnownZero.setBitsFrom(32);
         return nullptr;
       }
     }
@@ -811,6 +749,150 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   return nullptr;
 }
 
+/// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
+/// bits. It also tries to handle simplifications that can be done based on
+/// DemandedMask, but without modifying the Instruction.
+Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
+                                                     const APInt &DemandedMask,
+                                                     APInt &KnownZero,
+                                                     APInt &KnownOne,
+                                                     unsigned Depth,
+                                                     Instruction *CxtI) {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  Type *ITy = I->getType();
+
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+
+  // Despite the fact that we can't simplify this instruction in all User's
+  // context, we can at least compute the knownzero/knownone bits, and we can
+  // do simplifications that apply to *just* the one user if we know that
+  // this instruction has a simpler value in that context.
+  switch (I->getOpcode()) {
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
+                     CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
+                     CxtI);
+
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    APInt IKnownZero = RHSKnownZero | LHSKnownZero;
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    APInt IKnownOne = RHSKnownOne & LHSKnownOne;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
+        (DemandedMask & ~LHSKnownZero))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
+        (DemandedMask & ~RHSKnownZero))
+      return I->getOperand(1);
+
+    KnownZero = std::move(IKnownZero);
+    KnownOne  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Or: {
+    // We can simplify (X|Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    // If either the LHS or the RHS are One, the result is One.
+    computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
+                     CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
+                     CxtI);
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    APInt IKnownZero = RHSKnownZero & LHSKnownZero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    APInt IKnownOne = RHSKnownOne | LHSKnownOne;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
+        (DemandedMask & ~LHSKnownOne))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
+        (DemandedMask & ~RHSKnownOne))
+      return I->getOperand(1);
+
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
+        (DemandedMask & (~RHSKnownZero)))
+      return I->getOperand(0);
+    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
+        (DemandedMask & (~LHSKnownZero)))
+      return I->getOperand(1);
+
+    KnownZero = std::move(IKnownZero);
+    KnownOne  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Xor: {
+    // We can simplify (X^Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
+                     CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
+                     CxtI);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt IKnownZero = (RHSKnownZero & LHSKnownZero) |
+                       (RHSKnownOne & LHSKnownOne);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt IKnownOne =  (RHSKnownZero & LHSKnownOne) |
+                       (RHSKnownOne & LHSKnownZero);
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if ((DemandedMask & RHSKnownZero) == DemandedMask)
+      return I->getOperand(0);
+    if ((DemandedMask & LHSKnownZero) == DemandedMask)
+      return I->getOperand(1);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    KnownZero = std::move(IKnownZero);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne  = std::move(IKnownOne);
+    break;
+  }
+  default:
+    // Compute the KnownZero/KnownOne bits to simplify things downstream.
+    computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
+
+    // If this user is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(ITy, KnownOne);
+
+    break;
+  }
+
+  return nullptr;
+}
+
+
 /// Helper routine of SimplifyDemandedUseBits. It tries to simplify
 /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
 /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
@@ -849,7 +931,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   unsigned ShrAmt = ShrOp1.getZExtValue();
 
   KnownOne.clearAllBits();
-  KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
+  KnownZero.setLowBits(ShlAmt - 1);
   KnownZero &= DemandedMask;
 
   APInt BitMask1(APInt::getAllOnesValue(BitWidth));
@@ -1472,14 +1554,136 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       break;
     }
 
+    case Intrinsic::x86_sse2_packssdw_128:
+    case Intrinsic::x86_sse2_packsswb_128:
+    case Intrinsic::x86_sse2_packuswb_128:
+    case Intrinsic::x86_sse41_packusdw:
+    case Intrinsic::x86_avx2_packssdw:
+    case Intrinsic::x86_avx2_packsswb:
+    case Intrinsic::x86_avx2_packusdw:
+    case Intrinsic::x86_avx2_packuswb:
+    case Intrinsic::x86_avx512_packssdw_512:
+    case Intrinsic::x86_avx512_packsswb_512:
+    case Intrinsic::x86_avx512_packusdw_512:
+    case Intrinsic::x86_avx512_packuswb_512: {
+      auto *Ty0 = II->getArgOperand(0)->getType();
+      unsigned InnerVWidth = Ty0->getVectorNumElements();
+      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+      unsigned VWidthPerLane = VWidth / NumLanes;
+      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+      // Per lane, pack the elements of the first input and then the second.
+      // e.g.
+      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+      for (int OpNum = 0; OpNum != 2; ++OpNum) {
+        APInt OpDemandedElts(InnerVWidth, 0);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          unsigned LaneIdx = Lane * VWidthPerLane;
+          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+            if (DemandedElts[Idx])
+              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+          }
+        }
+
+        // Demand elements from the operand.
+        auto *Op = II->getArgOperand(OpNum);
+        APInt OpUndefElts(InnerVWidth, 0);
+        TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
+                                          Depth + 1);
+        if (TmpV) {
+          II->setArgOperand(OpNum, TmpV);
+          MadeChange = true;
+        }
+
+        // Pack the operand's UNDEF elements, one lane at a time.
+        OpUndefElts = OpUndefElts.zext(VWidth);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+          LaneElts = LaneElts.shl(InnerVWidthPerLane * (2 * Lane + OpNum));
+          UndefElts |= LaneElts;
+        }
+      }
+      break;
+    }
+
+    // PSHUFB
+    case Intrinsic::x86_ssse3_pshuf_b_128:
+    case Intrinsic::x86_avx2_pshuf_b:
+    case Intrinsic::x86_avx512_pshuf_b_512:
+    // PERMILVAR
+    case Intrinsic::x86_avx_vpermilvar_ps:
+    case Intrinsic::x86_avx_vpermilvar_ps_256:
+    case Intrinsic::x86_avx512_vpermilvar_ps_512:
+    case Intrinsic::x86_avx_vpermilvar_pd:
+    case Intrinsic::x86_avx_vpermilvar_pd_256:
+    case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    // PERMV
+    case Intrinsic::x86_avx2_permd:
+    case Intrinsic::x86_avx2_permps: {
+      Value *Op1 = II->getArgOperand(1);
+      TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts,
+                                        Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+      break;
+    }
+
     // SSE4A instructions leave the upper 64-bits of the 128-bit result
     // in an undefined state.
     case Intrinsic::x86_sse4a_extrq:
     case Intrinsic::x86_sse4a_extrqi:
     case Intrinsic::x86_sse4a_insertq:
     case Intrinsic::x86_sse4a_insertqi:
-      UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2);
+      UndefElts.setHighBits(VWidth / 2);
       break;
+    case Intrinsic::amdgcn_buffer_load:
+    case Intrinsic::amdgcn_buffer_load_format: {
+      if (VWidth == 1 || !DemandedElts.isMask())
+        return nullptr;
+
+      // TODO: Handle 3 vectors when supported in code gen.
+      unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
+      if (NewNumElts == VWidth)
+        return nullptr;
+
+      Module *M = II->getParent()->getParent()->getParent();
+      Type *EltTy = V->getType()->getVectorElementType();
+
+      Type *NewTy = (NewNumElts == 1) ? EltTy :
+        VectorType::get(EltTy, NewNumElts);
+
+      Function *NewIntrin = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
+                                                      NewTy);
+
+      SmallVector<Value *, 5> Args;
+      for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+        Args.push_back(II->getArgOperand(I));
+
+      IRBuilderBase::InsertPointGuard Guard(*Builder);
+      Builder->SetInsertPoint(II);
+
+      CallInst *NewCall = Builder->CreateCall(NewIntrin, Args);
+      NewCall->takeName(II);
+      NewCall->copyMetadata(*II);
+      if (NewNumElts == 1) {
+        return Builder->CreateInsertElement(UndefValue::get(V->getType()),
+                                            NewCall, static_cast<uint64_t>(0));
+      }
+
+      SmallVector<uint32_t, 8> EltMask;
+      for (unsigned I = 0; I < VWidth; ++I)
+        EltMask.push_back(I);
+
+      Value *Shuffle = Builder->CreateShuffleVector(
+        NewCall, UndefValue::get(NewTy), EltMask);
+
+      MadeChange = true;
+      return Shuffle;
+    }
     }
     break;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index b2477f6c8633..e89b400a4afc 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -645,6 +645,36 @@ static Instruction *foldInsSequenceIntoBroadcast(InsertElementInst &InsElt) {
   return new ShuffleVectorInst(InsertFirst, UndefValue::get(VT), ZeroMask);
 }
 
+/// If we have an insertelement instruction feeding into another insertelement
+/// and the 2nd is inserting a constant into the vector, canonicalize that
+/// constant insertion before the insertion of a variable:
+///
+/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
+/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
+///
+/// This has the potential of eliminating the 2nd insertelement instruction
+/// via constant folding of the scalar constant into a vector constant.
+static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
+                                     InstCombiner::BuilderTy &Builder) {
+  auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
+  if (!InsElt1 || !InsElt1->hasOneUse())
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *ScalarC;
+  ConstantInt *IdxC1, *IdxC2;
+  if (match(InsElt1->getOperand(0), m_Value(X)) &&
+      match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
+      match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
+      match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
+      match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
+    Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
+    return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
+  }
+
+  return nullptr;
+}
+
 /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
 /// --> shufflevector X, CVec', Mask'
 static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
@@ -806,6 +836,9 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
     return Shuf;
 
+  if (Instruction *NewInsElt = hoistInsEltConst(IE, *Builder))
+    return NewInsElt;
+
   // Turn a sequence of inserts that broadcasts a scalar into a single
   // insert + shufflevector.
   if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE))
@@ -1107,12 +1140,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
-  bool MadeChange = false;
-
-  // Undefined shuffle mask -> undefined value.
-  if (isa<UndefValue>(SVI.getOperand(2)))
-    return replaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+  if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getMask(),
+                                          SVI.getType(), DL, &TLI, &DT, &AC))
+    return replaceInstUsesWith(SVI, V);
 
+  bool MadeChange = false;
   unsigned VWidth = SVI.getType()->getVectorNumElements();
 
   APInt UndefElts(VWidth, 0);
@@ -1209,7 +1241,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (isShuffleExtractingFromLHS(SVI, Mask)) {
     Value *V = LHS;
     unsigned MaskElems = Mask.size();
-    unsigned BegIdx = Mask.front();
     VectorType *SrcTy = cast<VectorType>(V->getType());
     unsigned VecBitWidth = SrcTy->getBitWidth();
     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
@@ -1223,6 +1254,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
           // Only visit bitcasts that weren't previously handled.
           BCs.push_back(BC);
     for (BitCastInst *BC : BCs) {
+      unsigned BegIdx = Mask.front();
       Type *TgtTy = BC->getDestTy();
       unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
       if (!TgtElemBitWidth)
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 27fc34d23175..88ef17bbc8fa 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -82,18 +82,24 @@ static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
 
+static cl::opt<unsigned>
+MaxArraySize("instcombine-maxarray-size", cl::init(1024),
+             cl::desc("Maximum array size considered when doing a combine"));
+
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(Builder, DL, GEP);
 }
 
 /// Return true if it is desirable to convert an integer computation from a
 /// given bit width to a new bit width.
-/// We don't want to convert from a legal to an illegal type for example or from
-/// a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(unsigned FromWidth,
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. A width of '1' is always treated as a legal type
+/// because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types.
+bool InstCombiner::shouldChangeType(unsigned FromWidth,
                                     unsigned ToWidth) const {
-  bool FromLegal = DL.isLegalInteger(FromWidth);
-  bool ToLegal = DL.isLegalInteger(ToWidth);
+  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
+  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
@@ -109,14 +115,16 @@ bool InstCombiner::ShouldChangeType(unsigned FromWidth,
 }
 
 /// Return true if it is desirable to convert a computation from 'From' to 'To'.
-/// We don't want to convert from a legal to an illegal type for example or from
-/// a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. i1 is always treated as a legal type because it is
+/// a fundamental type in IR, and there are many specialized optimizations for
+/// i1 types.
+bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
 
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
-  return ShouldChangeType(FromWidth, ToWidth);
+  return shouldChangeType(FromWidth, ToWidth);
 }
 
 // Return true, if No Signed Wrap should be maintained for I.
@@ -447,16 +455,11 @@ static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
 
 /// This function returns identity value for given opcode, which can be used to
 /// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
-static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) {
+static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
   if (isa<Constant>(V))
     return nullptr;
 
-  if (OpCode == Instruction::Mul)
-    return ConstantInt::get(V->getType(), 1);
-
-  // TODO: We can handle other cases e.g. Instruction::And, Instruction::Or etc.
-
-  return nullptr;
+  return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
 }
 
 /// This function factors binary ops which can be combined using distributive
@@ -468,8 +471,7 @@ static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) {
 static Instruction::BinaryOps
 getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
                           BinaryOperator *Op, Value *&LHS, Value *&RHS) {
-  if (!Op)
-    return Instruction::BinaryOpsEnd;
+  assert(Op && "Expected a binary operator");
 
   LHS = Op->getOperand(0);
   RHS = Op->getOperand(1);
@@ -499,11 +501,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
                                const DataLayout &DL, BinaryOperator &I,
                                Instruction::BinaryOps InnerOpcode, Value *A,
                                Value *B, Value *C, Value *D) {
-
-  // If any of A, B, C, D are null, we can not factor I, return early.
-  // Checking A and C should be enough.
-  if (!A || !C || !B || !D)
-    return nullptr;
+  assert(A && B && C && D && "All values must be provided");
 
   Value *V = nullptr;
   Value *SimplifiedInst = nullptr;
@@ -564,13 +562,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
         if (isa<OverflowingBinaryOperator>(&I))
           HasNSW = I.hasNoSignedWrap();
 
-        if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
-          if (isa<OverflowingBinaryOperator>(Op0))
-            HasNSW &= Op0->hasNoSignedWrap();
+        if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS))
+          HasNSW &= LOBO->hasNoSignedWrap();
 
-        if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
-          if (isa<OverflowingBinaryOperator>(Op1))
-            HasNSW &= Op1->hasNoSignedWrap();
+        if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS))
+          HasNSW &= ROBO->hasNoSignedWrap();
 
         // We can propagate 'nsw' if we know that
         //  %Y = mul nsw i16 %X, C
@@ -599,31 +595,39 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
 
-  // Factorization.
-  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-  auto TopLevelOpcode = I.getOpcode();
-  auto LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
-  auto RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
-
-  // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
-  // a common term.
-  if (LHSOpcode == RHSOpcode) {
-    if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, C, D))
-      return V;
-  }
-
-  // The instruction has the form "(A op' B) op (C)".  Try to factorize common
-  // term.
-  if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, RHS,
-                                  getIdentityValue(LHSOpcode, RHS)))
-    return V;
+  {
+    // Factorization.
+    Value *A, *B, *C, *D;
+    Instruction::BinaryOps LHSOpcode, RHSOpcode;
+    if (Op0)
+      LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
+    if (Op1)
+      RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
+
+    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
+    // a common term.
+    if (Op0 && Op1 && LHSOpcode == RHSOpcode)
+      if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, C, D))
+        return V;
+
+    // The instruction has the form "(A op' B) op (C)".  Try to factorize common
+    // term.
+    if (Op0)
+      if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
+        if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, RHS,
+                                        Ident))
+          return V;
 
-  // The instruction has the form "(B) op (C op' D)".  Try to factorize common
-  // term.
-  if (Value *V = tryFactorization(Builder, DL, I, RHSOpcode, LHS,
-                                  getIdentityValue(RHSOpcode, LHS), C, D))
-    return V;
+    // The instruction has the form "(B) op (C op' D)".  Try to factorize common
+    // term.
+    if (Op1)
+      if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
+        if (Value *V = tryFactorization(Builder, DL, I, RHSOpcode, LHS, Ident,
+                                        C, D))
+          return V;
+  }
 
   // Expansion.
   if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
@@ -720,6 +724,21 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
     if (C->getType()->getElementType()->isIntegerTy())
       return ConstantExpr::getNeg(C);
 
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      Constant *Elt = CV->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      if (!isa<ConstantInt>(Elt))
+        return nullptr;
+    }
+    return ConstantExpr::getNeg(CV);
+  }
+
   return nullptr;
 }
 
@@ -820,8 +839,29 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
 }
 
-Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
-  PHINode *PN = cast<PHINode>(I.getOperand(0));
+static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
+                                        InstCombiner *IC) {
+  bool ConstIsRHS = isa<Constant>(I->getOperand(1));
+  Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
+
+  if (auto *InC = dyn_cast<Constant>(InV)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I->getOpcode(), InC, C);
+    return ConstantExpr::get(I->getOpcode(), C, InC);
+  }
+
+  Value *Op0 = InV, *Op1 = C;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  Value *RI = IC->Builder->CreateBinOp(I->getOpcode(), Op0, Op1, "phitmp");
+  auto *FPInst = dyn_cast<Instruction>(RI);
+  if (FPInst && isa<FPMathOperator>(FPInst))
+    FPInst->copyFastMathFlags(I);
+  return RI;
+}
+
+Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return nullptr;
@@ -902,7 +942,11 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
       // even if currently isNullValue gives false.
       Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
-      if (InC && !isa<ConstantExpr>(InC))
+      // For vector constants, we cannot use isNullValue to fold into
+      // FalseVInPred versus TrueVInPred. When we have individual nonzero
+      // elements in the vector, we will incorrectly fold InC to
+      // `TrueVInPred`.
+      if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
       else
         InV = Builder->CreateSelect(PN->getIncomingValue(i),
@@ -923,15 +967,9 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
                                   C, "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
-  } else if (I.getNumOperands() == 2) {
-    Constant *C = cast<Constant>(I.getOperand(1));
+  } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
     for (unsigned i = 0; i != NumPHIValues; ++i) {
-      Value *InV = nullptr;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
-        InV = ConstantExpr::get(I.getOpcode(), InC, C);
-      else
-        InV = Builder->CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
-                                   PN->getIncomingValue(i), C, "phitmp");
+      Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i), this);
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   } else {
@@ -957,14 +995,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return replaceInstUsesWith(I, NewPN);
 }
 
-Instruction *InstCombiner::foldOpWithConstantIntoOperand(Instruction &I) {
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(BinaryOperator &I) {
   assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
     if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
       return NewSel;
-  } else if (isa<PHINode>(I.getOperand(0))) {
-    if (Instruction *NewPhi = FoldOpIntoPhi(I))
+  } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
+    if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
       return NewPhi;
   }
   return nullptr;
@@ -1315,22 +1353,19 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
   assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
 
-  // If both arguments of binary operation are shuffles, which use the same
-  // mask and shuffle within a single vector, it is worthwhile to move the
-  // shuffle after binary operation:
+  // If both arguments of the binary operation are shuffles that use the same
+  // mask and shuffle within a single vector, move the shuffle after the binop:
   //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
-  if (isa<ShuffleVectorInst>(LHS) && isa<ShuffleVectorInst>(RHS)) {
-    ShuffleVectorInst *LShuf = cast<ShuffleVectorInst>(LHS);
-    ShuffleVectorInst *RShuf = cast<ShuffleVectorInst>(RHS);
-    if (isa<UndefValue>(LShuf->getOperand(1)) &&
-        isa<UndefValue>(RShuf->getOperand(1)) &&
-        LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType() &&
-        LShuf->getMask() == RShuf->getMask()) {
-      Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
-          RShuf->getOperand(0), Builder);
-      return Builder->CreateShuffleVector(NewBO,
-          UndefValue::get(NewBO->getType()), LShuf->getMask());
-    }
+  auto *LShuf = dyn_cast<ShuffleVectorInst>(LHS);
+  auto *RShuf = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LShuf && RShuf && LShuf->getMask() == RShuf->getMask() &&
+      isa<UndefValue>(LShuf->getOperand(1)) &&
+      isa<UndefValue>(RShuf->getOperand(1)) &&
+      LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType()) {
+    Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
+                                      RShuf->getOperand(0), Builder);
+    return Builder->CreateShuffleVector(
+        NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
   }
 
   // If one argument is a shuffle within one vector, the other is a constant,
@@ -1559,27 +1594,21 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // Replace: gep (gep %P, long B), long A, ...
       // With:    T = long A+B; gep %P, T, ...
       //
-      Value *Sum;
       Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
       Value *GO1 = GEP.getOperand(1);
-      if (SO1 == Constant::getNullValue(SO1->getType())) {
-        Sum = GO1;
-      } else if (GO1 == Constant::getNullValue(GO1->getType())) {
-        Sum = SO1;
-      } else {
-        // If they aren't the same type, then the input hasn't been processed
-        // by the loop above yet (which canonicalizes sequential index types to
-        // intptr_t).  Just avoid transforming this until the input has been
-        // normalized.
-        if (SO1->getType() != GO1->getType())
-          return nullptr;
-        // Only do the combine when GO1 and SO1 are both constants. Only in
-        // this case, we are sure the cost after the merge is never more than
-        // that before the merge.
-        if (!isa<Constant>(GO1) || !isa<Constant>(SO1))
-          return nullptr;
-        Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
-      }
+
+      // If they aren't the same type, then the input hasn't been processed
+      // by the loop above yet (which canonicalizes sequential index types to
+      // intptr_t).  Just avoid transforming this until the input has been
+      // normalized.
+      if (SO1->getType() != GO1->getType())
+        return nullptr;
+
+      Value* Sum = SimplifyAddInst(GO1, SO1, false, false, DL, &TLI, &DT, &AC);
+      // Only do the combine when we are sure the cost after the
+      // merge is never more than that before the merge.
+      if (Sum == nullptr)
+        return nullptr;
 
       // Update the GEP in place if possible.
       if (Src->getNumOperands() == 2) {
@@ -1654,14 +1683,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
-  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
-  Value *StrippedPtr = PtrOp->stripPointerCasts();
-  PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
-
   // We do not handle pointer-vector geps here.
-  if (!StrippedPtrTy)
+  if (GEP.getType()->isVectorTy())
     return nullptr;
 
+  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+  Value *StrippedPtr = PtrOp->stripPointerCasts();
+  PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
+
   if (StrippedPtr != PtrOp) {
     bool HasZeroPointerIndex = false;
     if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
@@ -2239,11 +2268,11 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   ConstantInt *AddRHS;
   if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
     // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
-    for (SwitchInst::CaseIt CaseIter : SI.cases()) {
-      Constant *NewCase = ConstantExpr::getSub(CaseIter.getCaseValue(), AddRHS);
+    for (auto Case : SI.cases()) {
+      Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
       assert(isa<ConstantInt>(NewCase) &&
              "Result of expression should be constant");
-      CaseIter.setValue(cast<ConstantInt>(NewCase));
+      Case.setValue(cast<ConstantInt>(NewCase));
     }
     SI.setCondition(Op0);
     return &SI;
@@ -2275,9 +2304,9 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc");
     SI.setCondition(NewCond);
 
-    for (SwitchInst::CaseIt CaseIter : SI.cases()) {
-      APInt TruncatedCase = CaseIter.getCaseValue()->getValue().trunc(NewWidth);
-      CaseIter.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+    for (auto Case : SI.cases()) {
+      APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+      Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
     }
     return &SI;
   }
@@ -2934,8 +2963,8 @@ bool InstCombiner::run() {
         Result->takeName(I);
 
         // Push the new instruction and any users onto the worklist.
-        Worklist.Add(Result);
         Worklist.AddUsersToWorkList(*Result);
+        Worklist.Add(Result);
 
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
@@ -2958,8 +2987,8 @@ bool InstCombiner::run() {
         if (isInstructionTriviallyDead(I, &TLI)) {
           eraseInstFromFunction(*I);
         } else {
-          Worklist.Add(I);
           Worklist.AddUsersToWorkList(*I);
+          Worklist.Add(I);
         }
       }
       MadeIRChange = true;
@@ -3022,12 +3051,11 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
         }
 
       // See if we can constant fold its operands.
-      for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e;
-           ++i) {
-        if (!isa<ConstantVector>(i) && !isa<ConstantExpr>(i))
+      for (Use &U : Inst->operands()) {
+        if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
           continue;
 
-        auto *C = cast<Constant>(i);
+        auto *C = cast<Constant>(U);
         Constant *&FoldRes = FoldedConstants[C];
         if (!FoldRes)
           FoldRes = ConstantFoldConstant(C, DL, TLI);
@@ -3035,7 +3063,10 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
           FoldRes = C;
 
         if (FoldRes != C) {
-          *i = FoldRes;
+          DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+                       << "\n    Old = " << *C
+                       << "\n    New = " << *FoldRes << '\n');
+          U = FoldRes;
           MadeIRChange = true;
         }
       }
@@ -3055,17 +3086,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
-        // See if this is an explicit destination.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i)
-          if (i.getCaseValue() == Cond) {
-            BasicBlock *ReachableBB = i.getCaseSuccessor();
-            Worklist.push_back(ReachableBB);
-            continue;
-          }
-
-        // Otherwise it is the default destination.
-        Worklist.push_back(SI->getDefaultDest());
+        Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
         continue;
       }
     }
@@ -3152,6 +3173,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
 
     InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
                     AA, AC, TLI, DT, DL, LI);
+    IC.MaxArraySizeForCombine = MaxArraySize;
     Changed |= IC.run();
 
     if (!Changed)
@@ -3176,9 +3198,10 @@ PreservedAnalyses InstCombinePass::run(Function &F,
     return PreservedAnalyses::all();
 
   // Mark all the analyses that instcombine updates as preserved.
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
   return PA;
 }
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index f5e9e7dd5a93..94cfc69ed555 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -80,6 +80,7 @@ static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 // The shadow memory space is dynamically allocated.
 static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
@@ -380,6 +381,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsPS4CPU = TargetTriple.isPS4CPU();
   bool IsLinux = TargetTriple.isOSLinux();
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
                  TargetTriple.getArch() == llvm::Triple::ppc64le;
@@ -392,6 +394,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
                   TargetTriple.getArch() == llvm::Triple::mips64el;
   bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
   bool IsWindows = TargetTriple.isOSWindows();
+  bool IsFuchsia = TargetTriple.isOSFuchsia();
 
   ShadowMapping Mapping;
 
@@ -412,12 +415,18 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
     else
       Mapping.Offset = kDefaultShadowOffset32;
   } else {  // LongSize == 64
-    if (IsPPC64)
+    // Fuchsia is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsFuchsia)
+      Mapping.Offset = 0;
+    else if (IsPPC64)
       Mapping.Offset = kPPC64_ShadowOffset64;
     else if (IsSystemZ)
       Mapping.Offset = kSystemZ_ShadowOffset64;
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsPS4CPU)
+      Mapping.Offset = kPS4CPU_ShadowOffset64;
     else if (IsLinux && IsX86_64) {
       if (IsKasan)
         Mapping.Offset = kLinuxKasan_ShadowOffset64;
@@ -456,9 +465,9 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   // offset is not necessary 1/8-th of the address space.  On SystemZ,
   // we could OR the constant in a single instruction, but it's more
   // efficient to load it once and use indexed addressing.
-  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ
-                           && !(Mapping.Offset & (Mapping.Offset - 1))
-                           && Mapping.Offset != kDynamicShadowSentinel;
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+                           !(Mapping.Offset & (Mapping.Offset - 1)) &&
+                           Mapping.Offset != kDynamicShadowSentinel;
 
   return Mapping;
 }
@@ -567,8 +576,6 @@ struct AddressSanitizer : public FunctionPass {
   Type *IntptrTy;
   ShadowMapping Mapping;
   DominatorTree *DT;
-  Function *AsanCtorFunction = nullptr;
-  Function *AsanInitFunction = nullptr;
   Function *AsanHandleNoReturnFunc;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
   // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
@@ -1561,31 +1568,31 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
 
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr));
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy()));
   AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
 
   // Declare functions that register/unregister globals.
   AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnregisterGlobals = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
-                            IntptrTy, IntptrTy, nullptr));
+                            IntptrTy, IntptrTy));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
 
   // Declare the functions that find globals in a shared object and then invoke
   // the (un)register function on them.
   AsanRegisterImageGlobals =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);
 
   AsanUnregisterImageGlobals =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);
 }
 
@@ -1618,11 +1625,12 @@ void AddressSanitizerModule::SetComdatForGlobalMetadata(
 GlobalVariable *
 AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
                                              StringRef OriginalName) {
-  GlobalVariable *Metadata =
-      new GlobalVariable(M, Initializer->getType(), false,
-                         GlobalVariable::InternalLinkage, Initializer,
-                         Twine("__asan_global_") +
-                             GlobalValue::getRealLinkageName(OriginalName));
+  auto Linkage = TargetTriple.isOSBinFormatMachO()
+                     ? GlobalVariable::InternalLinkage
+                     : GlobalVariable::PrivateLinkage;
+  GlobalVariable *Metadata = new GlobalVariable(
+      M, Initializer->getType(), false, Linkage, Initializer,
+      Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
   Metadata->setSection(getGlobalMetadataSection());
   return Metadata;
 }
@@ -1862,7 +1870,8 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
     GlobalValue *InstrumentedGlobal = NewGlobal;
 
     bool CanUsePrivateAliases =
-        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO();
+        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
+        TargetTriple.isOSBinFormatWasm();
     if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
       // Create local alias for NewGlobal to avoid crash on ODR between
       // instrumented and non-instrumented libraries.
@@ -1926,13 +1935,19 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   initializeCallbacks(M);
 
-  bool Changed = false;
+  if (CompileKernel)
+    return false;
+
+  Function *AsanCtorFunction;
+  std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{}, kAsanVersionCheckName);
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
 
+  bool Changed = false;
   // TODO(glider): temporarily disabled globals instrumentation for KASan.
-  if (ClGlobals && !CompileKernel) {
-    Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
-    assert(CtorFunc);
-    IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+  if (ClGlobals) {
+    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
     Changed |= InstrumentGlobals(IRB, M);
   }
 
@@ -1949,49 +1964,60 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
       const std::string ExpStr = Exp ? "exp_" : "";
       const std::string SuffixStr = CompileKernel ? "N" : "_n";
       const std::string EndingStr = Recover ? "_noabort" : "";
-      Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
-      AsanErrorCallbackSized[AccessIsWrite][Exp] =
-          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr,
-              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
-      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
-          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
-              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
-      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
-           AccessSizeIndex++) {
-        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
-        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
-            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
-                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
-        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
-            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
-                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
+
+      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+      SmallVector<Type *, 2> Args1{1, IntptrTy};
+      if (Exp) {
+        Type *ExpType = Type::getInt32Ty(*C);
+        Args2.push_back(ExpType);
+        Args1.push_back(ExpType);
       }
-    }
+	    AsanErrorCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
+	                EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+	         AccessSizeIndex++) {
+	      const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+	      AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+
+	      AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+	    }
+	  }
   }
 
   const std::string MemIntrinCallbackPrefix =
       CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
   AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
   AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
   AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memset", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy));
 
   AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), nullptr));
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy()));
 
   AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -2001,7 +2027,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 // virtual
 bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
-
   GlobalsMD.init(M);
 
   C = &(M.getContext());
@@ -2009,13 +2034,6 @@ bool AddressSanitizer::doInitialization(Module &M) {
   IntptrTy = Type::getIntNTy(*C, LongSize);
   TargetTriple = Triple(M.getTargetTriple());
 
-  if (!CompileKernel) {
-    std::tie(AsanCtorFunction, AsanInitFunction) =
-        createSanitizerCtorAndInitFunctions(
-            M, kAsanModuleCtorName, kAsanInitName,
-            /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName);
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
-  }
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   return true;
 }
@@ -2034,6 +2052,8 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
   if (F.getName().find(" load]") != std::string::npos) {
+    Function *AsanInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
     IRB.CreateCall(AsanInitFunction, {});
     return true;
@@ -2081,7 +2101,6 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
 }
 
 bool AddressSanitizer::runOnFunction(Function &F) {
-  if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().startswith("__asan_")) return false;
@@ -2175,8 +2194,9 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       (ClInstrumentationWithCallsThreshold >= 0 &&
        ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
   const DataLayout &DL = F.getParent()->getDataLayout();
-  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(),
-                                     /*RoundToAlign=*/true);
+  ObjectSizeOpts ObjSizeOpts;
+  ObjSizeOpts.RoundToAlign = true;
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
 
   // Instrument.
   int NumInstrumented = 0;
@@ -2234,18 +2254,18 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
     std::string Suffix = itostr(i);
     AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
-                              IntptrTy, nullptr));
+                              IntptrTy));
     AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
-                              IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+                              IRB.getVoidTy(), IntptrTy, IntptrTy));
   }
   if (ASan.UseAfterScope) {
     AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
-                              IntptrTy, IntptrTy, nullptr));
+                              IntptrTy, IntptrTy));
     AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
-                              IntptrTy, IntptrTy, nullptr));
+                              IntptrTy, IntptrTy));
   }
 
   for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
@@ -2254,14 +2274,14 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
     Name << std::setw(2) << std::setfill('0') << std::hex << Val;
     AsanSetShadowFunc[Val] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy));
   }
 
   AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanAllocasUnpoisonFunc =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
 }
 
 void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index b34d5b8c45a7..4e454f0c95b6 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -254,7 +254,7 @@ class DataFlowSanitizer : public ModulePass {
   MDNode *ColdCallWeights;
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
-  AttributeSet ReadOnlyNoneAttrs;
+  AttributeList ReadOnlyNoneAttrs;
   bool DFSanRuntimeShadowMask;
 
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
@@ -331,6 +331,10 @@ class DFSanVisitor : public InstVisitor<DFSanVisitor> {
   DFSanFunction &DFSF;
   DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
 
+  const DataLayout &getDataLayout() const {
+    return DFSF.F->getParent()->getDataLayout();
+  }
+
   void visitOperandShadowInst(Instruction &I);
 
   void visitBinaryOperator(BinaryOperator &BO);
@@ -539,16 +543,17 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
                                     F->getParent());
   NewF->copyAttributesFrom(F);
   NewF->removeAttributes(
-    AttributeSet::ReturnIndex,
-    AttributeSet::get(F->getContext(), AttributeSet::ReturnIndex,
-                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+      AttributeList::ReturnIndex,
+      AttributeList::get(
+          F->getContext(), AttributeList::ReturnIndex,
+          AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
     NewF->removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet().addAttribute(*Ctx, AttributeSet::FunctionIndex,
-                                    "split-stack"));
+        AttributeList::FunctionIndex,
+        AttributeList().addAttribute(*Ctx, AttributeList::FunctionIndex,
+                                     "split-stack"));
     CallInst::Create(DFSanVarargWrapperFn,
                      IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
                      BB);
@@ -580,8 +585,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     Function::arg_iterator AI = F->arg_begin(); ++AI;
     for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
       Args.push_back(&*AI);
-    CallInst *CI =
-        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    CallInst *CI = CallInst::Create(&*F->arg_begin(), Args, "", BB);
     ReturnInst *RI;
     if (FT->getReturnType()->isVoidTy())
       RI = ReturnInst::Create(*Ctx, BB);
@@ -595,7 +599,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     DFSanVisitor(DFSF).visitCallInst(*CI);
     if (!FT->getReturnType()->isVoidTy())
       new StoreInst(DFSF.getShadow(RI->getReturnValue()),
-                    &F->getArgumentList().back(), RI);
+                    &*std::prev(F->arg_end()), RI);
   }
 
   return C;
@@ -622,26 +626,26 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     F->addAttribute(1, Attribute::ZExt);
     F->addAttribute(2, Attribute::ZExt);
   }
   DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     F->addAttribute(1, Attribute::ZExt);
     F->addAttribute(2, Attribute::ZExt);
   }
   DFSanUnionLoadFn =
       Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   }
   DFSanUnimplementedFn =
       Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
@@ -696,7 +700,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
-  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+  ReadOnlyNoneAttrs = AttributeList::get(*Ctx, AttributeList::FunctionIndex, B);
 
   // First, change the ABI of every function in the module.  ABI-listed
   // functions keep their original ABI and get a wrapper function.
@@ -717,9 +721,10 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
         NewF->copyAttributesFrom(&F);
         NewF->removeAttributes(
-          AttributeSet::ReturnIndex,
-          AttributeSet::get(NewF->getContext(), AttributeSet::ReturnIndex,
-                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+            AttributeList::ReturnIndex,
+            AttributeList::get(
+                NewF->getContext(), AttributeList::ReturnIndex,
+                AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
         for (Function::arg_iterator FArg = F.arg_begin(),
                                     NewFArg = NewF->arg_begin(),
                                     FArgEnd = F.arg_end();
@@ -758,7 +763,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
           &F, std::string("dfsw$") + std::string(F.getName()),
           GlobalValue::LinkOnceODRLinkage, NewFT);
       if (getInstrumentedABI() == IA_TLS)
-        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
 
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
@@ -906,7 +911,7 @@ Value *DFSanFunction::getShadow(Value *V) {
         break;
       }
       case DataFlowSanitizer::IA_Args: {
-        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
         Function::arg_iterator i = F->arg_begin();
         while (ArgIdx--)
           ++i;
@@ -983,7 +988,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
   IRBuilder<> IRB(Pos);
   if (AvoidNewBlocks) {
     CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
-    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     Call->addAttribute(1, Attribute::ZExt);
     Call->addAttribute(2, Attribute::ZExt);
 
@@ -996,7 +1001,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
         Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
     IRBuilder<> ThenIRB(BI);
     CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
-    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     Call->addAttribute(1, Attribute::ZExt);
     Call->addAttribute(2, Attribute::ZExt);
 
@@ -1099,7 +1104,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     CallInst *FallbackCall = FallbackIRB.CreateCall(
         DFS.DFSanUnionLoadFn,
         {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
 
     // Compare each of the shadows stored in the loaded 64 bits to each other,
     // by computing (WideShadow rotl ShadowWidth) == WideShadow.
@@ -1156,7 +1161,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
   IRBuilder<> IRB(Pos);
   CallInst *FallbackCall = IRB.CreateCall(
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   return FallbackCall;
 }
 
@@ -1446,7 +1451,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
 
           // Custom functions returning non-void will write to the return label.
           if (!FT->getReturnType()->isVoidTy()) {
-            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+            CustomFn->removeAttributes(AttributeList::FunctionIndex,
                                        DFSF.DFS.ReadOnlyNoneAttrs);
           }
         }
@@ -1481,7 +1486,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
           auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
                                            CS.arg_size() - FT->getNumParams());
           auto *LabelVAAlloca = new AllocaInst(
-              LabelVATy, "labelva", &DFSF.F->getEntryBlock().front());
+              LabelVATy, getDataLayout().getAllocaAddrSpace(),
+              "labelva", &DFSF.F->getEntryBlock().front());
 
           for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
             auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
@@ -1494,8 +1500,9 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         if (!FT->getReturnType()->isVoidTy()) {
           if (!DFSF.LabelReturnAlloca) {
             DFSF.LabelReturnAlloca =
-                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
-                               &DFSF.F->getEntryBlock().front());
+              new AllocaInst(DFSF.DFS.ShadowTy,
+                             getDataLayout().getAllocaAddrSpace(),
+                             "labelreturn", &DFSF.F->getEntryBlock().front());
           }
           Args.push_back(DFSF.LabelReturnAlloca);
         }
@@ -1574,7 +1581,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
       unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
       ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
       AllocaInst *VarArgShadow =
-          new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front());
+        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+                       "", &DFSF.F->getEntryBlock().front());
       Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
       for (unsigned n = 0; i != e; ++i, ++n) {
         IRB.CreateStore(
@@ -1593,7 +1601,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
     }
     NewCS.setCallingConv(CS.getCallingConv());
     NewCS.setAttributes(CS.getAttributes().removeAttributes(
-        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
         AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
 
     if (Next) {
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 05eba6c4dc69..7dea1dee756a 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -267,35 +267,35 @@ void EfficiencySanitizer::initializeCallbacks(Module &M) {
     SmallString<32> AlignedLoadName("__esan_aligned_load" + ByteSizeStr);
     EsanAlignedLoad[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> AlignedStoreName("__esan_aligned_store" + ByteSizeStr);
     EsanAlignedStore[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> UnalignedLoadName("__esan_unaligned_load" + ByteSizeStr);
     EsanUnalignedLoad[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> UnalignedStoreName("__esan_unaligned_store" + ByteSizeStr);
     EsanUnalignedStore[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   }
   EsanUnalignedLoadN = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__esan_unaligned_loadN", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   EsanUnalignedStoreN = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__esan_unaligned_storeN", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemmoveFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemcpyFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemsetFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt32Ty(), IntptrTy, nullptr));
+                            IRB.getInt32Ty(), IntptrTy));
 }
 
 bool EfficiencySanitizer::shouldIgnoreStructType(StructType *StructTy) {
@@ -533,7 +533,7 @@ void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
   IRBuilder<> IRB_Dtor(EsanDtorFunction->getEntryBlock().getTerminator());
   Function *EsanExit = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(EsanExitName, IRB_Dtor.getVoidTy(),
-                            Int8PtrTy, nullptr));
+                            Int8PtrTy));
   EsanExit->setLinkage(Function::ExternalLinkage);
   IRB_Dtor.CreateCall(EsanExit, {ToolInfoArg});
   appendToGlobalDtors(M, EsanDtorFunction, EsanCtorAndDtorPriority);
@@ -757,7 +757,7 @@ bool EfficiencySanitizer::instrumentGetElementPtr(Instruction *I, Module &M) {
     return false;
   }
   Type *SourceTy = GepInst->getSourceElementType();
-  StructType *StructTy;
+  StructType *StructTy = nullptr;
   ConstantInt *Idx;
   // Check if GEP calculates address from a struct array.
   if (isa<StructType>(SourceTy)) {
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 1ba13bdfe05a..61d627673c90 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -1,4 +1,4 @@
-//===-- IndirectCallPromotion.cpp - Promote indirect calls to direct calls ===//
+//===-- IndirectCallPromotion.cpp - Optimizations based on value profiling ===//
 //
 //                      The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/IndirectCallSiteVisitor.h"
 #include "llvm/IR/BasicBlock.h"
@@ -40,6 +42,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -53,6 +56,8 @@ using namespace llvm;
 
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
 
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
@@ -80,6 +85,12 @@ static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
                                 cl::desc("Run indirect-call promotion in LTO "
                                          "mode"));
 
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+                     cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
 // If the option is set to true, only call instructions will be considered for
 // transformation -- invoke instructions will be ignored.
 static cl::opt<bool>
@@ -100,13 +111,51 @@ static cl::opt<bool>
     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
                  cl::desc("Dump IR after transformation happens"));
 
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
 namespace {
 class PGOIndirectCallPromotionLegacyPass : public ModulePass {
 public:
   static char ID;
 
-  PGOIndirectCallPromotionLegacyPass(bool InLTO = false)
-      : ModulePass(ID), InLTO(InLTO) {
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
     initializePGOIndirectCallPromotionLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -119,6 +168,28 @@ private:
   // If this pass is called in LTO. We need to special handling the PGOFuncName
   // for the static variables due to LTO's internalization.
   bool InLTO;
+
+  // If this pass is called in SamplePGO. We need to add the prof metadata to
+  // the promoted direct call.
+  bool SamplePGO;
+};
+
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
 };
 } // end anonymous namespace
 
@@ -128,8 +199,22 @@ INITIALIZE_PASS(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
                 "direct calls.",
                 false, false)
 
-ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO) {
-  return new PGOIndirectCallPromotionLegacyPass(InLTO);
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+                                                           bool SamplePGO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
+}
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
 }
 
 namespace {
@@ -144,17 +229,11 @@ private:
   // defines.
   InstrProfSymtab *Symtab;
 
-  enum TargetStatus {
-    OK,                   // Should be able to promote.
-    NotAvailableInModule, // Cannot find the target in current module.
-    ReturnTypeMismatch,   // Return type mismatch b/w target and indirect-call.
-    NumArgsMismatch,      // Number of arguments does not match.
-    ArgTypeMismatch       // Type mismatch in the arguments (cannot bitcast).
-  };
+  bool SamplePGO;
 
   // Test if we can legally promote this direct-call of Target.
-  TargetStatus isPromotionLegal(Instruction *Inst, uint64_t Target,
-                                Function *&F);
+  bool isPromotionLegal(Instruction *Inst, uint64_t Target, Function *&F,
+                        const char **Reason = nullptr);
 
   // A struct that records the direct target and it's call count.
   struct PromotionCandidate {
@@ -172,91 +251,77 @@ private:
       Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
       uint64_t TotalCount, uint32_t NumCandidates);
 
-  // Main function that transforms Inst (either a indirect-call instruction, or
-  // an invoke instruction , to a conditional call to F. This is like:
-  //     if (Inst.CalledValue == F)
-  //        F(...);
-  //     else
-  //        Inst(...);
-  //     end
-  // TotalCount is the profile count value that the instruction executes.
-  // Count is the profile count value that F is the target function.
-  // These two values are being used to update the branch weight.
-  void promote(Instruction *Inst, Function *F, uint64_t Count,
-               uint64_t TotalCount);
-
   // Promote a list of targets for one indirect-call callsite. Return
   // the number of promotions.
   uint32_t tryToPromote(Instruction *Inst,
                         const std::vector<PromotionCandidate> &Candidates,
                         uint64_t &TotalCount);
 
-  static const char *StatusToString(const TargetStatus S) {
-    switch (S) {
-    case OK:
-      return "OK to promote";
-    case NotAvailableInModule:
-      return "Cannot find the target";
-    case ReturnTypeMismatch:
-      return "Return type mismatch";
-    case NumArgsMismatch:
-      return "The number of arguments mismatch";
-    case ArgTypeMismatch:
-      return "Argument Type mismatch";
-    }
-    llvm_unreachable("Should not reach here");
-  }
-
   // Noncopyable
   ICallPromotionFunc(const ICallPromotionFunc &other) = delete;
   ICallPromotionFunc &operator=(const ICallPromotionFunc &other) = delete;
 
 public:
-  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab)
-      : F(Func), M(Modu), Symtab(Symtab) {
-  }
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+                     bool SamplePGO)
+      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO) {}
 
   bool processFunction();
 };
 } // end anonymous namespace
 
-ICallPromotionFunc::TargetStatus
-ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
-                                     Function *&TargetFunction) {
-  Function *DirectCallee = Symtab->getFunction(Target);
-  if (DirectCallee == nullptr)
-    return NotAvailableInModule;
+bool llvm::isLegalToPromote(Instruction *Inst, Function *F,
+                            const char **Reason) {
   // Check the return type.
   Type *CallRetType = Inst->getType();
   if (!CallRetType->isVoidTy()) {
-    Type *FuncRetType = DirectCallee->getReturnType();
+    Type *FuncRetType = F->getReturnType();
     if (FuncRetType != CallRetType &&
-        !CastInst::isBitCastable(FuncRetType, CallRetType))
-      return ReturnTypeMismatch;
+        !CastInst::isBitCastable(FuncRetType, CallRetType)) {
+      if (Reason)
+        *Reason = "Return type mismatch";
+      return false;
+    }
   }
 
   // Check if the arguments are compatible with the parameters
-  FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
+  FunctionType *DirectCalleeType = F->getFunctionType();
   unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
   CallSite CS(Inst);
   unsigned ArgNum = CS.arg_size();
 
-  if (ParamNum != ArgNum && !DirectCalleeType->isVarArg())
-    return NumArgsMismatch;
+  if (ParamNum != ArgNum && !DirectCalleeType->isVarArg()) {
+    if (Reason)
+      *Reason = "The number of arguments mismatch";
+    return false;
+  }
 
   for (unsigned I = 0; I < ParamNum; ++I) {
     Type *PTy = DirectCalleeType->getFunctionParamType(I);
     Type *ATy = CS.getArgument(I)->getType();
     if (PTy == ATy)
       continue;
-    if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy))
-      return ArgTypeMismatch;
+    if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy)) {
+      if (Reason)
+        *Reason = "Argument type mismatch";
+      return false;
+    }
   }
 
   DEBUG(dbgs() << " #" << NumOfPGOICallPromotion << " Promote the icall to "
-               << Symtab->getFuncName(Target) << "\n");
-  TargetFunction = DirectCallee;
-  return OK;
+               << F->getName() << "\n");
+  return true;
+}
+
+bool ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
+                                          Function *&TargetFunction,
+                                          const char **Reason) {
+  TargetFunction = Symtab->getFunction(Target);
+  if (TargetFunction == nullptr) {
+    *Reason = "Cannot find the target";
+    return false;
+  }
+  return isLegalToPromote(Inst, TargetFunction, Reason);
 }
 
 // Indirect-call promotion heuristic. The direct targets are sorted based on
@@ -296,10 +361,9 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
       break;
     }
     Function *TargetFunction = nullptr;
-    TargetStatus Status = isPromotionLegal(Inst, Target, TargetFunction);
-    if (Status != OK) {
+    const char *Reason = nullptr;
+    if (!isPromotionLegal(Inst, Target, TargetFunction, &Reason)) {
       StringRef TargetFuncName = Symtab->getFuncName(Target);
-      const char *Reason = StatusToString(Status);
       DEBUG(dbgs() << " Not promote: " << Reason << "\n");
       emitOptimizationRemarkMissed(
           F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
@@ -532,8 +596,14 @@ static void insertCallRetPHI(Instruction *Inst, Instruction *CallResult,
 //     Ret = phi(Ret1, Ret2);
 // It adds type casts for the args do not match the parameters and the return
 // value. Branch weights metadata also updated.
-void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
-                                 uint64_t Count, uint64_t TotalCount) {
+// If \p AttachProfToDirectCall is true, a prof metadata is attached to the
+// new direct call to contain \p Count. This is used by SamplePGO inliner to
+// check callsite hotness.
+// Returns the promoted direct call instruction.
+Instruction *llvm::promoteIndirectCall(Instruction *Inst,
+                                       Function *DirectCallee, uint64_t Count,
+                                       uint64_t TotalCount,
+                                       bool AttachProfToDirectCall) {
   assert(DirectCallee != nullptr);
   BasicBlock *BB = Inst->getParent();
   // Just to suppress the non-debug build warning.
@@ -548,6 +618,14 @@ void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
   Instruction *NewInst =
       createDirectCallInst(Inst, DirectCallee, DirectCallBB, MergeBB);
 
+  if (AttachProfToDirectCall) {
+    SmallVector<uint32_t, 1> Weights;
+    Weights.push_back(Count);
+    MDBuilder MDB(NewInst->getContext());
+    dyn_cast<Instruction>(NewInst->stripPointerCasts())
+        ->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  }
+
   // Move Inst from MergeBB to IndirectCallBB.
   Inst->removeFromParent();
   IndirectCallBB->getInstList().insert(IndirectCallBB->getFirstInsertionPt(),
@@ -576,9 +654,10 @@ void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
   DEBUG(dbgs() << *BB << *DirectCallBB << *IndirectCallBB << *MergeBB << "\n");
 
   emitOptimizationRemark(
-      F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
+      BB->getContext(), "pgo-icall-prom", *BB->getParent(), Inst->getDebugLoc(),
       Twine("Promote indirect call to ") + DirectCallee->getName() +
           " with count " + Twine(Count) + " out of " + Twine(TotalCount));
+  return NewInst;
 }
 
 // Promote indirect-call to conditional direct-call for one callsite.
@@ -589,7 +668,7 @@ uint32_t ICallPromotionFunc::tryToPromote(
 
   for (auto &C : Candidates) {
     uint64_t Count = C.Count;
-    promote(Inst, C.TargetFunction, Count, TotalCount);
+    promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount, SamplePGO);
     assert(TotalCount >= Count);
     TotalCount -= Count;
     NumOfPGOICallPromotion++;
@@ -630,7 +709,7 @@ bool ICallPromotionFunc::processFunction() {
 }
 
 // A wrapper function that does the actual work.
-static bool promoteIndirectCalls(Module &M, bool InLTO) {
+static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {
   if (DisableICP)
     return false;
   InstrProfSymtab Symtab;
@@ -641,7 +720,7 @@ static bool promoteIndirectCalls(Module &M, bool InLTO) {
       continue;
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
-    ICallPromotionFunc ICallPromotion(F, &M, &Symtab);
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO);
     bool FuncChanged = ICallPromotion.processFunction();
     if (ICPDUMPAFTER && FuncChanged) {
       DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
@@ -658,12 +737,289 @@ static bool promoteIndirectCalls(Module &M, bool InLTO) {
 
 bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
   // Command-line option has the priority for InLTO.
-  return promoteIndirectCalls(M, InLTO | ICPLTOMode);
+  return promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                              SamplePGO | ICPSamplePGOMode);
 }
 
-PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode))
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                            SamplePGO | ICPSamplePGOMode))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
 }
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
+      : Func(Func), BFI(BFI), Changed(false) {
+    ValueDataArray =
+        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+    // Get the MemOPSize range information from option MemOPSizeRange,
+    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+                                PreciseRangeLast);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MI : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MI)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        DEBUG(dbgs() << "MemOP calls: " << MI->getCalledFunction()->getName()
+                     << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(&MI);
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  bool Changed;
+  std::vector<MemIntrinsic *> WorkList;
+  // Start of the previse range.
+  int64_t PreciseRangeStart;
+  // Last value of the previse range.
+  int64_t PreciseRangeLast;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemIntrinsic *MI);
+
+  // This kind shows which group the value falls in. For PreciseValue, we have
+  // the profile count for that value. LargeGroup groups the values that are in
+  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+      return LargeGroup;
+    if (Value == PreciseRangeLast + 1)
+      return NonLargeGroup;
+    return PreciseValue;
+  }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+  assert(MI);
+  if (MI->getIntrinsicID() == Intrinsic::memmove)
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    DEBUG(dbgs() << "Scale counts: numberator = " << ActualCount
+                 << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+    // Only care precise value here.
+    if (getMemOPSizeKind(V) != PreciseValue)
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+  DEBUG(dbgs() << "Read one memory intrinsic profile: " << SumForOpt << " vs "
+               << TotalCount << "\n");
+  DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+               << " Versions\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MI->getParent();
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MI);
+  BasicBlock::iterator It(*MI);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+  DefaultBB->setName("MemOP.Default");
+  MergeBB->setName("MemOP.Merge");
+
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MI->getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+  // Clear the value profile data.
+  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+
+  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  for (uint64_t SizeId : SizeIds) {
+    ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    Instruction *NewInst = MI->clone();
+    // Fix the argument.
+    dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewInst);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  DEBUG(dbgs() << *BB << "\n");
+  DEBUG(dbgs() << *DefaultBB << "\n");
+  DEBUG(dbgs() << *MergeBB << "\n");
+
+  emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
+                         MI->getDebugLoc(),
+                         Twine("optimize ") + getMIName(MI) + " with count " +
+                             Twine(SumForOpt) + " out of " + Twine(TotalCount) +
+                             " for " + Twine(Version) + " versions");
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  return PGOMemOPSizeOptImpl(F, BFI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto  PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index adea7e772447..d91ac6ac7883 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -14,18 +14,58 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/InstrProfiling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "instrprof"
 
+// The start and end values of precise value profile range for memory
+// intrinsic sizes
+cl::opt<std::string> MemOPSizeRange(
+    "memop-size-range",
+    cl::desc("Set the range of size in memory intrinsic calls to be profiled "
+             "precisely, in a format of <start_val>:<end_val>"),
+    cl::init(""));
+
+// The value that considered to be large value in  memory intrinsic.
+cl::opt<unsigned> MemOPSizeLarge(
+    "memop-size-large",
+    cl::desc("Set large value thresthold in memory intrinsic size profiling. "
+             "Value of 0 disables the large value profiling."),
+    cl::init(8192));
+
 namespace {
 
 cl::opt<bool> DoNameCompression("enable-name-compression",
@@ -41,6 +81,7 @@ cl::opt<bool> ValueProfileStaticAlloc(
     "vp-static-alloc",
     cl::desc("Do static counter allocation for value profiler"),
     cl::init(true));
+
 cl::opt<double> NumCountersPerValueSite(
     "vp-counters-per-site",
     cl::desc("The average number of profile counters allocated "
@@ -56,9 +97,11 @@ class InstrProfilingLegacyPass : public ModulePass {
 
 public:
   static char ID;
-  InstrProfilingLegacyPass() : ModulePass(ID), InstrProf() {}
+
+  InstrProfilingLegacyPass() : ModulePass(ID) {}
   InstrProfilingLegacyPass(const InstrProfOptions &Options)
       : ModulePass(ID), InstrProf(Options) {}
+
   StringRef getPassName() const override {
     return "Frontend instrumentation-based coverage lowering";
   }
@@ -73,7 +116,7 @@ public:
   }
 };
 
-} // anonymous namespace
+} // end anonymous namespace
 
 PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
@@ -97,30 +140,6 @@ llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options) {
   return new InstrProfilingLegacyPass(Options);
 }
 
-bool InstrProfiling::isMachO() const {
-  return Triple(M->getTargetTriple()).isOSBinFormatMachO();
-}
-
-/// Get the section name for the counter variables.
-StringRef InstrProfiling::getCountersSection() const {
-  return getInstrProfCountersSectionName(isMachO());
-}
-
-/// Get the section name for the name variables.
-StringRef InstrProfiling::getNameSection() const {
-  return getInstrProfNameSectionName(isMachO());
-}
-
-/// Get the section name for the profile data variables.
-StringRef InstrProfiling::getDataSection() const {
-  return getInstrProfDataSectionName(isMachO());
-}
-
-/// Get the section name for the coverage mapping data.
-StringRef InstrProfiling::getCoverageSection() const {
-  return getInstrProfCoverageSectionName(isMachO());
-}
-
 static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
   InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
   if (Inc)
@@ -137,6 +156,9 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   NamesSize = 0;
   ProfileDataMap.clear();
   UsedVars.clear();
+  getMemOPSizeRangeFromOption(MemOPSizeRange, MemOPSizeRangeStart,
+                              MemOPSizeRangeLast);
+  TT = Triple(M.getTargetTriple());
 
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
@@ -189,17 +211,34 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
 }
 
 static Constant *getOrInsertValueProfilingCall(Module &M,
-                                               const TargetLibraryInfo &TLI) {
+                                               const TargetLibraryInfo &TLI,
+                                               bool IsRange = false) {
   LLVMContext &Ctx = M.getContext();
   auto *ReturnTy = Type::getVoidTy(M.getContext());
-  Type *ParamTypes[] = {
+
+  Constant *Res;
+  if (!IsRange) {
+    Type *ParamTypes[] = {
 #define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
 #include "llvm/ProfileData/InstrProfData.inc"
-  };
-  auto *ValueProfilingCallTy =
-      FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
-  Constant *Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
-                                        ValueProfilingCallTy);
+    };
+    auto *ValueProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+                                ValueProfilingCallTy);
+  } else {
+    Type *RangeParamTypes[] = {
+#define VALUE_RANGE_PROF 1
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+#undef VALUE_RANGE_PROF
+    };
+    auto *ValueRangeProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(RangeParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueRangeProfFuncName(),
+                                ValueRangeProfilingCallTy);
+  }
+
   if (Function *FunRes = dyn_cast<Function>(Res)) {
     if (auto AK = TLI.getExtAttrForI32Param(false))
       FunRes->addAttribute(3, AK);
@@ -208,7 +247,6 @@ static Constant *getOrInsertValueProfilingCall(Module &M,
 }
 
 void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
-
   GlobalVariable *Name = Ind->getName();
   uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
   uint64_t Index = Ind->getIndex()->getZExtValue();
@@ -222,7 +260,6 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
 }
 
 void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
-
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
   assert(It != ProfileDataMap.end() && It->second.DataVar &&
@@ -235,11 +272,25 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
     Index += It->second.NumValueSites[Kind];
 
   IRBuilder<> Builder(Ind);
-  Value *Args[3] = {Ind->getTargetValue(),
-                    Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
-                    Builder.getInt32(Index)};
-  CallInst *Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI),
-                                      Args);
+  bool IsRange = (Ind->getValueKind()->getZExtValue() ==
+                  llvm::InstrProfValueKind::IPVK_MemOPSize);
+  CallInst *Call = nullptr;
+  if (!IsRange) {
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args);
+  } else {
+    Value *Args[6] = {
+        Ind->getTargetValue(),
+        Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+        Builder.getInt32(Index),
+        Builder.getInt64(MemOPSizeRangeStart),
+        Builder.getInt64(MemOPSizeRangeLast),
+        Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
+    Call =
+        Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), Args);
+  }
   if (auto AK = TLI->getExtAttrForI32Param(false))
     Call->addAttribute(3, AK);
   Ind->replaceAllUsesWith(Call);
@@ -259,7 +310,6 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
 }
 
 void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
-
   ConstantArray *Names =
       cast<ConstantArray>(CoverageNamesVar->getInitializer());
   for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
@@ -270,7 +320,9 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 
     Name->setLinkage(GlobalValue::PrivateLinkage);
     ReferencedNames.push_back(Name);
+    NC->dropAllReferences();
   }
+  CoverageNamesVar->eraseFromParent();
 }
 
 /// Get the name of a profiling variable for a particular function.
@@ -367,7 +419,8 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                          Constant::getNullValue(CounterTy),
                          getVarName(Inc, getInstrProfCountersVarPrefix()));
   CounterPtr->setVisibility(NamePtr->getVisibility());
-  CounterPtr->setSection(getCountersSection());
+  CounterPtr->setSection(
+      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
   CounterPtr->setAlignment(8);
   CounterPtr->setComdat(ProfileVarsComdat);
 
@@ -376,7 +429,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // the current function.
   Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
   if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(*M)) {
-
     uint64_t NS = 0;
     for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
       NS += PD.NumValueSites[Kind];
@@ -388,11 +440,12 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                              Constant::getNullValue(ValuesTy),
                              getVarName(Inc, getInstrProfValuesVarPrefix()));
       ValuesVar->setVisibility(NamePtr->getVisibility());
-      ValuesVar->setSection(getInstrProfValuesSectionName(isMachO()));
+      ValuesVar->setSection(
+          getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
       ValuesVar->setAlignment(8);
       ValuesVar->setComdat(ProfileVarsComdat);
       ValuesPtrExpr =
-          ConstantExpr::getBitCast(ValuesVar, llvm::Type::getInt8PtrTy(Ctx));
+          ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
     }
   }
 
@@ -421,7 +474,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                                   ConstantStruct::get(DataTy, DataVals),
                                   getVarName(Inc, getInstrProfDataVarPrefix()));
   Data->setVisibility(NamePtr->getVisibility());
-  Data->setSection(getDataSection());
+  Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
   Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT);
   Data->setComdat(ProfileVarsComdat);
 
@@ -481,9 +534,10 @@ void InstrProfiling::emitVNodes() {
 
   ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
   auto *VNodesVar = new GlobalVariable(
-      *M, VNodesTy, false, llvm::GlobalValue::PrivateLinkage,
+      *M, VNodesTy, false, GlobalValue::PrivateLinkage,
       Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
-  VNodesVar->setSection(getInstrProfVNodesSectionName(isMachO()));
+  VNodesVar->setSection(
+      getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
   UsedVars.push_back(VNodesVar);
 }
 
@@ -496,18 +550,22 @@ void InstrProfiling::emitNameData() {
   std::string CompressedNameStr;
   if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
                                           DoNameCompression)) {
-    llvm::report_fatal_error(toString(std::move(E)), false);
+    report_fatal_error(toString(std::move(E)), false);
   }
 
   auto &Ctx = M->getContext();
-  auto *NamesVal = llvm::ConstantDataArray::getString(
+  auto *NamesVal = ConstantDataArray::getString(
       Ctx, StringRef(CompressedNameStr), false);
-  NamesVar = new llvm::GlobalVariable(*M, NamesVal->getType(), true,
-                                      llvm::GlobalValue::PrivateLinkage,
-                                      NamesVal, getInstrProfNamesVarName());
+  NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
+                                GlobalValue::PrivateLinkage, NamesVal,
+                                getInstrProfNamesVarName());
   NamesSize = CompressedNameStr.size();
-  NamesVar->setSection(getNameSection());
+  NamesVar->setSection(
+      getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
   UsedVars.push_back(NamesVar);
+
+  for (auto *NamePtr : ReferencedNames)
+    NamePtr->eraseFromParent();
 }
 
 void InstrProfiling::emitRegistration() {
@@ -550,7 +608,6 @@ void InstrProfiling::emitRegistration() {
 }
 
 void InstrProfiling::emitRuntimeHook() {
-
   // We expect the linker to be invoked with -u<hook_var> flag for linux,
   // for which case there is no need to emit the user function.
   if (Triple(M->getTargetTriple()).isOSLinux())
@@ -600,7 +657,6 @@ void InstrProfiling::emitInitialization() {
     GlobalVariable *ProfileNameVar = new GlobalVariable(
         *M, ProfileNameConst->getType(), true, GlobalValue::WeakAnyLinkage,
         ProfileNameConst, INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_NAME_VAR));
-    Triple TT(M->getTargetTriple());
     if (TT.supportsCOMDAT()) {
       ProfileNameVar->setLinkage(GlobalValue::ExternalLinkage);
       ProfileNameVar->setComdat(M->getOrInsertComdat(
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 2963d08752c4..7bb62d2c8455 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -63,6 +63,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializePGOInstrumentationGenLegacyPassPass(Registry);
   initializePGOInstrumentationUseLegacyPassPass(Registry);
   initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializePGOMemOPSizeOptLegacyPassPass(Registry);
   initializeInstrProfilingLegacyPassPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fafb0fcbd017..190f05db4b0c 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -425,7 +425,7 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   // which is not yet implemented.
   StringRef WarningFnName = Recover ? "__msan_warning"
                                     : "__msan_warning_noreturn";
-  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), nullptr);
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
 
   for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
        AccessSizeIndex++) {
@@ -433,31 +433,31 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
     std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
     MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt32Ty(), nullptr);
+        IRB.getInt32Ty());
 
     FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
     MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt8PtrTy(), IRB.getInt32Ty(), nullptr);
+        IRB.getInt8PtrTy(), IRB.getInt32Ty());
   }
 
   MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
     "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+    IRB.getInt8PtrTy(), IntptrTy);
   MsanPoisonStackFn =
       M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr);
+                            IRB.getInt8PtrTy(), IntptrTy);
   MsanChainOriginFn = M.getOrInsertFunction(
-    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), nullptr);
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
   MemmoveFn = M.getOrInsertFunction(
     "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+    IRB.getInt8PtrTy(), IntptrTy);
   MemcpyFn = M.getOrInsertFunction(
     "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IntptrTy, nullptr);
+    IntptrTy);
   MemsetFn = M.getOrInsertFunction(
     "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
-    IntptrTy, nullptr);
+    IntptrTy);
 
   // Create globals.
   RetvalTLS = new GlobalVariable(
@@ -1037,15 +1037,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     OriginMap[V] = Origin;
   }
 
+  Constant *getCleanShadow(Type *OrigTy) {
+    Type *ShadowTy = getShadowTy(OrigTy);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
   /// \brief Create a clean shadow value for a given value.
   ///
   /// Clean shadow (all zeroes) means all bits of the value are defined
   /// (initialized).
   Constant *getCleanShadow(Value *V) {
-    Type *ShadowTy = getShadowTy(V);
-    if (!ShadowTy)
-      return nullptr;
-    return Constant::getNullValue(ShadowTy);
+    return getCleanShadow(V->getType());
   }
 
   /// \brief Create a dirty shadow of a given shadow type.
@@ -1942,7 +1946,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
 
-    // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
     if (MS.TrackOrigins)
       IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1));
@@ -2325,11 +2328,49 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  void handleStmxcsr(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    Value *ShadowPtr = getShadowPtr(Addr, Ty, IRB);
+
+    IRB.CreateStore(getCleanShadow(Ty),
+                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+  }
+
+  void handleLdmxcsr(IntrinsicInst &I) {
+    if (!InsertChecks) return;
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    unsigned Alignment = 1;
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    Value *Shadow = IRB.CreateAlignedLoad(getShadowPtr(Addr, Ty, IRB),
+                                          Alignment, "_ldmxcsr");
+    Value *Origin = MS.TrackOrigins
+                        ? IRB.CreateLoad(getOriginPtr(Addr, IRB, Alignment))
+                        : getCleanOrigin();
+    insertShadowCheck(Shadow, Origin, &I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
+    case llvm::Intrinsic::x86_sse_stmxcsr:
+      handleStmxcsr(I);
+      break;
+    case llvm::Intrinsic::x86_sse_ldmxcsr:
+      handleLdmxcsr(I);
+      break;
     case llvm::Intrinsic::x86_avx512_vcvtsd2usi64:
     case llvm::Intrinsic::x86_avx512_vcvtsd2usi32:
     case llvm::Intrinsic::x86_avx512_vcvtss2usi64:
@@ -2566,10 +2607,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         AttrBuilder B;
         B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone);
-        Func->removeAttributes(AttributeSet::FunctionIndex,
-                               AttributeSet::get(Func->getContext(),
-                                                 AttributeSet::FunctionIndex,
-                                                 B));
+        Func->removeAttributes(AttributeList::FunctionIndex,
+                               AttributeList::get(Func->getContext(),
+                                                  AttributeList::FunctionIndex,
+                                                  B));
       }
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
@@ -2597,7 +2638,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             " Shadow: " << *ArgShadow << "\n");
       bool ArgIsInitialized = false;
       const DataLayout &DL = F.getParent()->getDataLayout();
-      if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
+      if (CS.paramHasAttr(i, Attribute::ByVal)) {
         assert(A->getType()->isPointerTy() &&
                "ByVal argument is not a pointer!");
         Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
@@ -2690,7 +2731,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     } else {
       Value *Shadow = getShadow(RetVal);
       IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
-      // FIXME: make it conditional if ClStoreCleanOrigin==0
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -2717,15 +2757,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getCleanOrigin());
     IRBuilder<> IRB(I.getNextNode());
     const DataLayout &DL = F.getParent()->getDataLayout();
-    uint64_t Size = DL.getTypeAllocSize(I.getAllocatedType());
+    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    if (I.isArrayAllocation())
+      Len = IRB.CreateMul(Len, I.getArraySize());
     if (PoisonStack && ClPoisonStackWithCall) {
       IRB.CreateCall(MS.MsanPoisonStackFn,
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
-                      ConstantInt::get(MS.IntptrTy, Size)});
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
     } else {
       Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
-      IRB.CreateMemSet(ShadowBase, PoisonValue, Size, I.getAlignment());
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
     }
 
     if (PoisonStack && MS.TrackOrigins) {
@@ -2742,8 +2784,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                                StackDescription.str());
 
       IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
-                      ConstantInt::get(MS.IntptrTy, Size),
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
                       IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
                       IRB.CreatePointerCast(&F, MS.IntptrTy)});
     }
@@ -2935,7 +2976,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *A = *ArgIt;
       unsigned ArgNo = CS.getArgumentNo(ArgIt);
       bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
-      bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
       if (IsByVal) {
         // ByVal arguments always go to the overflow area.
         // Fixed arguments passed through the overflow area will be stepped
@@ -3456,7 +3497,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       Value *A = *ArgIt;
       unsigned ArgNo = CS.getArgumentNo(ArgIt);
       bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
-      bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
       if (IsByVal) {
         assert(A->getType()->isPointerTy());
         Type *RealTy = A->getType()->getPointerElementType();
@@ -3618,9 +3659,9 @@ bool MemorySanitizer::runOnFunction(Function &F) {
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly)
     .addAttribute(Attribute::ReadNone);
-  F.removeAttributes(AttributeSet::FunctionIndex,
-                     AttributeSet::get(F.getContext(),
-                                       AttributeSet::FunctionIndex, B));
+  F.removeAttributes(
+      AttributeList::FunctionIndex,
+      AttributeList::get(F.getContext(), AttributeList::FunctionIndex, B));
 
   return Visitor.runOnFunction();
 }
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 04f9a64bef9f..990bcec109de 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -58,8 +58,10 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -71,7 +73,9 @@
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/JamCRC.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -87,6 +91,7 @@ using namespace llvm;
 
 STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
 STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
 STATISTIC(NumOfPGOEdge, "Number of edges.");
 STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
 STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
@@ -116,6 +121,13 @@ static cl::opt<unsigned> MaxNumAnnotations(
     cl::desc("Max number of annotations for a single indirect "
              "call callsite"));
 
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of preicise value annotations for a single memop"
+             "intrinsic"));
+
 // Command line option to control appending FunctionHash to the name of a COMDAT
 // function. This is to avoid the hash mismatch caused by the preinliner.
 static cl::opt<bool> DoComdatRenaming(
@@ -125,24 +137,59 @@ static cl::opt<bool> DoComdatRenaming(
 
 // Command line option to enable/disable the warning about missing profile
 // information.
-static cl::opt<bool> PGOWarnMissing("pgo-warn-missing-function",
-                                     cl::init(false),
-                                     cl::Hidden);
+static cl::opt<bool>
+    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+                   cl::desc("Use this option to turn on/off "
+                            "warnings about missing profile data for "
+                            "functions."));
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data.
-static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
-                                       cl::Hidden);
+static cl::opt<bool>
+    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+                      cl::desc("Use this option to turn off/on "
+                               "warnings about profile cfg mismatch."));
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data for Comdat functions, which often turns out to be false
 // positive due to the pre-instrumentation inline.
-static cl::opt<bool> NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat",
-                                             cl::init(true), cl::Hidden);
+static cl::opt<bool>
+    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("The option is used to turn on/off "
+                                     "warnings about hash mismatch for comdat "
+                                     "functions."));
 
 // Command line option to enable/disable select instruction instrumentation.
-static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
-                                    cl::Hidden);
+static cl::opt<bool>
+    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+                   cl::desc("Use this option to turn on/off SELECT "
+                            "instruction instrumentation. "));
+
+// Command line option to turn on CFG dot dump of raw profile counts
+static cl::opt<bool>
+    PGOViewRawCounts("pgo-view-raw-counts", cl::init(false), cl::Hidden,
+                     cl::desc("A boolean option to show CFG dag "
+                              "with raw profile counts from "
+                              "profile data. See also option "
+                              "-pgo-view-counts. To limit graph "
+                              "display to only one function, use "
+                              "filtering option -view-bfi-func-name."));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+                  cl::desc("Use this option to turn on/off "
+                           "memory instrinsic size profiling."));
+
+// Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
+extern cl::opt<bool> PGOViewCounts;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 namespace {
 
 /// The select instruction visitor plays three roles specified
@@ -167,6 +214,7 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   SelectInstVisitor(Function &Func) : F(Func) {}
 
   void countSelects(Function &Func) {
+    NSIs = 0;
     Mode = VM_counting;
     visit(Func);
   }
@@ -196,9 +244,54 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   void annotateOneSelectInst(SelectInst &SI);
   // Visit \p SI instruction and perform tasks according to visit mode.
   void visitSelectInst(SelectInst &SI);
+  // Return the number of select instructions. This needs be called after
+  // countSelects().
   unsigned getNumOfSelectInsts() const { return NSIs; }
 };
 
+/// Instruction Visitor class to visit memory intrinsic calls.
+struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
+  Function &F;
+  unsigned NMemIs = 0;          // Number of memIntrinsics instrumented.
+  VisitMode Mode = VM_counting; // Visiting mode.
+  unsigned CurCtrId = 0;        // Current counter index.
+  unsigned TotalNumCtrs = 0;    // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+  std::vector<Instruction *> Candidates;
+
+  MemIntrinsicVisitor(Function &Func) : F(Func) {}
+
+  void countMemIntrinsics(Function &Func) {
+    NMemIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+
+  void instrumentMemIntrinsics(Function &Func, unsigned TotalNC,
+                               GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  std::vector<Instruction *> findMemIntrinsics(Function &Func) {
+    Candidates.clear();
+    Mode = VM_annotate;
+    visit(Func);
+    return Candidates;
+  }
+
+  // Visit the IR stream and annotate all mem intrinsic call instructions.
+  void instrumentOneMemIntrinsic(MemIntrinsic &MI);
+  // Visit \p MI instruction and perform tasks according to visit mode.
+  void visitMemIntrinsic(MemIntrinsic &SI);
+  unsigned getNumOfMemIntrinsics() const { return NMemIs; }
+};
+
 class PGOInstrumentationGenLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -316,8 +409,9 @@ private:
   std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
 
 public:
-  std::vector<Instruction *> IndirectCallSites;
+  std::vector<std::vector<Instruction *>> ValueSites;
   SelectInstVisitor SIVisitor;
+  MemIntrinsicVisitor MIVisitor;
   std::string FuncName;
   GlobalVariable *FuncNameVar;
   // CFG hash value for this function.
@@ -347,13 +441,16 @@ public:
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr)
-      : F(Func), ComdatMembers(ComdatMembers), SIVisitor(Func), FunctionHash(0),
-        MST(F, BPI, BFI) {
+      : F(Func), ComdatMembers(ComdatMembers), ValueSites(IPVK_Last + 1),
+        SIVisitor(Func), MIVisitor(Func), FunctionHash(0), MST(F, BPI, BFI) {
 
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
+    MIVisitor.countMemIntrinsics(Func);
     NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-    IndirectCallSites = findIndirectCallSites(Func);
+    NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+    ValueSites[IPVK_IndirectCallTarget] = findIndirectCallSites(Func);
+    ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func);
 
     FuncName = getPGOFuncName(F);
     computeCFGHash();
@@ -405,7 +502,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   }
   JC.update(Indexes);
   FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
-                 (uint64_t)IndirectCallSites.size() << 48 |
+                 (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
                  (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
 }
 
@@ -552,7 +649,7 @@ static void instrumentOneFunc(
     return;
 
   unsigned NumIndirectCallSites = 0;
-  for (auto &I : FuncInfo.IndirectCallSites) {
+  for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
     CallSite CS(I);
     Value *Callee = CS.getCalledValue();
     DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
@@ -565,10 +662,14 @@ static void instrumentOneFunc(
         {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
          Builder.getInt64(FuncInfo.FunctionHash),
          Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
-         Builder.getInt32(llvm::InstrProfValueKind::IPVK_IndirectCallTarget),
+         Builder.getInt32(IPVK_IndirectCallTarget),
          Builder.getInt32(NumIndirectCallSites++)});
   }
   NumOfPGOICall += NumIndirectCallSites;
+
+  // Now instrument memop intrinsic calls.
+  FuncInfo.MIVisitor.instrumentMemIntrinsics(
+      F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash);
 }
 
 // This class represents a CFG edge in profile use compilation.
@@ -653,8 +754,11 @@ public:
   // Set the branch weights based on the count values.
   void setBranchWeights();
 
-  // Annotate the indirect call sites.
-  void annotateIndirectCallSites();
+  // Annotate the value profile call sites all all value kind.
+  void annotateValueSites();
+
+  // Annotate the value profile call sites for one value kind.
+  void annotateValueSites(uint32_t Kind);
 
   // The hotness of the function from the profile count.
   enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
@@ -677,6 +781,8 @@ public:
     return FuncInfo.findBBInfo(BB);
   }
 
+  Function &getFunc() const { return F; }
+
 private:
   Function &F;
   Module *M;
@@ -761,7 +867,7 @@ void PGOUseFunc::setInstrumentedCounts(
     NewEdge1.InMST = true;
     getBBInfo(InstrBB).setBBInfoCount(CountValue);
   }
-  ProfileCountSize =  CountFromProfile.size();
+  ProfileCountSize = CountFromProfile.size();
   CountPosition = I;
 }
 
@@ -932,21 +1038,6 @@ void PGOUseFunc::populateCounters() {
   DEBUG(FuncInfo.dumpInfo("after reading profile."));
 }
 
-static void setProfMetadata(Module *M, Instruction *TI,
-                            ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) {
-  MDBuilder MDB(M->getContext());
-  assert(MaxCount > 0 && "Bad max count");
-  uint64_t Scale = calculateCountScale(MaxCount);
-  SmallVector<unsigned, 4> Weights;
-  for (const auto &ECI : EdgeCounts)
-    Weights.push_back(scaleBranchCount(ECI, Scale));
-
-  DEBUG(dbgs() << "Weight is: ";
-        for (const auto &W : Weights) { dbgs() << W << " "; } 
-        dbgs() << "\n";);
-  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
-}
-
 // Assign the scaled count values to the BB with multiple out edges.
 void PGOUseFunc::setBranchWeights() {
   // Generate MD_prof metadata for every branch instruction.
@@ -990,8 +1081,8 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   Builder.CreateCall(
       Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
       {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
-       Builder.getInt64(FuncHash),
-       Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+       Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
 
@@ -1020,9 +1111,9 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   if (SI.getCondition()->getType()->isVectorTy())
     return;
 
-  NSIs++;
   switch (Mode) {
   case VM_counting:
+    NSIs++;
     return;
   case VM_instrument:
     instrumentOneSelectInst(SI);
@@ -1035,35 +1126,79 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
-// Traverse all the indirect callsites and annotate the instructions.
-void PGOUseFunc::annotateIndirectCallSites() {
+void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&MI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  Value *Length = MI.getLength();
+  assert(!dyn_cast<ConstantInt>(Length));
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+      {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty),
+       Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
+  ++CurCtrId;
+}
+
+void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
+  if (!PGOInstrMemOP)
+    return;
+  Value *Length = MI.getLength();
+  // Not instrument constant length calls.
+  if (dyn_cast<ConstantInt>(Length))
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NMemIs++;
+    return;
+  case VM_instrument:
+    instrumentOneMemIntrinsic(MI);
+    return;
+  case VM_annotate:
+    Candidates.push_back(&MI);
+    return;
+  }
+  llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
   if (DisableValueProfiling)
     return;
 
   // Create the PGOFuncName meta data.
   createPGOFuncNameMetadata(F, FuncInfo.FuncName);
 
-  unsigned IndirectCallSiteIndex = 0;
-  auto &IndirectCallSites = FuncInfo.IndirectCallSites;
-  unsigned NumValueSites =
-      ProfileRecord.getNumValueSites(IPVK_IndirectCallTarget);
-  if (NumValueSites != IndirectCallSites.size()) {
-    std::string Msg =
-        std::string("Inconsistent number of indirect call sites: ") +
-        F.getName().str();
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+  unsigned ValueSiteIndex = 0;
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+  if (NumValueSites != ValueSites.size()) {
     auto &Ctx = M->getContext();
-    Ctx.diagnose(
-        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of value sites for kind = ") + Twine(Kind) +
+            " in " + F.getName().str(),
+        DS_Warning));
     return;
   }
 
-  for (auto &I : IndirectCallSites) {
-    DEBUG(dbgs() << "Read one indirect call instrumentation: Index="
-                 << IndirectCallSiteIndex << " out of " << NumValueSites
-                 << "\n");
-    annotateValueSite(*M, *I, ProfileRecord, IPVK_IndirectCallTarget,
-                      IndirectCallSiteIndex, MaxNumAnnotations);
-    IndirectCallSiteIndex++;
+  for (auto &I : ValueSites) {
+    DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                 << "): Index = " << ValueSiteIndex << " out of "
+                 << NumValueSites << "\n");
+    annotateValueSite(*M, *I, ProfileRecord,
+                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+                                             : MaxNumAnnotations);
+    ValueSiteIndex++;
   }
 }
 } // end anonymous namespace
@@ -1196,12 +1331,29 @@ static bool annotateAllFunctions(
       continue;
     Func.populateCounters();
     Func.setBranchWeights();
-    Func.annotateIndirectCallSites();
+    Func.annotateValueSites();
     PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
     if (FreqAttr == PGOUseFunc::FFA_Cold)
       ColdFunctions.push_back(&F);
     else if (FreqAttr == PGOUseFunc::FFA_Hot)
       HotFunctions.push_back(&F);
+    if (PGOViewCounts && (ViewBlockFreqFuncName.empty() ||
+                          F.getName().equals(ViewBlockFreqFuncName))) {
+      LoopInfo LI{DominatorTree(F)};
+      std::unique_ptr<BranchProbabilityInfo> NewBPI =
+          llvm::make_unique<BranchProbabilityInfo>(F, LI);
+      std::unique_ptr<BlockFrequencyInfo> NewBFI =
+          llvm::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+
+      NewBFI->view();
+    }
+    if (PGOViewRawCounts && (ViewBlockFreqFuncName.empty() ||
+                             F.getName().equals(ViewBlockFreqFuncName))) {
+      if (ViewBlockFreqFuncName.empty())
+        WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+      else
+        ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+    }
   }
   M.setProfileSummary(PGOReader->getSummary().getMD(M.getContext()));
   // Set function hotness attribute from the profile.
@@ -1257,3 +1409,90 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
 
   return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI);
 }
+
+namespace llvm {
+void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+                     uint64_t MaxCount) {
+  MDBuilder MDB(M->getContext());
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<unsigned, 4> Weights;
+  for (const auto &ECI : EdgeCounts)
+    Weights.push_back(scaleBranchCount(ECI, Scale));
+
+  DEBUG(dbgs() << "Weight is: ";
+        for (const auto &W : Weights) { dbgs() << W << " "; }
+        dbgs() << "\n";);
+  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
+
+template <> struct GraphTraits<PGOUseFunc *> {
+  typedef const BasicBlock *NodeRef;
+  typedef succ_const_iterator ChildIteratorType;
+  typedef pointer_iterator<Function::const_iterator> nodes_iterator;
+
+  static NodeRef getEntryNode(const PGOUseFunc *G) {
+    return &G->getFunc().front();
+  }
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return succ_begin(N);
+  }
+  static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
+  static nodes_iterator nodes_begin(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().begin());
+  }
+  static nodes_iterator nodes_end(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().end());
+  }
+};
+
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+  if (!Node->getName().empty())
+    return Node->getName();
+
+  std::string SimpleNodeName;
+  raw_string_ostream OS(SimpleNodeName);
+  Node->printAsOperand(OS, false);
+  return OS.str();
+}
+
+template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const PGOUseFunc *G) {
+    return G->getFunc().getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << getSimpleNodeName(Node) << ":\\l";
+    UseBBInfo *BI = Graph->findBBInfo(Node);
+    OS << "Count : ";
+    if (BI && BI->CountValid)
+      OS << BI->CountValue << "\\l";
+    else
+      OS << "Unknown\\l";
+
+    if (!PGOInstrSelect)
+      return Result;
+
+    for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+      auto *I = &*BI;
+      if (!isa<SelectInst>(I))
+        continue;
+      // Display scaled counts for SELECT instruction:
+      OS << "SELECT : { T = ";
+      uint64_t TC, FC;
+      bool HasProf = I->extractProfMetadata(TC, FC);
+      if (!HasProf)
+        OS << "Unknown, F = Unknown }\\l";
+      else
+        OS << TC << ", F = " << FC << " }\\l";
+    }
+    return Result;
+  }
+};
+} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 5b4b1fb77134..fa0c7cc5a4c5 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -78,7 +78,6 @@ static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
 static const char *const SanCovModuleCtorName = "sancov.module_ctor";
 static const uint64_t SanCtorAndDtorPriority = 2;
 
-static const char *const SanCovTracePCGuardSection = "__sancov_guards";
 static const char *const SanCovTracePCGuardName =
     "__sanitizer_cov_trace_pc_guard";
 static const char *const SanCovTracePCGuardInitName =
@@ -95,7 +94,7 @@ static cl::opt<unsigned> ClCoverageBlockThreshold(
     "sanitizer-coverage-block-threshold",
     cl::desc("Use a callback with a guard check inside it if there are"
              " more than this number of blocks."),
-    cl::Hidden, cl::init(500));
+    cl::Hidden, cl::init(0));
 
 static cl::opt<bool>
     ClExperimentalTracing("sanitizer-coverage-experimental-tracing",
@@ -216,6 +215,9 @@ private:
            SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
            SanCovTraceEnter->getNumUses();
   }
+  StringRef getSanCovTracePCGuardSection() const;
+  StringRef getSanCovTracePCGuardSectionStart() const;
+  StringRef getSanCovTracePCGuardSectionEnd() const;
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
   Function *SanCovIndirCallFunction, *SanCovTracePCIndir;
@@ -227,6 +229,7 @@ private:
   InlineAsm *EmptyAsm;
   Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy;
   Module *CurModule;
+  Triple TargetTriple;
   LLVMContext *C;
   const DataLayout *DL;
 
@@ -246,6 +249,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   C = &(M.getContext());
   DL = &M.getDataLayout();
   CurModule = &M;
+  TargetTriple = Triple(M.getTargetTriple());
   HasSancovGuardsSection = false;
   IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   IntptrPtrTy = PointerType::getUnqual(IntptrTy);
@@ -258,39 +262,39 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   Int32Ty = IRB.getInt32Ty();
 
   SanCovFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy));
   SanCovWithCheckFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy));
   SanCovTracePCIndir = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy, nullptr));
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy));
   SanCovIndirCallFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
+          SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy));
   SanCovTraceCmpFunction[0] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty(), nullptr));
+          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty()));
   SanCovTraceCmpFunction[1] = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTraceCmp2, VoidTy, IRB.getInt16Ty(),
-                            IRB.getInt16Ty(), nullptr));
+                            IRB.getInt16Ty()));
   SanCovTraceCmpFunction[2] = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTraceCmp4, VoidTy, IRB.getInt32Ty(),
-                            IRB.getInt32Ty(), nullptr));
+                            IRB.getInt32Ty()));
   SanCovTraceCmpFunction[3] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty, nullptr));
+          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty));
 
   SanCovTraceDivFunction[0] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty(), nullptr));
+          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty()));
   SanCovTraceDivFunction[1] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceDiv8, VoidTy, Int64Ty, nullptr));
+          SanCovTraceDiv8, VoidTy, Int64Ty));
   SanCovTraceGepFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceGep, VoidTy, IntptrTy, nullptr));
+          SanCovTraceGep, VoidTy, IntptrTy));
   SanCovTraceSwitchFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy, nullptr));
+          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy));
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
@@ -298,13 +302,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
                             /*hasSideEffects=*/true);
 
   SanCovTracePC = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTracePCName, VoidTy, nullptr));
+      M.getOrInsertFunction(SanCovTracePCName, VoidTy));
   SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      SanCovTracePCGuardName, VoidTy, Int32PtrTy, nullptr));
+      SanCovTracePCGuardName, VoidTy, Int32PtrTy));
   SanCovTraceEnter = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy));
   SanCovTraceBB = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTraceBBName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovTraceBBName, VoidTy, Int32PtrTy));
 
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
@@ -363,22 +367,28 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   if (Options.TracePCGuard) {
     if (HasSancovGuardsSection) {
       Function *CtorFunc;
-      std::string SectionName(SanCovTracePCGuardSection);
-      GlobalVariable *Bounds[2];
-      const char *Prefix[2] = {"__start_", "__stop_"};
-      for (int i = 0; i < 2; i++) {
-        Bounds[i] = new GlobalVariable(M, Int32PtrTy, false,
-                                       GlobalVariable::ExternalLinkage, nullptr,
-                                       Prefix[i] + SectionName);
-        Bounds[i]->setVisibility(GlobalValue::HiddenVisibility);
-      }
+      GlobalVariable *SecStart = new GlobalVariable(
+          M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr,
+          getSanCovTracePCGuardSectionStart());
+      SecStart->setVisibility(GlobalValue::HiddenVisibility);
+      GlobalVariable *SecEnd = new GlobalVariable(
+          M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr,
+          getSanCovTracePCGuardSectionEnd());
+      SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+
       std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
           M, SanCovModuleCtorName, SanCovTracePCGuardInitName,
           {Int32PtrTy, Int32PtrTy},
-          {IRB.CreatePointerCast(Bounds[0], Int32PtrTy),
-            IRB.CreatePointerCast(Bounds[1], Int32PtrTy)});
-
-      appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+          {IRB.CreatePointerCast(SecStart, Int32PtrTy),
+            IRB.CreatePointerCast(SecEnd, Int32PtrTy)});
+
+      if (TargetTriple.supportsCOMDAT()) {
+        // Use comdat to dedup CtorFunc.
+        CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName));
+        appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+      } else {
+        appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+      }
     }
   } else if (!Options.TracePC) {
     Function *CtorFunc;
@@ -435,6 +445,11 @@ static bool shouldInstrumentBlock(const Function& F, const BasicBlock *BB, const
   if (isa<UnreachableInst>(BB->getTerminator()))
     return false;
 
+  // Don't insert coverage into blocks without a valid insertion point
+  // (catchswitch blocks).
+  if (BB->getFirstInsertionPt() == BB->end())
+    return false;
+
   if (!ClPruneBlocks || &F.getEntryBlock() == BB)
     return true;
 
@@ -517,7 +532,7 @@ void SanitizerCoverageModule::CreateFunctionGuardArray(size_t NumGuards,
       Constant::getNullValue(ArrayOfInt32Ty), "__sancov_gen_");
   if (auto Comdat = F.getComdat())
     FunctionGuardArray->setComdat(Comdat);
-  FunctionGuardArray->setSection(SanCovTracePCGuardSection);
+  FunctionGuardArray->setSection(getSanCovTracePCGuardSection());
 }
 
 bool SanitizerCoverageModule::InjectCoverage(Function &F,
@@ -755,6 +770,27 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   }
 }
 
+StringRef SanitizerCoverageModule::getSanCovTracePCGuardSection() const {
+  if (TargetTriple.getObjectFormat() == Triple::COFF)
+    return ".SCOV$M";
+  if (TargetTriple.isOSBinFormatMachO())
+    return "__DATA,__sancov_guards";
+  return "__sancov_guards";
+}
+
+StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionStart() const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$start$__DATA$__sancov_guards";
+  return "__start___sancov_guards";
+}
+
+StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionEnd() const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$end$__DATA$__sancov_guards";
+  return "__stop___sancov_guards";
+}
+
+
 char SanitizerCoverageModule::ID = 0;
 INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
                       "SanitizerCoverage: TODO."
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 52035c79a4a3..9260217bd5e6 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -155,17 +155,18 @@ FunctionPass *llvm::createThreadSanitizerPass() {
 
 void ThreadSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
-  AttributeSet Attr;
-  Attr = Attr.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoUnwind);
+  AttributeList Attr;
+  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
   // Initialize the callbacks.
   TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   TsanFuncExit = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy(), nullptr));
+      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy()));
   TsanIgnoreBegin = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy(), nullptr));
+      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy()));
   TsanIgnoreEnd = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy(), nullptr));
+      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy()));
   OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
     const unsigned ByteSize = 1U << i;
@@ -174,31 +175,31 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
     std::string BitSizeStr = utostr(BitSize);
     SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
     TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
     TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
     TsanUnalignedRead[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
     TsanUnalignedWrite[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
     TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy, nullptr));
+        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy));
 
     SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
     TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
+        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy));
 
     for (int op = AtomicRMWInst::FIRST_BINOP;
         op <= AtomicRMWInst::LAST_BINOP; ++op) {
@@ -222,33 +223,33 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
       TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction(
-          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy, nullptr));
+          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy));
     }
 
     SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
                                   "_compare_exchange_val");
     TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
+        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy));
   }
   TsanVptrUpdate = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), nullptr));
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy()));
   TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
+      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy));
   TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
+      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy));
 
   MemmoveFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemcpyFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemsetFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt32Ty(), IntptrTy, nullptr));
+                            IRB.getInt32Ty(), IntptrTy));
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
@@ -271,7 +272,7 @@ static bool isVtableAccess(Instruction *I) {
 
 // Do not instrument known races/"benign races" that come from compiler
 // instrumentatin. The user has no way of suppressing them.
-static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
   // Peel off GEPs and BitCasts.
   Addr = Addr->stripInBoundsOffsets();
 
@@ -279,8 +280,9 @@ static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
     if (GV->hasSection()) {
       StringRef SectionName = GV->getSection();
       // Check if the global is in the PGO counters section.
-      if (SectionName.endswith(getInstrProfCountersSectionName(
-            /*AddSegment=*/false)))
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
         return false;
     }
 
@@ -342,13 +344,13 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
   for (Instruction *I : reverse(Local)) {
     if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
       Value *Addr = Store->getPointerOperand();
-      if (!shouldInstrumentReadWriteFromAddress(Addr))
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
         continue;
       WriteTargets.insert(Addr);
     } else {
       LoadInst *Load = cast<LoadInst>(I);
       Value *Addr = Load->getPointerOperand();
-      if (!shouldInstrumentReadWriteFromAddress(Addr))
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
         continue;
       if (WriteTargets.count(Addr)) {
         // We will write to this temp, so no reason to analyze the read.
diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index c74827210364..c541fa4c8bee 100644
--- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -127,9 +127,8 @@ private:
 
     LLVMContext &C = TheModule->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attr =
-      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
     FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
                                           /*isVarArg=*/false);
     return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
@@ -144,10 +143,10 @@ private:
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attr = AttributeSet();
+    AttributeList Attr = AttributeList();
 
     if (NoUnwind)
-      Attr = Attr.addAttribute(C, AttributeSet::FunctionIndex,
+      Attr = Attr.addAttribute(C, AttributeList::FunctionIndex,
                                Attribute::NoUnwind);
 
     return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
@@ -162,9 +161,8 @@ private:
     Type *I8XX = PointerType::getUnqual(I8X);
     Type *Params[] = { I8XX, I8X };
 
-    AttributeSet Attr =
-      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
     Attr = Attr.addAttribute(C, 1, Attribute::NoCapture);
 
     FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 23c1f5990ba5..a86eaaec7641 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -394,6 +394,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
 
   DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong << "\n");
 
+  if (&*Iter == Retain) ++Iter;
   if (&*Iter == Store) ++Iter;
   Store->eraseFromParent();
   Release->eraseFromParent();
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 136d54a6cb75..3c73376c9906 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -85,41 +85,6 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   return nullptr;
 }
 
-/// This is a wrapper around getUnderlyingObjCPtr along the lines of
-/// GetUnderlyingObjects except that it returns early when it sees the first
-/// alloca.
-static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V,
-                                                   const DataLayout &DL) {
-  SmallPtrSet<const Value *, 4> Visited;
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(V);
-  do {
-    const Value *P = Worklist.pop_back_val();
-    P = GetUnderlyingObjCPtr(P, DL);
-
-    if (isa<AllocaInst>(P))
-      return true;
-
-    if (!Visited.insert(P).second)
-      continue;
-
-    if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) {
-      Worklist.push_back(SI->getTrueValue());
-      Worklist.push_back(SI->getFalseValue());
-      continue;
-    }
-
-    if (const PHINode *PN = dyn_cast<const PHINode>(P)) {
-      for (Value *IncValue : PN->incoming_values())
-        Worklist.push_back(IncValue);
-      continue;
-    }
-  } while (!Worklist.empty());
-
-  return false;
-}
-
-
 /// @}
 ///
 /// \defgroup ARCOpt ARC Optimization.
@@ -481,9 +446,6 @@ namespace {
     /// MDKind identifiers.
     ARCMDKindCache MDKindCache;
 
-    // This is used to track if a pointer is stored into an alloca.
-    DenseSet<const Value *> MultiOwnersSet;
-
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
@@ -524,8 +486,7 @@ namespace {
     PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
                              BlotMapVector<Value *, RRInfo> &Retains,
                              DenseMap<Value *, RRInfo> &Releases, Module *M,
-                             SmallVectorImpl<Instruction *> &NewRetains,
-                             SmallVectorImpl<Instruction *> &NewReleases,
+                             Instruction * Retain,
                              SmallVectorImpl<Instruction *> &DeadInsts,
                              RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                              Value *Arg, bool KnownSafe,
@@ -1155,29 +1116,6 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
   case ARCInstKind::None:
     // These are irrelevant.
     return NestingDetected;
-  case ARCInstKind::User:
-    // If we have a store into an alloca of a pointer we are tracking, the
-    // pointer has multiple owners implying that we must be more conservative.
-    //
-    // This comes up in the context of a pointer being ``KnownSafe''. In the
-    // presence of a block being initialized, the frontend will emit the
-    // objc_retain on the original pointer and the release on the pointer loaded
-    // from the alloca. The optimizer will through the provenance analysis
-    // realize that the two are related, but since we only require KnownSafe in
-    // one direction, will match the inner retain on the original pointer with
-    // the guard release on the original pointer. This is fixed by ensuring that
-    // in the presence of allocas we only unconditionally remove pointers if
-    // both our retain and our release are KnownSafe.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
-      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand(), DL)) {
-        auto I = MyStates.findPtrBottomUpState(
-            GetRCIdentityRoot(SI->getValueOperand()));
-        if (I != MyStates.bottom_up_ptr_end())
-          MultiOwnersSet.insert(I->first);
-      }
-    }
-    break;
   default:
     break;
   }
@@ -1540,8 +1478,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
     DenseMap<const BasicBlock *, BBState> &BBStates,
     BlotMapVector<Value *, RRInfo> &Retains,
     DenseMap<Value *, RRInfo> &Releases, Module *M,
-    SmallVectorImpl<Instruction *> &NewRetains,
-    SmallVectorImpl<Instruction *> &NewReleases,
+    Instruction *Retain,
     SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
     RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
     bool &AnyPairsCompletelyEliminated) {
@@ -1549,7 +1486,6 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   // is already incremented, we can similarly ignore possible decrements unless
   // we are dealing with a retainable object with multiple provenance sources.
   bool KnownSafeTD = true, KnownSafeBU = true;
-  bool MultipleOwners = false;
   bool CFGHazardAfflicted = false;
 
   // Connect the dots between the top-down-collected RetainsToMove and
@@ -1561,14 +1497,13 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   unsigned OldCount = 0;
   unsigned NewCount = 0;
   bool FirstRelease = true;
-  for (;;) {
+  for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
+    SmallVector<Instruction *, 4> NewReleases;
     for (Instruction *NewRetain : NewRetains) {
       auto It = Retains.find(NewRetain);
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
-      MultipleOwners =
-        MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain));
       for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
         auto Jt = Releases.find(NewRetainRelease);
         if (Jt == Releases.end())
@@ -1691,7 +1626,6 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
         }
       }
     }
-    NewReleases.clear();
     if (NewRetains.empty()) break;
   }
 
@@ -1745,10 +1679,6 @@ bool ObjCARCOpt::PerformCodePlacement(
   DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
 
   bool AnyPairsCompletelyEliminated = false;
-  RRInfo RetainsToMove;
-  RRInfo ReleasesToMove;
-  SmallVector<Instruction *, 4> NewRetains;
-  SmallVector<Instruction *, 4> NewReleases;
   SmallVector<Instruction *, 8> DeadInsts;
 
   // Visit each retain.
@@ -1780,9 +1710,10 @@ bool ObjCARCOpt::PerformCodePlacement(
 
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
-    NewRetains.push_back(Retain);
+    RRInfo RetainsToMove, ReleasesToMove;
+
     bool PerformMoveCalls = PairUpRetainsAndReleases(
-        BBStates, Retains, Releases, M, NewRetains, NewReleases, DeadInsts,
+        BBStates, Retains, Releases, M, Retain, DeadInsts,
         RetainsToMove, ReleasesToMove, Arg, KnownSafe,
         AnyPairsCompletelyEliminated);
 
@@ -1792,12 +1723,6 @@ bool ObjCARCOpt::PerformCodePlacement(
       MoveCalls(Arg, RetainsToMove, ReleasesToMove,
                 Retains, Releases, DeadInsts, M);
     }
-
-    // Clean up state for next retain.
-    NewReleases.clear();
-    NewRetains.clear();
-    RetainsToMove.clear();
-    ReleasesToMove.clear();
   }
 
   // Now that we're done moving everything, we can delete the newly dead
@@ -1987,9 +1912,6 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
                                                            Releases,
                                                            F.getParent());
 
-  // Cleanup.
-  MultiOwnersSet.clear();
-
   return AnyPairsCompletelyEliminated && NestingDetected;
 }
 
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index adc903cab31b..5b467dc9fe12 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed");
 STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
 
-// This is a tempoary option until we change the interface
-// to this pass based on optimization level.
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
 static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
                                            cl::init(true), cl::Hidden);
 
@@ -110,7 +110,7 @@ class AggressiveDeadCodeElimination {
 
   /// The set of blocks which we have determined whose control
   /// dependence sources must be live and which have not had
-  /// those dependences analyized.
+  /// those dependences analyzed.
   SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
 
   /// Set up auxiliary data structures for Instructions and BasicBlocks and
@@ -145,7 +145,7 @@ class AggressiveDeadCodeElimination {
   /// was removed.
   bool removeDeadInstructions();
 
-  /// Identify connected sections of the control flow grap which have
+  /// Identify connected sections of the control flow graph which have
   /// dead terminators and rewrite the control flow graph to remove them.
   void updateDeadRegions();
 
@@ -234,7 +234,7 @@ void AggressiveDeadCodeElimination::initialize() {
         return Iter != end() && Iter->second;
       }
     } State;
-    
+
     State.reserve(F.size());
     // Iterate over blocks in depth-first pre-order and
     // treat all edges to a block already seen as loop back edges
@@ -262,25 +262,6 @@ void AggressiveDeadCodeElimination::initialize() {
       continue;
     auto *BB = BBInfo.BB;
     if (!PDT.getNode(BB)) {
-      markLive(BBInfo.Terminator);
-      continue;
-    }
-    for (auto *Succ : successors(BB))
-      if (!PDT.getNode(Succ)) {
-        markLive(BBInfo.Terminator);
-        break;
-      }
-  }
-
-  // Mark blocks live if there is no path from the block to the
-  // return of the function or a successor for which this is true.
-  // This protects IDFCalculator which cannot handle such blocks.
-  for (auto &BBInfoPair : BlockInfo) {
-    auto &BBInfo = BBInfoPair.second;
-    if (BBInfo.terminatorIsLive())
-      continue;
-    auto *BB = BBInfo.BB;
-    if (!PDT.getNode(BB)) {
       DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
                    << '\n';);
       markLive(BBInfo.Terminator);
@@ -579,7 +560,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
         PreferredSucc = Info;
     }
     assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
-           "Failed to find safe successor for dead branc");
+           "Failed to find safe successor for dead branch");
     bool First = true;
     for (auto *Succ : successors(BB)) {
       if (!First || Succ != PreferredSucc->BB)
@@ -594,13 +575,13 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
 
 // reverse top-sort order
 void AggressiveDeadCodeElimination::computeReversePostOrder() {
-  
-  // This provides a post-order numbering of the reverse conrtol flow graph
+
+  // This provides a post-order numbering of the reverse control flow graph
   // Note that it is incomplete in the presence of infinite loops but we don't
   // need numbers blocks which don't reach the end of the functions since
   // all branches in those blocks are forced live.
-  
-  // For each block without successors, extend the DFS from the bloack
+
+  // For each block without successors, extend the DFS from the block
   // backward through the graph
   SmallPtrSet<BasicBlock*, 16> Visited;
   unsigned PostOrder = 0;
@@ -644,8 +625,8 @@ PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
   if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  auto PA = PreservedAnalyses();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index c1df3173c0fc..fd931c521c8f 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -438,19 +438,13 @@ AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  bool Changed = runImpl(F, AC, &SE, &DT);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
-
-  if (!Changed)
+  if (!runImpl(F, AC, &SE, &DT))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<AAManager>();
   PA.preserve<ScalarEvolutionAnalysis>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 251b38707769..61e8700f1cd6 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -80,8 +80,8 @@ PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!bitTrackingDCE(F, DB))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  auto PA = PreservedAnalyses();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 06d3d6a73954..b323ab3bd443 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_library(LLVMScalarOpts
   IVUsersPrinter.cpp
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
+  InferAddressSpaces.cpp
   JumpThreading.cpp
   LICM.cpp
   LoopAccessAnalysisPrinter.cpp
@@ -29,6 +30,7 @@ add_llvm_library(LLVMScalarOpts
   LoopInterchange.cpp
   LoopLoadElimination.cpp
   LoopPassManager.cpp
+  LoopPredication.cpp
   LoopRerollPass.cpp
   LoopRotation.cpp
   LoopSimplifyCFG.cpp
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 38262514c9ec..ee6333e88716 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -136,8 +136,16 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   if (Idx != ~0U && isa<PHINode>(Inst))
     return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
 
-  BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
-  return IDom->getTerminator();
+  // This must be an EH pad. Iterate over immediate dominators until we find a
+  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+  // and terminators.
+  auto IDom = DT->getNode(Inst->getParent())->getIDom();
+  while (IDom->getBlock()->isEHPad()) {
+    assert(Entry != IDom->getBlock() && "eh pad in entry block");
+    IDom = IDom->getIDom();
+  }
+
+  return IDom->getBlock()->getTerminator();
 }
 
 /// \brief Find an insertion point that dominates all uses.
@@ -289,8 +297,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
 // bit widths (APInt Operator- does not like that). If the value cannot be
 // represented in uint64 we return an "empty" APInt. This is then interpreted
 // as the value is not in range.
-static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2)
-{
+static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
+                                                 const APInt &V2) {
   llvm::Optional<APInt> Res = None;
   unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
                 V1.getBitWidth() : V2.getBitWidth();
@@ -623,6 +631,7 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
   if (!runImpl(F, TTI, DT, F.getEntryBlock()))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 84f9373ae914..c843c61ea94e 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -235,9 +235,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   // Analyse each switch case in turn.  This is done in reverse order so that
   // removing a case doesn't cause trouble for the iteration.
   bool Changed = false;
-  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
-       ) {
-    ConstantInt *Case = CI.getCaseValue();
+  for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+    ConstantInt *Case = CI->getCaseValue();
 
     // Check to see if the switch condition is equal to/not equal to the case
     // value on every incoming edge, equal/not equal being the same each time.
@@ -270,8 +269,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
     if (State == LazyValueInfo::False) {
       // This case never fires - remove it.
-      CI.getCaseSuccessor()->removePredecessor(BB);
-      SI->removeCase(CI); // Does not invalidate the iterator.
+      CI->getCaseSuccessor()->removePredecessor(BB);
+      CI = SI->removeCase(CI);
+      CE = SI->case_end();
 
       // The condition can be modified by removePredecessor's PHI simplification
       // logic.
@@ -279,7 +279,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
       ++NumDeadCases;
       Changed = true;
-    } else if (State == LazyValueInfo::True) {
+      continue;
+    }
+    if (State == LazyValueInfo::True) {
       // This case always fires.  Arrange for the switch to be turned into an
       // unconditional branch by replacing the switch condition with the case
       // value.
@@ -288,6 +290,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
       Changed = true;
       break;
     }
+
+    // Increment the case iterator sense we didn't delete it.
+    ++CI;
   }
 
   if (Changed)
@@ -308,7 +313,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
     // Try to mark pointer typed parameters as non-null.  We skip the
     // relatively expensive analysis for constants which are obviously either
     // null or non-null to start with.
-    if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+    if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
         !isa<Constant>(V) && 
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
@@ -322,7 +327,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   if (Indices.empty())
     return false;
 
-  AttributeSet AS = CS.getAttributes();
+  AttributeList AS = CS.getAttributes();
   LLVMContext &Ctx = CS.getInstruction()->getContext();
   AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
   CS.setAttributes(AS);
@@ -570,10 +575,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
   bool Changed = runImpl(F, LVI);
 
-  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<LazyValueAnalysis>(F);
-
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index cc2a3cfaf9d1..07a0ba9b1222 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -124,9 +124,12 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
 }
 
 PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
+  if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 4d4c3baef3f5..1ec38e56aa4c 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -135,13 +135,13 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
   if (auto CS = CallSite(I)) {
     if (Function *F = CS.getCalledFunction()) {
       StringRef FnName = F->getName();
-      if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy))
+      if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
         return true;
-      if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy))
+      if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
         return true;
-      if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat))
+      if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
         return true;
-      if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat))
+      if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
         return true;
     }
   }
@@ -287,19 +287,14 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
 }
 
 namespace {
-enum OverwriteResult {
-  OverwriteBegin,
-  OverwriteComplete,
-  OverwriteEnd,
-  OverwriteUnknown
-};
+enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
 }
 
-/// Return 'OverwriteComplete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
-/// the 'Earlier' location is completely overwritten by 'Later',
-/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
-/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later', or
+/// 'OW_Unknown' if nothing can be determined.
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
@@ -310,7 +305,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we don't know the sizes of either access, then we can't do a comparison.
   if (Later.Size == MemoryLocation::UnknownSize ||
       Earlier.Size == MemoryLocation::UnknownSize)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
@@ -320,7 +315,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (P1 == P2) {
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
   }
 
   // Check to see if the later store is to the entire object (either a global,
@@ -332,13 +327,13 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // If the "Later" store is to a recognizable object, get its size.
   uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
   if (ObjectSize != MemoryLocation::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
 
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
@@ -350,7 +345,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // The later store completely overlaps the earlier store if:
   //
@@ -370,7 +365,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (EarlierOff >= LaterOff &&
       Later.Size >= Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
-    return OverwriteComplete;
+    return OW_Complete;
 
   // We may now overlap, although the overlap is not complete. There might also
   // be other incomplete overlaps, and together, they might cover the complete
@@ -428,7 +423,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
                       ") Composite Later [" <<
                       ILI->second << ", " << ILI->first << ")\n");
       ++NumCompletePartials;
-      return OverwriteComplete;
+      return OW_Complete;
     }
   }
 
@@ -443,7 +438,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (!EnablePartialOverwriteTracking &&
       (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
        int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
-    return OverwriteEnd;
+    return OW_End;
 
   // Finally, we also need to check if the later store overwrites the beginning
   // of the earlier store.
@@ -458,11 +453,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
       (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
     assert(int64_t(LaterOff + Later.Size) <
                int64_t(EarlierOff + Earlier.Size) &&
-           "Expect to be handled as OverwriteComplete");
-    return OverwriteBegin;
+           "Expect to be handled as OW_Complete");
+    return OW_Begin;
   }
   // Otherwise, they don't completely overlap.
-  return OverwriteUnknown;
+  return OW_Unknown;
 }
 
 /// If 'Inst' might be a self read (i.e. a noop copy of a
@@ -551,7 +546,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
       Instruction *I = &*BI;
       if (I->mayWriteToMemory() && I != SecondI) {
         auto Res = AA->getModRefInfo(I, MemLoc);
-        if (Res != MRI_NoModRef)
+        if (Res & MRI_Mod)
           return false;
       }
     }
@@ -909,7 +904,7 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
 
   if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
     assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
-           "Should have been handled as OverwriteComplete");
+           "Should have been handled as OW_Complete");
     if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
                      LaterSize, false)) {
       IntervalMap.erase(OII);
@@ -1105,7 +1100,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
         OverwriteResult OR =
             isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
                         DepWrite, IOL);
-        if (OR == OverwriteComplete) {
+        if (OR == OW_Complete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
@@ -1117,15 +1112,15 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
           // We erased DepWrite; start over.
           InstDep = MD->getDependency(Inst);
           continue;
-        } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
-                   ((OR == OverwriteBegin &&
+        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OW_Begin &&
                      isShortenableAtTheBeginning(DepWrite)))) {
           assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
                                                     "when partial-overwrite "
                                                     "tracking is enabled");
           int64_t EarlierSize = DepLoc.Size;
           int64_t LaterSize = Loc.Size;
-          bool IsOverwriteEnd = (OR == OverwriteEnd);
+          bool IsOverwriteEnd = (OR == OW_End);
           MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
                                     InstWriteOffset, LaterSize, IsOverwriteEnd);
         }
@@ -1186,8 +1181,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   if (!eliminateDeadStores(F, AA, MD, DT, TLI))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 16e08ee58fbe..04479b6e49ac 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -19,6 +19,8 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include <deque>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -253,6 +254,7 @@ public:
   DominatorTree &DT;
   AssumptionCache &AC;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   typedef RecyclingAllocator<
       BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
@@ -315,7 +317,9 @@ public:
   /// \brief Set up the EarlyCSE runner for a particular function.
   EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
            DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
-      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {}
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
+  }
 
   bool run();
 
@@ -388,7 +392,7 @@ private:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
       : IsTargetMemInst(false), Inst(Inst) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+        if (TTI.getTgtMemIntrinsic(II, Info))
           IsTargetMemInst = true;
     }
     bool isLoad() const {
@@ -400,17 +404,14 @@ private:
       return isa<StoreInst>(Inst);
     }
     bool isAtomic() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.Ordering != AtomicOrdering::NotAtomic;
       return Inst->isAtomic();
     }
     bool isUnordered() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return true;
-      }
+      if (IsTargetMemInst)
+        return Info.isUnordered();
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isUnordered();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -421,10 +422,9 @@ private:
     }
 
     bool isVolatile() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.IsVolatile;
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isVolatile();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -517,7 +517,7 @@ private:
           if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
             PhisToCheck.push_back(MP);
 
-        MSSA->removeMemoryAccess(WI);
+        MSSAUpdater->removeMemoryAccess(WI);
 
         for (MemoryPhi *MP : PhisToCheck) {
           MemoryAccess *FirstIn = MP->getIncomingValue(0);
@@ -587,27 +587,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // which reaches this block where the condition might hold a different
   // value.  Since we're adding this to the scoped hash table (like any other
   // def), it will have been popped if we encounter a future merge block.
-  if (BasicBlock *Pred = BB->getSinglePredecessor())
-    if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-      if (BI->isConditional())
-        if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
-          if (SimpleValue::canHandle(CondInst)) {
-            assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
-            auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
-              ConstantInt::getTrue(BB->getContext()) :
-              ConstantInt::getFalse(BB->getContext());
-            AvailableValues.insert(CondInst, ConditionalConstant);
-            DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
-                  << CondInst->getName() << "' as " << *ConditionalConstant
-                  << " in " << BB->getName() << "\n");
-            // Replace all dominated uses with the known value.
-            if (unsigned Count =
-                    replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
-                                             BasicBlockEdge(Pred, BB))) {
-              Changed = true;
-              NumCSECVP = NumCSECVP + Count;
-            }
-          }
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (BI && BI->isConditional()) {
+      auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+      if (CondInst && SimpleValue::canHandle(CondInst)) {
+        assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+        auto *TorF = (BI->getSuccessor(0) == BB)
+                         ? ConstantInt::getTrue(BB->getContext())
+                         : ConstantInt::getFalse(BB->getContext());
+        AvailableValues.insert(CondInst, TorF);
+        DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                     << CondInst->getName() << "' as " << *TorF << " in "
+                     << BB->getName() << "\n");
+        // Replace all dominated uses with the known value.
+        if (unsigned Count = replaceDominatedUsesWith(
+                CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+          Changed = true;
+          NumCSECVP = NumCSECVP + Count;
+        }
+      }
+    }
+  }
 
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
@@ -761,12 +762,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
-    // If this instruction may read from memory, forget LastStore.
-    // Load/store intrinsics will indicate both a read and a write to
-    // memory.  The target may override this (e.g. so that a store intrinsic
-    // does not read  from memory, and thus will be treated the same as a
-    // regular store for commoning purposes).
-    if (Inst->mayReadFromMemory() &&
+    // If this instruction may read from memory or throw (and potentially read
+    // from memory in the exception handler), forget LastStore.  Load/store
+    // intrinsics will indicate both a read and a write to memory.  The target
+    // may override this (e.g. so that a store intrinsic does not read from
+    // memory, and thus will be treated the same as a regular store for
+    // commoning purposes).
+    if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
         !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
       LastStore = nullptr;
 
@@ -967,10 +969,8 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   if (!CSE.run())
     return PreservedAnalyses::all();
 
-  // CSE preserves the dominator tree because it doesn't mutate the CFG.
-  // FIXME: Bundle this with other CFG-preservation.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   if (UseMemorySSA)
     PA.preserve<MemorySSAAnalysis>();
diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index 545036d724ef..8a5af6195f1b 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -516,11 +516,10 @@ FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
 PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
   if (!runImpl(F))
     return PreservedAnalyses::all();
-  else {
-    // FIXME: This should also 'preserve the CFG'.
-    PreservedAnalyses PA;
-    PA.preserve<GlobalsAA>();
-    return PA;
-  }
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 } // End namespace llvm
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 0137378b828b..be696df548d5 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -51,9 +50,12 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+
 #include <vector>
 using namespace llvm;
 using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "gvn"
@@ -595,11 +597,12 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
+  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
-LLVM_DUMP_METHOD
-void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
        E = d.end(); I != E; ++I) {
@@ -608,6 +611,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
+#endif
 
 /// Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -690,442 +694,6 @@ SpeculationFailure:
 }
 
 
-/// Return true if CoerceAvailableValueToLoadType will succeed.
-static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
-                                            Type *LoadTy,
-                                            const DataLayout &DL) {
-  // If the loaded or stored value is an first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
-      StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
-    return false;
-
-  // The store has to be at least as big as the load.
-  if (DL.getTypeSizeInBits(StoredVal->getType()) <
-        DL.getTypeSizeInBits(LoadTy))
-    return false;
-
-  return true;
-}
-
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value.  LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                             IRBuilder<> &IRB,
-                                             const DataLayout &DL) {
-  assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
-         "precondition violation - materialization can't fail");
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  // If this is already the right type, just return it.
-  Type *StoredValTy = StoredVal->getType();
-
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
-
-  // If the store and reload are the same size, we can always reuse it.
-  if (StoredValSize == LoadedValSize) {
-    // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->getScalarType()->isPointerTy() &&
-        LoadedTy->getScalarType()->isPointerTy()) {
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy);
-    } else {
-      // Convert source pointers to integers, which can be bitcast.
-      if (StoredValTy->getScalarType()->isPointerTy()) {
-        StoredValTy = DL.getIntPtrType(StoredValTy);
-        StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-      }
-
-      Type *TypeToCastTo = LoadedTy;
-      if (TypeToCastTo->getScalarType()->isPointerTy())
-        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
-
-      if (StoredValTy != TypeToCastTo)
-        StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
-
-      // Cast to pointer if the load needs a pointer type.
-      if (LoadedTy->getScalarType()->isPointerTy())
-        StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
-    }
-
-    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
-      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-        StoredVal = FoldedStoredVal;
-
-    return StoredVal;
-  }
-
-  // If the loaded value is smaller than the available value, then we can
-  // extract out a piece from it.  If the available value is too small, then we
-  // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
-         "CanCoerceMustAliasedValueToLoad fail");
-
-  // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->getScalarType()->isPointerTy()) {
-    StoredValTy = DL.getIntPtrType(StoredValTy);
-    StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-  }
-
-  // Convert vectors and fp to integer, which can be manipulated.
-  if (!StoredValTy->isIntegerTy()) {
-    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
-    StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
-  }
-
-  // If this is a big-endian system, we need to shift the value down to the low
-  // bits so that a truncate will work.
-  if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
-    StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp");
-  }
-
-  // Truncate the integer to the right size now.
-  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
-  StoredVal  = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
-
-  if (LoadedTy != NewIntTy) {
-    // If the result is a pointer, inttoptr.
-    if (LoadedTy->getScalarType()->isPointerTy())
-      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
-    else
-      // Otherwise, bitcast.
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
-  }
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  return StoredVal;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering memory write (store,
-/// memset, memcpy, memmove).  This means that the write *may* provide bits used
-/// by the load but we can't be sure because the pointers don't mustalias.
-///
-/// Check this case to see if there is anything more we can do before we give
-/// up.  This returns -1 if we have to give up, or a byte number in the stored
-/// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
-                                          Value *WritePtr,
-                                          uint64_t WriteSizeInBits,
-                                          const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
-    return -1;
-
-  int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase =
-      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
-  if (StoreBase != LoadBase)
-    return -1;
-
-  // If the load and store are to the exact same address, they should have been
-  // a must alias.  AA must have gotten confused.
-  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
-  // to a load from the base of the memset.
-
-  // If the load and store don't overlap at all, the store doesn't provide
-  // anything to the load.  In this case, they really don't alias at all, AA
-  // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
-
-  if ((WriteSizeInBits & 7) | (LoadSize & 7))
-    return -1;
-  uint64_t StoreSize = WriteSizeInBits / 8;  // Convert to bytes.
-  LoadSize /= 8;
-
-
-  bool isAAFailure = false;
-  if (StoreOffset < LoadOffset)
-    isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
-  else
-    isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
-
-  if (isAAFailure)
-    return -1;
-
-  // If the Load isn't completely contained within the stored bits, we don't
-  // have all the bits to feed it.  We could do something crazy in the future
-  // (issue a smaller load then merge the bits in) but this seems unlikely to be
-  // valuable.
-  if (StoreOffset > LoadOffset ||
-      StoreOffset+StoreSize < LoadOffset+LoadSize)
-    return -1;
-
-  // Okay, we can do this transformation.  Return the number of bytes into the
-  // store that the load is.
-  return LoadOffset-StoreOffset;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
-                                          StoreInst *DepSI) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepSI->getValueOperand()->getType()->isStructTy() ||
-      DepSI->getValueOperand()->getType()->isArrayTy())
-    return -1;
-
-  const DataLayout &DL = DepSI->getModule()->getDataLayout();
-  Value *StorePtr = DepSI->getPointerOperand();
-  uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                        StorePtr, StoreSize, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being clobbered by another load.  See if
-/// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
-                                         LoadInst *DepLI, const DataLayout &DL){
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
-    return -1;
-
-  Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
-  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
-  if (R != -1) return R;
-
-  // If we have a load/load clobber an DepLI can be widened to cover this load,
-  // then we should widen it!
-  int64_t LoadOffs = 0;
-  const Value *LoadBase =
-      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-
-  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-      LoadBase, LoadOffs, LoadSize, DepLI);
-  if (Size == 0) return -1;
-
-  // Check non-obvious conditions enforced by MDA which we rely on for being
-  // able to materialize this potentially available value
-  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
-  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
-
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
-}
-
-
-
-static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
-                                            MemIntrinsic *MI,
-                                            const DataLayout &DL) {
-  // If the mem operation is a non-constant size, we can't handle it.
-  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
-  if (!SizeCst) return -1;
-  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
-
-  // If this is memset, we just need to see if the offset is valid in the size
-  // of the memset..
-  if (MI->getIntrinsicID() == Intrinsic::memset)
-    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, DL);
-
-  // If we have a memcpy/memmove, the only case we can handle is if this is a
-  // copy from constant memory.  In that case, we can read directly from the
-  // constant memory.
-  MemTransferInst *MTI = cast<MemTransferInst>(MI);
-
-  Constant *Src = dyn_cast<Constant>(MTI->getSource());
-  if (!Src) return -1;
-
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
-  if (!GV || !GV->isConstant()) return -1;
-
-  // See if the access is within the bounds of the transfer.
-  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                              MI->getDest(), MemSizeInBits, DL);
-  if (Offset == -1)
-    return Offset;
-
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
-    return Offset;
-  return -1;
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.  This means
-/// that the store provides bits used by the load but we the pointers don't
-/// mustalias.  Check this case to see if there is anything more we can do
-/// before we give up.
-static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
-                                   Type *LoadTy,
-                                   Instruction *InsertPt, const DataLayout &DL){
-  LLVMContext &Ctx = SrcVal->getType()->getContext();
-
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // Compute which bits of the stored value are being used by the load.  Convert
-  // to an integer type to start with.
-  if (SrcVal->getType()->getScalarType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal,
-        DL.getIntPtrType(SrcVal->getType()));
-  if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-
-  // Shift the bits to the least significant depending on endianness.
-  unsigned ShiftAmt;
-  if (DL.isLittleEndian())
-    ShiftAmt = Offset*8;
-  else
-    ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-
-  if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-
-  if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-
-  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering load.  This means
-/// that the load *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias.  Check this case to see if there is
-/// anything more we can do before we give up.
-static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
-                                  Type *LoadTy, Instruction *InsertPt,
-                                  GVN &gvn) {
-  const DataLayout &DL = SrcVal->getModule()->getDataLayout();
-  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
-  // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-  if (Offset+LoadSize > SrcValStoreSize) {
-    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
-    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
-    // If we have a load/load clobber an DepLI can be widened to cover this
-    // load, then we should widen it to the next power of 2 size big enough!
-    unsigned NewLoadSize = Offset+LoadSize;
-    if (!isPowerOf2_32(NewLoadSize))
-      NewLoadSize = NextPowerOf2(NewLoadSize);
-
-    Value *PtrVal = SrcVal->getPointerOperand();
-
-    // Insert the new load after the old load.  This ensures that subsequent
-    // memdep queries will find the new load.  We can't easily remove the old
-    // load completely because it is already in the value numbering table.
-    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    Type *DestPTy =
-      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
-    DestPTy = PointerType::get(DestPTy,
-                               PtrVal->getType()->getPointerAddressSpace());
-    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
-    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
-    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
-    NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(SrcVal->getAlignment());
-
-    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
-    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-
-    // Replace uses of the original load with the wider load.  On a big endian
-    // system, we need to shift down to get the relevant bits.
-    Value *RV = NewLoad;
-    if (DL.isBigEndian())
-      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
-    RV = Builder.CreateTrunc(RV, SrcVal->getType());
-    SrcVal->replaceAllUsesWith(RV);
-
-    // We would like to use gvn.markInstructionForDeletion here, but we can't
-    // because the load is already memoized into the leader map table that GVN
-    // tracks.  It is potentially possible to remove the load from the table,
-    // but then there all of the operations based on it would need to be
-    // rehashed.  Just leave the dead load around.
-    gvn.getMemDep().removeInstruction(SrcVal);
-    SrcVal = NewLoad;
-  }
-
-  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                                     Type *LoadTy, Instruction *InsertPt,
-                                     const DataLayout &DL){
-  LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // We know that this method is only called when the mem transfer fully
-  // provides the bits for the load.
-  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
-    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
-    // independently of what the offset is.
-    Value *Val = MSI->getValue();
-    if (LoadSize != 1)
-      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-
-    Value *OneElt = Val;
-
-    // Splat the value out to the right number of bits.
-    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
-      // If we can double the number of bytes set, do it.
-      if (NumBytesSet*2 <= LoadSize) {
-        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
-        Val = Builder.CreateOr(Val, ShVal);
-        NumBytesSet <<= 1;
-        continue;
-      }
-
-      // Otherwise insert one byte at a time.
-      Value *ShVal = Builder.CreateShl(Val, 1*8);
-      Val = Builder.CreateOr(OneElt, ShVal);
-      ++NumBytesSet;
-    }
-
-    return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
-  }
-
-  // Otherwise, this is a memcpy/memmove from a constant global.
-  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
-  Constant *Src = cast<Constant>(MTI->getSource());
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
-}
 
 
 /// Given a set of loads specified by ValuesPerBlock,
@@ -1171,7 +739,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
 
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
@@ -1182,14 +750,20 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
     if (Load->getType() == LoadTy && Offset == 0) {
       Res = Load;
     } else {
-      Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn);
-
+      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+      // We would like to use gvn.markInstructionForDeletion here, but we can't
+      // because the load is already memoized into the leader map table that GVN
+      // tracks.  It is potentially possible to remove the load from the table,
+      // but then there all of the operations based on it would need to be
+      // rehashed.  Just leave the dead load around.
+      gvn.getMemDep().removeInstruction(Load);
       DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                    << *getCoercedLoadValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
+                   << *Res << '\n'
+                   << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
-    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
                                  InsertPt, DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
@@ -1258,7 +832,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
         if (Offset != -1) {
           Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
           return true;
@@ -1276,7 +850,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
 
         if (Offset != -1) {
           Res = AvailableValue::getLoad(DepLI, Offset);
@@ -1289,7 +863,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // forward a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
       if (Address && !LI->isAtomic()) {
-        int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
                                                       DepMI, DL);
         if (Offset != -1) {
           Res = AvailableValue::getMI(DepMI, Offset);
@@ -1334,7 +908,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // different types if we have to. If the stored value is larger or equal to
     // the loaded value, we can reuse it.
     if (S->getValueOperand()->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+        !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
                                          LI->getType(), DL))
       return false;
 
@@ -1351,7 +925,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
     if (LD->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+        !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
       return false;
 
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1713,7 +1287,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       // If instruction I has debug info, then we should not update it.
       // Also, if I has a null DebugLoc, then it is still potentially incorrect
       // to propagate LI's DebugLoc because LI may not post-dominate I.
-      if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
+      if (LI->getDebugLoc() && LI->getParent() == I->getParent())
         I->setDebugLoc(LI->getDebugLoc());
     if (V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -1795,7 +1369,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
 
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
-  // Note that if 'I' is a load being replaced by some operation, 
+  // Note that if 'I' is a load being replaced by some operation,
   // for example, by an arithmetic operation, then andIRFlags()
   // would just erase all math flags from the original arithmetic
   // operation, which is clearly not wanted and not needed.
@@ -2187,11 +1761,11 @@ bool GVN::processInstruction(Instruction *I) {
 
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      BasicBlock *Dst = i.getCaseSuccessor();
+      BasicBlock *Dst = i->getCaseSuccessor();
       // If there is only a single edge, propagate the case value into it.
       if (SwitchEdges.lookup(Dst) == 1) {
         BasicBlockEdge E(Parent, Dst);
-        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
       }
     }
     return Changed;
@@ -2581,21 +2155,12 @@ bool GVN::iterateOnFunction(Function &F) {
 
   // Top-down walk of the dominator tree
   bool Changed = false;
-  // Save the blocks this function have before transformation begins. GVN may
-  // split critical edge, and hence may invalidate the RPO/DT iterator.
-  //
-  std::vector<BasicBlock *> BBVect;
-  BBVect.reserve(256);
   // Needed for value numbering with phi construction to work.
+  // RPOT walks the graph in its constructor and will not be invalidated during
+  // processBlock.
   ReversePostOrderTraversal<Function *> RPOT(&F);
-  for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
-                                                           RE = RPOT.end();
-       RI != RE; ++RI)
-    BBVect.push_back(*RI);
-
-  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
-       I != E; I++)
-    Changed |= processBlock(*I);
+  for (BasicBlock *BB : RPOT)
+    Changed |= processBlock(BB);
 
   return Changed;
 }
@@ -2783,6 +2348,7 @@ public:
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index f8e1d2e1a08a..6adfe130d148 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -17,16 +17,39 @@
 // is disabled in the following cases.
 // 1. Scalars across calls.
 // 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+//   switch i32 %c1, label %exit1 [
+//     i32 0, label %sw0
+//     i32 1, label %sw1
+//   ]
+//
+// sw0:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// sw1:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// exit1:
+//   store i32 1, i32* @G
+//   ret void
+// exit:
+//   ret void
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 
 using namespace llvm;
 
@@ -60,7 +83,7 @@ static cl::opt<int>
                    cl::desc("Maximum length of dependent chains to hoist "
                             "(default = 10, unlimited = -1)"));
 
-namespace {
+namespace llvm {
 
 // Provides a sorting function based on the execution order of two instructions.
 struct SortByDFSIn {
@@ -72,13 +95,6 @@ public:
 
   // Returns true when A executes before B.
   bool operator()(const Instruction *A, const Instruction *B) const {
-    // FIXME: libc++ has a std::sort() algorithm that will call the compare
-    // function on the same element.  Once PR20837 is fixed and some more years
-    // pass by and all the buildbots have moved to a corrected std::sort(),
-    // enable the following assert:
-    //
-    // assert(A != B);
-
     const BasicBlock *BA = A->getParent();
     const BasicBlock *BB = B->getParent();
     unsigned ADFS, BDFS;
@@ -202,6 +218,7 @@ public:
   GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
            MemorySSA *MSSA)
       : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
         HoistingGeps(false),
         HoistedCtr(0)
   { }
@@ -249,9 +266,11 @@ private:
   AliasAnalysis *AA;
   MemoryDependenceResults *MD;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   const bool HoistingGeps;
   DenseMap<const Value *, unsigned> DFSNumber;
   BBSideEffectsSet BBSideEffects;
+  DenseSet<const BasicBlock*> HoistBarrier;
   int HoistedCtr;
 
   enum InsKind { Unknown, Scalar, Load, Store };
@@ -307,8 +326,8 @@ private:
         continue;
       }
 
-      // Check for end of function, calls that do not return, etc.
-      if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
+      // We reached the leaf Basic Block => not all paths have this instruction.
+      if (!BB->getTerminator()->getNumSuccessors())
         return false;
 
       // When reaching the back-edge of a loop, there may be a path through the
@@ -360,7 +379,7 @@ private:
             ReachedNewPt = true;
           }
         }
-        if (defClobbersUseOrDef(Def, MU, *AA))
+        if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
           return true;
       }
 
@@ -387,7 +406,8 @@ private:
     // executed between the execution of NewBB and OldBB. Hoisting an expression
     // from OldBB into NewBB has to be safe on all execution paths.
     for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
-      if (*I == NewBB) {
+      const BasicBlock *BB = *I;
+      if (BB == NewBB) {
         // Stop traversal when reaching HoistPt.
         I.skipChildren();
         continue;
@@ -398,11 +418,17 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != OldBB) && HoistBarrier.count(BB))
         return true;
 
       // Check that we do not move a store past loads.
-      if (hasMemoryUse(NewPt, Def, *I))
+      if (hasMemoryUse(NewPt, Def, BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -419,17 +445,18 @@ private:
   // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
   // return true when the counter NBBsOnAllPaths reaches 0, except when it is
   // initialized to -1 which is unlimited.
-  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB,
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths) {
-    assert(DT->dominates(HoistPt, BB) && "Invalid path");
+    assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
 
     // Walk all basic blocks reachable in depth-first iteration on
     // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
     // blocks that may be executed between the execution of NewHoistPt and
     // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
     // on all execution paths.
-    for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) {
-      if (*I == HoistPt) {
+    for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+      const BasicBlock *BB = *I;
+      if (BB == HoistPt) {
         // Stop traversal when reaching NewHoistPt.
         I.skipChildren();
         continue;
@@ -440,7 +467,13 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != SrcBB) && HoistBarrier.count(BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -626,6 +659,8 @@ private:
       // Compute the insertion point and the list of expressions to be hoisted.
       SmallVecInsn InstructionsToHoist;
       for (auto I : V)
+        // We don't need to check for hoist-barriers here because if
+        // I->getParent() is a barrier then I precedes the barrier.
         if (!hasEH(I->getParent()))
           InstructionsToHoist.push_back(I);
 
@@ -809,9 +844,9 @@ private:
           // legal when the ld/st is not moved past its current definition.
           MemoryAccess *Def = OldMemAcc->getDefiningAccess();
           NewMemAcc =
-              MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+            MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
           OldMemAcc->replaceAllUsesWith(NewMemAcc);
-          MSSA->removeMemoryAccess(OldMemAcc);
+          MSSAUpdater->removeMemoryAccess(OldMemAcc);
         }
       }
 
@@ -850,7 +885,7 @@ private:
             // Update the uses of the old MSSA access with NewMemAcc.
             MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
             OldMA->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(OldMA);
+            MSSAUpdater->removeMemoryAccess(OldMA);
           }
 
           Repl->andIRFlags(I);
@@ -872,7 +907,7 @@ private:
           auto In = Phi->incoming_values();
           if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
             Phi->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(Phi);
+            MSSAUpdater->removeMemoryAccess(Phi);
           }
         }
       }
@@ -896,6 +931,12 @@ private:
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
       int InstructionNb = 0;
       for (Instruction &I1 : *BB) {
+        // If I1 cannot guarantee progress, subsequent instructions
+        // in BB cannot be hoisted anyways.
+        if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+           HoistBarrier.insert(BB);
+           break;
+        }
         // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
         // deeper may increase the register pressure and compilation time.
         if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index b05ef002a456..7019287954a1 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -568,8 +568,7 @@ bool GuardWideningImpl::combineRangeChecks(
       return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
     };
 
-    std::copy_if(Checks.begin(), Checks.end(),
-                 std::back_inserter(CurrentChecks), IsCurrentCheck);
+    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
     Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
 
     assert(CurrentChecks.size() != 0 && "We know we have at least one!");
@@ -658,8 +657,12 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  bool Changed = GuardWideningImpl(DT, PDT, LI).run();
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  if (!GuardWideningImpl(DT, PDT, LI).run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 1752fb75eb1b..dcb2a4a0c6e6 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -231,8 +231,9 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   bool isExact = false;
   // See if we can convert this to an int64_t
   uint64_t UIntVal;
-  if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
-                           &isExact) != APFloat::opOK || !isExact)
+  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+                           APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+      !isExact)
     return false;
   IntVal = UIntVal;
   return true;
@@ -906,7 +907,7 @@ class WidenIV {
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
   enum ExtendKind { ZeroExtended, SignExtended, Unknown };
-  // A map tracking the kind of extension used to widen each narrow IV 
+  // A map tracking the kind of extension used to widen each narrow IV
   // and narrow IV user.
   // Key: pointer to a narrow IV or IV user.
   // Value: the kind of extension used to widen this Instruction.
@@ -1608,7 +1609,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
       return;
 
     CmpInst::Predicate P =
-            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);  
+            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
 
     auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
     auto CmpConstrainedLHSRange =
@@ -1634,7 +1635,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
   UpdateRangeFromGuards(NarrowUser);
 
   BasicBlock *NarrowUserBB = NarrowUser->getParent();
-  // If NarrowUserBB is statically unreachable asking dominator queries may 
+  // If NarrowUserBB is statically unreachable asking dominator queries may
   // yield surprising results. (e.g. the block may not have a dom tree node)
   if (!DT->isReachableFromEntry(NarrowUserBB))
     return;
@@ -2152,6 +2153,8 @@ linearFunctionTestReplace(Loop *L,
   Value *CmpIndVar = IndVar;
   const SCEV *IVCount = BackedgeTakenCount;
 
+  assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
   // If the exiting block is the same as the backedge block, we prefer to
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
@@ -2376,6 +2379,7 @@ bool IndVarSimplify::run(Loop *L) {
   //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
   //    and we're in trouble if we can't find the induction variable even when
   //    we've manually inserted one.
+  //  - LFTR relies on having a single backedge.
   if (!L->isLoopSimplifyForm())
     return false;
 
@@ -2492,8 +2496,9 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 8e81541c2337..85db6e5e1105 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -446,6 +446,15 @@ struct LoopStructure {
   BasicBlock *LatchExit;
   unsigned LatchBrExitIdx;
 
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+  //   ... body ...
+
   Value *IndVarNext;
   Value *IndVarStart;
   Value *LoopExitAt;
@@ -789,6 +798,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     return None;
   }
 
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
   // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
@@ -809,10 +822,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SLT, IndVarStart,
+              SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
+
       IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateAdd(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
     }
-
   } else {
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
@@ -831,15 +856,24 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SGT, IndVarStart,
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+
       IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateSub(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
     }
   }
 
-  const SCEV *StartNext = IndVarNext->getStart();
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
-  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
-
   BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
 
   assert(SE.getLoopDisposition(LatchCount, &L) ==
diff --git a/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index f4940c937a2d..5d8701431a2c 100644
--- a/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -89,13 +89,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-infer-addrspace"
-
-#include "NVPTX.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -105,19 +103,30 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
+#define DEBUG_TYPE "infer-address-spaces"
+
 using namespace llvm;
 
 namespace {
-const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1;
+static const unsigned UninitializedAddressSpace = ~0u;
 
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 
-/// \brief NVPTXInferAddressSpaces
-class NVPTXInferAddressSpaces: public FunctionPass {
+/// \brief InferAddressSpaces
+class InferAddressSpaces : public FunctionPass {
+  /// Target specific address space which uses of should be replaced if
+  /// possible.
+  unsigned FlatAddrSpace;
+
 public:
   static char ID;
 
-  NVPTXInferAddressSpaces() : FunctionPass(ID) {}
+  InferAddressSpaces() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
 
   bool runOnFunction(Function &F) override;
 
@@ -125,30 +134,51 @@ private:
   // Returns the new address space of V if updated; otherwise, returns None.
   Optional<unsigned>
   updateAddressSpace(const Value &V,
-                     const ValueToAddrSpaceMapTy &InferredAddrSpace);
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
 
   // Tries to infer the specific address space of each address expression in
   // Postorder.
   void inferAddressSpaces(const std::vector<Value *> &Postorder,
-                          ValueToAddrSpaceMapTy *InferredAddrSpace);
+                          ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
 
-  // Changes the generic address expressions in function F to point to specific
+  // Changes the flat address expressions in function F to point to specific
   // address spaces if InferredAddrSpace says so. Postorder is the postorder of
-  // all generic address expressions in the use-def graph of function F.
+  // all flat expressions in the use-def graph of function F.
   bool
   rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
                               const ValueToAddrSpaceMapTy &InferredAddrSpace,
-                              Function *F);
+                              Function *F) const;
+
+  void appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const;
+
+  bool rewriteIntrinsicOperands(IntrinsicInst *II,
+                                Value *OldV, Value *NewV) const;
+  void collectRewritableIntrinsicOperands(
+    IntrinsicInst *II,
+    std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const;
+
+  std::vector<Value *> collectFlatAddressExpressions(Function &F) const;
+
+  Value *cloneValueWithNewAddressSpace(
+    Value *V, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+  unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
 };
 } // end anonymous namespace
 
-char NVPTXInferAddressSpaces::ID = 0;
+char InferAddressSpaces::ID = 0;
 
 namespace llvm {
-void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+void initializeInferAddressSpacesPass(PassRegistry &);
 }
-INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace",
-                "Infer address spaces",
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
                 false, false)
 
 // Returns true if V is an address expression.
@@ -163,6 +193,7 @@ static bool isAddressExpression(const Value &V) {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
   case Instruction::GetElementPtr:
+  case Instruction::Select:
     return true;
   default:
     return false;
@@ -174,7 +205,7 @@ static bool isAddressExpression(const Value &V) {
 // Precondition: V is an address expression.
 static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
   assert(isAddressExpression(V));
-  const Operator& Op = cast<Operator>(V);
+  const Operator &Op = cast<Operator>(V);
   switch (Op.getOpcode()) {
   case Instruction::PHI: {
     auto IncomingValues = cast<PHINode>(Op).incoming_values();
@@ -185,42 +216,113 @@ static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
   case Instruction::AddrSpaceCast:
   case Instruction::GetElementPtr:
     return {Op.getOperand(0)};
+  case Instruction::Select:
+    return {Op.getOperand(1), Op.getOperand(2)};
   default:
     llvm_unreachable("Unexpected instruction type.");
   }
 }
 
-// If V is an unvisited generic address expression, appends V to PostorderStack
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+                                                  Value *OldV,
+                                                  Value *NewV) const {
+  Module *M = II->getParent()->getParent()->getParent();
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:{
+    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+    if (!IsVolatile || !IsVolatile->isNullValue())
+      return false;
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::objectsize: {
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+    IntrinsicInst *II, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::objectsize:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+                                                 PostorderStack, Visited);
+    break;
+  default:
+    break;
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
 // and marks it as visited.
-static void appendsGenericAddressExpressionToPostorderStack(
+void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
     Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
-    DenseSet<Value *> *Visited) {
+    DenseSet<Value *> *Visited) const {
   assert(V->getType()->isPointerTy());
   if (isAddressExpression(*V) &&
-      V->getType()->getPointerAddressSpace() ==
-          AddressSpace::ADDRESS_SPACE_GENERIC) {
+      V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
     if (Visited->insert(V).second)
       PostorderStack->push_back(std::make_pair(V, false));
   }
 }
 
-// Returns all generic address expressions in function F. The elements are
+// Returns all flat address expressions in function F. The elements are ordered
 // ordered in postorder.
-static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
+std::vector<Value *>
+InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
   // This function implements a non-recursive postorder traversal of a partial
   // use-def graph of function F.
-  std::vector<std::pair<Value*, bool>> PostorderStack;
+  std::vector<std::pair<Value *, bool>> PostorderStack;
   // The set of visited expressions.
-  DenseSet<Value*> Visited;
+  DenseSet<Value *> Visited;
+
+  auto PushPtrOperand = [&](Value *Ptr) {
+    appendsFlatAddressExpressionToPostorderStack(Ptr, &PostorderStack,
+                                                 &Visited);
+  };
+
   // We only explore address expressions that are reachable from loads and
   // stores for now because we aim at generating faster loads and stores.
   for (Instruction &I : instructions(F)) {
-    if (isa<LoadInst>(I)) {
-      appendsGenericAddressExpressionToPostorderStack(
-          I.getOperand(0), &PostorderStack, &Visited);
-    } else if (isa<StoreInst>(I)) {
-      appendsGenericAddressExpressionToPostorderStack(
-          I.getOperand(1), &PostorderStack, &Visited);
+    if (auto *LI = dyn_cast<LoadInst>(&I))
+      PushPtrOperand(LI->getPointerOperand());
+    else if (auto *SI = dyn_cast<StoreInst>(&I))
+      PushPtrOperand(SI->getPointerOperand());
+    else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+      PushPtrOperand(RMW->getPointerOperand());
+    else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+      PushPtrOperand(CmpX->getPointerOperand());
+    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+      // For memset/memcpy/memmove, any pointer operand can be replaced.
+      PushPtrOperand(MI->getRawDest());
+
+      // Handle 2nd operand for memcpy/memmove.
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+        PushPtrOperand(MTI->getRawSource());
+    } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      collectRewritableIntrinsicOperands(II, &PostorderStack, &Visited);
+    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+      // FIXME: Handle vectors of pointers
+      if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+        PushPtrOperand(Cmp->getOperand(0));
+        PushPtrOperand(Cmp->getOperand(1));
+      }
     }
   }
 
@@ -236,8 +338,8 @@ static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
     // Otherwise, adds its operands to the stack and explores them.
     PostorderStack.back().second = true;
     for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
-      appendsGenericAddressExpressionToPostorderStack(
-          PtrOperand, &PostorderStack, &Visited);
+      appendsFlatAddressExpressionToPostorderStack(PtrOperand, &PostorderStack,
+                                                   &Visited);
     }
   }
   return Postorder;
@@ -251,12 +353,18 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
     const ValueToValueMapTy &ValueWithNewAddrSpace,
     SmallVectorImpl<const Use *> *UndefUsesToFix) {
   Value *Operand = OperandUse.get();
+
+  Type *NewPtrTy =
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (Constant *C = dyn_cast<Constant>(Operand))
+    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
   if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
     return NewOperand;
 
   UndefUsesToFix->push_back(&OperandUse);
-  return UndefValue::get(
-      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
+  return UndefValue::get(NewPtrTy);
 }
 
 // Returns a clone of `I` with its operands converted to those specified in
@@ -277,7 +385,7 @@ static Value *cloneInstructionWithNewAddressSpace(
 
   if (I->getOpcode() == Instruction::AddrSpaceCast) {
     Value *Src = I->getOperand(0);
-    // Because `I` is generic, the source address space must be specific.
+    // Because `I` is flat, the source address space must be specific.
     // Therefore, the inferred address space must be the source space, according
     // to our algorithm.
     assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
@@ -293,7 +401,7 @@ static Value *cloneInstructionWithNewAddressSpace(
       NewPointerOperands.push_back(nullptr);
     else
       NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
-          OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
   }
 
   switch (I->getOpcode()) {
@@ -318,6 +426,11 @@ static Value *cloneInstructionWithNewAddressSpace(
     NewGEP->setIsInBounds(GEP->isInBounds());
     return NewGEP;
   }
+  case Instruction::Select: {
+    assert(I->getType()->isPointerTy());
+    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+                              NewPointerOperands[2], "", nullptr, I);
+  }
   default:
     llvm_unreachable("Unexpected opcode");
   }
@@ -327,13 +440,13 @@ static Value *cloneInstructionWithNewAddressSpace(
 // constant expression `CE` with its operands replaced as specified in
 // ValueWithNewAddrSpace.
 static Value *cloneConstantExprWithNewAddressSpace(
-    ConstantExpr *CE, unsigned NewAddrSpace,
-    const ValueToValueMapTy &ValueWithNewAddrSpace) {
+  ConstantExpr *CE, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace) {
   Type *TargetType =
-      CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+    CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
 
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
-    // Because CE is generic, the source address space must be specific.
+    // Because CE is flat, the source address space must be specific.
     // Therefore, the inferred address space must be the source space according
     // to our algorithm.
     assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
@@ -341,6 +454,24 @@ static Value *cloneConstantExprWithNewAddressSpace(
     return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
   }
 
+  if (CE->getOpcode() == Instruction::BitCast) {
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+    return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::Select) {
+    Constant *Src0 = CE->getOperand(1);
+    Constant *Src1 = CE->getOperand(2);
+    if (Src0->getType()->getPointerAddressSpace() ==
+        Src1->getType()->getPointerAddressSpace()) {
+
+      return ConstantExpr::getSelect(
+          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+          ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+    }
+  }
+
   // Computes the operands of the new constant expression.
   SmallVector<Constant *, 4> NewOperands;
   for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
@@ -362,30 +493,29 @@ static Value *cloneConstantExprWithNewAddressSpace(
     // Needs to specify the source type while constructing a getelementptr
     // constant expression.
     return CE->getWithOperands(
-        NewOperands, TargetType, /*OnlyIfReduced=*/false,
-        NewOperands[0]->getType()->getPointerElementType());
+      NewOperands, TargetType, /*OnlyIfReduced=*/false,
+      NewOperands[0]->getType()->getPointerElementType());
   }
 
   return CE->getWithOperands(NewOperands, TargetType);
 }
 
 // Returns a clone of the value `V`, with its operands replaced as specified in
-// ValueWithNewAddrSpace. This function is called on every generic address
+// ValueWithNewAddrSpace. This function is called on every flat address
 // expression whose address space needs to be modified, in postorder.
 //
 // See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
-static Value *
-cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
-                              const ValueToValueMapTy &ValueWithNewAddrSpace,
-                              SmallVectorImpl<const Use *> *UndefUsesToFix) {
-  // All values in Postorder are generic address expressions.
+Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
+  Value *V, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace,
+  SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+  // All values in Postorder are flat address expressions.
   assert(isAddressExpression(*V) &&
-         V->getType()->getPointerAddressSpace() ==
-             AddressSpace::ADDRESS_SPACE_GENERIC);
+         V->getType()->getPointerAddressSpace() == FlatAddrSpace);
 
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     Value *NewV = cloneInstructionWithNewAddressSpace(
-        I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
     if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
       if (NewI->getParent() == nullptr) {
         NewI->insertBefore(I);
@@ -396,63 +526,68 @@ cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
   }
 
   return cloneConstantExprWithNewAddressSpace(
-      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+    cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
 }
 
 // Defines the join operation on the address space lattice (see the file header
 // comments).
-static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) {
-  if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC ||
-      AS2 == AddressSpace::ADDRESS_SPACE_GENERIC)
-    return AddressSpace::ADDRESS_SPACE_GENERIC;
+unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
+                                               unsigned AS2) const {
+  if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+    return FlatAddrSpace;
 
-  if (AS1 == ADDRESS_SPACE_UNINITIALIZED)
+  if (AS1 == UninitializedAddressSpace)
     return AS2;
-  if (AS2 == ADDRESS_SPACE_UNINITIALIZED)
+  if (AS2 == UninitializedAddressSpace)
     return AS1;
 
-  // The join of two different specific address spaces is generic.
-  return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC;
+  // The join of two different specific address spaces is flat.
+  return (AS1 == AS2) ? AS1 : FlatAddrSpace;
 }
 
-bool NVPTXInferAddressSpaces::runOnFunction(Function &F) {
+bool InferAddressSpaces::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  // Collects all generic address expressions in postorder.
-  std::vector<Value *> Postorder = collectGenericAddressExpressions(F);
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  FlatAddrSpace = TTI.getFlatAddressSpace();
+  if (FlatAddrSpace == UninitializedAddressSpace)
+    return false;
+
+  // Collects all flat address expressions in postorder.
+  std::vector<Value *> Postorder = collectFlatAddressExpressions(F);
 
   // Runs a data-flow analysis to refine the address spaces of every expression
   // in Postorder.
   ValueToAddrSpaceMapTy InferredAddrSpace;
   inferAddressSpaces(Postorder, &InferredAddrSpace);
 
-  // Changes the address spaces of the generic address expressions who are
-  // inferred to point to a specific address space.
+  // Changes the address spaces of the flat address expressions who are inferred
+  // to point to a specific address space.
   return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
 }
 
-void NVPTXInferAddressSpaces::inferAddressSpaces(
+void InferAddressSpaces::inferAddressSpaces(
     const std::vector<Value *> &Postorder,
-    ValueToAddrSpaceMapTy *InferredAddrSpace) {
+    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
   SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
   // Initially, all expressions are in the uninitialized address space.
   for (Value *V : Postorder)
-    (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED;
+    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
 
   while (!Worklist.empty()) {
-    Value* V = Worklist.pop_back_val();
+    Value *V = Worklist.pop_back_val();
 
     // Tries to update the address space of the stack top according to the
     // address spaces of its operands.
-    DEBUG(dbgs() << "Updating the address space of\n"
-                 << "  " << *V << "\n");
+    DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
     Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
     if (!NewAS.hasValue())
       continue;
     // If any updates are made, grabs its users to the worklist because
     // their address spaces can also be possibly updated.
-    DEBUG(dbgs() << "  to " << NewAS.getValue() << "\n");
+    DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
     (*InferredAddrSpace)[V] = NewAS.getValue();
 
     for (Value *User : V->users()) {
@@ -461,15 +596,15 @@ void NVPTXInferAddressSpaces::inferAddressSpaces(
         continue;
 
       auto Pos = InferredAddrSpace->find(User);
-      // Our algorithm only updates the address spaces of generic address
+      // Our algorithm only updates the address spaces of flat address
       // expressions, which are those in InferredAddrSpace.
       if (Pos == InferredAddrSpace->end())
         continue;
 
       // Function updateAddressSpace moves the address space down a lattice
-      // path. Therefore, nothing to do if User is already inferred as
-      // generic (the bottom element in the lattice).
-      if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC)
+      // path. Therefore, nothing to do if User is already inferred as flat (the
+      // bottom element in the lattice).
+      if (Pos->second == FlatAddrSpace)
         continue;
 
       Worklist.insert(User);
@@ -477,35 +612,177 @@ void NVPTXInferAddressSpaces::inferAddressSpaces(
   }
 }
 
-Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace(
-    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) {
+Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
   assert(InferredAddrSpace.count(&V));
 
   // The new inferred address space equals the join of the address spaces
   // of all its pointer operands.
-  unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED;
-  for (Value *PtrOperand : getPointerOperands(V)) {
-    unsigned OperandAS;
-    if (InferredAddrSpace.count(PtrOperand))
-      OperandAS = InferredAddrSpace.lookup(PtrOperand);
+  unsigned NewAS = UninitializedAddressSpace;
+
+  const Operator &Op = cast<Operator>(V);
+  if (Op.getOpcode() == Instruction::Select) {
+    Value *Src0 = Op.getOperand(1);
+    Value *Src1 = Op.getOperand(2);
+
+    auto I = InferredAddrSpace.find(Src0);
+    unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+      I->second : Src0->getType()->getPointerAddressSpace();
+
+    auto J = InferredAddrSpace.find(Src1);
+    unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+      J->second : Src1->getType()->getPointerAddressSpace();
+
+    auto *C0 = dyn_cast<Constant>(Src0);
+    auto *C1 = dyn_cast<Constant>(Src1);
+
+    // If one of the inputs is a constant, we may be able to do a constant
+    // addrspacecast of it. Defer inferring the address space until the input
+    // address space is known.
+    if ((C1 && Src0AS == UninitializedAddressSpace) ||
+        (C0 && Src1AS == UninitializedAddressSpace))
+      return None;
+
+    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+      NewAS = Src1AS;
+    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+      NewAS = Src0AS;
     else
-      OperandAS = PtrOperand->getType()->getPointerAddressSpace();
-    NewAS = joinAddressSpaces(NewAS, OperandAS);
-    // join(generic, *) = generic. So we can break if NewAS is already generic.
-    if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC)
-      break;
+      NewAS = joinAddressSpaces(Src0AS, Src1AS);
+  } else {
+    for (Value *PtrOperand : getPointerOperands(V)) {
+      auto I = InferredAddrSpace.find(PtrOperand);
+      unsigned OperandAS = I != InferredAddrSpace.end() ?
+        I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+      // join(flat, *) = flat. So we can break if NewAS is already flat.
+      NewAS = joinAddressSpaces(NewAS, OperandAS);
+      if (NewAS == FlatAddrSpace)
+        break;
+    }
   }
 
   unsigned OldAS = InferredAddrSpace.lookup(&V);
-  assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC);
+  assert(OldAS != FlatAddrSpace);
   if (OldAS == NewAS)
     return None;
   return NewAS;
 }
 
-bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
-    const std::vector<Value *> &Postorder,
-    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) {
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value.
+static bool isSimplePointerUseValidToReplace(Use &U) {
+  User *Inst = U.getUser();
+  unsigned OpNo = U.getOperandNo();
+
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
+    return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
+    return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+
+  if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+    return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+
+  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+           !CmpX->isVolatile();
+  }
+
+  return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+                                     Value *NewV) {
+  IRBuilder<> B(MI);
+  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+  if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+    B.CreateMemSet(NewV, MSI->getValue(),
+                   MSI->getLength(), MSI->getAlignment(),
+                   false, // isVolatile
+                   TBAA, ScopeMD, NoAliasMD);
+  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+    Value *Src = MTI->getRawSource();
+    Value *Dest = MTI->getRawDest();
+
+    // Be careful in case this is a self-to-self copy.
+    if (Src == OldV)
+      Src = NewV;
+
+    if (Dest == OldV)
+      Dest = NewV;
+
+    if (isa<MemCpyInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpy(Dest, Src, MTI->getLength(),
+                     MTI->getAlignment(),
+                     false, // isVolatile
+                     TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else {
+      assert(isa<MemMoveInst>(MTI));
+      B.CreateMemMove(Dest, Src, MTI->getLength(),
+                      MTI->getAlignment(),
+                      false, // isVolatile
+                      TBAA, ScopeMD, NoAliasMD);
+    }
+  } else
+    llvm_unreachable("unhandled MemIntrinsic");
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+  assert(NewAS != UninitializedAddressSpace);
+
+  unsigned SrcAS = C->getType()->getPointerAddressSpace();
+  if (SrcAS == NewAS || isa<UndefValue>(C))
+    return true;
+
+  // Prevent illegal casts between different non-flat address spaces.
+  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+    return false;
+
+  if (isa<ConstantPointerNull>(C))
+    return true;
+
+  if (auto *Op = dyn_cast<Operator>(C)) {
+    // If we already have a constant addrspacecast, it should be safe to cast it
+    // off.
+    if (Op->getOpcode() == Instruction::AddrSpaceCast)
+      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+    if (Op->getOpcode() == Instruction::IntToPtr &&
+        Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+      return true;
+  }
+
+  return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+                                          Value::use_iterator End) {
+  User *CurUser = I->getUser();
+  ++I;
+
+  while (I != End && I->getUser() == CurUser)
+    ++I;
+
+  return I;
+}
+
+bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+  const std::vector<Value *> &Postorder,
+  const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
   // For each address expression to be modified, creates a clone of it with its
   // pointer operands converted to the new address space. Since the pointer
   // operands are converted, the clone is naturally in the new address space by
@@ -516,7 +793,7 @@ bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
     unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
     if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
       ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
-          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+        V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
     }
   }
 
@@ -524,7 +801,7 @@ bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
     return false;
 
   // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
-  for (const Use* UndefUse : UndefUsesToFix) {
+  for (const Use *UndefUse : UndefUsesToFix) {
     User *V = UndefUse->getUser();
     User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
     unsigned OperandNo = UndefUse->getOperandNo();
@@ -538,39 +815,82 @@ bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
     if (NewV == nullptr)
       continue;
 
-    SmallVector<Use *, 4> Uses;
-    for (Use &U : V->uses())
-      Uses.push_back(&U);
-    DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  to\n  " << *NewV
-                 << "\n");
-    for (Use *U : Uses) {
-      if (isa<LoadInst>(U->getUser()) ||
-          (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) {
-        // If V is used as the pointer operand of a load/store, sets the pointer
-        // operand to NewV. This replacement does not change the element type,
-        // so the resultant load/store is still valid.
-        U->set(NewV);
-      } else if (isa<Instruction>(U->getUser())) {
-        // Otherwise, replaces the use with generic(NewV).
-        // TODO: Some optimization opportunities are missed. For example, in
-        //   %0 = icmp eq float* %p, %q
-        // if both p and q are inferred to be shared, we can rewrite %0 as
-        //   %0 = icmp eq float addrspace(3)* %new_p, %new_q
-        // instead of currently
-        //   %generic_p = addrspacecast float addrspace(3)* %new_p to float*
-        //   %generic_q = addrspacecast float addrspace(3)* %new_q to float*
-        //   %0 = icmp eq float* %generic_p, %generic_q
+    DEBUG(dbgs() << "Replacing the uses of " << *V
+                 << "\n  with\n  " << *NewV << '\n');
+
+    Value::use_iterator I, E, Next;
+    for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+      Use &U = *I;
+
+      // Some users may see the same pointer operand in multiple operands. Skip
+      // to the next instruction.
+      I = skipToNextUser(I, E);
+
+      if (isSimplePointerUseValidToReplace(U)) {
+        // If V is used as the pointer operand of a compatible memory operation,
+        // sets the pointer operand to NewV. This replacement does not change
+        // the element type, so the resultant load/store is still valid.
+        U.set(NewV);
+        continue;
+      }
+
+      User *CurUser = U.getUser();
+      // Handle more complex cases like intrinsic that need to be remangled.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+          continue;
+      }
+
+      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+        if (rewriteIntrinsicOperands(II, V, NewV))
+          continue;
+      }
+
+      if (isa<Instruction>(CurUser)) {
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+          // If we can infer that both pointers are in the same addrspace,
+          // transform e.g.
+          //   %cmp = icmp eq float* %p, %q
+          // into
+          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          int SrcIdx = U.getOperandNo();
+          int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+          Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+              Cmp->setOperand(OtherIdx, OtherNewV);
+              Cmp->setOperand(SrcIdx, NewV);
+              continue;
+            }
+          }
+
+          // Even if the type mismatches, we can cast the constant.
+          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+              Cmp->setOperand(SrcIdx, NewV);
+              Cmp->setOperand(OtherIdx,
+                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+              continue;
+            }
+          }
+        }
+
+        // Otherwise, replaces the use with flat(NewV).
         if (Instruction *I = dyn_cast<Instruction>(V)) {
           BasicBlock::iterator InsertPos = std::next(I->getIterator());
           while (isa<PHINode>(InsertPos))
             ++InsertPos;
-          U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
         } else {
-          U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
-                                                V->getType()));
+          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                               V->getType()));
         }
       }
     }
+
     if (V->use_empty())
       RecursivelyDeleteTriviallyDeadInstructions(V);
   }
@@ -578,6 +898,6 @@ bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
   return true;
 }
 
-FunctionPass *llvm::createNVPTXInferAddressSpacesPass() {
-  return new NVPTXInferAddressSpaces();
+FunctionPass *llvm::createInferAddressSpacesPass() {
+  return new InferAddressSpaces();
 }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 1870c3deb4f3..08eb95a1a3d3 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
@@ -30,11 +31,13 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -89,6 +92,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
@@ -104,6 +108,7 @@ INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
@@ -121,6 +126,7 @@ bool JumpThreading::runOnFunction(Function &F) {
     return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
@@ -129,7 +135,8 @@ bool JumpThreading::runOnFunction(Function &F) {
     BPI.reset(new BranchProbabilityInfo(F, LI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI),
+
+  return Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
                       std::move(BPI));
 }
 
@@ -138,6 +145,8 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
@@ -146,12 +155,9 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
     BPI.reset(new BranchProbabilityInfo(F, LI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  bool Changed =
-      runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI));
 
-  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<LazyValueAnalysis>(F);
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+                         std::move(BPI));
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -161,18 +167,23 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 }
 
 bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
-                                LazyValueInfo *LVI_, bool HasProfileData_,
+                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
+                                bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
   LVI = LVI_;
+  AA = AA_;
   BFI.reset();
   BPI.reset();
   // When profile data is available, we need to update edge weights after
   // successful jump threading, which requires both BPI and BFI being available.
   HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
   if (HasProfileData) {
     BPI = std::move(BPI_);
     BFI = std::move(BFI_);
@@ -226,26 +237,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
           BB != &BB->getParent()->getEntryBlock() &&
           // If the terminator is the only non-phi instruction, try to nuke it.
           BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
-        // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
-        // block, we have to make sure it isn't in the LoopHeaders set.  We
-        // reinsert afterward if needed.
-        bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
-        BasicBlock *Succ = BI->getSuccessor(0);
-
         // FIXME: It is always conservatively correct to drop the info
         // for a block even if it doesn't get erased.  This isn't totally
         // awesome, but it allows us to use AssertingVH to prevent nasty
         // dangling pointer issues within LazyValueInfo.
         LVI->eraseBlock(BB);
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
           Changed = true;
-          // If we deleted BB and BB was the header of a loop, then the
-          // successor is now the header of the loop.
-          BB = Succ;
-        }
-
-        if (ErasedFromLoopHeaders)
-          LoopHeaders.insert(BB);
       }
     }
     EverChanged |= Changed;
@@ -255,10 +253,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   return EverChanged;
 }
 
-/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it. Stop scanning the block when passing the threshold.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
                                              unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I(BB->getFirstNonPHI());
 
@@ -266,15 +267,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // branch, so they shouldn't count against the duplication cost.
 
   unsigned Bonus = 0;
-  const TerminatorInst *BBTerm = BB->getTerminator();
-  // Threading through a switch statement is particularly profitable.  If this
-  // block ends in a switch, decrease its cost to make it more likely to happen.
-  if (isa<SwitchInst>(BBTerm))
-    Bonus = 6;
-
-  // The same holds for indirect branches, but slightly more so.
-  if (isa<IndirectBrInst>(BBTerm))
-    Bonus = 8;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
 
   // Bump the threshold up so the early exit from the loop doesn't skip the
   // terminator-based Size adjustment at the end.
@@ -283,7 +286,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
-  for (; !isa<TerminatorInst>(I); ++I) {
+  for (; &*I != StopAt; ++I) {
 
     // Stop scanning the block if we've reached the threshold.
     if (Size > Threshold)
@@ -729,6 +732,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
 
+  // Look if we can propagate guards to predecessors.
+  if (HasGuards && ProcessGuards(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -804,7 +811,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     return false;
   }
 
-
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
     // If we're branching on a conditional, LVI might be able to determine
     // it's value at the branch instruction.  We only handle comparisons
@@ -812,7 +818,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     // TODO: This should be extended to handle switches as well.
     BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
     Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
-    if (CondBr && CondConst && CondBr->isConditional()) {
+    if (CondBr && CondConst) {
+      // We should have returned as soon as we turn a conditional branch to
+      // unconditional. Because its no longer interesting as far as jump
+      // threading is concerned.
+      assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
       LazyValueInfo::Tristate Ret =
         LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                             CondConst, CondBr);
@@ -835,10 +846,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         }
         return true;
       }
-    }
 
-    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
-      return true;
+      // We did not manage to simplify this branch, try to see whether
+      // CondCmp depends on a known phi-select pattern.
+      if (TryToUnfoldSelect(CondCmp, BB))
+        return true;
+    }
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -857,7 +870,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (SimplifyPartiallyRedundantLoad(LI))
       return true;
 
-
   // Handle a variety of cases where we are branching on something derived from
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
@@ -871,7 +883,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
       return ProcessBranchOnPHI(PN);
 
-
   // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
   if (CondInst->getOpcode() == Instruction::Xor &&
       CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
@@ -920,6 +931,14 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
   return false;
 }
 
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+    if (OpInst->getParent() == BB)
+      return true;
+  return false;
+}
+
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -942,18 +961,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   Value *LoadedPtr = LI->getOperand(0);
 
-  // If the loaded operand is defined in the LoadBB, it can't be available.
-  // TODO: Could do simple PHI translation, that would be fun :)
-  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
-    if (PtrOp->getParent() == LoadBB)
-      return false;
+  // If the loaded operand is defined in the LoadBB and its not a phi,
+  // it can't be available in predecessors.
+  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+    return false;
 
   // Scan a few instructions up from the load, to see if it is obviously live at
   // the entry to its block.
   BasicBlock::iterator BBIt(LI);
   bool IsLoadCSE;
-  if (Value *AvailableVal =
-        FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) {
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
@@ -997,12 +1015,34 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     if (!PredsScanned.insert(PredBB).second)
       continue;
 
-    // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
-                                                    DefMaxInstsToScan,
-                                                    nullptr,
-                                                    &IsLoadCSE);
+    unsigned NumScanedInst = 0;
+    Value *PredAvailable = nullptr;
+    // NOTE: We don't CSE load that is volatile or anything stronger than
+    // unordered, that should have been checked when we entered the function.
+    assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+    // If this is a load on a phi pointer, phi-translate it and search
+    // for available load/store to the pointer in predecessors.
+    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+    PredAvailable = FindAvailablePtrLoadStore(
+        Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        AA, &IsLoadCSE, &NumScanedInst);
+
+    // If PredBB has a single predecessor, continue scanning through the
+    // single precessor.
+    BasicBlock *SinglePredBB = PredBB;
+    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+           NumScanedInst < DefMaxInstsToScan) {
+      SinglePredBB = SinglePredBB->getSinglePredecessor();
+      if (SinglePredBB) {
+        BBIt = SinglePredBB->end();
+        PredAvailable = FindAvailablePtrLoadStore(
+            Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            &NumScanedInst);
+      }
+    }
+
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
@@ -1062,10 +1102,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    LoadInst *NewVal =
-        new LoadInst(LoadedPtr, LI->getName() + ".pr", false,
-                     LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(),
-                     UnavailablePred->getTerminator());
+    LoadInst *NewVal = new LoadInst(
+        LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+        LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+        LI->getSynchScope(), UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
@@ -1229,7 +1269,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
     else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
     } else {
       assert(isa<IndirectBrInst>(BB->getTerminator())
               && "Unexpected terminator");
@@ -1468,7 +1508,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1756,7 +1797,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
@@ -1888,10 +1930,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 /// TryToUnfoldSelect - Look for blocks of the form
 /// bb1:
 ///   %a = select
-///   br bb
+///   br bb2
 ///
 /// bb2:
-///   %p = phi [%a, %bb] ...
+///   %p = phi [%a, %bb1] ...
 ///   %c = icmp %p
 ///   br i1 %c
 ///
@@ -2021,3 +2063,130 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   
   return false;
 }
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  using namespace PatternMatch;
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+  if (Pred1 == Pred2)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  auto *Parent = Pred1->getSinglePredecessor();
+  if (!Parent || Parent != Pred2->getSinglePredecessor())
+    return false;
+
+  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+          return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl =
+        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+                                                       Guard, UnguardedMapping);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index f51d11c04cb2..340c81fed0fd 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -77,10 +77,16 @@ STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
 STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
 STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
 
+/// Memory promotion is enabled by default.
 static cl::opt<bool>
-    DisablePromotion("disable-licm-promotion", cl::Hidden,
+    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
                      cl::desc("Disable memory promotion in LICM pass"));
 
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+    cl::desc("Max num uses visited for identifying load "
+             "invariance in loop using invariant start (default = 8)"));
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
                             const LoopSafetyInfo *SafetyInfo);
@@ -201,9 +207,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
     return PreservedAnalyses::all();
 
-  // FIXME: There is no setPreservesCFG in the new PM. When that becomes
-  // available, it should be used here.
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 char LegacyLICMPass::ID = 0;
@@ -425,6 +431,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         continue;
       }
 
+      // Attempt to remove floating point division out of the loop by converting
+      // it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv &&
+          CurLoop->isLoopInvariant(I.getOperand(1)) &&
+          I.hasAllowReciprocal()) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
+                                                  ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        I.eraseFromParent();
+
+        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        Changed = true;
+        continue;
+      }
+
       // Try hoisting the instruction out to the preheader.  We can only do this
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
@@ -461,7 +490,10 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
 
   SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
   // Iterate over loop instructions and compute safety info.
-  for (Loop::block_iterator BB = CurLoop->block_begin(),
+  // Skip header as it has been computed and stored in HeaderMayThrow.
+  // The first block in loopinfo.Blocks is guaranteed to be the header.
+  assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+  for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
                             BBE = CurLoop->block_end();
        (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
@@ -477,6 +509,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
         SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory location
+// and size as the memory location LI loads from, and also the invariant.start
+// has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+                                  Loop *CurLoop) {
+  Value *Addr = LI->getOperand(0);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+      cast<PointerType>(Addr->getType())->getElementType());
+
+  // if the type is i8 addrspace(x)*, we know this is the type of
+  // llvm.invariant.start operand
+  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+                                     LI->getPointerAddressSpace());
+  unsigned BitcastsVisited = 0;
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (Addr->getType() != PtrInt8Ty) {
+    auto *BC = dyn_cast<BitCastInst>(Addr);
+    // Avoid traversing high number of bitcast uses.
+    if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+      return false;
+    Addr = BC->getOperand(0);
+  }
+
+  unsigned UsesVisited = 0;
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : Addr->users()) {
+    // Avoid traversing for Load operand with high number of users.
+    if (++UsesVisited > MaxNumUsesTraversed)
+      return false;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    // If there are escaping uses of invariant.start instruction, the load maybe
+    // non-invariant.
+    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+        II->hasNUsesOrMore(1))
+      continue;
+    unsigned InvariantSizeInBits =
+        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    // Confirm the invariant.start location size contains the load operand size
+    // in bits. Also, the invariant.start should dominate the load, and we
+    // should not hoist the load out of a loop that contains this dominating
+    // invariant.start.
+    if (LocSizeInBits <= InvariantSizeInBits &&
+        DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+      return true;
+  }
+
+  return false;
+}
+
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
                               LoopSafetyInfo *SafetyInfo,
@@ -493,6 +578,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT, CurLoop))
+      return true;
+
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
@@ -782,7 +871,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
   ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
-            << "hosting " << ore::NV("Inst", &I));
+            << "hoisting " << ore::NV("Inst", &I));
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -852,6 +941,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   LoopInfo &LI;
   DebugLoc DL;
   int Alignment;
+  bool UnorderedAtomic;
   AAMDNodes AATags;
 
   Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
@@ -875,10 +965,11 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-               const AAMDNodes &AATags)
+               bool UnorderedAtomic, const AAMDNodes &AATags)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
-        LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {}
+        LI(li), DL(std::move(dl)), Alignment(alignment),
+        UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -902,6 +993,8 @@ public:
       Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
       Instruction *InsertPos = LoopInsertPts[i];
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      if (UnorderedAtomic)
+        NewSI->setOrdering(AtomicOrdering::Unordered);
       NewSI->setAlignment(Alignment);
       NewSI->setDebugLoc(DL);
       if (AATags)
@@ -992,18 +1085,41 @@ bool llvm::promoteLoopAccessesToScalars(
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
+  // Keep track of which types of access we see
+  bool SawUnorderedAtomic = false; 
+  bool SawNotAtomic = false;
   AAMDNodes AATags;
 
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
 
+  // Do we know this object does not escape ?
+  bool IsKnownNonEscapingObject = false;
   if (SafetyInfo->MayThrow) {
     // If a loop can throw, we have to insert a store along each unwind edge.
     // That said, we can't actually make the unwind edge explicit. Therefore,
     // we have to prove that the store is dead along the unwind edge.
     //
-    // Currently, this code just special-cases alloca instructions.
-    if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL)))
-      return false;
+    // If the underlying object is not an alloca, nor a pointer that does not
+    // escape, then we can not effectively prove that the store is dead along
+    // the unwind edge. i.e. the caller of this function could have ways to
+    // access the pointed object.
+    Value *Object = GetUnderlyingObject(SomePtr, MDL);
+    // If this is a base pointer we do not understand, simply bail.
+    // We only handle alloca and return value from alloc-like fn right now.
+    if (!isa<AllocaInst>(Object)) {
+        if (!isAllocLikeFn(Object, TLI))
+          return false;
+      // If this is an alloc like fn. There are more constraints we need to verify.
+      // More specifically, we must make sure that the pointer can not escape.
+      //
+      // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
+      // even though its not captured by the enclosing function. Standard allocation
+      // functions like malloc, calloc, and operator new return values which can
+      // be assumed not to have previously escaped.
+      if (PointerMayBeCaptured(Object, true, true))
+        return false;
+      IsKnownNonEscapingObject = true;
+    }
   }
 
   // Check that all of the pointers in the alias set have the same type.  We
@@ -1029,8 +1145,11 @@ bool llvm::promoteLoopAccessesToScalars(
       // it.
       if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
         assert(!Load->isVolatile() && "AST broken");
-        if (!Load->isSimple())
+        if (!Load->isUnordered())
           return false;
+        
+        SawUnorderedAtomic |= Load->isAtomic();
+        SawNotAtomic |= !Load->isAtomic();
 
         if (!DereferenceableInPH)
           DereferenceableInPH = isSafeToExecuteUnconditionally(
@@ -1041,9 +1160,12 @@ bool llvm::promoteLoopAccessesToScalars(
         if (UI->getOperand(1) != ASIV)
           continue;
         assert(!Store->isVolatile() && "AST broken");
-        if (!Store->isSimple())
+        if (!Store->isUnordered())
           return false;
 
+        SawUnorderedAtomic |= Store->isAtomic();
+        SawNotAtomic |= !Store->isAtomic();
+
         // If the store is guaranteed to execute, both properties are satisfied.
         // We may want to check if a store is guaranteed to execute even if we
         // already know that promotion is safe, since it may have higher
@@ -1096,6 +1218,12 @@ bool llvm::promoteLoopAccessesToScalars(
     }
   }
 
+  // If we found both an unordered atomic instruction and a non-atomic memory
+  // access, bail.  We can't blindly promote non-atomic to atomic since we
+  // might not be able to lower the result.  We can't downgrade since that
+  // would violate memory model.  Also, align 0 is an error for atomics.
+  if (SawUnorderedAtomic && SawNotAtomic)
+    return false;
 
   // If we couldn't prove we can hoist the load, bail.
   if (!DereferenceableInPH)
@@ -1106,10 +1234,15 @@ bool llvm::promoteLoopAccessesToScalars(
   // stores along paths which originally didn't have them without violating the
   // memory model.
   if (!SafeToInsertStore) {
-    Value *Object = GetUnderlyingObject(SomePtr, MDL);
-    SafeToInsertStore =
-        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+    // If this is a known non-escaping object, it is safe to insert the stores.
+    if (IsKnownNonEscapingObject)
+      SafeToInsertStore = true;
+    else {
+      Value *Object = GetUnderlyingObject(SomePtr, MDL);
+      SafeToInsertStore =
+        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && 
         !PointerMayBeCaptured(Object, true, true);
+    }
   }
 
   // If we've still failed to prove we can sink the store, give up.
@@ -1134,12 +1267,15 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+                        SawUnorderedAtomic, AATags);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad = new LoadInst(
       SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+  if (SawUnorderedAtomic)
+    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
   if (AATags)
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 389f1c595aa4..02215d3450c2 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -53,18 +54,20 @@ struct LoadPOPPair {
 class LoadCombine : public BasicBlockPass {
   LLVMContext *C;
   AliasAnalysis *AA;
+  DominatorTree *DT;
 
 public:
   LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
     initializeLoadCombinePass(*PassRegistry::getPassRegistry());
   }
-  
+
   using llvm::Pass::doInitialization;
   bool doInitialization(Function &) override;
   bool runOnBasicBlock(BasicBlock &BB) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
@@ -234,6 +237,14 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
     return false;
 
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  // Skip analysing dead blocks (not forward reachable from function entry).
+  if (!DT->isReachableFromEntry(&BB)) {
+    DEBUG(dbgs() << "LC: skipping unreachable " << BB.getName() <<
+          " in " << BB.getParent()->getName() << "\n");
+    return false;
+  }
 
   IRBuilder<TargetFolder> TheBuilder(
       BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
@@ -245,13 +256,17 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
   bool Combined = false;
   unsigned Index = 0;
   for (auto &I : BB) {
-    if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
+    if (I.mayThrow() || AST.containsUnknown(&I)) {
       if (combineLoads(LoadMap))
         Combined = true;
       LoadMap.clear();
       AST.clear();
       continue;
     }
+    if (I.mayWriteToMemory()) {
+      AST.add(&I);
+      continue;
+    }
     LoadInst *LI = dyn_cast<LoadInst>(&I);
     if (!LI)
       continue;
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index cca75a365024..73e8ce0e1d93 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -29,32 +29,31 @@ using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
 
-/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
-/// checked for unique exit and exiting blocks, and that the code is in LCSSA
-/// form.
-bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
-                                  SmallVectorImpl<BasicBlock *> &exitingBlocks,
-                                  SmallVectorImpl<BasicBlock *> &exitBlocks,
-                                  bool &Changed, BasicBlock *Preheader) {
-  BasicBlock *exitBlock = exitBlocks[0];
-
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+                       SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+                       BasicBlock *ExitBlock, bool &Changed,
+                       BasicBlock *Preheader) {
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
   // must pass through a PHI in the exit block, meaning that this check is
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
-  BasicBlock::iterator BI = exitBlock->begin();
+  BasicBlock::iterator BI = ExitBlock->begin();
   bool AllEntriesInvariant = true;
   bool AllOutgoingValuesSame = true;
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+    Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
 
     // Make sure all exiting blocks produce the same incoming value for the exit
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
     AllOutgoingValuesSame =
-        all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) {
+        all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
           return incoming == P->getIncomingValueForBlock(BB);
         });
 
@@ -78,33 +77,37 @@ bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
 
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
-  // marked volatile.  This could be made more aggressive by using aliasing
-  // information to identify readonly and readnone calls.
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (Instruction &I : **LI) {
-      if (I.mayHaveSideEffects())
-        return false;
-    }
-  }
-
+  // marked volatile.
+  for (auto &I : L->blocks())
+    if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
   return true;
 }
 
-/// Remove dead loops, by which we mean loops that do not impact the observable
-/// behavior of the program other than finite running time.  Note we do ensure
-/// that this never remove a loop that might be infinite, as doing so could
-/// change the halting/non-halting nature of a program. NOTE: This entire
-/// process relies pretty heavily on LoopSimplify and LCSSA in order to make
-/// various safety checks work.
-bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                               LoopInfo &loopInfo) {
+/// Remove a loop if it is dead.
+///
+/// A loop is considered dead if it does not impact the observable behavior of
+/// the program other than finite running time. This never removes a loop that
+/// might be infinite, as doing so could change the halting/non-halting nature
+/// of a program.
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+///
+/// This also updates the relevant analysis information in \p DT, \p SE, and \p
+/// LI. It also updates the loop PM if an updater struct is provided.
+static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                             LoopInfo &LI, LPMUpdater *Updater = nullptr) {
   assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
 
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
-  BasicBlock *preheader = L->getLoopPreheader();
-  if (!preheader)
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
     return false;
 
   // If LoopSimplify form is not available, stay out of trouble.
@@ -116,22 +119,20 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   if (L->begin() != L->end())
     return false;
 
-  SmallVector<BasicBlock *, 4> exitingBlocks;
-  L->getExitingBlocks(exitingBlocks);
-
-  SmallVector<BasicBlock *, 4> exitBlocks;
-  L->getUniqueExitBlocks(exitBlocks);
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
 
   // We require that the loop only have a single exit block.  Otherwise, we'd
   // be in the situation of needing to be able to solve statically which exit
   // block will be branched to, or trying to preserve the branching logic in
   // a loop invariant manner.
-  if (exitBlocks.size() != 1)
+  BasicBlock *ExitBlock = L->getUniqueExitBlock();
+  if (!ExitBlock)
     return false;
 
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader))
     return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
@@ -142,11 +143,13 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
-  BasicBlock *exitBlock = exitBlocks[0];
-
+  //
   // Because we're deleting a large chunk of code at once, the sequence in which
-  // we remove things is very important to avoid invalidation issues.  Don't
-  // mess with this unless you have good reason and know what you're doing.
+  // we remove things is very important to avoid invalidation issues.
+
+  // If we have an LPM updater, tell it about the loop being removed.
+  if (Updater)
+    Updater->markLoopAsDeleted(*L);
 
   // Tell ScalarEvolution that the loop is deleted. Do this before
   // deleting the loop so that ScalarEvolution can look at the loop
@@ -154,19 +157,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SE.forgetLoop(L);
 
   // Connect the preheader directly to the exit block.
-  TerminatorInst *TI = preheader->getTerminator();
-  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+  TerminatorInst *TI = Preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), ExitBlock);
 
   // Rewrite phis in the exit block to get their inputs from
   // the preheader instead of the exiting block.
-  BasicBlock *exitingBlock = exitingBlocks[0];
-  BasicBlock::iterator BI = exitBlock->begin();
+  BasicBlock *ExitingBlock = ExitingBlocks[0];
+  BasicBlock::iterator BI = ExitBlock->begin();
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    int j = P->getBasicBlockIndex(exitingBlock);
+    int j = P->getBasicBlockIndex(ExitingBlock);
     assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
-    P->setIncomingBlock(j, preheader);
-    for (unsigned i = 1; i < exitingBlocks.size(); ++i)
-      P->removeIncomingValue(exitingBlocks[i]);
+    P->setIncomingBlock(j, Preheader);
+    for (unsigned i = 1; i < ExitingBlocks.size(); ++i)
+      P->removeIncomingValue(ExitingBlocks[i]);
     ++BI;
   }
 
@@ -175,11 +178,11 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
-    // Move all of the block's children to be children of the preheader, which
+    // Move all of the block's children to be children of the Preheader, which
     // allows us to remove the domtree entry for the block.
     ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
     for (DomTreeNode *ChildNode : ChildNodes) {
-      DT.changeImmediateDominator(ChildNode, DT[preheader]);
+      DT.changeImmediateDominator(ChildNode, DT[Preheader]);
     }
 
     ChildNodes.clear();
@@ -204,22 +207,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SmallPtrSet<BasicBlock *, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (BasicBlock *BB : blocks)
-    loopInfo.removeBlock(BB);
+    LI.removeBlock(BB);
 
   // The last step is to update LoopInfo now that we've eliminated this loop.
-  loopInfo.markAsRemoved(L);
-  Changed = true;
-
+  LI.markAsRemoved(L);
   ++NumDeleted;
 
-  return Changed;
+  return true;
 }
 
 PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
                                         LoopStandardAnalysisResults &AR,
-                                        LPMUpdater &) {
-  bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
-  if (!Changed)
+                                        LPMUpdater &Updater) {
+  if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
@@ -257,8 +257,7 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-  LoopDeletionPass Impl;
-  return Impl.runImpl(L, DT, SE, loopInfo);
+  return deleteLoopIfDead(L, DT, SE, LI);
 }
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 19716b28ad66..3624bba10345 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -812,29 +812,29 @@ private:
       const RuntimePointerChecking *RtPtrChecking) {
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (unsigned PtrIdx1 : Check.first->Members)
-                     for (unsigned PtrIdx2 : Check.second->Members)
-                       // Only include this check if there is a pair of pointers
-                       // that require checking and the pointers fall into
-                       // separate partitions.
-                       //
-                       // (Note that we already know at this point that the two
-                       // pointer groups need checking but it doesn't follow
-                       // that each pair of pointers within the two groups need
-                       // checking as well.
-                       //
-                       // In other words we don't want to include a check just
-                       // because there is a pair of pointers between the two
-                       // pointer groups that require checks and a different
-                       // pair whose pointers fall into different partitions.)
-                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
-                           !RuntimePointerChecking::arePointersInSamePartition(
-                               PtrToPartition, PtrIdx1, PtrIdx2))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (unsigned PtrIdx1 : Check.first->Members)
+                for (unsigned PtrIdx2 : Check.second->Members)
+                  // Only include this check if there is a pair of pointers
+                  // that require checking and the pointers fall into
+                  // separate partitions.
+                  //
+                  // (Note that we already know at this point that the two
+                  // pointer groups need checking but it doesn't follow
+                  // that each pair of pointers within the two groups need
+                  // checking as well.
+                  //
+                  // In other words we don't want to include a check just
+                  // because there is a pair of pointers between the two
+                  // pointer groups that require checks and a different
+                  // pair whose pointers fall into different partitions.)
+                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                      !RuntimePointerChecking::arePointersInSamePartition(
+                          PtrToPartition, PtrIdx1, PtrIdx2))
+                    return true;
+              return false;
+            });
 
     return Checks;
   }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5fec51c095d0..946d85d7360f 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -236,9 +236,9 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   ApplyCodeSizeHeuristics =
       L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
 
-  HasMemset = TLI->has(LibFunc::memset);
-  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
-  HasMemcpy = TLI->has(LibFunc::memcpy);
+  HasMemset = TLI->has(LibFunc_memset);
+  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc_memcpy);
 
   if (HasMemset || HasMemsetPattern || HasMemcpy)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
@@ -823,7 +823,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Module *M = TheStore->getModule();
     Value *MSP =
         M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
-                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+                               Int8PtrTy, Int8PtrTy, IntPtr);
     inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 69102d10ff60..28e71ca05436 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -189,7 +189,9 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 char LoopInstSimplifyLegacyPass::ID = 0;
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index e9f84edd1cbf..9f3875a3027f 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -39,7 +39,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-interchange"
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 8fb580183e30..cf63cb660db8 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -20,13 +20,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -45,9 +46,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
-#include <forward_list>
-#include <cassert>
 #include <algorithm>
+#include <cassert>
+#include <forward_list>
 #include <set>
 #include <tuple>
 #include <utility>
@@ -373,15 +374,15 @@ public:
     const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (auto PtrIdx1 : Check.first->Members)
-                     for (auto PtrIdx2 : Check.second->Members)
-                       if (needsChecking(PtrIdx1, PtrIdx2,
-                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (auto PtrIdx1 : Check.first->Members)
+                for (auto PtrIdx2 : Check.second->Members)
+                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+                                    CandLoadPtrs))
+                    return true;
+              return false;
+            });
 
     DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
     DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
@@ -558,6 +559,32 @@ private:
   PredicatedScalarEvolution PSE;
 };
 
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+                          function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+  // Build up a worklist of inner-loops to transform to avoid iterator
+  // invalidation.
+  // FIXME: This logic comes from other passes that actually change the loop
+  // nest structure. It isn't clear this is necessary (or useful) for a pass
+  // which merely optimizes the use of loads in a loop.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->empty())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    // The actual work is performed by LoadEliminationForLoop.
+    LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+    Changed |= LEL.processLoop();
+  }
+  return Changed;
+}
+
 /// \brief The pass.  Most of the work is delegated to the per-loop
 /// LoadEliminationForLoop class.
 class LoopLoadElimination : public FunctionPass {
@@ -570,32 +597,14 @@ public:
     if (skipFunction(F))
       return false;
 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-    // Build up a worklist of inner-loops to vectorize. This is necessary as the
-    // act of distributing a loop creates new loops and can invalidate iterators
-    // across the loops.
-    SmallVector<Loop *, 8> Worklist;
-
-    for (Loop *TopLevelLoop : *LI)
-      for (Loop *L : depth_first(TopLevelLoop))
-        // We only handle inner-most loops.
-        if (L->empty())
-          Worklist.push_back(L);
-
-    // Now walk the identified inner loops.
-    bool Changed = false;
-    for (Loop *L : Worklist) {
-      const LoopAccessInfo &LAI = LAA->getInfo(L);
-      // The actual work is performed by LoadEliminationForLoop.
-      LoadEliminationForLoop LEL(L, LI, LAI, DT);
-      Changed |= LEL.processLoop();
-    }
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
     // Process each loop nest in the function.
-    return Changed;
+    return eliminateLoadsAcrossLoops(
+        F, LI, DT,
+        [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -631,4 +640,28 @@ FunctionPass *createLoopLoadEliminationPass() {
   return new LoopLoadElimination();
 }
 
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  bool Changed = eliminateLoadsAcrossLoops(
+      F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+        LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+        return LAM.getResult<LoopAccessAnalysis>(L, AR);
+      });
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
+
 } // end namespace llvm
diff --git a/lib/Transforms/Scalar/LoopPassManager.cpp b/lib/Transforms/Scalar/LoopPassManager.cpp
index 028f4bba8b1d..10f6fcdcfdb7 100644
--- a/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -42,6 +42,13 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
       break;
     }
 
+#ifndef NDEBUG
+    // Verify the loop structure and LCSSA form before visiting the loop.
+    L.verifyLoop();
+    assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
+           "Loops must remain in LCSSA form!");
+#endif
+
     // Update the analysis manager as each pass runs and potentially
     // invalidates analyses.
     AM.invalidate(L, PassPA);
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
new file mode 100644
index 000000000000..0ce604429326
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -0,0 +1,282 @@
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+//   for (i = 0; i < n; i++) {
+//     guard(i < len);
+//     ...
+//   }
+//
+// to
+//
+//   for (i = 0; i < n; i++) {
+//     guard(n - 1 < len);
+//     ...
+//   }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+//   if (n - 1 < len)
+//     for (i = 0; i < n; i++) {
+//       ...
+//     }
+//   else
+//     deoptimize
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+using namespace llvm;
+
+namespace {
+class LoopPredication {
+  ScalarEvolution *SE;
+
+  Loop *L;
+  const DataLayout *DL;
+  BasicBlock *Preheader;
+
+  Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+                                        IRBuilder<> &Builder);
+  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+
+public:
+  LoopPredication(ScalarEvolution *SE) : SE(SE){};
+  bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+  static char ID;
+  LoopPredicationLegacyPass() : LoopPass(ID) {
+    initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LoopPredication LP(SE);
+    return LP.runOnLoop(L);
+  }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+                      "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+                    "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+  return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  LoopPredication LP(&AR.SE);
+  if (!LP.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       IRBuilder<> &Builder) {
+  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  DEBUG(ICI->dump());
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+  const SCEV *LHSS = SE->getSCEV(LHS);
+  if (isa<SCEVCouldNotCompute>(LHSS))
+    return None;
+  const SCEV *RHSS = SE->getSCEV(RHS);
+  if (isa<SCEVCouldNotCompute>(RHSS))
+    return None;
+
+  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable index
+  if (SE->isLoopInvariant(LHSS, L)) {
+    std::swap(LHS, RHS);
+    std::swap(LHSS, RHSS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+  if (!SE->isLoopInvariant(RHSS, L) || !isSafeToExpand(RHSS, *SE))
+    return None;
+
+  const SCEVAddRecExpr *IndexAR = dyn_cast<SCEVAddRecExpr>(LHSS);
+  if (!IndexAR || IndexAR->getLoop() != L)
+    return None;
+
+  DEBUG(dbgs() << "IndexAR: ");
+  DEBUG(IndexAR->dump());
+
+  bool IsIncreasing = false;
+  if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+    return None;
+
+  // If the predicate is increasing the condition can change from false to true
+  // as the loop progresses, in this case take the value on the first iteration
+  // for the widened check. Otherwise the condition can change from true to
+  // false as the loop progresses, so take the value on the last iteration.
+  const SCEV *NewLHSS = IsIncreasing
+                            ? IndexAR->getStart()
+                            : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
+  if (NewLHSS == IndexAR) {
+    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+    return None;
+  }
+
+  DEBUG(dbgs() << "NewLHSS: ");
+  DEBUG(NewLHSS->dump());
+
+  if (!SE->isLoopInvariant(NewLHSS, L) || !isSafeToExpand(NewLHSS, *SE))
+    return None;
+
+  DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+
+  Type *Ty = LHS->getType();
+  Instruction *InsertAt = Preheader->getTerminator();
+  assert(Ty == RHS->getType() && "icmp operands have different types?");
+  Value *NewLHS = Expander.expandCodeFor(NewLHSS, Ty, InsertAt);
+  Value *NewRHS = Expander.expandCodeFor(RHSS, Ty, InsertAt);
+  return Builder.CreateICmp(Pred, NewLHS, NewRHS);
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+                                           SCEVExpander &Expander) {
+  DEBUG(dbgs() << "Processing guard:\n");
+  DEBUG(Guard->dump());
+
+  IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
+
+  // The guard condition is expected to be in form of:
+  //   cond1 && cond2 && cond3 ...
+  // Iterate over subconditions looking for for icmp conditions which can be
+  // widened across loop iterations. Widening these conditions remember the
+  // resulting list of subconditions in Checks vector.
+  SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
+  SmallPtrSet<Value *, 4> Visited;
+
+  SmallVector<Value *, 4> Checks;
+
+  unsigned NumWidened = 0;
+  do {
+    Value *Condition = Worklist.pop_back_val();
+    if (!Visited.insert(Condition).second)
+      continue;
+
+    Value *LHS, *RHS;
+    using namespace llvm::PatternMatch;
+    if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+      Worklist.push_back(LHS);
+      Worklist.push_back(RHS);
+      continue;
+    }
+
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+      if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) {
+        Checks.push_back(NewRangeCheck.getValue());
+        NumWidened++;
+        continue;
+      }
+    }
+
+    // Save the condition as is if we can't widen it
+    Checks.push_back(Condition);
+  } while (Worklist.size() != 0);
+
+  if (NumWidened == 0)
+    return false;
+
+  // Emit the new guard condition
+  Builder.SetInsertPoint(Guard);
+  Value *LastCheck = nullptr;
+  for (auto *Check : Checks)
+    if (!LastCheck)
+      LastCheck = Check;
+    else
+      LastCheck = Builder.CreateAnd(LastCheck, Check);
+  Guard->setOperand(0, LastCheck);
+
+  DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  return true;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+  L = Loop;
+
+  DEBUG(dbgs() << "Analyzing ");
+  DEBUG(L->dump());
+
+  Module *M = L->getHeader()->getModule();
+
+  // There is nothing to do if the module doesn't use guards
+  auto *GuardDecl =
+      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  DL = &M->getDataLayout();
+
+  Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  // Collect all the guards into a vector and process later, so as not
+  // to invalidate the instruction iterator.
+  SmallVector<IntrinsicInst *, 4> Guards;
+  for (const auto BB : L->blocks())
+    for (auto &I : *BB)
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
+
+  SCEVExpander Expander(*SE, *DL, "loop-predication");
+
+  bool Changed = false;
+  for (auto *Guard : Guards)
+    Changed |= widenGuardConditions(Guard, Expander);
+
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index cc83069d5f52..e5689368de80 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -79,7 +79,8 @@ private:
 /// to merge the two values.  Do this now.
 static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
                                             BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
   // Remove PHI node entries that are no longer live.
   BasicBlock::iterator I, E = OrigHeader->end();
   for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
@@ -87,7 +88,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
   // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
   // as necessary.
-  SSAUpdater SSA;
+  SSAUpdater SSA(InsertedPHIs);
   for (I = OrigHeader->begin(); I != E; ++I) {
     Value *OrigHeaderVal = &*I;
 
@@ -174,6 +175,38 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   }
 }
 
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
 /// Rotate loop LP. Return true if the loop is rotated.
 ///
 /// \param SimplifiedLatch is true if the latch was just folded into the final
@@ -347,9 +380,18 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // remove the corresponding incoming values from the PHI nodes in OrigHeader.
   LoopEntryBranch->eraseFromParent();
 
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
   // If there were any uses of instructions in the duplicated block outside the
   // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
 
   // NewHeader is now the header of the loop.
   L->moveToHeader(NewHeader);
@@ -634,6 +676,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
   bool Changed = LR.processLoop(&L);
   if (!Changed)
     return PreservedAnalyses::all();
+
   return getLoopPassPreservedAnalyses();
 }
 
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 16061212ba38..a5a81c33a8eb 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -69,6 +69,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LPMUpdater &) {
   if (!simplifyLoopCFG(L, AR.DT, AR.LI))
     return PreservedAnalyses::all();
+
   return getLoopPassPreservedAnalyses();
 }
 
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index f3f415275c0e..c9d55b4594fe 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -1,4 +1,4 @@
-//===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,8 +28,10 @@
 //       InsertBBs = UseBBs - DomBBs + BB
 //   For BB in InsertBBs:
 //     Insert I at BB's beginning
+//
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -297,6 +299,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   return Changed;
 }
 
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  // Nothing to do if there are no loops.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = FAM.getResult<AAManager>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+  // We want to do a postorder walk over the loops. Since loops are a tree this
+  // is equivalent to a reversed preorder walk and preorder is easy to compute
+  // without recursion. Since we reverse the preorder, we will visit siblings
+  // in reverse program order. This isn't expected to matter at all but is more
+  // consistent with sinking algorithms which generally work bottom-up.
+  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+  bool Changed = false;
+  do {
+    Loop &L = *PreorderLoops.pop_back_val();
+
+    // Note that we don't pass SCEV here because it is only used to invalidate
+    // loops in SCEV and we don't preserve (or request) SCEV at all making that
+    // unnecessary.
+    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+                                             /*ScalarEvolution*/ nullptr);
+  } while (!PreorderLoops.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
 namespace {
 struct LegacyLoopSinkPass : public LoopPass {
   static char ID;
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 194587a85e7c..af137f6faa63 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -129,6 +129,17 @@ static cl::opt<bool> EnablePhiElim(
   "enable-lsr-phielim", cl::Hidden, cl::init(true),
   cl::desc("Enable LSR phi elimination"));
 
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+  "lsr-exp-narrow", cl::Hidden, cl::init(false),
+  cl::desc("Narrow LSR complex solution using"
+           " expectation of registers number"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -181,10 +192,11 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
-LLVM_DUMP_METHOD
-void RegSortData::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -295,9 +307,13 @@ struct Formula {
   /// canonical representation of a formula is
   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
   /// #1 enforces that the scaled register is always used when at least two
   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
   /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
   /// This invariant can be temporarly broken while building a formula.
   /// However, every formula inserted into the LSRInstance must be in canonical
   /// form.
@@ -318,12 +334,14 @@ struct Formula {
 
   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
-  bool isCanonical() const;
+  bool isCanonical(const Loop &L) const;
 
-  void canonicalize();
+  void canonicalize(const Loop &L);
 
   bool unscale();
 
+  bool hasZeroEnd() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -410,16 +428,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
-  canonicalize();
+  canonicalize(*L);
 }
 
 /// \brief Check whether or not this formula statisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
-bool Formula::isCanonical() const {
-  if (ScaledReg)
-    return Scale != 1 || !BaseRegs.empty();
-  return BaseRegs.size() <= 1;
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
+    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
+  auto I =
+      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+        return isa<const SCEVAddRecExpr>(S) &&
+               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+      });
+  return I == BaseRegs.end();
 }
 
 /// \brief Helper method to morph a formula into its canonical representation.
@@ -428,21 +465,33 @@ bool Formula::isCanonical() const {
 /// field. Otherwise, we would have to do special cases everywhere in LSR
 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
 /// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize() {
-  if (isCanonical())
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
     return;
   // So far we did not need this case. This is easy to implement but it is
   // useless to maintain dead code. Beside it could hurt compile time.
   assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
-  ScaledReg = BaseRegs.back();
-  BaseRegs.pop_back();
-  Scale = 1;
-  size_t BaseRegsSize = BaseRegs.size();
-  size_t Try = 0;
-  // If ScaledReg is an invariant, try to find a variant expression.
-  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
-    std::swap(ScaledReg, BaseRegs[Try++]);
+  if (!ScaledReg) {
+    ScaledReg = BaseRegs.back();
+    BaseRegs.pop_back();
+    Scale = 1;
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
+    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+                     [&](const SCEV *S) {
+                       return isa<const SCEVAddRecExpr>(S) &&
+                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+                     });
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
 }
 
 /// \brief Get rid of the scale in the formula.
@@ -458,6 +507,14 @@ bool Formula::unscale() {
   return true;
 }
 
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -534,10 +591,11 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
-LLVM_DUMP_METHOD
-void Formula::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Return true if the given addrec can be sign-extended without changing its
 /// value.
@@ -711,7 +769,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    if (SI->getOperand(1) == OperandVal)
+    if (SI->getPointerOperand() == OperandVal)
       isAddress = true;
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
@@ -723,6 +781,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
           isAddress = true;
         break;
     }
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    if (RMW->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    if (CmpX->getPointerOperand() == OperandVal)
+      isAddress = true;
   }
   return isAddress;
 }
@@ -735,6 +799,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
     AccessTy.AddrSpace = SI->getPointerAddressSpace();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
     AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
   }
 
   // All pointers have the same requirements, so canonicalize them to an
@@ -875,7 +943,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F);
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);
 
 namespace {
 
@@ -883,6 +952,7 @@ namespace {
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
+  unsigned Insns;
   unsigned NumRegs;
   unsigned AddRecCost;
   unsigned NumIVMuls;
@@ -893,8 +963,8 @@ class Cost {
 
 public:
   Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+    : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
+      ImmCost(0), SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -903,9 +973,9 @@ public:
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+    return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
              | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+      || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
            & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
@@ -1067,7 +1137,8 @@ public:
   }
   
   bool HasFormulaWithSameRegs(const Formula &F) const;
-  bool InsertFormula(const Formula &F);
+  float getNotSelectedProbability(const SCEV *Reg) const;
+  bool InsertFormula(const Formula &F, const Loop &L);
   void DeleteFormula(Formula &F);
   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
 
@@ -1083,17 +1154,23 @@ void Cost::RateRegister(const SCEV *Reg,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-    // If this is an addrec for another loop, don't second-guess its addrec phi
-    // nodes. LSR isn't currently smart enough to reason about more than one
-    // loop at a time. LSR has already run on inner loops, will not run on outer
-    // loops, and cannot be expected to change sibling loops.
+    // If this is an addrec for another loop, it should be an invariant
+    // with respect to L since L is the innermost loop (at least
+    // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
       if (isExistingPhi(AR, SE))
         return;
 
-      // Otherwise, do not consider this formula at all.
-      Lose();
+      // It is bad to allow LSR for current loop to add induction variables
+      // for its sibling loops.
+      if (!AR->getLoop()->contains(L)) {
+        Lose();
+        return;
+      }
+
+      // Otherwise, it will be an invariant with respect to Loop L.
+      ++NumRegs;
       return;
     }
     AddRecCost += 1; /// TODO: This should be a function of the stride.
@@ -1150,8 +1227,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
-  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
   // Tally up the registers.
+  unsigned PrevAddRecCost = AddRecCost;
+  unsigned PrevNumRegs = NumRegs;
+  unsigned PrevNumBaseAdds = NumBaseAdds;
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Lose();
@@ -1171,6 +1251,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       return;
   }
 
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      Insns += (NumRegs - PrevNumRegs);
+    else
+      Insns += (NumRegs - TTIRegNum);
+  }
+
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
@@ -1181,7 +1273,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   NumBaseAdds += (F.UnfoldedOffset != 0);
 
   // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
+  ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
@@ -1199,11 +1291,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
       NumBaseAdds++;
   }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  Insns += (AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    Insns += NumBaseAdds - PrevNumBaseAdds;
   assert(isValid() && "invalid cost");
 }
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
+  Insns = ~0u;
   NumRegs = ~0u;
   AddRecCost = ~0u;
   NumIVMuls = ~0u;
@@ -1215,6 +1326,8 @@ void Cost::Lose() {
 
 /// Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
+  if (InsnsCost && Insns != Other.Insns)
+    return Insns < Other.Insns;
   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                   ImmCost, SetupCost) <
          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@@ -1223,6 +1336,7 @@ bool Cost::operator<(const Cost &Other) const {
 }
 
 void Cost::print(raw_ostream &OS) const {
+  OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
   if (AddRecCost != 0)
     OS << ", with addrec cost " << AddRecCost;
@@ -1239,10 +1353,11 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
-LLVM_DUMP_METHOD
-void Cost::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 LSRFixup::LSRFixup()
   : UserInst(nullptr), OperandValToReplace(nullptr),
@@ -1285,10 +1400,11 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
-LLVM_DUMP_METHOD
-void LSRFixup::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Test whether this use as a formula which has the same registers as the given
 /// formula.
@@ -1300,10 +1416,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   return Uniquifier.count(Key);
 }
 
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+  unsigned FNum = 0;
+  for (const Formula &F : Formulae)
+    if (F.referencesReg(Reg))
+      FNum++;
+  return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
 /// If the given formula has not yet been inserted, add it to the list, and
 /// return true. Return false otherwise.  The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F) {
-  assert(F.isCanonical() && "Invalid canonical representation");
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");
 
   if (!Formulae.empty() && RigidFormula)
     return false;
@@ -1391,10 +1516,11 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
-LLVM_DUMP_METHOD
-void LSRUse::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
@@ -1472,7 +1598,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  int64_t MinOffset, int64_t MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 const Formula &F) {
+                                 const Formula &F, const Loop &L) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
   // or a scale not equal to zero is correct.
   // Problems may arise from non canonical formulae having a scale == 0.
@@ -1480,7 +1606,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
   // However, when we generate the scaled formulae, we first check that the
   // scaling factor is profitable before computing the actual ScaledReg for
   // compile time sake.
-  assert((F.isCanonical() || F.Scale != 0));
+  assert((F.isCanonical(L) || F.Scale != 0));
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
 }
@@ -1515,14 +1641,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F) {
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
   if (!F.Scale)
     return 0;
 
   // If the use is not completely folded in that instruction, we will have to
   // pay an extra cost only for scale != 1.
   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                            LU.AccessTy, F))
+                            LU.AccessTy, F, L))
     return F.Scale != 1;
 
   switch (LU.Kind) {
@@ -1772,6 +1899,7 @@ class LSRInstance {
   void NarrowSearchSpaceByDetectingSupersets();
   void NarrowSearchSpaceByCollapsingUnrolledCode();
   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
 
@@ -2492,7 +2620,12 @@ static Value *getWideOperand(Value *Oper) {
 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
   Type *LType = LVal->getType();
   Type *RType = RVal->getType();
-  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+                              // Different address spaces means (possibly)
+                              // different types of the pointer implementation,
+                              // e.g. i16 vs i32 so disallow that.
+                              (LType->getPointerAddressSpace() ==
+                               RType->getPointerAddressSpace()));
 }
 
 /// Return an approximation of this SCEV expression's "base", or NULL for any
@@ -2989,8 +3122,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     User::op_iterator UseI =
         find(UserInst->operands(), U.getOperandValToReplace());
     assert(UseI != UserInst->op_end() && "cannot find IV operand");
-    if (IVIncSet.count(UseI))
+    if (IVIncSet.count(UseI)) {
+      DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
       continue;
+    }
 
     LSRUse::KindType Kind = LSRUse::Basic;
     MemAccessTy AccessTy;
@@ -3025,8 +3160,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
-          N = TransformForPostIncUse(Normalize, N, CI, nullptr,
-                                     TmpPostIncLoops, SE, DT);
+          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
         }
@@ -3108,7 +3242,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   // Do not insert formula that we will not be able to expand.
   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
          "Formula is illegal");
-  if (!LU.InsertFormula(F))
+
+  if (!LU.InsertFormula(F, *L))
     return false;
 
   CountRegisters(F, LUIdx);
@@ -3347,7 +3482,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       F.BaseRegs.push_back(*J);
     // We may have changed the number of register in base regs, adjust the
     // formula accordingly.
-    F.canonicalize();
+    F.canonicalize(*L);
 
     if (InsertFormula(LU, LUIdx, F))
       // If that formula hadn't been seen before, recurse to find more like
@@ -3359,7 +3494,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
 /// Split out subexpressions from adds and the bases of addrecs.
 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                          Formula Base, unsigned Depth) {
-  assert(Base.isCanonical() && "Input must be in the canonical form");
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
   // Arbitrarily cap recursion to protect compile time.
   if (Depth >= 3)
     return;
@@ -3400,7 +3535,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
-      F.canonicalize();
+      F.canonicalize(*L);
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
@@ -3457,7 +3592,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           F.ScaledReg = nullptr;
         } else
           F.deleteBaseReg(F.BaseRegs[Idx]);
-        F.canonicalize();
+        F.canonicalize(*L);
       } else if (IsScaledReg)
         F.ScaledReg = NewG;
       else
@@ -3620,10 +3755,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     if (LU.Kind == LSRUse::ICmpZero &&
         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
       continue;
-    // For each addrec base reg, apply the scale, if possible.
-    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
-      if (const SCEVAddRecExpr *AR =
-            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
         if (FactorS->isZero())
           continue;
@@ -3637,11 +3772,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           // The canonical representation of 1*reg is reg, which is already in
           // Base. In that case, do not try to insert the formula, it will be
           // rejected anyway.
-          if (F.Scale == 1 && F.BaseRegs.empty())
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
             continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
           (void)InsertFormula(LU, LUIdx, F);
         }
       }
+    }
   }
 }
 
@@ -3697,10 +3838,11 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
-LLVM_DUMP_METHOD
-void WorkItem::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Look for registers which are a constant distance apart and try to form reuse
 /// opportunities between them.
@@ -3821,7 +3963,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
             continue;
 
         // OK, looks good.
-        NewF.canonicalize();
+        NewF.canonicalize(*this->L);
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3853,7 +3995,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                 goto skip_formula;
 
           // Ok, looks good.
-          NewF.canonicalize();
+          NewF.canonicalize(*this->L);
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -4165,6 +4307,144 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   }
 }
 
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+///  reg(a) + reg({0,+,1})
+///  reg(a) + reg({-1,+,1}) + 1
+///  reg({a,+,1})
+/// Use2:
+///  reg(b) + reg({0,+,1})
+///  reg(b) + reg({-1,+,1}) + 1
+///  reg({b,+,1})
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1})
+///  reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+///                 Use1   Use2    Use3
+/// reg(a)         (1/3) *   1   *   1
+/// reg(b)           1   * (1/3) * (1/2)
+/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1})  (2/3) * (2/3) *   1
+/// reg({a,+,1})   (2/3) *   1   *   1
+/// reg({b,+,1})     1   * (2/3) * (2/3)
+/// reg(c)           1   *   1   *   0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
+///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
+///  reg({a,+,1})                   1
+/// Use2:
+///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
+///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
+///  reg({b,+,1})                   2/3
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+///  reg(c) + reg({b,+,1})          1 + 2/3
+
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+  // Ok, we have too many of formulae on our hands to conveniently handle.
+  // Use a rough heuristic to thin out the list.
+
+  // Set of Regs wich will be 100% used in final solution.
+  // Used in each formula of a solution (in example above this is reg(c)).
+  // We can skip them in calculations.
+  SmallPtrSet<const SCEV *, 4> UniqRegs;
+  DEBUG(dbgs() << "The search space is too complex.\n");
+
+  // Map each register to probability of not selecting
+  DenseMap <const SCEV *, float> RegNumMap;
+  for (const SCEV *Reg : RegUses) {
+    if (UniqRegs.count(Reg))
+      continue;
+    float PNotSel = 1;
+    for (const LSRUse &LU : Uses) {
+      if (!LU.Regs.count(Reg))
+        continue;
+      float P = LU.getNotSelectedProbability(Reg);
+      if (P != 0.0)
+        PNotSel *= P;
+      else
+        UniqRegs.insert(Reg);
+    }
+    RegNumMap.insert(std::make_pair(Reg, PNotSel));
+  }
+
+  DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+  // Delete formulas where registers number expectation is high.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // If nothing to delete - continue.
+    if (LU.Formulae.size() < 2)
+      continue;
+    // This is temporary solution to test performance. Float should be
+    // replaced with round independent type (based on integers) to avoid
+    // different results for different target builds.
+    float FMinRegNum = LU.Formulae[0].getNumRegs();
+    float FMinARegNum = LU.Formulae[0].getNumRegs();
+    size_t MinIdx = 0;
+    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+      Formula &F = LU.Formulae[i];
+      float FRegNum = 0;
+      float FARegNum = 0;
+      for (const SCEV *BaseReg : F.BaseRegs) {
+        if (UniqRegs.count(BaseReg))
+          continue;
+        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+        if (isa<SCEVAddRecExpr>(BaseReg))
+          FARegNum +=
+              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+      }
+      if (const SCEV *ScaledReg = F.ScaledReg) {
+        if (!UniqRegs.count(ScaledReg)) {
+          FRegNum +=
+              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+          if (isa<SCEVAddRecExpr>(ScaledReg))
+            FARegNum +=
+                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+        }
+      }
+      if (FMinRegNum > FRegNum ||
+          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+        FMinRegNum = FRegNum;
+        FMinARegNum = FARegNum;
+        MinIdx = i;
+      }
+    }
+    DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+          dbgs() << " with min reg num " << FMinRegNum << '\n');
+    if (MinIdx != 0)
+      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+    while (LU.Formulae.size() != 1) {
+      DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+            dbgs() << '\n');
+      LU.Formulae.pop_back();
+    }
+    LU.RecomputeRegs(LUIdx, RegUses);
+    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+    Formula &F = LU.Formulae[0];
+    DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    // When we choose the formula, the regs become unique.
+    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+    if (F.ScaledReg)
+      UniqRegs.insert(F.ScaledReg);
+  }
+  DEBUG(dbgs() << "After pre-selection:\n";
+  print_uses(dbgs()));
+}
+
+
 /// Pick a register which seems likely to be profitable, and then in any use
 /// which has any reference to that register, delete all formulae which do not
 /// reference that register.
@@ -4237,7 +4517,10 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByDetectingSupersets();
   NarrowSearchSpaceByCollapsingUnrolledCode();
   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
-  NarrowSearchSpaceByPickingWinnerRegs();
+  if (LSRExpNarrow)
+    NarrowSearchSpaceByDeletingCostlyFormulas();
+  else
+    NarrowSearchSpaceByPickingWinnerRegs();
 }
 
 /// This is the recursive solver.
@@ -4515,11 +4798,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
     assert(!Reg->isZero() && "Zero allocated in a base register!");
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
-    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
-    Reg = TransformForPostIncUse(Denormalize, Reg,
-                                 LF.UserInst, LF.OperandValToReplace,
-                                 Loops, SE, DT);
-
+    Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
   }
 
@@ -4530,9 +4809,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
-    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
-                                     LF.UserInst, LF.OperandValToReplace,
-                                     Loops, SE, DT);
+    ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
 
     if (LU.Kind == LSRUse::ICmpZero) {
       // Expand ScaleReg as if it was part of the base regs.
@@ -4975,10 +5252,11 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
-LLVM_DUMP_METHOD
-void LSRInstance::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index c7f91226d222..62aa6ee48069 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -44,7 +44,11 @@ using namespace llvm;
 
 static cl::opt<unsigned>
     UnrollThreshold("unroll-threshold", cl::Hidden,
-                    cl::desc("The baseline cost threshold for loop unrolling"));
+                    cl::desc("The cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPartialThreshold(
+    "unroll-partial-threshold", cl::Hidden,
+    cl::desc("The cost threshold for partial loop unrolling"));
 
 static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
     "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
@@ -106,10 +110,19 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
              "aggressively unrolled."));
 
 static cl::opt<bool>
-    UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+    "unroll-revisit-child-loops", cl::Hidden,
+    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+             "This shouldn't typically be needed as child loops (or their "
+             "clones) were already visited."));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -118,16 +131,17 @@ static const unsigned NoThreshold = UINT_MAX;
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
-    Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
-    Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
-    Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
+    Loop *L, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
-  UP.Threshold = 150;
+  UP.Threshold = OptLevel > 2 ? 300 : 150;
   UP.MaxPercentThresholdBoost = 400;
   UP.OptSizeThreshold = 0;
-  UP.PartialThreshold = UP.Threshold;
+  UP.PartialThreshold = 150;
   UP.PartialOptSizeThreshold = 0;
   UP.Count = 0;
   UP.PeelCount = 0;
@@ -141,7 +155,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
-  UP.AllowPeeling = false;
+  UP.AllowPeeling = true;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, UP);
@@ -153,10 +167,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   }
 
   // Apply any user values specified by cl::opt
-  if (UnrollThreshold.getNumOccurrences() > 0) {
+  if (UnrollThreshold.getNumOccurrences() > 0)
     UP.Threshold = UnrollThreshold;
-    UP.PartialThreshold = UnrollThreshold;
-  }
+  if (UnrollPartialThreshold.getNumOccurrences() > 0)
+    UP.PartialThreshold = UnrollPartialThreshold;
   if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
@@ -495,7 +509,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
             KnownSucc = SI->getSuccessor(0);
           else if (ConstantInt *SimpleCondVal =
                        dyn_cast<ConstantInt>(SimpleCond))
-            KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
+            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
         }
       }
       if (KnownSucc) {
@@ -770,7 +784,15 @@ static bool computeUnrollCount(
     }
   }
 
-  // 4rd priority is partial unrolling.
+  // 4th priority is loop peeling
+  computePeelCount(L, LoopSize, UP, TripCount);
+  if (UP.PeelCount) {
+    UP.Runtime = false;
+    UP.Count = 1;
+    return ExplicitUnroll;
+  }
+
+  // 5th priority is partial unrolling.
   // Try partial unroll only when TripCount could be staticaly calculated.
   if (TripCount) {
     UP.Partial |= ExplicitUnroll;
@@ -833,14 +855,6 @@ static bool computeUnrollCount(
         << "Unable to fully unroll loop as directed by unroll(full) pragma "
            "because loop has a runtime trip count.");
 
-  // 5th priority is loop peeling
-  computePeelCount(L, LoopSize, UP);
-  if (UP.PeelCount) {
-    UP.Runtime = false;
-    UP.Count = 1;
-    return ExplicitUnroll;
-  }
-
   // 6th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
   if (HasRuntimeUnrollDisablePragma(L)) {
@@ -914,7 +928,7 @@ static bool computeUnrollCount(
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                             ScalarEvolution *SE, const TargetTransformInfo &TTI,
                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
-                            bool PreserveLCSSA,
+                            bool PreserveLCSSA, int OptLevel,
                             Optional<unsigned> ProvidedCount,
                             Optional<unsigned> ProvidedThreshold,
                             Optional<bool> ProvidedAllowPartial,
@@ -934,7 +948,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+      L, TTI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
       ProvidedRuntime, ProvidedUpperBound);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
@@ -1034,16 +1048,17 @@ namespace {
 class LoopUnroll : public LoopPass {
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopUnroll(Optional<unsigned> Threshold = None,
+  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None)
-      : LoopPass(ID), ProvidedCount(std::move(Count)),
+      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
+  int OptLevel;
   Optional<unsigned> ProvidedCount;
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
@@ -1068,7 +1083,7 @@ public:
     OptimizationRemarkEmitter ORE(&F);
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
-    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
                            ProvidedCount, ProvidedThreshold,
                            ProvidedAllowPartial, ProvidedRuntime,
                            ProvidedUpperBound);
@@ -1094,26 +1109,27 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
-                                 int Runtime, int UpperBound) {
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+                                 int AllowPartial, int Runtime,
+                                 int UpperBound) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
-  return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
-                        Count == -1 ? None : Optional<unsigned>(Count),
-                        AllowPartial == -1 ? None
-                                           : Optional<bool>(AllowPartial),
-                        Runtime == -1 ? None : Optional<bool>(Runtime),
-                        UpperBound == -1 ? None : Optional<bool>(UpperBound));
+  return new LoopUnroll(
+      OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      Count == -1 ? None : Optional<unsigned>(Count),
+      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+      Runtime == -1 ? None : Optional<bool>(Runtime),
+      UpperBound == -1 ? None : Optional<bool>(UpperBound));
 }
 
-Pass *llvm::createSimpleLoopUnrollPass() {
-  return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+  return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
 }
 
 PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                                       LoopStandardAnalysisResults &AR,
-                                      LPMUpdater &) {
+                                      LPMUpdater &Updater) {
   const auto &FAM =
       AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
@@ -1124,12 +1140,84 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
-                                 /*PreserveLCSSA*/ true, ProvidedCount,
-                                 ProvidedThreshold, ProvidedAllowPartial,
-                                 ProvidedRuntime, ProvidedUpperBound);
-
+  // Keep track of the previous loop structure so we can identify new loops
+  // created by unrolling.
+  Loop *ParentL = L.getParentLoop();
+  SmallPtrSet<Loop *, 4> OldLoops;
+  if (ParentL)
+    OldLoops.insert(ParentL->begin(), ParentL->end());
+  else
+    OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+  // The API here is quite complex to call, but there are only two interesting
+  // states we support: partial and full (or "simple") unrolling. However, to
+  // enable these things we actually pass "None" in for the optional to avoid
+  // providing an explicit choice.
+  Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
+  if (!AllowPartialUnrolling)
+    AllowPartialParam = RuntimeParam = UpperBoundParam = false;
+  bool Changed = tryToUnrollLoop(
+      &L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+      /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
   if (!Changed)
     return PreservedAnalyses::all();
+
+  // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+  if (ParentL)
+    ParentL->verifyLoop();
+#endif
+
+  // Unrolling can do several things to introduce new loops into a loop nest:
+  // - Partial unrolling clones child loops within the current loop. If it
+  //   uses a remainder, then it can also create any number of sibling loops.
+  // - Full unrolling clones child loops within the current loop but then
+  //   removes the current loop making all of the children appear to be new
+  //   sibling loops.
+  // - Loop peeling can directly introduce new sibling loops by peeling one
+  //   iteration.
+  //
+  // When a new loop appears as a sibling loop, either from peeling an
+  // iteration or fully unrolling, its nesting structure has fundamentally
+  // changed and we want to revisit it to reflect that.
+  //
+  // When unrolling has removed the current loop, we need to tell the
+  // infrastructure that it is gone.
+  //
+  // Finally, we support a debugging/testing mode where we revisit child loops
+  // as well. These are not expected to require further optimizations as either
+  // they or the loop they were cloned from have been directly visited already.
+  // But the debugging mode allows us to check this assumption.
+  bool IsCurrentLoopValid = false;
+  SmallVector<Loop *, 4> SibLoops;
+  if (ParentL)
+    SibLoops.append(ParentL->begin(), ParentL->end());
+  else
+    SibLoops.append(AR.LI.begin(), AR.LI.end());
+  erase_if(SibLoops, [&](Loop *SibLoop) {
+    if (SibLoop == &L) {
+      IsCurrentLoopValid = true;
+      return true;
+    }
+
+    // Otherwise erase the loop from the list if it was in the old loops.
+    return OldLoops.count(SibLoop) != 0;
+  });
+  Updater.addSiblingLoops(SibLoops);
+
+  if (!IsCurrentLoopValid) {
+    Updater.markLoopAsDeleted(L);
+  } else {
+    // We can only walk child loops if the current loop remained valid.
+    if (UnrollRevisitChildLoops) {
+      // Walk *all* of the child loops. This is a highly speculative mode
+      // anyways so look for any simplifications that arose from partial
+      // unrolling or peeling off of iterations.
+      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+      Updater.addChildLoops(ChildLoops);
+    }
+  }
+
   return getLoopPassPreservedAnalyses();
 }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 76fe91884c7b..a99c9999c619 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -47,6 +48,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/CommandLine.h"
@@ -77,19 +79,6 @@ static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
 
-static cl::opt<bool>
-LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
-    cl::init(false), cl::Hidden,
-    cl::desc("Enable the use of the block frequency analysis to access PGO "
-             "heuristics to minimize code growth in cold regions."));
-
-static cl::opt<unsigned>
-ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
-    cl::desc("Coldness threshold in percentage. The loop header frequency "
-             "(relative to the entry frequency) is compared with this "
-             "threshold to determine if non-trivial unswitching should be "
-             "enabled."));
-
 namespace {
 
   class LUAnalysisCache {
@@ -174,13 +163,6 @@ namespace {
 
     LUAnalysisCache BranchesInfo;
 
-    bool EnabledPGO;
-
-    // BFI and ColdEntryFreq are only used when PGO and
-    // LoopUnswitchWithBlockFrequency are enabled.
-    BlockFrequencyInfo BFI;
-    BlockFrequency ColdEntryFreq;
-
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -199,12 +181,14 @@ namespace {
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
     std::vector<BasicBlock*> NewBlocks;
 
+    bool hasBranchDivergence;
+
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false) :
+    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
       currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
-      loopPreheader(nullptr) {
+      loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -217,6 +201,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (hasBranchDivergence)
+        AU.addRequired<DivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
     }
 
@@ -255,6 +241,11 @@ namespace {
                                         TerminatorInst *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+    /// Given that the Invariant is not equal to Val. Simplify instructions
+    /// in the loop.
+    Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+                                           Constant *Val);
   };
 }
 
@@ -381,16 +372,35 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os) {
-  return new LoopUnswitch(Os);
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+  return new LoopUnswitch(Os, hasBranchDivergence);
 }
 
+/// Operator chain lattice.
+enum OperatorChain {
+  OC_OpChainNone,    ///< There is no operator.
+  OC_OpChainOr,      ///< There are only ORs.
+  OC_OpChainAnd,     ///< There are only ANDs.
+  OC_OpChainMixed    ///< There are ANDs and ORs.
+};
+
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   OperatorChain &ParentChain,
                                    DenseMap<Value *, Value *> &Cache) {
   auto CacheIt = Cache.find(Cond);
   if (CacheIt != Cache.end())
@@ -414,21 +424,53 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
     return Cond;
   }
 
+  // Walk up the operator chain to find partial invariant conditions.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
     if (BO->getOpcode() == Instruction::And ||
         BO->getOpcode() == Instruction::Or) {
-      // If either the left or right side is invariant, we can unswitch on this,
-      // which will cause the branch to go away in one loop and the condition to
-      // simplify in the other one.
-      if (Value *LHS =
-              FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) {
-        Cache[Cond] = LHS;
-        return LHS;
+      // Given the previous operator, compute the current operator chain status.
+      OperatorChain NewChain;
+      switch (ParentChain) {
+      case OC_OpChainNone:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainOr;
+        break;
+      case OC_OpChainOr:
+        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainAnd:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainMixed:
+        NewChain = OC_OpChainMixed;
+        break;
       }
-      if (Value *RHS =
-              FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) {
-        Cache[Cond] = RHS;
-        return RHS;
+
+      // If we reach a Mixed state, we do not want to keep walking up as we can not
+      // reliably find a value that will simplify the chain. With this check, we
+      // will return null on the first sight of mixed chain and the caller will
+      // either backtrack to find partial LIV in other operand or return null.
+      if (NewChain != OC_OpChainMixed) {
+        // Update the current operator chain type before we search up the chain.
+        ParentChain = NewChain;
+        // If either the left or right side is invariant, we can unswitch on this,
+        // which will cause the branch to go away in one loop and the condition to
+        // simplify in the other one.
+        if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = LHS;
+          return LHS;
+        }
+        // We did not manage to find a partial LIV in operand(0). Backtrack and try
+        // operand(1).
+        ParentChain = NewChain;
+        if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = RHS;
+          return RHS;
+        }
       }
     }
 
@@ -436,9 +478,21 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   return nullptr;
 }
 
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+                                                              Loop *L,
+                                                              bool &Changed) {
   DenseMap<Value *, Value *> Cache;
-  return FindLIVLoopCondition(Cond, L, Changed, Cache);
+  OperatorChain OpChain = OC_OpChainNone;
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+  // In case we do find a LIV, it can not be obtained by walking up a mixed
+  // operator chain.
+  assert((!FCond || OpChain != OC_OpChainMixed) &&
+        "Do not expect a partial LIV with mixed operator chain");
+  return {FCond, OpChain};
 }
 
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
@@ -457,19 +511,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   if (SanitizeMemory)
     computeLoopSafetyInfo(&SafetyInfo, L);
 
-  EnabledPGO = F->getEntryCount().hasValue();
-
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    BranchProbabilityInfo BPI(*F, *LI);
-    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
-
-    // Use BranchProbability to compute a minimum frequency based on
-    // function entry baseline frequency. Loops with headers below this
-    // frequency are considered as cold.
-    const BranchProbability ColdProb(ColdnessThreshold, 100);
-    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
-  }
-
   bool Changed = false;
   do {
     assert(currentLoop->isLCSSAForm(*DT));
@@ -581,19 +622,9 @@ bool LoopUnswitch::processCurrentLoop() {
       loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
     return false;
 
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    // Compute the weighted frequency of the hottest block in the
-    // loop (loopHeader in this case since inner loops should be
-    // processed before outer loop). If it is less than ColdFrequency,
-    // we should not unswitch.
-    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
-    if (LoopEntryFreq < ColdEntryFreq)
-      return false;
-  }
-
   for (IntrinsicInst *Guard : Guards) {
     Value *LoopCond =
-        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed);
+        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
     if (LoopCond &&
         UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
@@ -634,7 +665,7 @@ bool LoopUnswitch::processCurrentLoop() {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
         Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
@@ -642,24 +673,48 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                             currentLoop, Changed);
+      Value *SC = SI->getCondition();
+      Value *LoopCond;
+      OperatorChain OpChain;
+      std::tie(LoopCond, OpChain) =
+        FindLIVLoopCondition(SC, currentLoop, Changed);
+
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = nullptr;
-
-        // Do not process same value again and again.
-        // At this point we have some cases already unswitched and
-        // some not yet unswitched. Let's find the first not yet unswitched one.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i) {
-          Constant *UnswitchValCandidate = i.getCaseValue();
-          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
-            UnswitchVal = UnswitchValCandidate;
-            break;
+        // Find a case value such that at least one case value is unswitched
+        // out.
+        if (OpChain == OC_OpChainAnd) {
+          // If the chain only has ANDs and the switch has a case value of 0.
+          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllZero))
+            continue;
+          // We are unswitching 0 out.
+          UnswitchVal = AllZero;
+        } else if (OpChain == OC_OpChainOr) {
+          // If the chain only has ORs and the switch has a case value of ~0.
+          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllOne))
+            continue;
+          // We are unswitching ~0 out.
+          UnswitchVal = AllOne;
+        } else {
+          assert(OpChain == OC_OpChainNone && 
+                 "Expect to unswitch on trivial chain");
+          // Do not process same value again and again.
+          // At this point we have some cases already unswitched and
+          // some not yet unswitched. Let's find the first not yet unswitched one.
+          for (auto Case : SI->cases()) {
+            Constant *UnswitchValCandidate = Case.getCaseValue();
+            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+              UnswitchVal = UnswitchValCandidate;
+              break;
+            }
           }
         }
 
@@ -668,6 +723,11 @@ bool LoopUnswitch::processCurrentLoop() {
 
         if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
           ++NumSwitches;
+          // In case of a full LIV, UnswitchVal is the value we unswitched out.
+          // In case of a partial LIV, we only unswitch when its an AND-chain
+          // or OR-chain. In both cases switch input value simplifies to
+          // UnswitchVal.
+          BranchesInfo.setUnswitched(SI, UnswitchVal);
           return true;
         }
       }
@@ -678,7 +738,7 @@ bool LoopUnswitch::processCurrentLoop() {
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
         Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
@@ -753,6 +813,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                  << ". Cost too high.\n");
     return false;
   }
+  if (hasBranchDivergence &&
+      getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Condition is divergent.\n");
+    return false;
+  }
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
   return true;
@@ -899,7 +968,6 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       if (I.mayHaveSideEffects())
         return false;
 
-    // FIXME: add check for constant foldable switch instructions.
     if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
       if (BI->isUnconditional()) {
         CurrentBB = BI->getSuccessor(0);
@@ -911,7 +979,16 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
         // Found a trivial condition candidate: non-foldable conditional branch.
         break;
       }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // At this point, any constant-foldable instructions should have probably
+      // been folded.
+      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+      if (!Cond)
+        break;
+      // Find the target block we are definitely going to.
+      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
     } else {
+      // We do not understand these terminator instructions.
       break;
     }
 
@@ -929,7 +1006,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       return false;
 
     Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -960,7 +1037,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
     Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -973,13 +1050,12 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     // this.
     // Note that we can't trivially unswitch on the default case or
     // on already unswitched cases.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto Case : SI->cases()) {
       BasicBlock *LoopExitCandidate;
-      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
-                                               i.getCaseSuccessor()))) {
+      if ((LoopExitCandidate =
+               isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt *CaseVal = i.getCaseValue();
+        ConstantInt *CaseVal = Case.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -998,6 +1074,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
 
     UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
                              nullptr);
+
+    // We are only unswitching full LIV.
+    BranchesInfo.setUnswitched(SI, CondVal);
     ++NumSwitches;
     return true;
   }
@@ -1253,18 +1332,38 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (!UI || !L->contains(UI))
       continue;
 
-    Worklist.push_back(UI);
+    // At this point, we know LIC is definitely not Val. Try to use some simple
+    // logic to simplify the user w.r.t. to the context.
+    if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+        // This in-loop instruction has been simplified w.r.t. its context,
+        // i.e. LIC != Val, make sure we propagate its replacement value to
+        // all its users.
+        //  
+        // We can not yet delete UI, the LIC user, yet, because that would invalidate
+        // the LIC->users() iterator !. However, we can make this instruction
+        // dead by replacing all its users and push it onto the worklist so that
+        // it can be properly deleted and its operands simplified. 
+        UI->replaceAllUsesWith(Replacement);
+      }
+    }
 
-    // TODO: We could do other simplifications, for example, turning
-    // 'icmp eq LIC, Val' -> false.
+    // This is a LIC user, push it into the worklist so that SimplifyCode can
+    // attempt to simplify it.
+    Worklist.push_back(UI);
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(UI);
     if (!SI || !isa<ConstantInt>(Val)) continue;
 
-    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // NOTE: if a case value for the switch is unswitched out, we record it
+    // after the unswitch finishes. We can not record it here as the switch
+    // is not a direct user of the partial LIV.
+    SwitchInst::CaseHandle DeadCase =
+        *SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
-    if (DeadCase == SI->case_default()) continue;
+    if (DeadCase == *SI->case_default())
+      continue;
 
     // Found a dead case value.  Don't remove PHI nodes in the
     // successor if they become single-entry, those PHI nodes may
@@ -1274,8 +1373,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
 
-    BranchesInfo.setUnswitched(SI, Val);
-
     if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
     // If the DeadCase successor dominates the loop latch, then the
     // transformation isn't safe since it will delete the sole predecessor edge
@@ -1397,3 +1494,27 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     }
   }
 }
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+                                                     Value *Invariant,
+                                                     Constant *Val) {
+  // icmp eq cond, val -> false
+  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+  if (CI && CI->isEquality()) {
+    Value *Op0 = CI->getOperand(0);
+    Value *Op1 = CI->getOperand(1);
+    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+      LLVMContext &Ctx = Inst->getContext();
+      if (CI->getPredicate() == CmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(Ctx);
+      else 
+        return ConstantInt::getTrue(Ctx);
+     }
+  }
+
+  // FIXME: there may be other opportunities, e.g. comparison with floating
+  // point, or Invariant - Val != 0, etc.
+  return nullptr;
+}
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 52975ef35153..a143b9a3c645 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -67,11 +67,11 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   if (!ExpectedValue)
     return false;
 
-  SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
   unsigned n = SI.getNumCases(); // +1 for default case.
   SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
 
-  if (Case == SI.case_default())
+  if (Case == *SI.case_default())
     Weights[0] = LikelyBranchWeight;
   else
     Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1b590140f70a..a3f3f25c1e0f 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -12,20 +12,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
@@ -119,6 +148,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   return true;
 }
 
+namespace {
 
 /// Represents a range of memset'd bytes with the ByteVal value.
 /// This allows us to analyze stores like:
@@ -130,7 +160,6 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 /// the first store, we make a range [1, 2).  The second store extends the range
 /// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
 /// two ranges into [0, 3) which is memset'able.
-namespace {
 struct MemsetRange {
   // Start/End - A semi range that describes the span that this range covers.
   // The range is closed at the start and open at the end: [Start, End).
@@ -148,7 +177,8 @@ struct MemsetRange {
 
   bool isProfitableToUseMemset(const DataLayout &DL) const;
 };
-} // end anon namespace
+
+} // end anonymous namespace
 
 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
@@ -192,13 +222,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   return TheStores.size() > NumPointerStores+NumByteStores;
 }
 
-
 namespace {
+
 class MemsetRanges {
   /// A sorted list of the memset ranges.
   SmallVector<MemsetRange, 8> Ranges;
   typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
   const DataLayout &DL;
+
 public:
   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 
@@ -231,8 +262,7 @@ public:
 
 };
 
-} // end anon namespace
-
+} // end anonymous namespace
 
 /// Add a new store to the MemsetRanges data structure.  This adds a
 /// new range for the specified store at the specified offset, merging into
@@ -299,48 +329,36 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 //===----------------------------------------------------------------------===//
 
 namespace {
-  class MemCpyOptLegacyPass : public FunctionPass {
-    MemCpyOptPass Impl;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    MemCpyOptLegacyPass() : FunctionPass(ID) {
-      initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
 
-    bool runOnFunction(Function &F) override;
-
-  private:
-    // This transformation requires dominator postdominator info
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<MemoryDependenceWrapperPass>();
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<MemoryDependenceWrapperPass>();
-    }
+class MemCpyOptLegacyPass : public FunctionPass {
+  MemCpyOptPass Impl;
 
-    // Helper functions
-    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
-    bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-    bool processMemCpy(MemCpyInst *M);
-    bool processMemMove(MemMoveInst *M);
-    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
-    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
-    bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
-    bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
-    bool processByValArgument(CallSite CS, unsigned ArgNo);
-    Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
-                                      Value *ByteVal);
-
-    bool iterateOnFunction(Function &F);
-  };
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-  char MemCpyOptLegacyPass::ID = 0;
-}
+  MemCpyOptLegacyPass() : FunctionPass(ID) {
+    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
+
+char MemCpyOptLegacyPass::ID = 0;
+
+} // end anonymous namespace
 
 /// The public interface to this file...
 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
@@ -523,14 +541,15 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
     if (Args.erase(C))
       NeedLift = true;
     else if (MayAlias) {
-      NeedLift = any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+      NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
         return AA.getModRefInfo(C, ML);
       });
 
       if (!NeedLift)
-        NeedLift = any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
-          return AA.getModRefInfo(C, CS);
-        });
+        NeedLift =
+            llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+              return AA.getModRefInfo(C, CS);
+            });
     }
 
     if (!NeedLift)
@@ -567,7 +586,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
   }
 
   // We made it, we need to lift
-  for (auto *I : reverse(ToLift)) {
+  for (auto *I : llvm::reverse(ToLift)) {
     DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
     I->moveBefore(P);
   }
@@ -761,7 +780,6 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   return false;
 }
 
-
 /// Takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
@@ -914,6 +932,17 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   if (MR != MRI_NoModRef)
     return false;
 
+  // We can't create address space casts here because we don't know if they're
+  // safe for the target.
+  if (cpySrc->getType()->getPointerAddressSpace() !=
+      cpyDest->getType()->getPointerAddressSpace())
+    return false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+        cpySrc->getType()->getPointerAddressSpace() !=
+        CS.getArgument(i)->getType()->getPointerAddressSpace())
+      return false;
+
   // All the checks have passed, so do the transformation.
   bool changedArgument = false;
   for (unsigned i = 0; i < CS.arg_size(); ++i)
@@ -1240,7 +1269,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
 bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
   AliasAnalysis &AA = LookupAliasAnalysis();
 
-  if (!TLI->has(LibFunc::memmove))
+  if (!TLI->has(LibFunc_memmove))
     return false;
 
   // See if the pointers alias.
@@ -1306,6 +1335,11 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
     return false;
 
+  // The address space of the memcpy source must match the byval argument
+  if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+      ByValArg->getType()->getPointerAddressSpace())
+    return false;
+
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
   //    memcpy(a <- b)
@@ -1375,7 +1409,6 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 }
 
 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-
   auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
 
@@ -1393,7 +1426,9 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
                             LookupAssumptionCache, LookupDomTree);
   if (!MadeChange)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
@@ -1414,10 +1449,10 @@ bool MemCpyOptPass::runImpl(
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
   // even they are disabled, there is no point in trying hard.
-  if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
     return false;
 
-  while (1) {
+  while (true) {
     if (!iterateOnFunction(F))
       break;
     MadeChange = true;
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6a64c6b3619c..acd3ef6791be 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -19,6 +19,8 @@
 // thinks it safe to do so.  This optimization helps with eg. hiding load
 // latencies, triggering if-conversion, and reducing static code size.
 //
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
 //===----------------------------------------------------------------------===//
 //
 //
@@ -87,7 +89,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 
@@ -118,16 +119,6 @@ private:
   void removeInstruction(Instruction *Inst);
   BasicBlock *getDiamondTail(BasicBlock *BB);
   bool isDiamondHead(BasicBlock *BB);
-  // Routines for hoisting loads
-  bool isLoadHoistBarrierInRange(const Instruction &Start,
-                                 const Instruction &End, LoadInst *LI,
-                                 bool SafeToLoadUnconditionally);
-  LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
-  void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
-                        Instruction *ElseInst);
-  bool isSafeToHoist(Instruction *I) const;
-  bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
-  bool mergeLoads(BasicBlock *BB);
   // Routines for sinking stores
   StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
@@ -188,169 +179,6 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
   return true;
 }
 
-///
-/// \brief True when instruction is a hoist barrier for a load
-///
-/// Whenever an instruction could possibly modify the value
-/// being loaded or protect against the load from happening
-/// it is considered a hoist barrier.
-///
-bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(
-    const Instruction &Start, const Instruction &End, LoadInst *LI,
-    bool SafeToLoadUnconditionally) {
-  if (!SafeToLoadUnconditionally)
-    for (const Instruction &Inst :
-         make_range(Start.getIterator(), End.getIterator()))
-      if (!isGuaranteedToTransferExecutionToSuccessor(&Inst))
-        return true;
-  MemoryLocation Loc = MemoryLocation::get(LI);
-  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
-}
-
-///
-/// \brief Decide if a load can be hoisted
-///
-/// When there is a load in \p BB to the same address as \p LI
-/// and it can be hoisted from \p BB, return that load.
-/// Otherwise return Null.
-///
-LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
-                                                   LoadInst *Load0) {
-  BasicBlock *BB0 = Load0->getParent();
-  BasicBlock *Head = BB0->getSinglePredecessor();
-  bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally(
-      Load0->getPointerOperand(), Load0->getAlignment(),
-      Load0->getModule()->getDataLayout(),
-      /*ScanFrom=*/Head->getTerminator());
-  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
-       ++BBI) {
-    Instruction *Inst = &*BBI;
-
-    // Only merge and hoist loads when their result in used only in BB
-    auto *Load1 = dyn_cast<LoadInst>(Inst);
-    if (!Load1 || Inst->isUsedOutsideOfBlock(BB1))
-      continue;
-
-    MemoryLocation Loc0 = MemoryLocation::get(Load0);
-    MemoryLocation Loc1 = MemoryLocation::get(Load1);
-    if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) &&
-        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1,
-                                   SafeToLoadUnconditionally) &&
-        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0,
-                                   SafeToLoadUnconditionally)) {
-      return Load1;
-    }
-  }
-  return nullptr;
-}
-
-///
-/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
-/// \p BB
-///
-/// BB is the head of a diamond
-///
-void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
-                                             Instruction *HoistCand,
-                                             Instruction *ElseInst) {
-  DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
-        dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
-        dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
-  // Hoist the instruction.
-  assert(HoistCand->getParent() != BB);
-
-  // Intersect optional metadata.
-  HoistCand->andIRFlags(ElseInst);
-  HoistCand->dropUnknownNonDebugMetadata();
-
-  // Prepend point for instruction insert
-  Instruction *HoistPt = BB->getTerminator();
-
-  // Merged instruction
-  Instruction *HoistedInst = HoistCand->clone();
-
-  // Hoist instruction.
-  HoistedInst->insertBefore(HoistPt);
-
-  HoistCand->replaceAllUsesWith(HoistedInst);
-  removeInstruction(HoistCand);
-  // Replace the else block instruction.
-  ElseInst->replaceAllUsesWith(HoistedInst);
-  removeInstruction(ElseInst);
-}
-
-///
-/// \brief Return true if no operand of \p I is defined in I's parent block
-///
-bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
-  BasicBlock *Parent = I->getParent();
-  for (Use &U : I->operands())
-    if (auto *Instr = dyn_cast<Instruction>(&U))
-      if (Instr->getParent() == Parent)
-        return false;
-  return true;
-}
-
-///
-/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
-///
-bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
-                                      LoadInst *L1) {
-  // Only one definition?
-  auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
-  auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
-  if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
-      A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
-      A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
-      isa<GetElementPtrInst>(A0)) {
-    DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
-          dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
-          dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
-    hoistInstruction(BB, A0, A1);
-    hoistInstruction(BB, L0, L1);
-    return true;
-  }
-  return false;
-}
-
-///
-/// \brief Try to hoist two loads to same address into diamond header
-///
-/// Starting from a diamond head block, iterate over the instructions in one
-/// successor block and try to match a load in the second successor.
-///
-bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
-  bool MergedLoads = false;
-  assert(isDiamondHead(BB));
-  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
-  BasicBlock *Succ0 = BI->getSuccessor(0);
-  BasicBlock *Succ1 = BI->getSuccessor(1);
-  // #Instructions in Succ1 for Compile Time Control
-  int Size1 = Succ1->size();
-  int NLoads = 0;
-  for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
-       BBI != BBE;) {
-    Instruction *I = &*BBI;
-    ++BBI;
-
-    // Don't move non-simple (atomic, volatile) loads.
-    auto *L0 = dyn_cast<LoadInst>(I);
-    if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
-      continue;
-
-    ++NLoads;
-    if (NLoads * Size1 >= MagicCompileTimeControl)
-      break;
-    if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
-      bool Res = hoistLoad(BB, L0, L1);
-      MergedLoads |= Res;
-      // Don't attempt to hoist above loads that had not been hoisted.
-      if (!Res)
-        break;
-    }
-  }
-  return MergedLoads;
-}
 
 ///
 /// \brief True when instruction is a sink barrier for a store
@@ -534,7 +362,6 @@ bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
     if (isDiamondHead(BB)) {
-      Changed |= mergeLoads(BB);
       Changed |= mergeStores(getDiamondTail(BB));
     }
   }
@@ -596,8 +423,8 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!Impl.run(F, MD, AA))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index 0a3bf7b4c31b..c5bf2f28d185 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -156,20 +156,12 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  bool Changed = runImpl(F, AC, DT, SE, TLI, TTI);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
-
-  if (!Changed)
+  if (!runImpl(F, AC, DT, SE, TLI, TTI))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<ScalarEvolutionAnalysis>();
-  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 57e6e3ddad94..3d8ce888867e 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -17,6 +17,27 @@
 /// "A Sparse Algorithm for Predicated Global Value Numbering" from
 /// Karthik Gargi.
 ///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block.  This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number).  Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly.  In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes.  The rest of the algorithm is devoted to
+/// performing symbolic evaluation, forward propagation, and simplification of
+/// operations based on the value numbers deduced so far.
+///
+/// We also do not perform elimination by using any published algorithm.  All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/NewGVN.h"
@@ -40,13 +61,10 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -55,24 +73,25 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <numeric>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 using namespace llvm::GVNExpression;
-
+using namespace llvm::VNCoercion;
 #define DEBUG_TYPE "newgvn"
 
 STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -87,6 +106,15 @@ STATISTIC(NumGVNAvoidedSortedLeaderChanges,
           "Number of avoided sorted leader changes");
 STATISTIC(NumGVNNotMostDominatingLeader,
           "Number of times a member dominated it's new classes' leader");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+              "Controls which instructions are value numbered")
+
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow.  This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+                                           cl::init(false), cl::Hidden);
 
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@@ -105,6 +133,77 @@ PHIExpression::~PHIExpression() = default;
 }
 }
 
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+
+  TarjanSCC() : Components(1) {}
+
+  void Start(const Instruction *Start) {
+    if (Root.lookup(Start) == 0)
+      FindSCC(Start);
+  }
+
+  const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+    unsigned ComponentID = ValueToComponent.lookup(V);
+
+    assert(ComponentID > 0 &&
+           "Asking for a component for a value we never processed");
+    return Components[ComponentID];
+  }
+
+private:
+  void FindSCC(const Instruction *I) {
+    Root[I] = ++DFSNum;
+    // Store the DFS Number we had before it possibly gets incremented.
+    unsigned int OurDFS = DFSNum;
+    for (auto &Op : I->operands()) {
+      if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+        if (Root.lookup(Op) == 0)
+          FindSCC(InstOp);
+        if (!InComponent.count(Op))
+          Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+      }
+    }
+    // See if we really were the root of a component, by seeing if we still have our DFSNumber.
+    // If we do, we are the root of the component, and we have completed a component. If we do not,
+    // we are not the root of a component, and belong on the component stack.
+    if (Root.lookup(I) == OurDFS) {
+      unsigned ComponentID = Components.size();
+      Components.resize(Components.size() + 1);
+      auto &Component = Components.back();
+      Component.insert(I);
+      DEBUG(dbgs() << "Component root is " << *I << "\n");
+      InComponent.insert(I);
+      ValueToComponent[I] = ComponentID;
+      // Pop a component off the stack and label it.
+      while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+        auto *Member = Stack.back();
+        DEBUG(dbgs() << "Component member is " << *Member << "\n");
+        Component.insert(Member);
+        InComponent.insert(Member);
+        ValueToComponent[Member] = ComponentID;
+        Stack.pop_back();
+      }
+    } else {
+      // Part of a component, push to stack
+      Stack.push_back(I);
+    }
+  }
+  unsigned int DFSNum = 1;
+  SmallPtrSet<const Value *, 8> InComponent;
+  DenseMap<const Value *, unsigned int> Root;
+  SmallVector<const Value *, 8> Stack;
+  // Store the components as vector of ptr sets, because we need the topo order
+  // of SCC's, but not individual member order
+  SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+  DenseMap<const Value *, unsigned> ValueToComponent;
+};
 // Congruence classes represent the set of expressions/instructions
 // that are all the same *during some scope in the function*.
 // That is, because of the way we perform equality propagation, and
@@ -115,43 +214,152 @@ PHIExpression::~PHIExpression() = default;
 // For any Value in the Member set, it is valid to replace any dominated member
 // with that Value.
 //
-// Every congruence class has a leader, and the leader is used to
-// symbolize instructions in a canonical way (IE every operand of an
-// instruction that is a member of the same congruence class will
-// always be replaced with leader during symbolization).
-// To simplify symbolization, we keep the leader as a constant if class can be
-// proved to be a constant value.
-// Otherwise, the leader is a randomly chosen member of the value set, it does
-// not matter which one is chosen.
-// Each congruence class also has a defining expression,
-// though the expression may be null.  If it exists, it can be used for forward
-// propagation and reassociation of values.
-//
-struct CongruenceClass {
-  using MemberSet = SmallPtrSet<Value *, 4>;
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization).  To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value.  Otherwise, the
+// leader is the member of the value set with the smallest DFS number.  Each
+// congruence class also has a defining expression, though the expression may be
+// null.  If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something.  Unfortunately, this does not work
+// easily.  The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression.  This
+// leads to wasted propagation, and in some cases, missed optimization.  For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class.  This in turn will move at one
+// of the memorydefs to a new memory congruence class.  Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number).  So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency.  While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+  using MemberType = Value;
+  using MemberSet = SmallPtrSet<MemberType *, 4>;
+  using MemoryMemberType = MemoryPhi;
+  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+  explicit CongruenceClass(unsigned ID) : ID(ID) {}
+  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+  unsigned getID() const { return ID; }
+  // True if this class has no members left.  This is mainly used for assertion
+  // purposes, and for skipping empty classes.
+  bool isDead() const {
+    // If it's both dead from a value perspective, and dead from a memory
+    // perspective, it's really dead.
+    return empty() && memory_empty();
+  }
+  // Leader functions
+  Value *getLeader() const { return RepLeader; }
+  void setLeader(Value *Leader) { RepLeader = Leader; }
+  const std::pair<Value *, unsigned int> &getNextLeader() const {
+    return NextLeader;
+  }
+  void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+
+  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+    if (LeaderPair.second < NextLeader.second)
+      NextLeader = LeaderPair;
+  }
+
+  Value *getStoredValue() const { return RepStoredValue; }
+  void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+  // Forward propagation info
+  const Expression *getDefiningExpr() const { return DefiningExpr; }
+  void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
+
+  // Value member set
+  bool empty() const { return Members.empty(); }
+  unsigned size() const { return Members.size(); }
+  MemberSet::const_iterator begin() const { return Members.begin(); }
+  MemberSet::const_iterator end() const { return Members.end(); }
+  void insert(MemberType *M) { Members.insert(M); }
+  void erase(MemberType *M) { Members.erase(M); }
+  void swap(MemberSet &Other) { Members.swap(Other); }
+
+  // Memory member set
+  bool memory_empty() const { return MemoryMembers.empty(); }
+  unsigned memory_size() const { return MemoryMembers.size(); }
+  MemoryMemberSet::const_iterator memory_begin() const {
+    return MemoryMembers.begin();
+  }
+  MemoryMemberSet::const_iterator memory_end() const {
+    return MemoryMembers.end();
+  }
+  iterator_range<MemoryMemberSet::const_iterator> memory() const {
+    return make_range(memory_begin(), memory_end());
+  }
+  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+  // Store count
+  unsigned getStoreCount() const { return StoreCount; }
+  void incStoreCount() { ++StoreCount; }
+  void decStoreCount() {
+    assert(StoreCount != 0 && "Store count went negative");
+    --StoreCount;
+  }
+
+  // Return true if two congruence classes are equivalent to each other.  This
+  // means
+  // that every field but the ID number and the dead field are equivalent.
+  bool isEquivalentTo(const CongruenceClass *Other) const {
+    if (!Other)
+      return false;
+    if (this == Other)
+      return true;
+
+    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+                 Other->RepMemoryAccess))
+      return false;
+    if (DefiningExpr != Other->DefiningExpr)
+      if (!DefiningExpr || !Other->DefiningExpr ||
+          *DefiningExpr != *Other->DefiningExpr)
+        return false;
+    // We need some ordered set
+    std::set<Value *> AMembers(Members.begin(), Members.end());
+    std::set<Value *> BMembers(Members.begin(), Members.end());
+    return AMembers == BMembers;
+  }
+
+private:
   unsigned ID;
   // Representative leader.
   Value *RepLeader = nullptr;
+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+  // If this is represented by a store, the value of the store.
+  Value *RepStoredValue = nullptr;
+  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+  // access.
+  const MemoryAccess *RepMemoryAccess = nullptr;
   // Defining Expression.
   const Expression *DefiningExpr = nullptr;
   // Actual members of this class.
   MemberSet Members;
-
-  // True if this class has no members left.  This is mainly used for assertion
-  // purposes, and for skipping empty classes.
-  bool Dead = false;
-
+  // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+  // MemoryUses have real instructions representing them, so we only need to
+  // track MemoryPhis here.
+  MemoryMemberSet MemoryMembers;
   // Number of stores in this congruence class.
   // This is used so we can detect store equivalence changes properly.
   int StoreCount = 0;
-
-  // The most dominating leader after our current leader, because the member set
-  // is not sorted and is expensive to keep sorted all the time.
-  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
-
-  explicit CongruenceClass(unsigned ID) : ID(ID) {}
-  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
-      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
 };
 
 namespace llvm {
@@ -180,19 +388,34 @@ template <> struct DenseMapInfo<const Expression *> {
 };
 } // end namespace llvm
 
-class NewGVN : public FunctionPass {
+namespace {
+class NewGVN {
+  Function &F;
   DominatorTree *DT;
-  const DataLayout *DL;
-  const TargetLibraryInfo *TLI;
   AssumptionCache *AC;
+  const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
   MemorySSA *MSSA;
   MemorySSAWalker *MSSAWalker;
+  const DataLayout &DL;
+  std::unique_ptr<PredicateInfo> PredInfo;
   BumpPtrAllocator ExpressionAllocator;
   ArrayRecycler<Value *> ArgRecycler;
+  TarjanSCC SCCFinder;
+
+  // Number of function arguments, used by ranking
+  unsigned int NumFuncArgs;
+
+  // RPOOrdering of basic blocks
+  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
 
   // Congruence class info.
-  CongruenceClass *InitialClass;
+
+  // This class is called INITIAL in the paper. It is the class everything
+  // startsout in, and represents any value. Being an optimistic analysis,
+  // anything in the TOP class has the value TOP, which is indeterminate and
+  // equivalent to everything.
+  CongruenceClass *TOPClass;
   std::vector<CongruenceClass *> CongruenceClasses;
   unsigned NextCongruenceNum;
 
@@ -200,13 +423,38 @@ class NewGVN : public FunctionPass {
   DenseMap<Value *, CongruenceClass *> ValueToClass;
   DenseMap<Value *, const Expression *> ValueToExpression;
 
+  // Mapping from predicate info we used to the instructions we used it with.
+  // In order to correctly ensure propagation, we must keep track of what
+  // comparisons we used, so that when the values of the comparisons change, we
+  // propagate the information to the places we used the comparison.
+  DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
+  // Mapping from MemoryAccess we used to the MemoryAccess we used it with.  Has
+  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
+  // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+  DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+
   // A table storing which memorydefs/phis represent a memory state provably
   // equivalent to another memory state.
   // We could use the congruence class machinery, but the MemoryAccess's are
   // abstract memory states, so they can only ever be equivalent to each other,
   // and not to constants, etc.
-  DenseMap<const MemoryAccess *, MemoryAccess *> MemoryAccessEquiv;
-
+  DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+  // We could, if we wanted, build MemoryPhiExpressions and
+  // MemoryVariableExpressions, etc, and value number them the same way we value
+  // number phi expressions.  For the moment, this seems like overkill.  They
+  // can only exist in one of three states: they can be TOP (equal to
+  // everything), Equivalent to something else, or unique.  Because we do not
+  // create expressions for them, we need to simulate leader change not just
+  // when they change class, but when they change state.  Note: We can do the
+  // same thing for phis, and avoid having phi expressions if we wanted, We
+  // should eventually unify in one direction or the other, so this is a little
+  // bit of an experiment in which turns out easier to maintain.
+  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+  enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
+  DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
@@ -231,8 +479,6 @@ class NewGVN : public FunctionPass {
   BitVector TouchedInstructions;
 
   DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-  DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>>
-      DominatedInstRange;
 
 #ifndef NDEBUG
   // Debugging for how many times each block and instruction got processed.
@@ -240,56 +486,42 @@ class NewGVN : public FunctionPass {
 #endif
 
   // DFS info.
-  DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap;
+  // This contains a mapping from Instructions to DFS numbers.
+  // The numbering starts at 1. An instruction with DFS number zero
+  // means that the instruction is dead.
   DenseMap<const Value *, unsigned> InstrDFS;
+
+  // This contains the mapping DFS numbers to instructions.
   SmallVector<Value *, 32> DFSToInstr;
 
   // Deletion info.
   SmallPtrSet<Instruction *, 8> InstructionsToErase;
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  NewGVN() : FunctionPass(ID) {
-    initializeNewGVNPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
-  bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
-              TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA);
+  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+         const DataLayout &DL)
+      : F(F), DT(DT), AC(AC), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)) {}
+  bool runGVN();
 
 private:
-  // This transformation requires dominator postdominator info.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<MemorySSAWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-
   // Expression handling.
-  const Expression *createExpression(Instruction *, const BasicBlock *);
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
-                                           const BasicBlock *);
-  PHIExpression *createPHIExpression(Instruction *);
+  const Expression *createExpression(Instruction *);
+  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
+  PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
+                                     bool &AllConstant);
   const VariableExpression *createVariableExpression(Value *);
   const ConstantExpression *createConstantExpression(Constant *);
-  const Expression *createVariableOrConstant(Value *V, const BasicBlock *B);
+  const Expression *createVariableOrConstant(Value *V);
   const UnknownExpression *createUnknownExpression(Instruction *);
-  const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *,
-                                               const BasicBlock *);
+  const StoreExpression *createStoreExpression(StoreInst *,
+                                               const MemoryAccess *);
   LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
-                                       MemoryAccess *, const BasicBlock *);
-
-  const CallExpression *createCallExpression(CallInst *, MemoryAccess *,
-                                             const BasicBlock *);
-  const AggregateValueExpression *
-  createAggregateValueExpression(Instruction *, const BasicBlock *);
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *,
-                              const BasicBlock *);
+                                       const MemoryAccess *);
+  const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
+  const AggregateValueExpression *createAggregateValueExpression(Instruction *);
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *);
 
   // Congruence class handling.
   CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -298,9 +530,21 @@ private:
     return result;
   }
 
+  CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+    auto *CC = createCongruenceClass(nullptr, nullptr);
+    CC->setMemoryLeader(MA);
+    return CC;
+  }
+  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+    auto *CC = getMemoryClass(MA);
+    if (CC->getMemoryLeader() != MA)
+      CC = createMemoryClass(MA);
+    return CC;
+  }
+
   CongruenceClass *createSingletonCongruenceClass(Value *Member) {
     CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
-    CClass->Members.insert(Member);
+    CClass->insert(Member);
     ValueToClass[Member] = CClass;
     return CClass;
   }
@@ -313,37 +557,49 @@ private:
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
                                                Value *);
-  const Expression *performSymbolicEvaluation(Value *, const BasicBlock *);
-  const Expression *performSymbolicLoadEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicStoreEvaluation(Instruction *,
-                                                   const BasicBlock *);
-  const Expression *performSymbolicCallEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicPHIEvaluation(Instruction *,
-                                                 const BasicBlock *);
-  bool setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To);
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *,
-                                                       const BasicBlock *);
+  const Expression *performSymbolicEvaluation(Value *);
+  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+                                                Instruction *, MemoryAccess *);
+  const Expression *performSymbolicLoadEvaluation(Instruction *);
+  const Expression *performSymbolicStoreEvaluation(Instruction *);
+  const Expression *performSymbolicCallEvaluation(Instruction *);
+  const Expression *performSymbolicPHIEvaluation(Instruction *);
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *);
+  const Expression *performSymbolicCmpEvaluation(Instruction *);
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
 
   // Congruence finding.
-  // Templated to allow them to work both on BB's and BB-edges.
-  template <class T>
-  Value *lookupOperandLeader(Value *, const User *, const T &) const;
+  bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+  Value *lookupOperandLeader(Value *) const;
   void performCongruenceFinding(Instruction *, const Expression *);
-  void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
-                                     CongruenceClass *);
+  void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+                                     CongruenceClass *, CongruenceClass *);
+  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+                                      CongruenceClass *, CongruenceClass *);
+  Value *getNextValueLeader(CongruenceClass *) const;
+  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+  bool isMemoryAccessTop(const MemoryAccess *) const;
+
+  // Ranking
+  unsigned int getRank(const Value *) const;
+  bool shouldSwapOperands(const Value *, const Value *) const;
+
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
   void processOutgoingEdges(TerminatorInst *, BasicBlock *);
-  bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const;
-  Value *findConditionEquivalence(Value *, BasicBlock *) const;
-  MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const;
+  Value *findConditionEquivalence(Value *) const;
 
   // Elimination.
   struct ValueDFS;
-  void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
-                                SmallVectorImpl<ValueDFS> &);
+  void convertClassToDFSOrdered(const CongruenceClass &,
+                                SmallVectorImpl<ValueDFS> &,
+                                DenseMap<const Value *, unsigned int> &,
+                                SmallPtrSetImpl<Instruction *> &) const;
+  void convertClassToLoadsAndStores(const CongruenceClass &,
+                                    SmallVectorImpl<ValueDFS> &) const;
 
   bool eliminateInstructions(Function &);
   void replaceInstruction(Instruction *, Value *);
@@ -355,35 +611,58 @@ private:
 
   // Various instruction touch utilities
   void markUsersTouched(Value *);
-  void markMemoryUsersTouched(MemoryAccess *);
-  void markLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryUsersTouched(const MemoryAccess *);
+  void markMemoryDefTouched(const MemoryAccess *);
+  void markPredicateUsersTouched(Instruction *);
+  void markValueLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void addPredicateUsers(const PredicateBase *, Instruction *);
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+
+  // Main loop of value numbering
+  void iterateTouchedInstructions();
 
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
   void updateProcessedCount(Value *V);
   void verifyMemoryCongruency() const;
+  void verifyIterationSettled(Function &F);
   bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
-};
-
-char NewGVN::ID = 0;
+  BasicBlock *getBlockForValue(Value *V) const;
+  void deleteExpression(const Expression *E);
+  unsigned InstrToDFSNum(const Value *V) const {
+    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+    return InstrDFS.lookup(V);
+  }
 
-// createGVNPass - The public interface to this file.
-FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); }
+  unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+    return MemoryToDFSNum(MA);
+  }
+  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+  // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
+  // This deliberately takes a value so it can be used with Use's, which will
+  // auto-convert to Value's but not to MemoryAccess's.
+  unsigned MemoryToDFSNum(const Value *MA) const {
+    assert(isa<MemoryAccess>(MA) &&
+           "This should not be used with instructions");
+    return isa<MemoryUseOrDef>(MA)
+               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+               : InstrDFS.lookup(MA);
+  }
+  bool isCycleFree(const PHINode *PN);
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+  // Debug counter info.  When verifying, we have to reset the value numbering
+  // debug counter to the same state it started in to get the same results.
+  std::pair<int, int> StartingVNCounter;
+};
+} // end anonymous namespace
 
 template <typename T>
 static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
-  if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) ||
-      !LHS.BasicExpression::equals(RHS)) {
+  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
     return false;
-  } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != L->getDefiningAccess())
-      return false;
-  } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != S->getDefiningAccess())
-      return false;
-  }
-  return true;
+  return LHS.MemoryExpression::equals(RHS);
 }
 
 bool LoadExpression::equals(const Expression &Other) const {
@@ -391,7 +670,13 @@ bool LoadExpression::equals(const Expression &Other) const {
 }
 
 bool StoreExpression::equals(const Expression &Other) const {
-  return equalsLoadStoreHelper(*this, Other);
+  if (!equalsLoadStoreHelper(*this, Other))
+    return false;
+  // Make sure that store vs store includes the value operand.
+  if (const auto *S = dyn_cast<StoreExpression>(&Other))
+    if (getStoredValue() != S->getStoredValue())
+      return false;
+  return true;
 }
 
 #ifndef NDEBUG
@@ -400,16 +685,28 @@ static std::string getBlockName(const BasicBlock *B) {
 }
 #endif
 
-INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+  if (auto *I = dyn_cast<Instruction>(V))
+    return I->getParent();
+  else if (auto *MP = dyn_cast<MemoryPhi>(V))
+    return MP->getBlock();
+  llvm_unreachable("Should have been able to figure out a block for our value");
+  return nullptr;
+}
 
-PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator.  Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) {
+  assert(isa<BasicExpression>(E));
+  auto *BE = cast<BasicExpression>(E);
+  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+  ExpressionAllocator.Deallocate(E);
+}
+
+PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+                                           bool &AllConstant) {
   BasicBlock *PHIBlock = I->getParent();
   auto *PN = cast<PHINode>(I);
   auto *E =
@@ -419,28 +716,32 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
   E->setType(I->getType());
   E->setOpcode(I->getOpcode());
 
-  auto ReachablePhiArg = [&](const Use &U) {
-    return ReachableBlocks.count(PN->getIncomingBlock(U));
-  };
+  unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
 
-  // Filter out unreachable operands
-  auto Filtered = make_filter_range(PN->operands(), ReachablePhiArg);
+  // Filter out unreachable phi operands.
+  auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
+    return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+  });
 
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
                  [&](const Use &U) -> Value * {
+                   auto *BB = PN->getIncomingBlock(U);
+                   auto *DTN = DT->getNode(BB);
+                   if (RPOOrdering.lookup(DTN) >= PHIRPO)
+                     HasBackedge = true;
+                   AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+
                    // Don't try to transform self-defined phis.
                    if (U == PN)
                      return PN;
-                   const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
-                   return lookupOperandLeader(U, I, BBE);
+                   return lookupOperandLeader(U);
                  });
   return E;
 }
 
 // Set basic expression info (Arguments, type, opcode) for Expression
 // E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
-                                    const BasicBlock *B) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
   bool AllConstant = true;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     E->setType(GEP->getSourceElementType());
@@ -452,7 +753,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
   // Transform the operand array into an operand leader array, and keep track of
   // whether all members are constant.
   std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
-    auto Operand = lookupOperandLeader(O, I, B);
+    auto Operand = lookupOperandLeader(O);
     AllConstant &= isa<Constant>(Operand);
     return Operand;
   });
@@ -461,8 +762,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
 }
 
 const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
-                                                 Value *Arg1, Value *Arg2,
-                                                 const BasicBlock *B) {
+                                                 Value *Arg1, Value *Arg2) {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
 
   E->setType(T);
@@ -473,13 +773,13 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
     // of their operands get the same value number by sorting the operand value
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
-    if (Arg1 > Arg2)
+    if (shouldSwapOperands(Arg1, Arg2))
       std::swap(Arg1, Arg2);
   }
-  E->op_push_back(lookupOperandLeader(Arg1, nullptr, B));
-  E->op_push_back(lookupOperandLeader(Arg2, nullptr, B));
+  E->op_push_back(lookupOperandLeader(Arg1));
+  E->op_push_back(lookupOperandLeader(Arg2));
 
-  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI,
+  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), DL, TLI,
                            DT, AC);
   if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
     return SimplifiedE;
@@ -502,40 +802,32 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
     NumGVNOpsSimplified++;
     assert(isa<BasicExpression>(E) &&
            "We should always have had a basic expression here");
-
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(C);
   } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " variable " << *V << "\n");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createVariableExpression(V);
   }
 
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && CC->DefiningExpr) {
+  if (CC && CC->getDefiningExpr()) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " expression " << *V << "\n");
     NumGVNOpsSimplified++;
-    assert(isa<BasicExpression>(E) &&
-           "We should always have had a basic expression here");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    return CC->DefiningExpr;
+    deleteExpression(E);
+    return CC->getDefiningExpr();
   }
   return nullptr;
 }
 
-const Expression *NewGVN::createExpression(Instruction *I,
-                                           const BasicBlock *B) {
-
+const Expression *NewGVN::createExpression(Instruction *I) {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
-  bool AllConstant = setBasicExpressionInfo(I, E, B);
+  bool AllConstant = setBasicExpressionInfo(I, E);
 
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
@@ -543,7 +835,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
-    if (E->getOperand(0) > E->getOperand(1))
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
       E->swapOperands(0, 1);
   }
 
@@ -559,48 +851,43 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // Sort the operand value numbers so x<y and y>x get the same value
     // number.
     CmpInst::Predicate Predicate = CI->getPredicate();
-    if (E->getOperand(0) > E->getOperand(1)) {
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
       E->swapOperands(0, 1);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     E->setOpcode((CI->getOpcode() << 8) | Predicate);
     // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
-    // TODO: Since we noop bitcasts, we may need to check types before
-    // simplifying, so that we don't end up simplifying based on a wrong
-    // type assumption. We should clean this up so we can use constants of the
-    // wrong type
-
     assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
            "Wrong types on cmp instruction");
-    if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
-         E->getOperand(1)->getType() == I->getOperand(1)->getType())) {
-      Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
-                                 *DL, TLI, DT, AC);
-      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
-        return SimplifiedE;
-    }
+    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+            E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+    Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
+                               DL, TLI, DT, AC);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
   } else if (isa<SelectInst>(I)) {
     if (isa<Constant>(E->getOperand(0)) ||
-        (E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
-         E->getOperand(2)->getType() == I->getOperand(2)->getType())) {
+        E->getOperand(0) == E->getOperand(1)) {
+      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+             E->getOperand(2)->getType() == I->getOperand(2)->getType());
       Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
-                                    E->getOperand(2), *DL, TLI, DT, AC);
+                                    E->getOperand(2), DL, TLI, DT, AC);
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
     }
   } else if (I->isBinaryOp()) {
     Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1),
-                             *DL, TLI, DT, AC);
+                             DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
-    Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC);
+    Value *V = SimplifyInstruction(BI, DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (isa<GetElementPtrInst>(I)) {
     Value *V = SimplifyGEPInst(E->getType(),
                                ArrayRef<Value *>(E->op_begin(), E->op_end()),
-                               *DL, TLI, DT, AC);
+                               DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (AllConstant) {
@@ -615,7 +902,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     for (Value *Arg : E->operands())
       C.emplace_back(cast<Constant>(Arg));
 
-    if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI))
+    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
   }
@@ -623,18 +910,18 @@ const Expression *NewGVN::createExpression(Instruction *I,
 }
 
 const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
+NewGVN::createAggregateValueExpression(Instruction *I) {
   if (auto *II = dyn_cast<InsertValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
-    setBasicExpressionInfo(I, E, B);
+    setBasicExpressionInfo(I, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
     return E;
   } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
-    setBasicExpressionInfo(EI, E, B);
+    setBasicExpressionInfo(EI, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
     return E;
@@ -648,12 +935,10 @@ const VariableExpression *NewGVN::createVariableExpression(Value *V) {
   return E;
 }
 
-const Expression *NewGVN::createVariableOrConstant(Value *V,
-                                                   const BasicBlock *B) {
-  auto Leader = lookupOperandLeader(V, nullptr, B);
-  if (auto *C = dyn_cast<Constant>(Leader))
+const Expression *NewGVN::createVariableOrConstant(Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
     return createConstantExpression(C);
-  return createVariableExpression(Leader);
+  return createVariableExpression(V);
 }
 
 const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
@@ -669,40 +954,90 @@ const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
 }
 
 const CallExpression *NewGVN::createCallExpression(CallInst *CI,
-                                                   MemoryAccess *HV,
-                                                   const BasicBlock *B) {
+                                                   const MemoryAccess *MA) {
   // FIXME: Add operand bundles for calls.
   auto *E =
-      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV);
-  setBasicExpressionInfo(CI, E, B);
+      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+  setBasicExpressionInfo(CI, E);
   return E;
 }
 
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+                                     const Instruction *U) const {
+  auto *CC = ValueToClass.lookup(Inst);
+  // This must be an instruction because we are only called from phi nodes
+  // in the case that the value it needs to check against is an instruction.
+
+  // The most likely candiates for dominance are the leader and the next leader.
+  // The leader or nextleader will dominate in all cases where there is an
+  // equivalent that is higher up in the dom tree.
+  // We can't *only* check them, however, because the
+  // dominator tree could have an infinite number of non-dominating siblings
+  // with instructions that are in the right congruence class.
+  //       A
+  // B C D E F G
+  // |
+  // H
+  // Instruction U could be in H,  with equivalents in every other sibling.
+  // Depending on the rpo order picked, the leader could be the equivalent in
+  // any of these siblings.
+  if (!CC)
+    return false;
+  if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+    return true;
+  if (CC->getNextLeader().first &&
+      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+    return true;
+  return llvm::any_of(*CC, [&](const Value *Member) {
+    return Member != CC->getLeader() &&
+           DT->dominates(cast<Instruction>(Member), U);
+  });
+}
+
 // See if we have a congruence class and leader for this operand, and if so,
 // return it. Otherwise, return the operand itself.
-template <class T>
-Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const {
+Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && (CC != InitialClass))
-    return CC->RepLeader;
+  if (CC) {
+    // Everything in TOP is represneted by undef, as it can be any value.
+    // We do have to make sure we get the type right though, so we can't set the
+    // RepLeader to undef.
+    if (CC == TOPClass)
+      return UndefValue::get(V->getType());
+    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+  }
+
   return V;
 }
 
-MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const {
-  MemoryAccess *Result = MemoryAccessEquiv.lookup(MA);
-  return Result ? Result : MA;
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+  auto *CC = getMemoryClass(MA);
+  assert(CC->getMemoryLeader() &&
+         "Every MemoryAccess should be mapped to a "
+         "congruence class with a represenative memory "
+         "access");
+  return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices.  This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
+  return getMemoryClass(MA) == TOPClass;
 }
 
 LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
-                                             LoadInst *LI, MemoryAccess *DA,
-                                             const BasicBlock *B) {
-  auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA);
+                                             LoadInst *LI,
+                                             const MemoryAccess *MA) {
+  auto *E =
+      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(LoadType);
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(PointerOp, LI, B));
+  E->op_push_back(PointerOp);
   if (LI)
     E->setAlignment(LI->getAlignment());
 
@@ -713,16 +1048,16 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
 }
 
 const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
-                                                     MemoryAccess *DA,
-                                                     const BasicBlock *B) {
-  auto *E =
-      new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, DA);
+                                                     const MemoryAccess *MA) {
+  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+  auto *E = new (ExpressionAllocator)
+      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(SI->getValueOperand()->getType());
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B));
+  E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -730,44 +1065,140 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
-// Utility function to check whether the congruence class has a member other
-// than the given instruction.
-bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
-  // Either it has more than one store, in which case it must contain something
-  // other than us (because it's indexed by value), or if it only has one store
-  // right now, that member should not be us.
-  return CC->StoreCount > 1 || CC->Members.count(I) == 0;
-}
-
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
-                                                         const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
-  MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
-  // See if we are defined by a previous store expression, it already has a
-  // value, and it's the same value as our current store. FIXME: Right now, we
-  // only do this for simple stores, we should expand to cover memcpys, etc.
+  auto *StoreAccess = MSSA->getMemoryAccess(SI);
+  // Get the expression, if any, for the RHS of the MemoryDef.
+  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+  if (EnableStoreRefinement)
+    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+  // If we bypassed the use-def chains, make sure we add a use.
+  if (StoreRHS != StoreAccess->getDefiningAccess())
+    addMemoryUsers(StoreRHS, StoreAccess);
+
+  StoreRHS = lookupMemoryLeader(StoreRHS);
+  // If we are defined by ourselves, use the live on entry def.
+  if (StoreRHS == StoreAccess)
+    StoreRHS = MSSA->getLiveOnEntryDef();
+
   if (SI->isSimple()) {
-    // Get the expression, if any, for the RHS of the MemoryDef.
-    MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
-        cast<MemoryDef>(StoreAccess)->getDefiningAccess());
-    const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
-    CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+    // See if we are defined by a previous store expression, it already has a
+    // value, and it's the same value as our current store. FIXME: Right now, we
+    // only do this for simple stores, we should expand to cover memcpys, etc.
+    const auto *LastStore = createStoreExpression(SI, StoreRHS);
+    const auto *LastCC = ExpressionToClass.lookup(LastStore);
     // Basically, check if the congruence class the store is in is defined by a
     // store that isn't us, and has the same value.  MemorySSA takes care of
     // ensuring the store has the same memory state as us already.
-    if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
-        CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
-        hasMemberOtherThanUs(CC, I))
-      return createStoreExpression(SI, StoreRHS, B);
+    // The RepStoredValue gets nulled if all the stores disappear in a class, so
+    // we don't need to check if the class contains a store besides us.
+    if (LastCC &&
+        LastCC->getStoredValue() == lookupOperandLeader(SI->getValueOperand()))
+      return LastStore;
+    deleteExpression(LastStore);
+    // Also check if our value operand is defined by a load of the same memory
+    // location, and the memory state is the same as it was then (otherwise, it
+    // could have been overwritten later. See test32 in
+    // transforms/DeadStoreElimination/simple.ll).
+    if (auto *LI =
+            dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) {
+      if ((lookupOperandLeader(LI->getPointerOperand()) ==
+           lookupOperandLeader(SI->getPointerOperand())) &&
+          (lookupMemoryLeader(MSSA->getMemoryAccess(LI)->getDefiningAccess()) ==
+           StoreRHS))
+        return createVariableExpression(LI);
+    }
   }
 
-  return createStoreExpression(SI, StoreAccess, B);
+  // If the store is not equivalent to anything, value number it as a store that
+  // produces a unique memory state (instead of using it's MemoryUse, we use
+  // it's MemoryDef).
+  return createStoreExpression(SI, StoreAccess);
 }
 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+                                    LoadInst *LI, Instruction *DepInst,
+                                    MemoryAccess *DefiningAccess) {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    // Also don't need to coerce if they are the same type, we will just
+    // propogate..
+    if (LI->isAtomic() > DepSI->isAtomic() ||
+        LoadType == DepSI->getValueOperand()->getType())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+    if (Offset >= 0) {
+      if (auto *C = dyn_cast<Constant>(
+              lookupOperandLeader(DepSI->getValueOperand()))) {
+        DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+                     << *C << "\n");
+        return createConstantExpression(
+            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+      }
+    }
+
+  } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LI->isAtomic() > DepLI->isAtomic())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+    if (Offset >= 0) {
+      // We can coerce a constant load into a load
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+        if (auto *PossibleConstant =
+                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+          DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+                       << *PossibleConstant << "\n");
+          return createConstantExpression(PossibleConstant);
+        }
+    }
+
+  } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+    if (Offset >= 0) {
+      if (auto *PossibleConstant =
+              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+        DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                     << " to constant " << *PossibleConstant << "\n");
+        return createConstantExpression(PossibleConstant);
+      }
+    }
+  }
+
+  // All of the below are only true if the loaded pointer is produced
+  // by the dependent instruction.
+  if (LoadPtr != lookupOperandLeader(DepInst) &&
+      !AA->isMustAlias(LoadPtr, DepInst))
+    return nullptr;
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.  Note that this is only true in the case
+  // that the result of the allocation is pointer equal to the load ptr.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  else if (isCallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(Constant::getNullValue(LoadType));
+  }
+
+  return nullptr;
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
   auto *LI = cast<LoadInst>(I);
 
   // We can eliminate in favor of non-simple loads, but we won't be able to
@@ -775,7 +1206,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
   if (!LI->isSimple())
     return nullptr;
 
-  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B);
+  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
   // Load of undef is undef.
   if (isa<UndefValue>(LoadAddressLeader))
     return createConstantExpression(UndefValue::get(LI->getType()));
@@ -788,61 +1219,233 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
       // If the defining instruction is not reachable, replace with undef.
       if (!ReachableBlocks.count(DefiningInst->getParent()))
         return createConstantExpression(UndefValue::get(LI->getType()));
+      // This will handle stores and memory insts.  We only do if it the
+      // defining access has a different type, or it is a pointer produced by
+      // certain memory operations that cause the memory to have a fixed value
+      // (IE things like calloc).
+      if (const auto *CoercionResult =
+              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+                                          DefiningInst, DefiningAccess))
+        return CoercionResult;
     }
   }
 
-  const Expression *E =
-      createLoadExpression(LI->getType(), LI->getPointerOperand(), LI,
-                           lookupMemoryAccessEquiv(DefiningAccess), B);
+  const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
+                                             LI, DefiningAccess);
   return E;
 }
 
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+  auto *PI = PredInfo->getPredicateInfoFor(I);
+  if (!PI)
+    return nullptr;
+
+  DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+  auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+  if (!PWC)
+    return nullptr;
+
+  auto *CopyOf = I->getOperand(0);
+  auto *Cond = PWC->Condition;
+
+  // If this a copy of the condition, it must be either true or false depending
+  // on the predicate info type and edge
+  if (CopyOf == Cond) {
+    // We should not need to add predicate users because the predicate info is
+    // already a use of this operand.
+    if (isa<PredicateAssume>(PI))
+      return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+    if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+      if (PBranch->TrueEdge)
+        return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+      return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+    }
+    if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+      return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+  }
+
+  // Not a copy of the condition, so see what the predicates tell us about this
+  // value.  First, though, we check to make sure the value is actually a copy
+  // of one of the condition operands. It's possible, in certain cases, for it
+  // to be a copy of a predicateinfo copy. In particular, if two branch
+  // operations use the same condition, and one branch dominates the other, we
+  // will end up with a copy of a copy.  This is currently a small deficiency in
+  // predicateinfo.  What will end up happening here is that we will value
+  // number both copies the same anyway.
+
+  // Everything below relies on the condition being a comparison.
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!Cmp)
+    return nullptr;
+
+  if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+    DEBUG(dbgs() << "Copy is not of any condition operands!");
+    return nullptr;
+  }
+  Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+  Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+  bool SwappedOps = false;
+  // Sort the ops
+  if (shouldSwapOperands(FirstOp, SecondOp)) {
+    std::swap(FirstOp, SecondOp);
+    SwappedOps = true;
+  }
+  CmpInst::Predicate Predicate =
+      SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+  if (isa<PredicateAssume>(PI)) {
+    // If the comparison is true when the operands are equal, then we know the
+    // operands are equal, because assumes must always be true.
+    if (CmpInst::isTrueWhenEqual(Predicate)) {
+      addPredicateUsers(PI, I);
+      return createVariableOrConstant(FirstOp);
+    }
+  }
+  if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+    // If we are *not* a copy of the comparison, we may equal to the other
+    // operand when the predicate implies something about equality of
+    // operations.  In particular, if the comparison is true/false when the
+    // operands are equal, and we are on the right edge, we know this operation
+    // is equal to something.
+    if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+        (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+      addPredicateUsers(PI, I);
+      return createVariableOrConstant(FirstOp);
+    }
+    // Handle the special case of floating point.
+    if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+         (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+        isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+      addPredicateUsers(PI, I);
+      return createConstantExpression(cast<Constant>(FirstOp));
+    }
+  }
+  return nullptr;
+}
+
 // Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
   auto *CI = cast<CallInst>(I);
-  if (AA->doesNotAccessMemory(CI))
-    return createCallExpression(CI, nullptr, B);
-  if (AA->onlyReadsMemory(CI)) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    // Instrinsics with the returned attribute are copies of arguments.
+    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+          return Result;
+      return createVariableOrConstant(ReturnedValue);
+    }
+  }
+  if (AA->doesNotAccessMemory(CI)) {
+    return createCallExpression(CI, TOPClass->getMemoryLeader());
+  } else if (AA->onlyReadsMemory(CI)) {
     MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
-    return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B);
+    return createCallExpression(CI, DefiningAccess);
   }
   return nullptr;
 }
 
-// Update the memory access equivalence table to say that From is equal to To,
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+
+  auto *Result = MemoryAccessToClass.lookup(MA);
+  assert(Result && "Should have found memory class");
+  return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
 // and return true if this is different from what already existed in the table.
-bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
-  DEBUG(dbgs() << "Setting " << *From << " equivalent to ");
-  if (!To)
-    DEBUG(dbgs() << "itself");
-  else
-    DEBUG(dbgs() << *To);
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+                            CongruenceClass *NewClass) {
+  assert(NewClass &&
+         "Every MemoryAccess should be getting mapped to a non-null class");
+  DEBUG(dbgs() << "Setting " << *From);
+  DEBUG(dbgs() << " equivalent to congruence class ");
+  DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader());
   DEBUG(dbgs() << "\n");
-  auto LookupResult = MemoryAccessEquiv.find(From);
+
+  auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
   // If it's already in the table, see if the value changed.
-  if (LookupResult != MemoryAccessEquiv.end()) {
-    if (To && LookupResult->second != To) {
+  if (LookupResult != MemoryAccessToClass.end()) {
+    auto *OldClass = LookupResult->second;
+    if (OldClass != NewClass) {
+      // If this is a phi, we have to handle memory member updates.
+      if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+        OldClass->memory_erase(MP);
+        NewClass->memory_insert(MP);
+        // This may have killed the class if it had no non-memory members
+        if (OldClass->getMemoryLeader() == From) {
+          if (OldClass->memory_empty()) {
+            OldClass->setMemoryLeader(nullptr);
+          } else {
+            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+            DEBUG(dbgs() << "Memory class leader change for class "
+                         << OldClass->getID() << " to "
+                         << *OldClass->getMemoryLeader()
+                         << " due to removal of a memory member " << *From
+                         << "\n");
+            markMemoryLeaderChangeTouched(OldClass);
+          }
+        }
+      }
       // It wasn't equivalent before, and now it is.
-      LookupResult->second = To;
-      Changed = true;
-    } else if (!To) {
-      // It used to be equivalent to something, and now it's not.
-      MemoryAccessEquiv.erase(LookupResult);
+      LookupResult->second = NewClass;
       Changed = true;
     }
-  } else {
-    assert(!To &&
-           "Memory equivalence should never change from nothing to something");
   }
 
   return Changed;
 }
+
+// Determine if a phi is cycle-free.  That means the values in the phi don't
+// depend on any expressions that can change value as a result of the phi.
+// For example, a non-cycle free phi would be  v = phi(0, v+1).
+bool NewGVN::isCycleFree(const PHINode *PN) {
+  // In order to compute cycle-freeness, we do SCC finding on the phi, and see
+  // what kind of SCC it ends up in.  If it is a singleton, it is cycle-free.
+  // If it is not in a singleton, it is only cycle free if the other members are
+  // all phi nodes (as they do not compute anything, they are copies).  TODO:
+  // There are likely a few other intrinsics or expressions that could be
+  // included here, but this happens so infrequently already that it is not
+  // likely to be worth it.
+  auto PCS = PhiCycleState.lookup(PN);
+  if (PCS == PCS_Unknown) {
+    SCCFinder.Start(PN);
+    auto &SCC = SCCFinder.getComponentFor(PN);
+    // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+    if (SCC.size() == 1)
+      PhiCycleState.insert({PN, PCS_CycleFree});
+    else {
+      bool AllPhis =
+          llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+      PCS = AllPhis ? PCS_CycleFree : PCS_Cycle;
+      for (auto *Member : SCC)
+        if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+          PhiCycleState.insert({MemberPhi, PCS});
+    }
+  }
+  if (PCS == PCS_Cycle)
+    return false;
+  return true;
+}
+
 // Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
-                                                       const BasicBlock *B) {
-  auto *E = cast<PHIExpression>(createPHIExpression(I));
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+  // True if one of the incoming phi edges is a backedge.
+  bool HasBackedge = false;
+  // All constant tracks the state of whether all the *original* phi operands
+  // were constant.
+  // This is really shorthand for "this phi cannot cycle due to forward
+  // propagation", as any
+  // change in value of the phi is guaranteed not to later change the value of
+  // the phi.
+  // IE it can't be v = phi(undef, v+1)
+  bool AllConstant = true;
+  auto *E =
+      cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
 
   // See if all arguaments are the same.
@@ -861,14 +1464,15 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
   if (Filtered.begin() == Filtered.end()) {
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(UndefValue::get(I->getType()));
   }
+  unsigned NumOps = 0;
   Value *AllSameValue = *(Filtered.begin());
   ++Filtered.begin();
   // Can't use std::equal here, sadly, because filter.begin moves.
-  if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
+  if (llvm::all_of(Filtered, [AllSameValue, &NumOps](const Value *V) {
+        ++NumOps;
         return V == AllSameValue;
       })) {
     // In LLVM's non-standard representation of phi nodes, it's possible to have
@@ -881,27 +1485,32 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
     // We also special case undef, so that if we have an undef, we can't use the
     // common value unless it dominates the phi block.
     if (HasUndef) {
+      // If we have undef and at least one other value, this is really a
+      // multivalued phi, and we need to know if it's cycle free in order to
+      // evaluate whether we can ignore the undef.  The other parts of this are
+      // just shortcuts.  If there is no backedge, or all operands are
+      // constants, or all operands are ignored but the undef, it also must be
+      // cycle free.
+      if (!AllConstant && HasBackedge && NumOps > 0 &&
+          !isa<UndefValue>(AllSameValue) && !isCycleFree(cast<PHINode>(I)))
+        return E;
+
       // Only have to check for instructions
       if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
-        if (!DT->dominates(AllSameInst, I))
+        if (!someEquivalentDominates(AllSameInst, I))
           return E;
     }
 
     NumGVNPhisAllSame++;
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    if (auto *C = dyn_cast<Constant>(AllSameValue))
-      return createConstantExpression(C);
-    return createVariableExpression(AllSameValue);
+    deleteExpression(E);
+    return createVariableOrConstant(AllSameValue);
   }
   return E;
 }
 
-const Expression *
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
-                                           const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
   if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
     if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -931,19 +1540,130 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
         // expression.
         assert(II->getNumArgOperands() == 2 &&
                "Expect two args for recognised intrinsics.");
-        return createBinaryExpression(Opcode, EI->getType(),
-                                      II->getArgOperand(0),
-                                      II->getArgOperand(1), B);
+        return createBinaryExpression(
+            Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1));
       }
     }
   }
 
-  return createAggregateValueExpression(I, B);
+  return createAggregateValueExpression(I);
+}
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+  auto *CI = dyn_cast<CmpInst>(I);
+  // See if our operands are equal to those of a previous predicate, and if so,
+  // if it implies true or false.
+  auto Op0 = lookupOperandLeader(CI->getOperand(0));
+  auto Op1 = lookupOperandLeader(CI->getOperand(1));
+  auto OurPredicate = CI->getPredicate();
+  if (shouldSwapOperands(Op0, Op1)) {
+    std::swap(Op0, Op1);
+    OurPredicate = CI->getSwappedPredicate();
+  }
+
+  // Avoid processing the same info twice
+  const PredicateBase *LastPredInfo = nullptr;
+  // See if we know something about the comparison itself, like it is the target
+  // of an assume.
+  auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+    return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+  if (Op0 == Op1) {
+    // This condition does not depend on predicates, no need to add users
+    if (CI->isTrueWhenEqual())
+      return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+    else if (CI->isFalseWhenEqual())
+      return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+  }
+
+  // NOTE: Because we are comparing both operands here and below, and using
+  // previous comparisons, we rely on fact that predicateinfo knows to mark
+  // comparisons that use renamed operands as users of the earlier comparisons.
+  // It is *not* enough to just mark predicateinfo renamed operands as users of
+  // the earlier comparisons, because the *other* operand may have changed in a
+  // previous iteration.
+  // Example:
+  // icmp slt %a, %b
+  // %b.0 = ssa.copy(%b)
+  // false branch:
+  // icmp slt %c, %b.0
+
+  // %c and %a may start out equal, and thus, the code below will say the second
+  // %icmp is false.  c may become equal to something else, and in that case the
+  // %second icmp *must* be reexamined, but would not if only the renamed
+  // %operands are considered users of the icmp.
+
+  // *Currently* we only check one level of comparisons back, and only mark one
+  // level back as touched when changes appen .  If you modify this code to look
+  // back farther through comparisons, you *must* mark the appropriate
+  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
+  // we know something just from the operands themselves
+
+  // See if our operands have predicate info, so that we may be able to derive
+  // something from a previous comparison.
+  for (const auto &Op : CI->operands()) {
+    auto *PI = PredInfo->getPredicateInfoFor(Op);
+    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+      if (PI == LastPredInfo)
+        continue;
+      LastPredInfo = PI;
+
+      // TODO: Along the false edge, we may know more things too, like icmp of
+      // same operands is false.
+      // TODO: We only handle actual comparison conditions below, not and/or.
+      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+      if (!BranchCond)
+        continue;
+      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+      auto BranchPredicate = BranchCond->getPredicate();
+      if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+        std::swap(BranchOp0, BranchOp1);
+        BranchPredicate = BranchCond->getSwappedPredicate();
+      }
+      if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+        if (PBranch->TrueEdge) {
+          // If we know the previous predicate is true and we are in the true
+          // edge then we may be implied true or false.
+          if (CmpInst::isImpliedTrueByMatchingCmp(OurPredicate,
+                                                  BranchPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+
+          if (CmpInst::isImpliedFalseByMatchingCmp(OurPredicate,
+                                                   BranchPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          }
+
+        } else {
+          // Just handle the ne and eq cases, where if we have the same
+          // operands, we may know something.
+          if (BranchPredicate == OurPredicate) {
+            addPredicateUsers(PI, I);
+            // Same predicate, same ops,we know it was false, so this is false.
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          } else if (BranchPredicate ==
+                     CmpInst::getInversePredicate(OurPredicate)) {
+            addPredicateUsers(PI, I);
+            // Inverse predicate, we know the other was false, so this is true.
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+        }
+      }
+    }
+  }
+  // Create expression will take care of simplifyCmpInst
+  return createExpression(I);
 }
 
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V,
-                                                    const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -957,24 +1677,27 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     switch (I->getOpcode()) {
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
-      E = performSymbolicAggrValueEvaluation(I, B);
+      E = performSymbolicAggrValueEvaluation(I);
       break;
     case Instruction::PHI:
-      E = performSymbolicPHIEvaluation(I, B);
+      E = performSymbolicPHIEvaluation(I);
       break;
     case Instruction::Call:
-      E = performSymbolicCallEvaluation(I, B);
+      E = performSymbolicCallEvaluation(I);
       break;
     case Instruction::Store:
-      E = performSymbolicStoreEvaluation(I, B);
+      E = performSymbolicStoreEvaluation(I);
       break;
     case Instruction::Load:
-      E = performSymbolicLoadEvaluation(I, B);
+      E = performSymbolicLoadEvaluation(I);
       break;
     case Instruction::BitCast: {
-      E = createExpression(I, B);
+      E = createExpression(I);
+    } break;
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      E = performSymbolicCmpEvaluation(I);
     } break;
-
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -993,8 +1716,6 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-    case Instruction::ICmp:
-    case Instruction::FCmp:
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -1011,7 +1732,7 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::InsertElement:
     case Instruction::ShuffleVector:
     case Instruction::GetElementPtr:
-      E = createExpression(I, B);
+      E = createExpression(I);
       break;
     default:
       return nullptr;
@@ -1020,129 +1741,297 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
   return E;
 }
 
-// There is an edge from 'Src' to 'Dst'.  Return true if every path from
-// the entry block to 'Dst' passes via this edge.  In particular 'Dst'
-// must not be reachable via another edge from 'Src'.
-bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const {
-
-  // While in theory it is interesting to consider the case in which Dst has
-  // more than one predecessor, because Dst might be part of a loop which is
-  // only reachable from Src, in practice it is pointless since at the time
-  // GVN runs all such loops have preheaders, which means that Dst will have
-  // been changed to have only one predecessor, namely Src.
-  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
-  const BasicBlock *Src = E.getStart();
-  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
-  (void)Src;
-  return Pred != nullptr;
-}
-
 void NewGVN::markUsersTouched(Value *V) {
   // Now mark the users as touched.
   for (auto *User : V->users()) {
     assert(isa<Instruction>(User) && "Use of value not within an instruction?");
-    TouchedInstructions.set(InstrDFS[User]);
+    TouchedInstructions.set(InstrToDFSNum(User));
   }
 }
 
-void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
-  for (auto U : MA->users()) {
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(U))
-      TouchedInstructions.set(InstrDFS[MUD->getMemoryInst()]);
-    else
-      TouchedInstructions.set(InstrDFS[U]);
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+  DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+  TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+  if (isa<MemoryUse>(MA))
+    return;
+  for (auto U : MA->users())
+    TouchedInstructions.set(MemoryToDFSNum(U));
+  const auto Result = MemoryToUsers.find(MA);
+  if (Result != MemoryToUsers.end()) {
+    for (auto *User : Result->second)
+      TouchedInstructions.set(MemoryToDFSNum(User));
+    MemoryToUsers.erase(Result);
+  }
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+  if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PBranch->Condition].insert(I);
+  else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+  const auto Result = PredicateToUsers.find(I);
+  if (Result != PredicateToUsers.end()) {
+    for (auto *User : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(User));
+    PredicateToUsers.erase(Result);
   }
 }
 
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->memory())
+    markMemoryDefTouched(M);
+}
+
 // Touch the instructions that need to be updated after a congruence class has a
 // leader change, and mark changed values.
-void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
-  for (auto M : CC->Members) {
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : *CC) {
     if (auto *I = dyn_cast<Instruction>(M))
-      TouchedInstructions.set(InstrDFS[I]);
+      TouchedInstructions.set(InstrToDFSNum(I));
     LeaderChanges.insert(M);
   }
 }
 
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+  for (const auto X : R) {
+    auto DFSNum = InstrToDFSNum(X);
+    if (DFSNum < MinDFS.second)
+      MinDFS = {X, DFSNum};
+  }
+  return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+  // TODO: If this ends up to slow, we can maintain a next memory leader like we
+  // do for regular leaders.
+  // Make sure there will be a leader to find
+  assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
+         "Can't get next leader if there is none");
+  if (CC->getStoreCount() > 0) {
+    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+      return MSSA->getMemoryAccess(NL);
+    // Find the store with the minimum DFS number.
+    auto *V = getMinDFSOfRange<Value>(make_filter_range(
+        *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+    return MSSA->getMemoryAccess(cast<StoreInst>(V));
+  }
+  assert(CC->getStoreCount() == 0);
+
+  // Given our assertion, hitting this part must mean
+  // !OldClass->memory_empty()
+  if (CC->memory_size() == 1)
+    return *CC->memory_begin();
+  return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away.  This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+  // We don't need to sort members if there is only 1, and we don't care about
+  // sorting the TOP class because everything either gets out of it or is
+  // unreachable.
+
+  if (CC->size() == 1 || CC == TOPClass) {
+    return *(CC->begin());
+  } else if (CC->getNextLeader().first) {
+    ++NumGVNAvoidedSortedLeaderChanges;
+    return CC->getNextLeader().first;
+  } else {
+    ++NumGVNSortedLeaderChanges;
+    // NOTE: If this ends up to slow, we can maintain a dual structure for
+    // member testing/insertion, or keep things mostly sorted, and sort only
+    // here, or use SparseBitVector or ....
+    return getMinDFSOfRange<Value>(*CC);
+  }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// I must be moving to NewClass from OldClass The StoreCount of OldClass and
+// NewClass is expected to have been updated for I already if it is is a store.
+// The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+                                            MemoryAccess *InstMA,
+                                            CongruenceClass *OldClass,
+                                            CongruenceClass *NewClass) {
+  // If the leader is I, and we had a represenative MemoryAccess, it should
+  // be the MemoryAccess of OldClass.
+  assert((!InstMA || !OldClass->getMemoryLeader() ||
+          OldClass->getLeader() != I ||
+          OldClass->getMemoryLeader() == InstMA) &&
+         "Representative MemoryAccess mismatch");
+  // First, see what happens to the new class
+  if (!NewClass->getMemoryLeader()) {
+    // Should be a new class, or a store becoming a leader of a new class.
+    assert(NewClass->size() == 1 ||
+           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+    NewClass->setMemoryLeader(InstMA);
+    // Mark it touched if we didn't just create a singleton
+    DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+                 << " due to new memory instruction becoming leader\n");
+    markMemoryLeaderChangeTouched(NewClass);
+  }
+  setMemoryClass(InstMA, NewClass);
+  // Now, fixup the old class if necessary
+  if (OldClass->getMemoryLeader() == InstMA) {
+    if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+      DEBUG(dbgs() << "Memory class leader change for class "
+                   << OldClass->getID() << " to "
+                   << *OldClass->getMemoryLeader()
+                   << " due to removal of old leader " << *InstMA << "\n");
+      markMemoryLeaderChangeTouched(OldClass);
+    } else
+      OldClass->setMemoryLeader(nullptr);
+  }
+}
+
 // Move a value, currently in OldClass, to be part of NewClass
-// Update OldClass for the move (including changing leaders, etc)
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
                                            CongruenceClass *OldClass,
                                            CongruenceClass *NewClass) {
-  DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
-               << "\n");
-
-  if (I == OldClass->NextLeader.first)
-    OldClass->NextLeader = {nullptr, ~0U};
+  if (I == OldClass->getNextLeader().first)
+    OldClass->resetNextLeader();
 
   // It's possible, though unlikely, for us to discover equivalences such
   // that the current leader does not dominate the old one.
   // This statistic tracks how often this happens.
   // We assert on phi nodes when this happens, currently, for debugging, because
   // we want to make sure we name phi node cycles properly.
-  if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader &&
-      I != NewClass->RepLeader &&
-      DT->properlyDominates(
-          I->getParent(),
-          cast<Instruction>(NewClass->RepLeader)->getParent())) {
-    ++NumGVNNotMostDominatingLeader;
-    assert(!isa<PHINode>(I) &&
-           "New class for instruction should not be dominated by instruction");
-  }
-
-  if (NewClass->RepLeader != I) {
-    auto DFSNum = InstrDFS.lookup(I);
-    if (DFSNum < NewClass->NextLeader.second)
-      NewClass->NextLeader = {I, DFSNum};
+  if (isa<Instruction>(NewClass->getLeader()) && NewClass->getLeader() &&
+      I != NewClass->getLeader()) {
+    auto *IBB = I->getParent();
+    auto *NCBB = cast<Instruction>(NewClass->getLeader())->getParent();
+    bool Dominated =
+        IBB == NCBB && InstrToDFSNum(I) < InstrToDFSNum(NewClass->getLeader());
+    Dominated = Dominated || DT->properlyDominates(IBB, NCBB);
+    if (Dominated) {
+      ++NumGVNNotMostDominatingLeader;
+      assert(
+          !isa<PHINode>(I) &&
+          "New class for instruction should not be dominated by instruction");
+    }
   }
 
-  OldClass->Members.erase(I);
-  NewClass->Members.insert(I);
-  if (isa<StoreInst>(I)) {
-    --OldClass->StoreCount;
-    assert(OldClass->StoreCount >= 0);
-    ++NewClass->StoreCount;
-    assert(NewClass->StoreCount > 0);
+  if (NewClass->getLeader() != I)
+    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+
+  OldClass->erase(I);
+  NewClass->insert(I);
+  // Handle our special casing of stores.
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    OldClass->decStoreCount();
+    // Okay, so when do we want to make a store a leader of a class?
+    // If we have a store defined by an earlier load, we want the earlier load
+    // to lead the class.
+    // If we have a store defined by something else, we want the store to lead
+    // the class so everything else gets the "something else" as a value.
+    // If we have a store as the single member of the class, we want the store
+    // as the leader
+    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+      // If it's a store expression we are using, it means we are not equivalent
+      // to something earlier.
+      if (isa<StoreExpression>(E)) {
+        assert(lookupOperandLeader(SI->getValueOperand()) !=
+               NewClass->getLeader());
+        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        markValueLeaderChangeTouched(NewClass);
+        // Shift the new class leader to be the store
+        DEBUG(dbgs() << "Changing leader of congruence class "
+                     << NewClass->getID() << " from " << *NewClass->getLeader()
+                     << " to  " << *SI << " because store joined class\n");
+        // If we changed the leader, we have to mark it changed because we don't
+        // know what it will do to symbolic evlauation.
+        NewClass->setLeader(SI);
+      }
+      // We rely on the code below handling the MemoryAccess change.
+    }
+    NewClass->incStoreCount();
   }
-
+  // True if there is no memory instructions left in a class that had memory
+  // instructions before.
+
+  // If it's not a memory use, set the MemoryAccess equivalence
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(MSSA->getMemoryAccess(I));
+  bool InstWasMemoryLeader = InstMA && OldClass->getMemoryLeader() == InstMA;
+  if (InstMA)
+    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
   ValueToClass[I] = NewClass;
   // See if we destroyed the class or need to swap leaders.
-  if (OldClass->Members.empty() && OldClass != InitialClass) {
-    if (OldClass->DefiningExpr) {
-      OldClass->Dead = true;
-      DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+  if (OldClass->empty() && OldClass != TOPClass) {
+    if (OldClass->getDefiningExpr()) {
+      DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
                    << " from table\n");
-      ExpressionToClass.erase(OldClass->DefiningExpr);
+      ExpressionToClass.erase(OldClass->getDefiningExpr());
     }
-  } else if (OldClass->RepLeader == I) {
+  } else if (OldClass->getLeader() == I) {
     // When the leader changes, the value numbering of
     // everything may change due to symbolization changes, so we need to
     // reprocess.
-    DEBUG(dbgs() << "Leader change!\n");
+    DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+                 << "\n");
     ++NumGVNLeaderChanges;
-    // We don't need to sort members if there is only 1, and we don't care about
-    // sorting the initial class because everything either gets out of it or is
-    // unreachable.
-    if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
-      OldClass->RepLeader = *(OldClass->Members.begin());
-    } else if (OldClass->NextLeader.first) {
-      ++NumGVNAvoidedSortedLeaderChanges;
-      OldClass->RepLeader = OldClass->NextLeader.first;
-      OldClass->NextLeader = {nullptr, ~0U};
-    } else {
-      ++NumGVNSortedLeaderChanges;
-      // TODO: If this ends up to slow, we can maintain a dual structure for
-      // member testing/insertion, or keep things mostly sorted, and sort only
-      // here, or ....
-      std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
-      for (const auto X : OldClass->Members) {
-        auto DFSNum = InstrDFS.lookup(X);
-        if (DFSNum < MinDFS.second)
-          MinDFS = {X, DFSNum};
-      }
-      OldClass->RepLeader = MinDFS.first;
+    // Destroy the stored value if there are no more stores to represent it.
+    // Note that this is basically clean up for the expression removal that
+    // happens below.  If we remove stores from a class, we may leave it as a
+    // class of equivalent memory phis.
+    if (OldClass->getStoreCount() == 0) {
+      if (OldClass->getStoredValue())
+        OldClass->setStoredValue(nullptr);
+    }
+    // If we destroy the old access leader and it's a store, we have to
+    // effectively destroy the congruence class.  When it comes to scalars,
+    // anything with the same value is as good as any other.  That means that
+    // one leader is as good as another, and as long as you have some leader for
+    // the value, you are good.. When it comes to *memory states*, only one
+    // particular thing really represents the definition of a given memory
+    // state.  Once it goes away, we need to re-evaluate which pieces of memory
+    // are really still equivalent. The best way to do this is to re-value
+    // number things.  The only way to really make that happen is to destroy the
+    // rest of the class.  In order to effectively destroy the class, we reset
+    // ExpressionToClass for each by using the ValueToExpression mapping.  The
+    // members later get marked as touched due to the leader change.  We will
+    // create new congruence classes, and the pieces that are still equivalent
+    // will end back together in a new class.  If this becomes too expensive, it
+    // is possible to use a versioning scheme for the congruence classes to
+    // avoid the expressions finding this old class.  Note that the situation is
+    // different for memory phis, becuase they are evaluated anew each time, and
+    // they become equal not by hashing, but by seeing if all operands are the
+    // same (or only one is reachable).
+    if (OldClass->getStoreCount() > 0 && InstWasMemoryLeader) {
+      DEBUG(dbgs() << "Kicking everything out of class " << OldClass->getID()
+                   << " because MemoryAccess leader changed");
+      for (auto Member : *OldClass)
+        ExpressionToClass.erase(ValueToExpression.lookup(Member));
     }
-    markLeaderChangeTouched(OldClass);
+    OldClass->setLeader(getNextValueLeader(OldClass));
+    OldClass->resetNextLeader();
+    markValueLeaderChangeTouched(OldClass);
   }
 }
 
@@ -1150,12 +2039,12 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
 void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   ValueToExpression[I] = E;
   // This is guaranteed to return something, since it will at least find
-  // INITIAL.
+  // TOP.
 
   CongruenceClass *IClass = ValueToClass[I];
   assert(IClass && "Should have found a IClass");
   // Dead classes should have been eliminated from the mapping.
-  assert(!IClass->Dead && "Found a dead class");
+  assert(!IClass->isDead() && "Found a dead class");
 
   CongruenceClass *EClass;
   if (const auto *VE = dyn_cast<VariableExpression>(E)) {
@@ -1171,79 +2060,52 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
 
       // Constants and variables should always be made the leader.
       if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
-        NewClass->RepLeader = CE->getConstantValue();
+        NewClass->setLeader(CE->getConstantValue());
       } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
         StoreInst *SI = SE->getStoreInst();
-        NewClass->RepLeader =
-            lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+        NewClass->setLeader(SI);
+        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        // The RepMemoryAccess field will be filled in properly by the
+        // moveValueToNewCongruenceClass call.
       } else {
-        NewClass->RepLeader = I;
+        NewClass->setLeader(I);
       }
       assert(!isa<VariableExpression>(E) &&
              "VariableExpression should have been handled already");
 
       EClass = NewClass;
       DEBUG(dbgs() << "Created new congruence class for " << *I
-                   << " using expression " << *E << " at " << NewClass->ID
-                   << " and leader " << *(NewClass->RepLeader) << "\n");
-      DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
+                   << " using expression " << *E << " at " << NewClass->getID()
+                   << " and leader " << *(NewClass->getLeader()));
+      if (NewClass->getStoredValue())
+        DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
+      DEBUG(dbgs() << "\n");
     } else {
       EClass = lookupResult.first->second;
       if (isa<ConstantExpression>(E))
-        assert(isa<Constant>(EClass->RepLeader) &&
+        assert((isa<Constant>(EClass->getLeader()) ||
+                (EClass->getStoredValue() &&
+                 isa<Constant>(EClass->getStoredValue()))) &&
                "Any class with a constant expression should have a "
                "constant leader");
 
       assert(EClass && "Somehow don't have an eclass");
 
-      assert(!EClass->Dead && "We accidentally looked up a dead class");
+      assert(!EClass->isDead() && "We accidentally looked up a dead class");
     }
   }
   bool ClassChanged = IClass != EClass;
   bool LeaderChanged = LeaderChanges.erase(I);
   if (ClassChanged || LeaderChanged) {
-    DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
+    DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
                  << "\n");
-
     if (ClassChanged)
-      moveValueToNewCongruenceClass(I, IClass, EClass);
+      moveValueToNewCongruenceClass(I, E, IClass, EClass);
     markUsersTouched(I);
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
-      // If this is a MemoryDef, we need to update the equivalence table. If
-      // we determined the expression is congruent to a different memory
-      // state, use that different memory state.  If we determined it didn't,
-      // we update that as well.  Right now, we only support store
-      // expressions.
-      if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
-          EClass->Members.size() != 1) {
-        auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
-        setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
-      } else {
-        setMemoryAccessEquivTo(MA, nullptr);
-      }
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
       markMemoryUsersTouched(MA);
-    }
-  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-    // There is, sadly, one complicating thing for stores.  Stores do not
-    // produce values, only consume them.  However, in order to make loads and
-    // stores value number the same, we ignore the value operand of the store.
-    // But the value operand will still be the leader of our class, and thus, it
-    // may change.  Because the store is a use, the store will get reprocessed,
-    // but nothing will change about it, and so nothing above will catch it
-    // (since the class will not change).  In order to make sure everything ends
-    // up okay, we need to recheck the leader of the class.  Since stores of
-    // different values value number differently due to different memorydefs, we
-    // are guaranteed the leader is always the same between stores in the same
-    // class.
-    DEBUG(dbgs() << "Checking store leader\n");
-    auto ProperLeader =
-        lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
-    if (EClass->RepLeader != ProperLeader) {
-      DEBUG(dbgs() << "Store leader changed, fixing\n");
-      EClass->RepLeader = ProperLeader;
-      markLeaderChangeTouched(EClass);
-      markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
-    }
+    if (auto *CI = dyn_cast<CmpInst>(I))
+      markPredicateUsersTouched(CI);
   }
 }
 
@@ -1267,11 +2129,11 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
       // they are the only thing that depend on new edges. Anything using their
       // values will get propagated to if necessary.
       if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
-        TouchedInstructions.set(InstrDFS[MemPhi]);
+        TouchedInstructions.set(InstrToDFSNum(MemPhi));
 
       auto BI = To->begin();
       while (isa<PHINode>(BI)) {
-        TouchedInstructions.set(InstrDFS[&*BI]);
+        TouchedInstructions.set(InstrToDFSNum(&*BI));
         ++BI;
       }
     }
@@ -1280,8 +2142,8 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
 
 // Given a predicate condition (from a switch, cmp, or whatever) and a block,
 // see if we know some constant value for it already.
-Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const {
-  auto Result = lookupOperandLeader(Cond, nullptr, B);
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+  auto Result = lookupOperandLeader(Cond);
   if (isa<Constant>(Result))
     return Result;
   return nullptr;
@@ -1293,10 +2155,10 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
   BranchInst *BR;
   if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
     Value *Cond = BR->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(Cond, B);
+    Value *CondEvaluated = findConditionEquivalence(Cond);
     if (!CondEvaluated) {
       if (auto *I = dyn_cast<Instruction>(Cond)) {
-        const Expression *E = createExpression(I, B);
+        const Expression *E = createExpression(I);
         if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
           CondEvaluated = CE->getConstantValue();
         }
@@ -1329,13 +2191,13 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
 
     Value *SwitchCond = SI->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(SwitchCond, B);
+    Value *CondEvaluated = findConditionEquivalence(SwitchCond);
     // See if we were able to turn this switch statement into a constant.
     if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
       auto *CondVal = cast<ConstantInt>(CondEvaluated);
       // We should be able to get case value for this.
-      auto CaseVal = SI->findCaseValue(CondVal);
-      if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) {
+      auto Case = *SI->findCaseValue(CondVal);
+      if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
         // We proved the value is outside of the range of the case.
         // We can't do anything other than mark the default dest as reachable,
         // and go home.
@@ -1343,7 +2205,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
         return;
       }
       // Now get where it goes and mark it reachable.
-      BasicBlock *TargetBlock = CaseVal.getCaseSuccessor();
+      BasicBlock *TargetBlock = Case.getCaseSuccessor();
       updateReachableEdge(B, TargetBlock);
     } else {
       for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
@@ -1361,45 +2223,66 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     }
 
     // This also may be a memory defining terminator, in which case, set it
-    // equivalent to nothing.
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(TI))
-      setMemoryAccessEquivTo(MA, nullptr);
+    // equivalent only to itself.
+    //
+    auto *MA = MSSA->getMemoryAccess(TI);
+    if (MA && !isa<MemoryUse>(MA)) {
+      auto *CC = ensureLeaderOfMemoryClass(MA);
+      if (setMemoryClass(MA, CC))
+        markMemoryUsersTouched(MA);
+    }
   }
 }
 
-// The algorithm initially places the values of the routine in the INITIAL
-// congruence
-// class. The leader of INITIAL is the undetermined value `TOP`.
-// When the algorithm has finished, values still in INITIAL are unreachable.
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
 void NewGVN::initializeCongruenceClasses(Function &F) {
-  // FIXME now i can't remember why this is 2
-  NextCongruenceNum = 2;
-  // Initialize all other instructions to be in INITIAL class.
-  CongruenceClass::MemberSet InitialValues;
-  InitialClass = createCongruenceClass(nullptr, nullptr);
+  NextCongruenceNum = 0;
+
+  // Note that even though we use the live on entry def as a representative
+  // MemoryAccess, it is *not* the same as the actual live on entry def. We
+  // have no real equivalemnt to undef for MemoryAccesses, and so we really
+  // should be checking whether the MemoryAccess is top if we want to know if it
+  // is equivalent to everything.  Otherwise, what this really signifies is that
+  // the access "it reaches all the way back to the beginning of the function"
+
+  // Initialize all other instructions to be in TOP class.
+  TOPClass = createCongruenceClass(nullptr, nullptr);
+  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+  //  The live on entry def gets put into it's own class
+  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+      createMemoryClass(MSSA->getLiveOnEntryDef());
+
   for (auto &B : F) {
-    if (auto *MP = MSSA->getMemoryAccess(&B))
-      MemoryAccessEquiv.insert({MP, MSSA->getLiveOnEntryDef()});
+    // All MemoryAccesses are equivalent to live on entry to start. They must
+    // be initialized to something so that initial changes are noticed. For
+    // the maximal answer, we initialize them all to be the same as
+    // liveOnEntry.
+    auto *MemoryBlockDefs = MSSA->getBlockDefs(&B);
+    if (MemoryBlockDefs)
+      for (const auto &Def : *MemoryBlockDefs) {
+        MemoryAccessToClass[&Def] = TOPClass;
+        auto *MD = dyn_cast<MemoryDef>(&Def);
+        // Insert the memory phis into the member list.
+        if (!MD) {
+          const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+          TOPClass->memory_insert(MP);
+          MemoryPhiState.insert({MP, MPS_TOP});
+        }
 
-    for (auto &I : B) {
-      InitialValues.insert(&I);
-      ValueToClass[&I] = InitialClass;
-      // All memory accesses are equivalent to live on entry to start. They must
-      // be initialized to something so that initial changes are noticed. For
-      // the maximal answer, we initialize them all to be the same as
-      // liveOnEntry.  Note that to save time, we only initialize the
-      // MemoryDef's for stores and all MemoryPhis to be equal.  Right now, no
-      // other expression can generate a memory equivalence.  If we start
-      // handling memcpy/etc, we can expand this.
-      if (isa<StoreInst>(&I)) {
-        MemoryAccessEquiv.insert(
-            {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
-        ++InitialClass->StoreCount;
-        assert(InitialClass->StoreCount > 0);
+        if (MD && isa<StoreInst>(MD->getMemoryInst()))
+          TOPClass->incStoreCount();
       }
+    for (auto &I : B) {
+      // Don't insert void terminators into the class. We don't value number
+      // them, and they just end up sitting in TOP.
+      if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+        continue;
+      TOPClass->insert(&I);
+      ValueToClass[&I] = TOPClass;
     }
   }
-  InitialClass->Members.swap(InitialValues);
 
   // Initialize arguments to be in their own unique congruence classes
   for (auto &FA : F.args())
@@ -1408,8 +2291,8 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
 
 void NewGVN::cleanupTables() {
   for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has "
-                 << CongruenceClasses[i]->Members.size() << " members\n");
+    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                 << " has " << CongruenceClasses[i]->size() << " members\n");
     // Make sure we delete the congruence class (probably worth switching to
     // a unique_ptr at some point.
     delete CongruenceClasses[i];
@@ -1427,15 +2310,14 @@ void NewGVN::cleanupTables() {
 #ifndef NDEBUG
   ProcessedCount.clear();
 #endif
-  DFSDomMap.clear();
   InstrDFS.clear();
   InstructionsToErase.clear();
-
   DFSToInstr.clear();
   BlockInstRange.clear();
   TouchedInstructions.clear();
-  DominatedInstRange.clear();
-  MemoryAccessEquiv.clear();
+  MemoryAccessToClass.clear();
+  PredicateToUsers.clear();
+  MemoryToUsers.clear();
 }
 
 std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
@@ -1447,6 +2329,16 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
   }
 
   for (auto &I : *B) {
+    // There's no need to call isInstructionTriviallyDead more than once on
+    // an instruction. Therefore, once we know that an instruction is dead
+    // we change its DFS number so that it doesn't get value numbered.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      InstrDFS[&I] = 0;
+      DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      markInstructionForDeletion(&I);
+      continue;
+    }
+
     InstrDFS[&I] = End++;
     DFSToInstr.emplace_back(&I);
   }
@@ -1462,7 +2354,7 @@ void NewGVN::updateProcessedCount(Value *V) {
   if (ProcessedCount.count(V) == 0) {
     ProcessedCount.insert({V, 1});
   } else {
-    ProcessedCount[V] += 1;
+    ++ProcessedCount[V];
     assert(ProcessedCount[V] < 100 &&
            "Seem to have processed the same Value a lot");
   }
@@ -1472,26 +2364,33 @@ void NewGVN::updateProcessedCount(Value *V) {
 void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // If all the arguments are the same, the MemoryPhi has the same value as the
   // argument.
-  // Filter out unreachable blocks from our operands.
+  // Filter out unreachable blocks and self phis from our operands.
+  const BasicBlock *PHIBlock = MP->getBlock();
   auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
-    return ReachableBlocks.count(MP->getIncomingBlock(U));
+    return lookupMemoryLeader(cast<MemoryAccess>(U)) != MP &&
+           !isMemoryAccessTop(cast<MemoryAccess>(U)) &&
+           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
   });
-
-  assert(Filtered.begin() != Filtered.end() &&
-         "We should not be processing a MemoryPhi in a completely "
-         "unreachable block");
+  // If all that is left is nothing, our memoryphi is undef. We keep it as
+  // InitialClass.  Note: The only case this should happen is if we have at
+  // least one self-argument.
+  if (Filtered.begin() == Filtered.end()) {
+    if (setMemoryClass(MP, TOPClass))
+      markMemoryUsersTouched(MP);
+    return;
+  }
 
   // Transform the remaining operands into operand leaders.
   // FIXME: mapped_iterator should have a range version.
   auto LookupFunc = [&](const Use &U) {
-    return lookupMemoryAccessEquiv(cast<MemoryAccess>(U));
+    return lookupMemoryLeader(cast<MemoryAccess>(U));
   };
   auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
   auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
 
   // and now check if all the elements are equal.
   // Sadly, we can't use std::equals since these are random access iterators.
-  MemoryAccess *AllSameValue = *MappedBegin;
+  const auto *AllSameValue = *MappedBegin;
   ++MappedBegin;
   bool AllEqual = std::all_of(
       MappedBegin, MappedEnd,
@@ -1501,8 +2400,18 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
     DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
   else
     DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
-
-  if (setMemoryAccessEquivTo(MP, AllEqual ? AllSameValue : nullptr))
+  // If it's equal to something, it's in that class. Otherwise, it has to be in
+  // a class where it is the leader (other things may be equivalent to it, but
+  // it needs to start off in its own class, which means it must have been the
+  // leader, and it can't have stopped being the leader because it was never
+  // removed).
+  CongruenceClass *CC =
+      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+  auto OldState = MemoryPhiState.lookup(MP);
+  assert(OldState != MPS_Invalid && "Invalid memory phi state");
+  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+  MemoryPhiState[MP] = NewState;
+  if (setMemoryClass(MP, CC) || OldState != NewState)
     markMemoryUsersTouched(MP);
 }
 
@@ -1510,21 +2419,25 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
 // congruence finding, and updating mappings.
 void NewGVN::valueNumberInstruction(Instruction *I) {
   DEBUG(dbgs() << "Processing instruction " << *I << "\n");
-  if (isInstructionTriviallyDead(I, TLI)) {
-    DEBUG(dbgs() << "Skipping unused instruction\n");
-    markInstructionForDeletion(I);
-    return;
-  }
   if (!I->isTerminator()) {
-    const auto *Symbolized = performSymbolicEvaluation(I, I->getParent());
+    const Expression *Symbolized = nullptr;
+    if (DebugCounter::shouldExecute(VNCounter)) {
+      Symbolized = performSymbolicEvaluation(I);
+    } else {
+      // Mark the instruction as unused so we don't value number it again.
+      InstrDFS[I] = 0;
+    }
     // If we couldn't come up with a symbolic expression, use the unknown
     // expression
-    if (Symbolized == nullptr)
+    if (Symbolized == nullptr) {
       Symbolized = createUnknownExpression(I);
+    }
+
     performCongruenceFinding(I, Symbolized);
   } else {
     // Handle terminators that return values. All of them produce values we
-    // don't currently understand.
+    // don't currently understand.  We don't place non-value producing
+    // terminators in a class.
     if (!I->getType()->isVoidTy()) {
       auto *Symbolized = createUnknownExpression(I);
       performCongruenceFinding(I, Symbolized);
@@ -1539,72 +2452,102 @@ bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
                                     const MemoryAccess *Second) const {
   if (First == Second)
     return true;
-
-  if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
-    auto *DefAccess = FirstDef->getDefiningAccess();
-    return singleReachablePHIPath(DefAccess, Second);
-  } else {
-    auto *MP = cast<MemoryPhi>(First);
-    auto ReachableOperandPred = [&](const Use &U) {
-      return ReachableBlocks.count(MP->getIncomingBlock(U));
-    };
-    auto FilteredPhiArgs =
-        make_filter_range(MP->operands(), ReachableOperandPred);
-    SmallVector<const Value *, 32> OperandList;
-    std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
-              std::back_inserter(OperandList));
-    bool Okay = OperandList.size() == 1;
-    if (!Okay)
-      Okay = std::equal(OperandList.begin(), OperandList.end(),
-                        OperandList.begin());
-    if (Okay)
-      return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  if (MSSA->isLiveOnEntryDef(First))
     return false;
+
+  const auto *EndDef = First;
+  for (auto *ChainDef : optimized_def_chain(First)) {
+    if (ChainDef == Second)
+      return true;
+    if (MSSA->isLiveOnEntryDef(ChainDef))
+      return false;
+    EndDef = ChainDef;
   }
+  auto *MP = cast<MemoryPhi>(EndDef);
+  auto ReachableOperandPred = [&](const Use &U) {
+    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+  };
+  auto FilteredPhiArgs =
+      make_filter_range(MP->operands(), ReachableOperandPred);
+  SmallVector<const Value *, 32> OperandList;
+  std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+            std::back_inserter(OperandList));
+  bool Okay = OperandList.size() == 1;
+  if (!Okay)
+    Okay =
+        std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+  if (Okay)
+    return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  return false;
 }
 
 // Verify the that the memory equivalence table makes sense relative to the
 // congruence classes.  Note that this checking is not perfect, and is currently
-// subject to very rare false negatives. It is only useful for testing/debugging.
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
 void NewGVN::verifyMemoryCongruency() const {
-  // Anything equivalent in the memory access table should be in the same
+#ifndef NDEBUG
+  // Verify that the memory table equivalence and memory member set match
+  for (const auto *CC : CongruenceClasses) {
+    if (CC == TOPClass || CC->isDead())
+      continue;
+    if (CC->getStoreCount() != 0) {
+      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+             "Any class with a store as a "
+             "leader should have a "
+             "representative stored value\n");
+      assert(CC->getMemoryLeader() &&
+             "Any congruence class with a store should "
+             "have a representative access\n");
+    }
+
+    if (CC->getMemoryLeader())
+      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+             "Representative MemoryAccess does not appear to be reverse "
+             "mapped properly");
+    for (auto M : CC->memory())
+      assert(MemoryAccessToClass.lookup(M) == CC &&
+             "Memory member does not appear to be reverse mapped properly");
+  }
+
+  // Anything equivalent in the MemoryAccess table should be in the same
   // congruence class.
 
   // Filter out the unreachable and trivially dead entries, because they may
   // never have been updated if the instructions were not processed.
   auto ReachableAccessPred =
-      [&](const std::pair<const MemoryAccess *, MemoryAccess *> Pair) {
+      [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
         bool Result = ReachableBlocks.count(Pair.first->getBlock());
         if (!Result)
           return false;
+        if (MSSA->isLiveOnEntryDef(Pair.first))
+          return true;
         if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
           return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+        if (MemoryToDFSNum(Pair.first) == 0)
+          return false;
         return true;
       };
 
-  auto Filtered = make_filter_range(MemoryAccessEquiv, ReachableAccessPred);
+  auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
   for (auto KV : Filtered) {
-    assert(KV.first != KV.second &&
-           "We added a useless equivalence to the memory equivalence table");
-    // Unreachable instructions may not have changed because we never process
-    // them.
-    if (!ReachableBlocks.count(KV.first->getBlock()))
-      continue;
+    assert(KV.second != TOPClass &&
+           "Memory not unreachable but ended up in TOP");
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
-      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
+      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
       if (FirstMUD && SecondMUD)
         assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
-               ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
-                       ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
-                   "The instructions for these memory operations should have "
-                   "been in the same congruence class or reachable through"
-                   "a single argument phi");
+                ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                    ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+               "The instructions for these memory operations should have "
+               "been in the same congruence class or reachable through"
+               "a single argument phi");
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
-
       // We can only sanely verify that MemoryDefs in the operand list all have
       // the same class.
       auto ReachableOperandPred = [&](const Use &U) {
-        return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) &&
+        return ReachableEdges.count(
+                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
                isa<MemoryDef>(U);
 
       };
@@ -1622,19 +2565,127 @@ void NewGVN::verifyMemoryCongruency() const {
              "All MemoryPhi arguments should be in the same class");
     }
   }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Beginning iteration verification\n");
+  if (DebugCounter::isCounterSet(VNCounter))
+    DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+  // Note that we have to store the actual classes, as we may change existing
+  // classes during iteration.  This is because our memory iteration propagation
+  // is not perfect, and so may waste a little work.  But it should generate
+  // exactly the same congruence classes we have now, with different IDs.
+  std::map<const Value *, CongruenceClass> BeforeIteration;
+
+  for (auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    BeforeIteration.insert({KV.first, *KV.second});
+  }
+
+  TouchedInstructions.set();
+  TouchedInstructions.reset(0);
+  iterateTouchedInstructions();
+  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+      EqualClasses;
+  for (const auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    // We could sink these uses, but i think this adds a bit of clarity here as
+    // to what we are comparing.
+    auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+    auto *AfterCC = KV.second;
+    // Note that the classes can't change at this point, so we memoize the set
+    // that are equal.
+    if (!EqualClasses.count({BeforeCC, AfterCC})) {
+      assert(BeforeCC->isEquivalentTo(AfterCC) &&
+             "Value number changed after main loop completed!");
+      EqualClasses.insert({BeforeCC, AfterCC});
+    }
+  }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+  unsigned int Iterations = 0;
+  // Figure out where touchedinstructions starts
+  int FirstInstr = TouchedInstructions.find_first();
+  // Nothing set, nothing to iterate, just return.
+  if (FirstInstr == -1)
+    return;
+  BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  while (TouchedInstructions.any()) {
+    ++Iterations;
+    // Walk through all the instructions in all the blocks in RPO.
+    // TODO: As we hit a new block, we should push and pop equalities into a
+    // table lookupOperandLeader can use, to catch things PredicateInfo
+    // might miss, like edge-only equivalences.
+    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
+         InstrNum = TouchedInstructions.find_next(InstrNum)) {
+
+      // This instruction was found to be dead. We don't bother looking
+      // at it again.
+      if (InstrNum == 0) {
+        TouchedInstructions.reset(InstrNum);
+        continue;
+      }
+
+      Value *V = InstrFromDFSNum(InstrNum);
+      BasicBlock *CurrBlock = getBlockForValue(V);
+
+      // If we hit a new block, do reachability processing.
+      if (CurrBlock != LastBlock) {
+        LastBlock = CurrBlock;
+        bool BlockReachable = ReachableBlocks.count(CurrBlock);
+        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+        // If it's not reachable, erase any touched instructions and move on.
+        if (!BlockReachable) {
+          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+          DEBUG(dbgs() << "Skipping instructions in block "
+                       << getBlockName(CurrBlock)
+                       << " because it is unreachable\n");
+          continue;
+        }
+        updateProcessedCount(CurrBlock);
+      }
+
+      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        valueNumberMemoryPhi(MP);
+      } else if (auto *I = dyn_cast<Instruction>(V)) {
+        valueNumberInstruction(I);
+      } else {
+        llvm_unreachable("Should have been a MemoryPhi or Instruction");
+      }
+      updateProcessedCount(V);
+      // Reset after processing (because we may mark ourselves as touched when
+      // we propagate equalities).
+      TouchedInstructions.reset(InstrNum);
+    }
+  }
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
 }
 
 // This is the main transformation entry point.
-bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
-                    TargetLibraryInfo *_TLI, AliasAnalysis *_AA,
-                    MemorySSA *_MSSA) {
+bool NewGVN::runGVN() {
+  if (DebugCounter::isCounterSet(VNCounter))
+    StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
   bool Changed = false;
-  DT = _DT;
-  AC = _AC;
-  TLI = _TLI;
-  AA = _AA;
-  MSSA = _MSSA;
-  DL = &F.getParent()->getDataLayout();
+  NumFuncArgs = F.arg_size();
   MSSAWalker = MSSA->getWalker();
 
   // Count number of instructions for sizing of hash tables, and come
@@ -1642,15 +2693,14 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   unsigned ICount = 1;
   // Add an empty instruction to account for the fact that we start at 1
   DFSToInstr.emplace_back(nullptr);
-  // Note: We want RPO traversal of the blocks, which is not quite the same as
-  // dominator tree order, particularly with regard whether backedges get
-  // visited first or second, given a block with multiple successors.
+  // Note: We want ideal RPO traversal of the blocks, which is not quite the
+  // same as dominator tree order, particularly with regard whether backedges
+  // get visited first or second, given a block with multiple successors.
   // If we visit in the wrong order, we will end up performing N times as many
   // iterations.
   // The dominator tree does guarantee that, for a given dom tree node, it's
   // parent must occur before it in the RPO ordering. Thus, we only need to sort
   // the siblings.
-  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
   ReversePostOrderTraversal<Function *> RPOT(&F);
   unsigned Counter = 0;
   for (auto &B : RPOT) {
@@ -1663,7 +2713,7 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
     auto *Node = DT->getNode(B);
     if (Node->getChildren().size() > 1)
       std::sort(Node->begin(), Node->end(),
-                [&RPOOrdering](const DomTreeNode *A, const DomTreeNode *B) {
+                [&](const DomTreeNode *A, const DomTreeNode *B) {
                   return RPOOrdering[A] < RPOOrdering[B];
                 });
   }
@@ -1689,7 +2739,6 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   }
 
   TouchedInstructions.resize(ICount);
-  DominatedInstRange.reserve(F.size());
   // Ensure we don't end up resizing the expressionToClass map, as
   // that can be quite expensive. At most, we have one expression per
   // instruction.
@@ -1701,62 +2750,10 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   ReachableBlocks.insert(&F.getEntryBlock());
 
   initializeCongruenceClasses(F);
-
-  unsigned int Iterations = 0;
-  // We start out in the entry block.
-  BasicBlock *LastBlock = &F.getEntryBlock();
-  while (TouchedInstructions.any()) {
-    ++Iterations;
-    // Walk through all the instructions in all the blocks in RPO.
-    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
-         InstrNum = TouchedInstructions.find_next(InstrNum)) {
-      assert(InstrNum != 0 && "Bit 0 should never be set, something touched an "
-                              "instruction not in the lookup table");
-      Value *V = DFSToInstr[InstrNum];
-      BasicBlock *CurrBlock = nullptr;
-
-      if (auto *I = dyn_cast<Instruction>(V))
-        CurrBlock = I->getParent();
-      else if (auto *MP = dyn_cast<MemoryPhi>(V))
-        CurrBlock = MP->getBlock();
-      else
-        llvm_unreachable("DFSToInstr gave us an unknown type of instruction");
-
-      // If we hit a new block, do reachability processing.
-      if (CurrBlock != LastBlock) {
-        LastBlock = CurrBlock;
-        bool BlockReachable = ReachableBlocks.count(CurrBlock);
-        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
-
-        // If it's not reachable, erase any touched instructions and move on.
-        if (!BlockReachable) {
-          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
-          DEBUG(dbgs() << "Skipping instructions in block "
-                       << getBlockName(CurrBlock)
-                       << " because it is unreachable\n");
-          continue;
-        }
-        updateProcessedCount(CurrBlock);
-      }
-
-      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
-        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
-        valueNumberMemoryPhi(MP);
-      } else if (auto *I = dyn_cast<Instruction>(V)) {
-        valueNumberInstruction(I);
-      } else {
-        llvm_unreachable("Should have been a MemoryPhi or Instruction");
-      }
-      updateProcessedCount(V);
-      // Reset after processing (because we may mark ourselves as touched when
-      // we propagate equalities).
-      TouchedInstructions.reset(InstrNum);
-    }
-  }
-  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
-#ifndef NDEBUG
+  iterateTouchedInstructions();
   verifyMemoryCongruency();
-#endif
+  verifyIterationSettled(F);
+
   Changed |= eliminateInstructions(F);
 
   // Delete all instructions marked for deletion.
@@ -1783,36 +2780,6 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   return Changed;
 }
 
-bool NewGVN::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-  return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-                &getAnalysis<MemorySSAWrapperPass>().getMSSA());
-}
-
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
-  NewGVN Impl;
-
-  // Apparently the order in which we get these results matter for
-  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
-  // the same order here, just in case.
-  auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  auto &AA = AM.getResult<AAManager>(F);
-  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA);
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<GlobalsAA>();
-  return PA;
-}
-
 // Return true if V is a value that will always be available (IE can
 // be placed anywhere) in the function.  We don't do globals here
 // because they are often worse to put in place.
@@ -1821,21 +2788,15 @@ static bool alwaysAvailable(Value *V) {
   return isa<Constant>(V) || isa<Argument>(V);
 }
 
-// Get the basic block from an instruction/value.
-static BasicBlock *getBlockForValue(Value *V) {
-  if (auto *I = dyn_cast<Instruction>(V))
-    return I->getParent();
-  return nullptr;
-}
-
 struct NewGVN::ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
   int LocalNum = 0;
-  // Only one of these will be set.
-  Value *Val = nullptr;
+  // Only one of Def and U will be set.
+  // The bool in the Def tells us whether the Def is the stored value of a
+  // store.
+  PointerIntPair<Value *, 1, bool> Def;
   Use *U = nullptr;
-
   bool operator<(const ValueDFS &Other) const {
     // It's not enough that any given field be less than - we have sets
     // of fields that need to be evaluated together to give a proper ordering.
@@ -1875,89 +2836,151 @@ struct NewGVN::ValueDFS {
     // but .val  and .u.
     // It does not matter what order we replace these operands in.
     // You will always end up with the same IR, and this is guaranteed.
-    return std::tie(DFSIn, DFSOut, LocalNum, Val, U) <
-           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val,
+    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
                     Other.U);
   }
 };
 
-void NewGVN::convertDenseToDFSOrdered(
-    CongruenceClass::MemberSet &Dense,
-    SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info.  The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+    DenseMap<const Value *, unsigned int> &UseCounts,
+    SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
   for (auto D : Dense) {
     // First add the value.
     BasicBlock *BB = getBlockForValue(D);
     // Constants are handled prior to ever calling this function, so
     // we should only be left with instructions as members.
     assert(BB && "Should have figured out a basic block for value");
-    ValueDFS VD;
-
-    std::pair<int, int> DFSPair = DFSDomMap[BB];
-    assert(DFSPair.first != -1 && DFSPair.second != -1 && "Invalid DFS Pair");
-    VD.DFSIn = DFSPair.first;
-    VD.DFSOut = DFSPair.second;
-    VD.Val = D;
-    // If it's an instruction, use the real local dfs number.
-    if (auto *I = dyn_cast<Instruction>(D))
-      VD.LocalNum = InstrDFS[I];
-    else
-      llvm_unreachable("Should have been an instruction");
-
-    DFSOrderedSet.emplace_back(VD);
-
-    // Now add the users.
-    for (auto &U : D->uses()) {
+    ValueDFS VDDef;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VDDef.DFSIn = DomNode->getDFSNumIn();
+    VDDef.DFSOut = DomNode->getDFSNumOut();
+    // If it's a store, use the leader of the value operand, if it's always
+    // available, or the value operand.  TODO: We could do dominance checks to
+    // find a dominating leader, but not worth it ATM.
+    if (auto *SI = dyn_cast<StoreInst>(D)) {
+      auto Leader = lookupOperandLeader(SI->getValueOperand());
+      if (alwaysAvailable(Leader)) {
+        VDDef.Def.setPointer(Leader);
+      } else {
+        VDDef.Def.setPointer(SI->getValueOperand());
+        VDDef.Def.setInt(true);
+      }
+    } else {
+      VDDef.Def.setPointer(D);
+    }
+    assert(isa<Instruction>(D) &&
+           "The dense set member should always be an instruction");
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.emplace_back(VDDef);
+    Instruction *Def = cast<Instruction>(D);
+    unsigned int UseCount = 0;
+    // Now add the uses.
+    for (auto &U : Def->uses()) {
       if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        ValueDFS VD;
+        // Don't try to replace into dead uses
+        if (InstructionsToErase.count(I))
+          continue;
+        ValueDFS VDUse;
         // Put the phi node uses in the incoming block.
         BasicBlock *IBlock;
         if (auto *P = dyn_cast<PHINode>(I)) {
           IBlock = P->getIncomingBlock(U);
           // Make phi node users appear last in the incoming block
           // they are from.
-          VD.LocalNum = InstrDFS.size() + 1;
+          VDUse.LocalNum = InstrDFS.size() + 1;
         } else {
           IBlock = I->getParent();
-          VD.LocalNum = InstrDFS[I];
+          VDUse.LocalNum = InstrToDFSNum(I);
         }
-        std::pair<int, int> DFSPair = DFSDomMap[IBlock];
-        VD.DFSIn = DFSPair.first;
-        VD.DFSOut = DFSPair.second;
-        VD.U = &U;
-        DFSOrderedSet.emplace_back(VD);
+
+        // Skip uses in unreachable blocks, as we're going
+        // to delete them.
+        if (ReachableBlocks.count(IBlock) == 0)
+          continue;
+
+        DomTreeNode *DomNode = DT->getNode(IBlock);
+        VDUse.DFSIn = DomNode->getDFSNumIn();
+        VDUse.DFSOut = DomNode->getDFSNumOut();
+        VDUse.U = &U;
+        ++UseCount;
+        DFSOrderedSet.emplace_back(VDUse);
       }
     }
+
+    // If there are no uses, it's probably dead (but it may have side-effects,
+    // so not definitely dead. Otherwise, store the number of uses so we can
+    // track if it becomes dead later).
+    if (UseCount == 0)
+      ProbablyDead.insert(Def);
+    else
+      UseCounts[Def] = UseCount;
   }
 }
 
-static void patchReplacementInstruction(Instruction *I, Value *Repl) {
-  // Patch the replacement so that it is not more restrictive than the value
-  // being replaced.
-  auto *Op = dyn_cast<BinaryOperator>(I);
-  auto *ReplOp = dyn_cast<BinaryOperator>(Repl);
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+    const CongruenceClass &Dense,
+    SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+  for (auto D : Dense) {
+    if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+      continue;
 
-  if (Op && ReplOp)
-    ReplOp->andIRFlags(Op);
+    BasicBlock *BB = getBlockForValue(D);
+    ValueDFS VD;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VD.DFSIn = DomNode->getDFSNumIn();
+    VD.DFSOut = DomNode->getDFSNumOut();
+    VD.Def.setPointer(D);
 
-  if (auto *ReplInst = dyn_cast<Instruction>(Repl)) {
-    // FIXME: If both the original and replacement value are part of the
-    // same control-flow region (meaning that the execution of one
-    // guarentees the executation of the other), then we can combine the
-    // noalias scopes here and do better than the general conservative
-    // answer used in combineMetadata().
+    // If it's an instruction, use the real local dfs number.
+    if (auto *I = dyn_cast<Instruction>(D))
+      VD.LocalNum = InstrToDFSNum(I);
+    else
+      llvm_unreachable("Should have been an instruction");
 
-    // In general, GVN unifies expressions over different control-flow
-    // regions, and so we need a conservative combination of the noalias
-    // scopes.
-    unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,        LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
-        LLVMContext::MD_invariant_group};
-    combineMetadata(ReplInst, I, KnownIDs);
+    LoadsAndStores.emplace_back(VD);
   }
 }
 
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
+}
+
 static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
   patchReplacementInstruction(I, Repl);
   I->replaceAllUsesWith(Repl);
@@ -1967,10 +2990,6 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumGVNBlocksDeleted;
 
-  // Check to see if there are non-terminating instructions to delete.
-  if (isa<TerminatorInst>(BB->begin()))
-    return;
-
   // Delete the instructions backwards, as it has a reduced likelihood of having
   // to update as many def-use and use-def chains. Start after the terminator.
   auto StartPoint = BB->rbegin();
@@ -1987,6 +3006,11 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
     Inst.eraseFromParent();
     ++NumGVNInstrDeleted;
   }
+  // Now insert something that simplifycfg will turn into an unreachable.
+  Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+  new StoreInst(UndefValue::get(Int8Ty),
+                Constant::getNullValue(Int8Ty->getPointerTo()),
+                BB->getTerminator());
 }
 
 void NewGVN::markInstructionForDeletion(Instruction *I) {
@@ -2086,59 +3110,59 @@ bool NewGVN::eliminateInstructions(Function &F) {
         }
       }
     }
-    DomTreeNode *Node = DT->getNode(&B);
-    if (Node)
-      DFSDomMap[&B] = {Node->getDFSNumIn(), Node->getDFSNumOut()};
   }
 
-  for (CongruenceClass *CC : CongruenceClasses) {
-    // FIXME: We should eventually be able to replace everything still
-    // in the initial class with undef, as they should be unreachable.
-    // Right now, initial still contains some things we skip value
-    // numbering of (UNREACHABLE's, for example).
-    if (CC == InitialClass || CC->Dead)
+  // Map to store the use counts
+  DenseMap<const Value *, unsigned int> UseCounts;
+  for (CongruenceClass *CC : reverse(CongruenceClasses)) {
+    // Track the equivalent store info so we can decide whether to try
+    // dead store elimination.
+    SmallVector<ValueDFS, 8> PossibleDeadStores;
+    SmallPtrSet<Instruction *, 8> ProbablyDead;
+    if (CC->isDead() || CC->empty())
       continue;
-    assert(CC->RepLeader && "We should have had a leader");
+    // Everything still in the TOP class is unreachable or dead.
+    if (CC == TOPClass) {
+#ifndef NDEBUG
+      for (auto M : *CC)
+        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+                InstructionsToErase.count(cast<Instruction>(M))) &&
+               "Everything in TOP should be unreachable or dead at this "
+               "point");
+#endif
+      continue;
+    }
 
+    assert(CC->getLeader() && "We should have had a leader");
     // If this is a leader that is always available, and it's a
     // constant or has no equivalences, just replace everything with
     // it. We then update the congruence class with whatever members
     // are left.
-    if (alwaysAvailable(CC->RepLeader)) {
-      SmallPtrSet<Value *, 4> MembersLeft;
-      for (auto M : CC->Members) {
-
+    Value *Leader =
+        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+    if (alwaysAvailable(Leader)) {
+      CongruenceClass::MemberSet MembersLeft;
+      for (auto M : *CC) {
         Value *Member = M;
-
         // Void things have no uses we can replace.
-        if (Member == CC->RepLeader || Member->getType()->isVoidTy()) {
+        if (Member == Leader || !isa<Instruction>(Member) ||
+            Member->getType()->isVoidTy()) {
           MembersLeft.insert(Member);
           continue;
         }
-
-        DEBUG(dbgs() << "Found replacement " << *(CC->RepLeader) << " for "
-                     << *Member << "\n");
-        // Due to equality propagation, these may not always be
-        // instructions, they may be real values.  We don't really
-        // care about trying to replace the non-instructions.
-        if (auto *I = dyn_cast<Instruction>(Member)) {
-          assert(CC->RepLeader != I &&
-                 "About to accidentally remove our leader");
-          replaceInstruction(I, CC->RepLeader);
-          AnythingReplaced = true;
-
-          continue;
-        } else {
-          MembersLeft.insert(I);
-        }
+        DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
+                     << "\n");
+        auto *I = cast<Instruction>(Member);
+        assert(Leader != I && "About to accidentally remove our leader");
+        replaceInstruction(I, Leader);
+        AnythingReplaced = true;
       }
-      CC->Members.swap(MembersLeft);
-
+      CC->swap(MembersLeft);
     } else {
-      DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n");
+      DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+                   << "\n");
       // If this is a singleton, we can skip it.
-      if (CC->Members.size() != 1) {
-
+      if (CC->size() != 1) {
         // This is a stack because equality replacement/etc may place
         // constants in the middle of the member list, and we want to use
         // those constant values in preference to the current leader, over
@@ -2147,24 +3171,19 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
         // Convert the members to DFS ordered sets and then merge them.
         SmallVector<ValueDFS, 8> DFSOrderedSet;
-        convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
+        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
 
         // Sort the whole thing.
         std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
         for (auto &VD : DFSOrderedSet) {
           int MemberDFSIn = VD.DFSIn;
           int MemberDFSOut = VD.DFSOut;
-          Value *Member = VD.Val;
-          Use *MemberUse = VD.U;
-
-          if (Member) {
-            // We ignore void things because we can't get a value from them.
-            // FIXME: We could actually use this to kill dead stores that are
-            // dominated by equivalent earlier stores.
-            if (Member->getType()->isVoidTy())
-              continue;
-          }
+          Value *Def = VD.Def.getPointer();
+          bool FromStore = VD.Def.getInt();
+          Use *U = VD.U;
+          // We ignore void things because we can't get a value from them.
+          if (Def && Def->getType()->isVoidTy())
+            continue;
 
           if (EliminationStack.empty()) {
             DEBUG(dbgs() << "Elimination Stack is empty\n");
@@ -2189,69 +3208,240 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // start using, we also push.
           // Otherwise, we walk along, processing members who are
           // dominated by this scope, and eliminate them.
-          bool ShouldPush =
-              Member && (EliminationStack.empty() || isa<Constant>(Member));
+          bool ShouldPush = Def && EliminationStack.empty();
           bool OutOfScope =
               !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
 
           if (OutOfScope || ShouldPush) {
             // Sync to our current scope.
             EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
-            ShouldPush |= Member && EliminationStack.empty();
+            bool ShouldPush = Def && EliminationStack.empty();
             if (ShouldPush) {
-              EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+            }
+          }
+
+          // Skip the Def's, we only want to eliminate on their uses.  But mark
+          // dominated defs as dead.
+          if (Def) {
+            // For anything in this case, what and how we value number
+            // guarantees that any side-effets that would have occurred (ie
+            // throwing, etc) can be proven to either still occur (because it's
+            // dominated by something that has the same side-effects), or never
+            // occur.  Otherwise, we would not have been able to prove it value
+            // equivalent to something else. For these things, we can just mark
+            // it all dead.  Note that this is different from the "ProbablyDead"
+            // set, which may not be dominated by anything, and thus, are only
+            // easy to prove dead if they are also side-effect free. Note that
+            // because stores are put in terms of the stored value, we skip
+            // stored values here. If the stored value is really dead, it will
+            // still be marked for deletion when we process it in its own class.
+            if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+                isa<Instruction>(Def) && !FromStore)
+              markInstructionForDeletion(cast<Instruction>(Def));
+            continue;
+          }
+          // At this point, we know it is a Use we are trying to possibly
+          // replace.
+
+          assert(isa<Instruction>(U->get()) &&
+                 "Current def should have been an instruction");
+          assert(isa<Instruction>(U->getUser()) &&
+                 "Current user should have been an instruction");
+
+          // If the thing we are replacing into is already marked to be dead,
+          // this use is dead.  Note that this is true regardless of whether
+          // we have anything dominating the use or not.  We do this here
+          // because we are already walking all the uses anyway.
+          Instruction *InstUse = cast<Instruction>(U->getUser());
+          if (InstructionsToErase.count(InstUse)) {
+            auto &UseCount = UseCounts[U->get()];
+            if (--UseCount == 0) {
+              ProbablyDead.insert(cast<Instruction>(U->get()));
             }
           }
 
           // If we get to this point, and the stack is empty we must have a use
-          // with nothing we can use to eliminate it, just skip it.
+          // with nothing we can use to eliminate this use, so just skip it.
           if (EliminationStack.empty())
             continue;
 
-          // Skip the Value's, we only want to eliminate on their uses.
-          if (Member)
-            continue;
-          Value *Result = EliminationStack.back();
+          Value *DominatingLeader = EliminationStack.back();
 
           // Don't replace our existing users with ourselves.
-          if (MemberUse->get() == Result)
+          if (U->get() == DominatingLeader)
             continue;
-
-          DEBUG(dbgs() << "Found replacement " << *Result << " for "
-                       << *MemberUse->get() << " in " << *(MemberUse->getUser())
-                       << "\n");
+          DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+                       << *U->get() << " in " << *(U->getUser()) << "\n");
 
           // If we replaced something in an instruction, handle the patching of
-          // metadata.
-          if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get()))
-            patchReplacementInstruction(ReplacedInst, Result);
-
-          assert(isa<Instruction>(MemberUse->getUser()));
-          MemberUse->set(Result);
+          // metadata.  Skip this if we are replacing predicateinfo with its
+          // original operand, as we already know we can just drop it.
+          auto *ReplacedInst = cast<Instruction>(U->get());
+          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+          if (!PI || DominatingLeader != PI->OriginalOp)
+            patchReplacementInstruction(ReplacedInst, DominatingLeader);
+          U->set(DominatingLeader);
+          // This is now a use of the dominating leader, which means if the
+          // dominating leader was dead, it's now live!
+          auto &LeaderUseCount = UseCounts[DominatingLeader];
+          // It's about to be alive again.
+          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+            ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          ++LeaderUseCount;
           AnythingReplaced = true;
         }
       }
     }
 
+    // At this point, anything still in the ProbablyDead set is actually dead if
+    // would be trivially dead.
+    for (auto *I : ProbablyDead)
+      if (wouldInstructionBeTriviallyDead(I))
+        markInstructionForDeletion(I);
+
     // Cleanup the congruence class.
-    SmallPtrSet<Value *, 4> MembersLeft;
-    for (Value *Member : CC->Members) {
-      if (Member->getType()->isVoidTy()) {
+    CongruenceClass::MemberSet MembersLeft;
+    for (auto *Member : *CC)
+      if (!isa<Instruction>(Member) ||
+          !InstructionsToErase.count(cast<Instruction>(Member)))
         MembersLeft.insert(Member);
-        continue;
-      }
-
-      if (auto *MemberInst = dyn_cast<Instruction>(Member)) {
-        if (isInstructionTriviallyDead(MemberInst)) {
-          // TODO: Don't mark loads of undefs.
-          markInstructionForDeletion(MemberInst);
-          continue;
+    CC->swap(MembersLeft);
+
+    // If we have possible dead stores to look at, try to eliminate them.
+    if (CC->getStoreCount() > 0) {
+      convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+      std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+      ValueDFSStack EliminationStack;
+      for (auto &VD : PossibleDeadStores) {
+        int MemberDFSIn = VD.DFSIn;
+        int MemberDFSOut = VD.DFSOut;
+        Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+        if (EliminationStack.empty() ||
+            !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+          // Sync to our current scope.
+          EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+          if (EliminationStack.empty()) {
+            EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+            continue;
+          }
         }
+        // We already did load elimination, so nothing to do here.
+        if (isa<LoadInst>(Member))
+          continue;
+        assert(!EliminationStack.empty());
+        Instruction *Leader = cast<Instruction>(EliminationStack.back());
+        (void)Leader;
+        assert(DT->dominates(Leader->getParent(), Member->getParent()));
+        // Member is dominater by Leader, and thus dead
+        DEBUG(dbgs() << "Marking dead store " << *Member
+                     << " that is dominated by " << *Leader << "\n");
+        markInstructionForDeletion(Member);
+        CC->erase(Member);
+        ++NumGVNDeadStores;
       }
-      MembersLeft.insert(Member);
     }
-    CC->Members.swap(MembersLeft);
   }
 
   return AnythingReplaced;
 }
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order.  Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank.  However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+  // Prefer undef to anything else
+  if (isa<UndefValue>(V))
+    return 0;
+  if (isa<Constant>(V))
+    return 1;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 2 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // the constant and argument ranking above.
+  unsigned Result = InstrToDFSNum(V);
+  if (Result > 0)
+    return 3 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+  // Because we only care about a total ordering, and don't rewrite expressions
+  // in this order, we order by rank, which will give a strict weak ordering to
+  // everything but constants, and then we order by pointer address.
+  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+class NewGVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  NewGVNLegacyPass() : FunctionPass(ID) {
+    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                F.getParent()->getDataLayout())
+      .runGVN();
+}
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+                    false)
+
+char NewGVNLegacyPass::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+  // Apparently the order in which we get these results matter for
+  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+  // the same order here, just in case.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  bool Changed =
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+          .runGVN();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 1a7ddc9585ba..1bfecea2f61e 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -66,7 +66,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call. Insert a FP compare instruction and a conditional branch
   // at the end of CurrBB.
-  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   CurrBB.getTerminator()->eraseFromParent();
   Builder.SetInsertPoint(&CurrBB);
   Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
@@ -98,14 +98,14 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
 
       // Skip if function either has local linkage or is not a known library
       // function.
-      LibFunc::Func LibFunc;
+      LibFunc LF;
       if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+          !TLI->getLibFunc(CalledFunc->getName(), LF))
         continue;
 
-      switch (LibFunc) {
-      case LibFunc::sqrtf:
-      case LibFunc::sqrt:
+      switch (LF) {
+      case LibFunc_sqrtf:
+      case LibFunc_sqrt:
         if (TTI->haveFastSqrt(Call->getType()) &&
             optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
           break;
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 65c814d7a63b..3dcab6090789 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1069,8 +1069,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
 ///
 /// Ops is the top-level list of add operands we're trying to factor.
 static void FindSingleUseMultiplyFactors(Value *V,
-                                         SmallVectorImpl<Value*> &Factors,
-                                       const SmallVectorImpl<ValueEntry> &Ops) {
+                                         SmallVectorImpl<Value*> &Factors) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
   if (!BO) {
     Factors.push_back(V);
@@ -1078,8 +1077,8 @@ static void FindSingleUseMultiplyFactors(Value *V,
   }
 
   // Otherwise, add the LHS and RHS to the list of factors.
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
 }
 
 /// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
@@ -1499,7 +1498,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 
     // Compute all of the factors of this added value.
     SmallVector<Value*, 8> Factors;
-    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+    FindSingleUseMultiplyFactors(BOp, Factors);
     assert(Factors.size() > 1 && "Bad linearize!");
 
     // Add one to FactorOccurrences for each unique factor in this op.
@@ -2236,8 +2235,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   ValueRankMap.clear();
 
   if (MadeChange) {
-    // FIXME: This should also 'preserve the CFG'.
-    auto PA = PreservedAnalyses();
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
     PA.preserve<GlobalsAA>();
     return PA;
   }
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 1de742050cb3..f344eb151464 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -365,6 +365,11 @@ findBaseDefiningValueOfVector(Value *I) {
     // for particular sufflevector patterns.
     return BaseDefiningValueResult(I, false);
 
+  // The behavior of getelementptr instructions is the same for vector and
+  // non-vector data types.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -634,7 +639,7 @@ static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
 
 // Values of type BDVState form a lattice, and this function implements the meet
 // operation.
-static BDVState meetBDVState(BDVState LHS, BDVState RHS) {
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
   BDVState Result = meetBDVStateImpl(LHS, RHS);
   assert(Result == meetBDVStateImpl(RHS, LHS) &&
          "Math is wrong: meet does not commute!");
@@ -1123,14 +1128,14 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
 
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
-static AttributeSet legalizeCallAttributes(AttributeSet AS) {
-  AttributeSet Ret;
+static AttributeList legalizeCallAttributes(AttributeList AS) {
+  AttributeList Ret;
 
   for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
     unsigned Index = AS.getSlotIndex(Slot);
 
-    if (Index == AttributeSet::ReturnIndex ||
-        Index == AttributeSet::FunctionIndex) {
+    if (Index == AttributeList::ReturnIndex ||
+        Index == AttributeList::FunctionIndex) {
 
       for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
 
@@ -1148,7 +1153,7 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {
 
         Ret = Ret.addAttributes(
             AS.getContext(), Index,
-            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
+            AttributeList::get(AS.getContext(), Index, AttrBuilder(Attr)));
       }
     }
 
@@ -1299,12 +1304,11 @@ static StringRef getDeoptLowering(CallSite CS) {
   const char *DeoptLowering = "deopt-lowering";
   if (CS.hasFnAttr(DeoptLowering)) {
     // FIXME: CallSite has a *really* confusing interface around attributes
-    // with values.  
-    const AttributeSet &CSAS = CS.getAttributes();
-    if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
-                          DeoptLowering))
-      return CSAS.getAttribute(AttributeSet::FunctionIndex,
-                               DeoptLowering).getValueAsString();
+    // with values.
+    const AttributeList &CSAS = CS.getAttributes();
+    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+          .getValueAsString();
     Function *F = CS.getCalledFunction();
     assert(F && F->hasFnAttribute(DeoptLowering));
     return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1388,7 +1392,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
   // Create the statepoint given all the arguments
   Instruction *Token = nullptr;
-  AttributeSet ReturnAttrs;
   if (CS.isCall()) {
     CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
     CallInst *Call = Builder.CreateGCStatepointCall(
@@ -1400,11 +1403,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    AttributeList NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
     // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    Call->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    Call->setAttributes(AttributeList::get(Call->getContext(),
+                                           AttributeList::FunctionIndex,
+                                           NewAttrs.getFnAttributes()));
 
     Token = Call;
 
@@ -1428,11 +1432,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    AttributeList NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
     // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    Invoke->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    Invoke->setAttributes(AttributeList::get(Invoke->getContext(),
+                                             AttributeList::FunctionIndex,
+                                             NewAttrs.getFnAttributes()));
 
     Token = Invoke;
 
@@ -1478,7 +1483,9 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
       StringRef Name =
           CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
       CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
-      GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+      GCResult->setAttributes(
+          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+                             CS.getAttributes().getRetAttributes()));
 
       // We cannot RAUW or delete CS.getInstruction() because it could be in the
       // live set of some other safepoint, in which case that safepoint's
@@ -1615,8 +1622,10 @@ static void relocationViaAlloca(
 
   // Emit alloca for "LiveValue" and record it in "allocaMap" and
   // "PromotableAllocas"
+  const DataLayout &DL = F.getParent()->getDataLayout();
   auto emitAllocaFor = [&](Value *LiveValue) {
-    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+                                        DL.getAllocaAddrSpace(), "",
                                         F.getEntryBlock().getFirstNonPHI());
     AllocaMap[LiveValue] = Alloca;
     PromotableAllocas.push_back(Alloca);
@@ -1873,7 +1882,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
              "non noop cast is found during rematerialization");
 
       Type *SrcTy = CI->getOperand(0)->getType();
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
@@ -2304,7 +2313,7 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
 
   if (!R.empty())
     AH.setAttributes(AH.getAttributes().removeAttributes(
-        Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+        Ctx, Index, AttributeList::get(Ctx, Index, R)));
 }
 
 void
@@ -2316,7 +2325,7 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
       RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
 
   if (isa<PointerType>(F.getReturnType()))
-    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
 }
 
 void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
@@ -2351,7 +2360,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
         if (isa<PointerType>(CS.getArgument(i)->getType()))
           RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
       if (isa<PointerType>(CS.getType()))
-        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
     }
   }
 }
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index ede381c4c243..8908dae2f545 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -140,6 +140,14 @@ public:
     return nullptr;
   }
 
+  /// getBlockAddress - If this is a constant with a BlockAddress value, return
+  /// it, otherwise return null.
+  BlockAddress *getBlockAddress() const {
+    if (isConstant())
+      return dyn_cast<BlockAddress>(getConstant());
+    return nullptr;
+  }
+
   void markForcedConstant(Constant *V) {
     assert(isUnknown() && "Can't force a defined value!");
     Val.setInt(forcedconstant);
@@ -306,20 +314,14 @@ public:
     return MRVFunctionsTracked;
   }
 
-  void markOverdefined(Value *V) {
-    assert(!V->getType()->isStructTy() &&
-           "structs should use markAnythingOverdefined");
-    markOverdefined(ValueState[V], V);
-  }
-
-  /// markAnythingOverdefined - Mark the specified value overdefined.  This
+  /// markOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
-  void markAnythingOverdefined(Value *V) {
+  void markOverdefined(Value *V) {
     if (auto *STy = dyn_cast<StructType>(V->getType()))
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         markOverdefined(getStructValueState(V, i), V);
     else
-      markOverdefined(V);
+      markOverdefined(ValueState[V], V);
   }
 
   // isStructLatticeConstant - Return true if all the lattice values
@@ -513,12 +515,12 @@ private:
   void visitCmpInst(CmpInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
-  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+  void visitLandingPadInst(LandingPadInst &I) { markOverdefined(&I); }
   void visitFuncletPadInst(FuncletPadInst &FPI) {
-    markAnythingOverdefined(&FPI);
+    markOverdefined(&FPI);
   }
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
-    markAnythingOverdefined(&CPI);
+    markOverdefined(&CPI);
     visitTerminatorInst(CPI);
   }
 
@@ -538,16 +540,16 @@ private:
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
-    markAnythingOverdefined(&I);
+    markOverdefined(&I);
   }
   void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
-  void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
 
   void visitInstruction(Instruction &I) {
     // If a new instruction is added to LLVM that we don't handle.
     DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
-    markAnythingOverdefined(&I);   // Just in case
+    markOverdefined(&I);   // Just in case
   }
 };
 
@@ -602,14 +604,36 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
       return;
     }
 
-    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+    Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
     return;
   }
 
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(&TI)) {
-    // Just mark all destinations executable!
-    Succs.assign(TI.getNumSuccessors(), true);
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+    // Casts are folded by visitCastInst.
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+    if (!Addr) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!IBRValue.isUnknown())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    BasicBlock* T = Addr->getBasicBlock();
+    assert(Addr->getFunction() == T->getParent() &&
+           "Block address of a different function ?");
+    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+      // This is the target.
+      if (IBR->getDestination(i) == T) {
+        Succs[i] = true;
+        return;
+      }
+    }
+
+    // If we didn't find our destination in the IBR successor list, then we
+    // have undefined behavior. Its ok to assume no successor is executable.
     return;
   }
 
@@ -659,13 +683,21 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     if (!CI)
       return !SCValue.isUnknown();
 
-    return SI->findCaseValue(CI).getCaseSuccessor() == To;
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
   }
 
-  // Just mark all destinations executable!
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(TI))
-    return true;
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
 
   DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
@@ -693,7 +725,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // If this PN returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (PN.getType()->isStructTy())
-    return markAnythingOverdefined(&PN);
+    return markOverdefined(&PN);
 
   if (getValueState(&PN).isOverdefined())
     return;  // Quick exit
@@ -803,7 +835,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // If this returns a struct, mark all elements over defined, we don't track
   // structs in structs.
   if (EVI.getType()->isStructTy())
-    return markAnythingOverdefined(&EVI);
+    return markOverdefined(&EVI);
 
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
@@ -828,7 +860,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
-    return markAnythingOverdefined(&IVI);
+    return markOverdefined(&IVI);
 
   Value *Aggr = IVI.getAggregateOperand();
   unsigned Idx = *IVI.idx_begin();
@@ -857,7 +889,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // If this select returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal CondValue = getValueState(I.getCondition());
   if (CondValue.isUnknown())
@@ -910,9 +942,16 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
 
   // Otherwise, one of our operands is overdefined.  Try to produce something
   // better than overdefined with some tricks.
-
-  // If this is an AND or OR with 0 or -1, it doesn't matter that the other
-  // operand is overdefined.
+  // If this is 0 / Y, it doesn't matter that the second operand is
+  // overdefined, and we can replace it with zero.
+  if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
+    if (V1State.isConstant() && V1State.getConstant()->isNullValue())
+      return markConstant(IV, &I, V1State.getConstant());
+
+  // If this is:
+  // -> AND/MUL with 0
+  // -> OR with -1
+  // it doesn't matter that the other operand is overdefined.
   if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
       I.getOpcode() == Instruction::Or) {
     LatticeVal *NonOverdefVal = nullptr;
@@ -1021,7 +1060,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
 void SCCPSolver::visitLoadInst(LoadInst &I) {
   // If this load is of a struct, just mark the result overdefined.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
@@ -1107,7 +1146,7 @@ CallOverdefined:
     }
 
     // Otherwise, we don't know anything about this call, mark it overdefined.
-    return markAnythingOverdefined(I);
+    return markOverdefined(I);
   }
 
   // If this is a local function that doesn't have its address taken, mark its
@@ -1483,6 +1522,31 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       return true;
     }
 
+   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+      // Indirect branch with no successor ?. Its ok to assume it branches
+      // to no target.
+      if (IBR->getNumSuccessors() < 1)
+        continue;
+
+      if (!getValueState(IBR->getAddress()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // the first successor of the indirect branch.
+      if (isa<UndefValue>(IBR->getAddress())) {
+        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+        markEdgeExecutable(&BB, IBR->getSuccessor(0));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to the first successor.
+      markForcedConstant(IBR->getAddress(),
+                         BlockAddress::get(IBR->getSuccessor(0)));
+      return true;
+    }
+
     if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
         continue;
@@ -1490,12 +1554,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->case_begin().getCaseValue());
-        markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor());
+        SI->setCondition(SI->case_begin()->getCaseValue());
+        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
         return true;
       }
 
-      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+      markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
       return true;
     }
   }
@@ -1545,7 +1609,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
   // Mark all arguments to the function as being overdefined.
   for (Argument &AI : F.args())
-    Solver.markAnythingOverdefined(&AI);
+    Solver.markOverdefined(&AI);
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1728,7 +1792,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
     // Assume nothing about the incoming arguments.
     for (Argument &AI : F.args())
-      Solver.markAnythingOverdefined(&AI);
+      Solver.markOverdefined(&AI);
   }
 
   // Loop over global variables.  We inform the solver about any internal global
@@ -1817,32 +1881,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (!I) continue;
 
         bool Folded = ConstantFoldTerminator(I->getParent());
-        if (!Folded) {
-          // The constant folder may not have been able to fold the terminator
-          // if this is a branch or switch on undef.  Fold it manually as a
-          // branch to the first successor.
-#ifndef NDEBUG
-          if (auto *BI = dyn_cast<BranchInst>(I)) {
-            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
-                   "Branch should be foldable!");
-          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
-            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
-          } else {
-            llvm_unreachable("Didn't fold away reference to block!");
-          }
-#endif
-
-          // Make this an uncond branch to the first successor.
-          TerminatorInst *TI = I->getParent()->getTerminator();
-          BranchInst::Create(TI->getSuccessor(0), TI);
-
-          // Remove entries in successor phi nodes to remove edges.
-          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
-            TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
-          // Remove the old terminator.
-          TI->eraseFromParent();
-        }
+        assert(Folded &&
+              "Expect TermInst on constantint or blockaddress to be folded");
+        (void) Folded;
       }
 
       // Finally, delete the basic block.
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index bfcb15530ef5..d01e91a7f235 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1825,6 +1825,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     // Rank the remaining candidate vector types. This is easy because we know
     // they're all integer vectors. We sort by ascending number of elements.
     auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
       assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
              "Cannot have vector types of different sizes!");
       assert(RHSTy->getElementType()->isIntegerTy() &&
@@ -2294,7 +2295,8 @@ private:
 #endif
 
     return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
 #ifndef NDEBUG
                           Twine(OldName) + "."
 #else
@@ -2369,6 +2371,8 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
+    unsigned AS = LI.getPointerAddressSpace();
+
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
     const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
@@ -2387,6 +2391,10 @@ private:
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      // Try to preserve nonnull metadata
+      if (TargetTy->isPointerTy())
+        NewLI->copyMetadata(LI, LLVMContext::MD_nonnull);
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
@@ -2401,7 +2409,7 @@ private:
                                 "endian_shift");
           }
     } else {
-      Type *LTy = TargetTy->getPointerTo();
+      Type *LTy = TargetTy->getPointerTo(AS);
       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
                                               getSliceAlign(TargetTy),
                                               LI.isVolatile(), LI.getName());
@@ -2429,7 +2437,7 @@ private:
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
       Value *Placeholder =
-          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
@@ -2542,7 +2550,8 @@ private:
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
       NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
                                      SI.isVolatile());
     }
@@ -3857,7 +3866,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     if (Alignment <= DL.getABITypeAlignment(SliceTy))
       Alignment = 0;
     NewAI = new AllocaInst(
-        SliceTy, nullptr, Alignment,
+      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     ++NumNewAllocas;
   }
@@ -4184,7 +4193,7 @@ bool SROA::promoteAllocas(Function &F) {
   NumPromoted += PromotableAllocas.size();
 
   DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
   PromotableAllocas.clear();
   return true;
 }
@@ -4234,9 +4243,8 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  // FIXME: Even when promoting allocas we should preserve some abstract set of
-  // CFG-specific analyses.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index afe7483006ae..00e3c95f6f06 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -43,13 +43,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
-  initializeNewGVNPass(Registry);
+  initializeNewGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
   initializeInductiveRangeCheckEliminationPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
+  initializeInferAddressSpacesPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLegacyLICMPassPass(Registry);
   initializeLegacyLoopSinkPassPass(Registry);
@@ -58,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
+  initializeLoopPredicationLegacyPassPass(Registry);
   initializeLoopRotateLegacyPassPass(Registry);
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
@@ -79,6 +81,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
+  initializeLateCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
@@ -115,6 +118,10 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
 
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLateCFGSimplificationPass());
+}
+
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 39969e27367f..c0c09a7e43fe 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -520,12 +520,25 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   unsigned NumElems = VT->getNumElements();
   unsigned NumIndices = GEPI.getNumIndices();
 
-  Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+  // The base pointer might be scalar even if it's a vector GEP. In those cases,
+  // splat the pointer into a vector value, and scatter that vector.
+  Value *Op0 = GEPI.getOperand(0);
+  if (!Op0->getType()->isVectorTy())
+    Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+  Scatterer Base = scatter(&GEPI, Op0);
 
   SmallVector<Scatterer, 8> Ops;
   Ops.resize(NumIndices);
-  for (unsigned I = 0; I < NumIndices; ++I)
-    Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+  for (unsigned I = 0; I < NumIndices; ++I) {
+    Value *Op = GEPI.getOperand(I + 1);
+
+    // The indices might be scalars even if it's a vector GEP. In those cases,
+    // splat the scalar into a vector value, and scatter that vector.
+    if (!Op->getType()->isVectorTy())
+      Op = Builder.CreateVectorSplat(NumElems, Op);
+
+    Ops[I] = scatter(&GEPI, Op);
+  }
 
   ValueVector Res;
   Res.resize(NumElems);
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f2723bd7af82..8754c714c5b2 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -130,7 +130,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
                                    AssumptionCache *AC,
-                                   unsigned BonusInstThreshold) {
+                                   unsigned BonusInstThreshold,
+                                   bool LateSimplifyCFG) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -145,7 +146,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -156,10 +157,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                AssumptionCache *AC, int BonusInstThreshold) {
+                                AssumptionCache *AC, int BonusInstThreshold,
+                                bool LateSimplifyCFG) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                        LateSimplifyCFG);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -173,7 +176,8 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                         LateSimplifyCFG);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
@@ -181,17 +185,19 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 SimplifyCFGPass::SimplifyCFGPass()
-    : BonusInstThreshold(UserBonusInstThreshold) {}
+    : BonusInstThreshold(UserBonusInstThreshold),
+      LateSimplifyCFG(true) {}
 
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
-    : BonusInstThreshold(BonusInstThreshold) {}
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
+    : BonusInstThreshold(BonusInstThreshold),
+      LateSimplifyCFG(LateSimplifyCFG) {}
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
 
-  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
@@ -199,16 +205,17 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
 }
 
 namespace {
-struct CFGSimplifyPass : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
+struct BaseCFGSimplifyPass : public FunctionPass {
   unsigned BonusInstThreshold;
   std::function<bool(const Function &)> PredicateFtor;
+  bool LateSimplifyCFG;
 
-  CFGSimplifyPass(int T = -1,
-                  std::function<bool(const Function &)> Ftor = nullptr)
-      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+  BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
+                      std::function<bool(const Function &)> Ftor,
+                      char &ID)
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
+        LateSimplifyCFG(LateSimplifyCFG) {
     BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override {
     if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
@@ -218,7 +225,7 @@ struct CFGSimplifyPass : public FunctionPass {
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -227,6 +234,26 @@ struct CFGSimplifyPass : public FunctionPass {
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
+
+struct CFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  CFGSimplifyPass(int T = -1,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+                  : BaseCFGSimplifyPass(T, false, Ftor, ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LateCFGSimplifyPass(int T = -1,
+                      std::function<bool(const Function &)> Ftor = nullptr)
+                      : BaseCFGSimplifyPass(T, true, Ftor, ID) {
+    initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
 }
 
 char CFGSimplifyPass::ID = 0;
@@ -237,9 +264,24 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
+char LateCFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
+                      "Simplify the CFG more aggressively", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
+                    "Simplify the CFG more aggressively", false, false)
+
 // Public interface to the CFGSimplification pass
 FunctionPass *
 llvm::createCFGSimplificationPass(int Threshold,
-                                  std::function<bool(const Function &)> Ftor) {
+    std::function<bool(const Function &)> Ftor) {
   return new CFGSimplifyPass(Threshold, std::move(Ftor));
 }
+
+// Public interface to the LateCFGSimplification pass
+FunctionPass *
+llvm::createLateCFGSimplificationPass(int Threshold, 
+                                  std::function<bool(const Function &)> Ftor) {
+  return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+}
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index c3f14a0f4b1e..102e9eaeab77 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -164,13 +164,14 @@ static bool SinkInstruction(Instruction *Inst,
 
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
-  // Look at all the postdominators and see if we can sink it in one.
+  // Look at all the dominated blocks and see if we can sink it in one.
   DomTreeNode *DTN = DT.getNode(Inst->getParent());
   for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
       I != E && SuccToSinkTo == nullptr; ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
-    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
-        IsAcceptableTarget(Inst, Candidate, DT, LI))
+    // A node always immediate-dominates its children on the dominator
+    // tree.
+    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
       SuccToSinkTo = Candidate;
   }
 
@@ -262,9 +263,8 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!iterativelySinkInstructions(F, DT, LI, AA))
     return PreservedAnalyses::all();
 
-  auto PA = PreservedAnalyses();
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<LoopAnalysis>();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }
 
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 2e95926c0b3f..4c9746b8c691 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -102,6 +102,10 @@ FunctionPass *llvm::createAddDiscriminatorsPass() {
   return new AddDiscriminatorsLegacyPass();
 }
 
+static bool shouldHaveDiscriminator(const Instruction *I) {
+  return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
+}
+
 /// \brief Assign DWARF discriminators.
 ///
 /// To assign discriminators, we examine the boundaries of every
@@ -176,7 +180,13 @@ static bool addDiscriminators(Function &F) {
   // discriminator for this instruction.
   for (BasicBlock &B : F) {
     for (auto &I : B.getInstList()) {
-      if (isa<IntrinsicInst>(&I))
+      // Not all intrinsic calls should have a discriminator.
+      // We want to avoid a non-deterministic assignment of discriminators at
+      // different debug levels. We still allow discriminators on memory
+      // intrinsic calls because those can be early expanded by SROA into
+      // pairs of loads and stores, and the expanded load/store instructions
+      // should have a valid discriminator.
+      if (!shouldHaveDiscriminator(&I))
         continue;
       const DILocation *DIL = I.getDebugLoc();
       if (!DIL)
@@ -190,8 +200,8 @@ static bool addDiscriminators(Function &F) {
       // discriminator is needed to distinguish both instructions.
       // Only the lowest 7 bits are used to represent a discriminator to fit
       // it in 1 byte ULEB128 representation.
-      unsigned Discriminator = (R.second ? ++LDM[L] : LDM[L]) & 0x7f;
-      I.setDebugLoc(DIL->cloneWithDiscriminator(Discriminator));
+      unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
+      I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
       DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
                    << DIL->getColumn() << ":" << Discriminator << " " << I
                    << "\n");
@@ -207,6 +217,10 @@ static bool addDiscriminators(Function &F) {
     LocationSet CallLocations;
     for (auto &I : B.getInstList()) {
       CallInst *Current = dyn_cast<CallInst>(&I);
+      // We bypass intrinsic calls for the following two reasons:
+      //  1) We want to avoid a non-deterministic assigment of
+      //     discriminators.
+      //  2) We want to minimize the number of base discriminators used.
       if (!Current || isa<IntrinsicInst>(&I))
         continue;
 
@@ -216,8 +230,8 @@ static bool addDiscriminators(Function &F) {
       Location L =
           std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
       if (!CallLocations.insert(L).second) {
-        Current->setDebugLoc(
-            CurrentDIL->cloneWithDiscriminator((++LDM[L]) & 0x7f));
+        unsigned Discriminator = ++LDM[L];
+        Current->setDebugLoc(CurrentDIL->setBaseDiscriminator(Discriminator));
         Changed = true;
       }
     }
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index b90349d3cdad..22af21d55c01 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -438,7 +438,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
-  BI->setDebugLoc(BB->getFirstNonPHI()->getDebugLoc());
+  BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
 
   // Move the edges from Preds to point to NewBB instead of BB.
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
@@ -646,9 +646,10 @@ llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
   }
 
   if (LI) {
-    Loop *L = LI->getLoopFor(Head);
-    L->addBasicBlockToLoop(ThenBlock, *LI);
-    L->addBasicBlockToLoop(Tail, *LI);
+    if (Loop *L = LI->getLoopFor(Head)) {
+      L->addBasicBlockToLoop(ThenBlock, *LI);
+      L->addBasicBlockToLoop(Tail, *LI);
+    }
   }
 
   return CheckTerm;
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index e61b04fbdd57..6cd9f1614991 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -96,9 +96,9 @@ static bool setDoesNotAlias(Function &F, unsigned n) {
 }
 
 static bool setNonNull(Function &F, unsigned n) {
-  assert((n != AttributeSet::ReturnIndex ||
-          F.getReturnType()->isPointerTy()) &&
-         "nonnull applies only to pointers");
+  assert(
+      (n != AttributeList::ReturnIndex || F.getReturnType()->isPointerTy()) &&
+      "nonnull applies only to pointers");
   if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
     return false;
   F.addAttribute(n, Attribute::NonNull);
@@ -107,255 +107,255 @@ static bool setNonNull(Function &F, unsigned n) {
 }
 
 bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
-  LibFunc::Func TheLibFunc;
+  LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
 
   bool Changed = false;
   switch (TheLibFunc) {
-  case LibFunc::strlen:
+  case LibFunc_strlen:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::strchr:
-  case LibFunc::strrchr:
+  case LibFunc_strchr:
+  case LibFunc_strrchr:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::strtol:
-  case LibFunc::strtod:
-  case LibFunc::strtof:
-  case LibFunc::strtoul:
-  case LibFunc::strtoll:
-  case LibFunc::strtold:
-  case LibFunc::strtoull:
+  case LibFunc_strtol:
+  case LibFunc_strtod:
+  case LibFunc_strtof:
+  case LibFunc_strtoul:
+  case LibFunc_strtoll:
+  case LibFunc_strtold:
+  case LibFunc_strtoull:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
-  case LibFunc::strcat:
-  case LibFunc::strncat:
-  case LibFunc::strncpy:
-  case LibFunc::stpncpy:
+  case LibFunc_strcpy:
+  case LibFunc_stpcpy:
+  case LibFunc_strcat:
+  case LibFunc_strncat:
+  case LibFunc_strncpy:
+  case LibFunc_stpncpy:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::strxfrm:
+  case LibFunc_strxfrm:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::strcmp:      // 0,1
-  case LibFunc::strspn:      // 0,1
-  case LibFunc::strncmp:     // 0,1
-  case LibFunc::strcspn:     // 0,1
-  case LibFunc::strcoll:     // 0,1
-  case LibFunc::strcasecmp:  // 0,1
-  case LibFunc::strncasecmp: //
+  case LibFunc_strcmp:      // 0,1
+  case LibFunc_strspn:      // 0,1
+  case LibFunc_strncmp:     // 0,1
+  case LibFunc_strcspn:     // 0,1
+  case LibFunc_strcoll:     // 0,1
+  case LibFunc_strcasecmp:  // 0,1
+  case LibFunc_strncasecmp: //
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::strstr:
-  case LibFunc::strpbrk:
+  case LibFunc_strstr:
+  case LibFunc_strpbrk:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::strtok:
-  case LibFunc::strtok_r:
+  case LibFunc_strtok:
+  case LibFunc_strtok_r:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::scanf:
+  case LibFunc_scanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::setbuf:
-  case LibFunc::setvbuf:
+  case LibFunc_setbuf:
+  case LibFunc_setvbuf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::strdup:
-  case LibFunc::strndup:
+  case LibFunc_strdup:
+  case LibFunc_strndup:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::stat:
-  case LibFunc::statvfs:
+  case LibFunc_stat:
+  case LibFunc_statvfs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::sscanf:
+  case LibFunc_sscanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::sprintf:
+  case LibFunc_sprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::snprintf:
+  case LibFunc_snprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 3);
     Changed |= setOnlyReadsMemory(F, 3);
     return Changed;
-  case LibFunc::setitimer:
+  case LibFunc_setitimer:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setDoesNotCapture(F, 3);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::system:
+  case LibFunc_system:
     // May throw; "system" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::malloc:
+  case LibFunc_malloc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     return Changed;
-  case LibFunc::memcmp:
+  case LibFunc_memcmp:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::memchr:
-  case LibFunc::memrchr:
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::modf:
-  case LibFunc::modff:
-  case LibFunc::modfl:
+  case LibFunc_modf:
+  case LibFunc_modff:
+  case LibFunc_modfl:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::memcpy:
-  case LibFunc::mempcpy:
-  case LibFunc::memccpy:
-  case LibFunc::memmove:
+  case LibFunc_memcpy:
+  case LibFunc_mempcpy:
+  case LibFunc_memccpy:
+  case LibFunc_memmove:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::memcpy_chk:
+  case LibFunc_memcpy_chk:
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::memalign:
+  case LibFunc_memalign:
     Changed |= setDoesNotAlias(F, 0);
     return Changed;
-  case LibFunc::mkdir:
+  case LibFunc_mkdir:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::mktime:
+  case LibFunc_mktime:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::realloc:
+  case LibFunc_realloc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::read:
+  case LibFunc_read:
     // May throw; "read" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::rewind:
+  case LibFunc_rewind:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::rmdir:
-  case LibFunc::remove:
-  case LibFunc::realpath:
+  case LibFunc_rmdir:
+  case LibFunc_remove:
+  case LibFunc_realpath:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::rename:
+  case LibFunc_rename:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::readlink:
+  case LibFunc_readlink:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::write:
+  case LibFunc_write:
     // May throw; "write" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::bcopy:
+  case LibFunc_bcopy:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::bcmp:
+  case LibFunc_bcmp:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::bzero:
+  case LibFunc_bzero:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::calloc:
+  case LibFunc_calloc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     return Changed;
-  case LibFunc::chmod:
-  case LibFunc::chown:
+  case LibFunc_chmod:
+  case LibFunc_chown:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::ctermid:
-  case LibFunc::clearerr:
-  case LibFunc::closedir:
+  case LibFunc_ctermid:
+  case LibFunc_clearerr:
+  case LibFunc_closedir:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::atoi:
-  case LibFunc::atol:
-  case LibFunc::atof:
-  case LibFunc::atoll:
+  case LibFunc_atoi:
+  case LibFunc_atol:
+  case LibFunc_atof:
+  case LibFunc_atoll:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::access:
+  case LibFunc_access:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::fopen:
+  case LibFunc_fopen:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -363,150 +363,150 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fdopen:
+  case LibFunc_fdopen:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::feof:
-  case LibFunc::free:
-  case LibFunc::fseek:
-  case LibFunc::ftell:
-  case LibFunc::fgetc:
-  case LibFunc::fseeko:
-  case LibFunc::ftello:
-  case LibFunc::fileno:
-  case LibFunc::fflush:
-  case LibFunc::fclose:
-  case LibFunc::fsetpos:
-  case LibFunc::flockfile:
-  case LibFunc::funlockfile:
-  case LibFunc::ftrylockfile:
+  case LibFunc_feof:
+  case LibFunc_free:
+  case LibFunc_fseek:
+  case LibFunc_ftell:
+  case LibFunc_fgetc:
+  case LibFunc_fseeko:
+  case LibFunc_ftello:
+  case LibFunc_fileno:
+  case LibFunc_fflush:
+  case LibFunc_fclose:
+  case LibFunc_fsetpos:
+  case LibFunc_flockfile:
+  case LibFunc_funlockfile:
+  case LibFunc_ftrylockfile:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::ferror:
+  case LibFunc_ferror:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F);
     return Changed;
-  case LibFunc::fputc:
-  case LibFunc::fstat:
-  case LibFunc::frexp:
-  case LibFunc::frexpf:
-  case LibFunc::frexpl:
-  case LibFunc::fstatvfs:
+  case LibFunc_fputc:
+  case LibFunc_fstat:
+  case LibFunc_frexp:
+  case LibFunc_frexpf:
+  case LibFunc_frexpl:
+  case LibFunc_fstatvfs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::fgets:
+  case LibFunc_fgets:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 3);
     return Changed;
-  case LibFunc::fread:
+  case LibFunc_fread:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 4);
     return Changed;
-  case LibFunc::fwrite:
+  case LibFunc_fwrite:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 4);
     // FIXME: readonly #1?
     return Changed;
-  case LibFunc::fputs:
+  case LibFunc_fputs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::fscanf:
-  case LibFunc::fprintf:
+  case LibFunc_fscanf:
+  case LibFunc_fprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fgetpos:
+  case LibFunc_fgetpos:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::getc:
-  case LibFunc::getlogin_r:
-  case LibFunc::getc_unlocked:
+  case LibFunc_getc:
+  case LibFunc_getlogin_r:
+  case LibFunc_getc_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::getenv:
+  case LibFunc_getenv:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::gets:
-  case LibFunc::getchar:
+  case LibFunc_gets:
+  case LibFunc_getchar:
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::getitimer:
+  case LibFunc_getitimer:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::getpwnam:
+  case LibFunc_getpwnam:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::ungetc:
+  case LibFunc_ungetc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::uname:
+  case LibFunc_uname:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::unlink:
+  case LibFunc_unlink:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::unsetenv:
+  case LibFunc_unsetenv:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::utime:
-  case LibFunc::utimes:
+  case LibFunc_utime:
+  case LibFunc_utimes:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::putc:
+  case LibFunc_putc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::puts:
-  case LibFunc::printf:
-  case LibFunc::perror:
+  case LibFunc_puts:
+  case LibFunc_printf:
+  case LibFunc_perror:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::pread:
+  case LibFunc_pread:
     // May throw; "pread" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::pwrite:
+  case LibFunc_pwrite:
     // May throw; "pwrite" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::putchar:
+  case LibFunc_putchar:
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::popen:
+  case LibFunc_popen:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -514,132 +514,132 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::pclose:
+  case LibFunc_pclose:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::vscanf:
+  case LibFunc_vscanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::vsscanf:
+  case LibFunc_vsscanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::vfscanf:
+  case LibFunc_vfscanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::valloc:
+  case LibFunc_valloc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     return Changed;
-  case LibFunc::vprintf:
+  case LibFunc_vprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::vfprintf:
-  case LibFunc::vsprintf:
+  case LibFunc_vfprintf:
+  case LibFunc_vsprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::vsnprintf:
+  case LibFunc_vsnprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 3);
     Changed |= setOnlyReadsMemory(F, 3);
     return Changed;
-  case LibFunc::open:
+  case LibFunc_open:
     // May throw; "open" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::opendir:
+  case LibFunc_opendir:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::tmpfile:
+  case LibFunc_tmpfile:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     return Changed;
-  case LibFunc::times:
+  case LibFunc_times:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::htonl:
-  case LibFunc::htons:
-  case LibFunc::ntohl:
-  case LibFunc::ntohs:
+  case LibFunc_htonl:
+  case LibFunc_htons:
+  case LibFunc_ntohl:
+  case LibFunc_ntohs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAccessMemory(F);
     return Changed;
-  case LibFunc::lstat:
+  case LibFunc_lstat:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::lchown:
+  case LibFunc_lchown:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::qsort:
+  case LibFunc_qsort:
     // May throw; places call through function pointer.
     Changed |= setDoesNotCapture(F, 4);
     return Changed;
-  case LibFunc::dunder_strdup:
-  case LibFunc::dunder_strndup:
+  case LibFunc_dunder_strdup:
+  case LibFunc_dunder_strndup:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::dunder_strtok_r:
+  case LibFunc_dunder_strtok_r:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::under_IO_getc:
+  case LibFunc_under_IO_getc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::under_IO_putc:
+  case LibFunc_under_IO_putc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::dunder_isoc99_scanf:
+  case LibFunc_dunder_isoc99_scanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::stat64:
-  case LibFunc::lstat64:
-  case LibFunc::statvfs64:
+  case LibFunc_stat64:
+  case LibFunc_lstat64:
+  case LibFunc_statvfs64:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::dunder_isoc99_sscanf:
+  case LibFunc_dunder_isoc99_sscanf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fopen64:
+  case LibFunc_fopen64:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -647,26 +647,26 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fseeko64:
-  case LibFunc::ftello64:
+  case LibFunc_fseeko64:
+  case LibFunc_ftello64:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::tmpfile64:
+  case LibFunc_tmpfile64:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAlias(F, 0);
     return Changed;
-  case LibFunc::fstat64:
-  case LibFunc::fstatvfs64:
+  case LibFunc_fstat64:
+  case LibFunc_fstatvfs64:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::open64:
+  case LibFunc_open64:
     // May throw; "open" is a valid pthread cancellation point.
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::gettimeofday:
+  case LibFunc_gettimeofday:
     // Currently some platforms have the restrict keyword on the arguments to
     // gettimeofday. To be conservative, do not add noalias to gettimeofday's
     // arguments.
@@ -674,29 +674,29 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::Znwj: // new(unsigned int)
-  case LibFunc::Znwm: // new(unsigned long)
-  case LibFunc::Znaj: // new[](unsigned int)
-  case LibFunc::Znam: // new[](unsigned long)
-  case LibFunc::msvc_new_int: // new(unsigned int)
-  case LibFunc::msvc_new_longlong: // new(unsigned long long)
-  case LibFunc::msvc_new_array_int: // new[](unsigned int)
-  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
+  case LibFunc_Znwj: // new(unsigned int)
+  case LibFunc_Znwm: // new(unsigned long)
+  case LibFunc_Znaj: // new[](unsigned int)
+  case LibFunc_Znam: // new[](unsigned long)
+  case LibFunc_msvc_new_int: // new(unsigned int)
+  case LibFunc_msvc_new_longlong: // new(unsigned long long)
+  case LibFunc_msvc_new_array_int: // new[](unsigned int)
+  case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
     // Operator new always returns a nonnull noalias pointer
-    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
-    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+    Changed |= setNonNull(F, AttributeList::ReturnIndex);
+    Changed |= setDoesNotAlias(F, AttributeList::ReturnIndex);
     return Changed;
   //TODO: add LibFunc entries for:
-  //case LibFunc::memset_pattern4:
-  //case LibFunc::memset_pattern8:
-  case LibFunc::memset_pattern16:
+  //case LibFunc_memset_pattern4:
+  //case LibFunc_memset_pattern8:
+  case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
   // int __nvvm_reflect(const char *)
-  case LibFunc::nvvm_reflect:
+  case LibFunc_nvvm_reflect:
     Changed |= setDoesNotAccessMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
@@ -717,13 +717,13 @@ Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
 
 Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strlen))
+  if (!TLI->has(LibFunc_strlen))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
-                                            B.getInt8PtrTy(), nullptr);
+                                            B.getInt8PtrTy());
   inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
   CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
@@ -734,14 +734,14 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
 
 Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strchr))
+  if (!TLI->has(LibFunc_strchr))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr =
-      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty, nullptr);
+      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
   inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
   CallInst *CI = B.CreateCall(
       StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
@@ -752,14 +752,14 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
 Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strncmp))
+  if (!TLI->has(LibFunc_strncmp))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
                                           B.getInt8PtrTy(), B.getInt8PtrTy(),
-                                          DL.getIntPtrType(Context), nullptr);
+                                          DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
@@ -772,12 +772,12 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
 
 Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI, StringRef Name) {
-  if (!TLI->has(LibFunc::strcpy))
+  if (!TLI->has(LibFunc_strcpy))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr, nullptr);
+  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
   inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI =
       B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
@@ -788,13 +788,13 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 
 Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI, StringRef Name) {
-  if (!TLI->has(LibFunc::strncpy))
+  if (!TLI->has(LibFunc_strncpy))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
-                                          Len->getType(), nullptr);
+                                          Len->getType());
   inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
@@ -806,18 +806,18 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
 Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilder<> &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::memcpy_chk))
+  if (!TLI->has(LibFunc_memcpy_chk))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  AttributeSet AS;
-  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                         Attribute::NoUnwind);
+  AttributeList AS;
+  AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
+                          Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction(
-      "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
+      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
-      DL.getIntPtrType(Context), nullptr);
+      DL.getIntPtrType(Context));
   Dst = castToCStr(Dst, B);
   Src = castToCStr(Src, B);
   CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
@@ -828,14 +828,14 @@ Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 
 Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::memchr))
+  if (!TLI->has(LibFunc_memchr))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
                                          B.getInt8PtrTy(), B.getInt32Ty(),
-                                         DL.getIntPtrType(Context), nullptr);
+                                         DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
   CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
 
@@ -847,14 +847,14 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
 
 Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::memcmp))
+  if (!TLI->has(LibFunc_memcmp))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
-                                         DL.getIntPtrType(Context), nullptr);
+                                         DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
   CallInst *CI = B.CreateCall(
       MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
@@ -881,13 +881,13 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
 }
 
 Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttributeSet &Attrs) {
+                                  const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op, Name, NameBuffer);
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
-                                         Op->getType(), nullptr);
+                                         Op->getType());
   CallInst *CI = B.CreateCall(Callee, Op, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -897,13 +897,13 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 }
 
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                  IRBuilder<> &B, const AttributeSet &Attrs) {
+                                   IRBuilder<> &B, const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(),
-                                         Op2->getType(), nullptr);
+                                         Op2->getType());
   CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -914,12 +914,12 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
 
 Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::putchar))
+  if (!TLI->has(LibFunc_putchar))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
-                                          B.getInt32Ty(), nullptr);
+  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(*M->getFunction("putchar"), *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -934,12 +934,12 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
 
 Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
                       const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::puts))
+  if (!TLI->has(LibFunc_puts))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *PutS =
-      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy(), nullptr);
+      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
   inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
   CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
@@ -949,12 +949,12 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
 
 Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::fputc))
+  if (!TLI->has(LibFunc_fputc))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
-                                       File->getType(), nullptr);
+                                       File->getType());
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
@@ -968,13 +968,13 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 
 Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::fputs))
+  if (!TLI->has(LibFunc_fputs))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  StringRef FPutsName = TLI->getName(LibFunc::fputs);
+  StringRef FPutsName = TLI->getName(LibFunc_fputs);
   Constant *F = M->getOrInsertFunction(
-      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), nullptr);
+      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
@@ -986,16 +986,16 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 
 Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::fwrite))
+  if (!TLI->has(LibFunc_fwrite))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  StringRef FWriteName = TLI->getName(LibFunc::fwrite);
+  StringRef FWriteName = TLI->getName(LibFunc_fwrite);
   Constant *F = M->getOrInsertFunction(
       FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType(),
-      nullptr);
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
   CallInst *CI =
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index bc2cef26edcb..1cfe3bd53648 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -17,6 +17,8 @@
 
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -36,12 +38,21 @@ namespace {
       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
   };
 
-  struct DivPhiNodes {
-    PHINode *Quotient;
-    PHINode *Remainder;
+  struct QuotRemPair {
+    Value *Quotient;
+    Value *Remainder;
 
-    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
-      : Quotient(InQuotient), Remainder(InRemainder) {}
+    QuotRemPair(Value *InQuotient, Value *InRemainder)
+        : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+
+  /// A quotient and remainder, plus a BB from which they logically "originate".
+  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
+  /// corresponding predecessor.
+  struct QuotRemWithBB {
+    BasicBlock *BB = nullptr;
+    Value *Quotient = nullptr;
+    Value *Remainder = nullptr;
   };
 }
 
@@ -69,159 +80,376 @@ namespace llvm {
     }
   };
 
-  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+  typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
+  typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
+  typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
 }
 
-// insertFastDiv - Substitutes the div/rem instruction with code that checks the
-// value of the operands and uses a shorter-faster div/rem instruction when
-// possible and the longer-slower div/rem instruction otherwise.
-static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
-                          bool UseDivOp, bool UseSignedOp,
-                          DivCacheTy &PerBBDivCache) {
-  Function *F = I->getParent()->getParent();
-  // Get instruction operands
-  Value *Dividend = I->getOperand(0);
-  Value *Divisor = I->getOperand(1);
+namespace {
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_KNOWN_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LIKELY_LONG
+};
+
+class FastDivInsertionTask {
+  bool IsValidTask = false;
+  Instruction *SlowDivOrRem = nullptr;
+  IntegerType *BypassType = nullptr;
+  BasicBlock *MainBB = nullptr;
+
+  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
+  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
+  QuotRemWithBB createSlowBB(BasicBlock *Successor);
+  QuotRemWithBB createFastBB(BasicBlock *Successor);
+  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
+                                   BasicBlock *PhiBB);
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
+  Optional<QuotRemPair> insertFastDivAndRem();
+
+  bool isSignedOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::SRem;
+  }
+  bool isDivisionOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::UDiv;
+  }
+  Type *getSlowType() { return SlowDivOrRem->getType(); }
+
+public:
+  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+  Value *getReplacement(DivCacheTy &Cache);
+};
+} // anonymous namespace
+
+FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
+                                           const BypassWidthsTy &BypassWidths) {
+  switch (I->getOpcode()) {
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    SlowDivOrRem = I;
+    break;
+  default:
+    // I is not a div/rem operation.
+    return;
+  }
 
-  if (isa<ConstantInt>(Divisor)) {
-    // Division by a constant should have been been solved and replaced earlier
-    // in the pipeline.
-    return false;
+  // Skip division on vector types. Only optimize integer instructions.
+  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
+  if (!SlowType)
+    return;
+
+  // Skip if this bitwidth is not bypassed.
+  auto BI = BypassWidths.find(SlowType->getBitWidth());
+  if (BI == BypassWidths.end())
+    return;
+
+  // Get type for div/rem instruction with bypass bitwidth.
+  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
+  BypassType = BT;
+
+  // The original basic block.
+  MainBB = I->getParent();
+
+  // The instruction is indeed a slow div or rem operation.
+  IsValidTask = true;
+}
+
+/// Reuses previously-computed dividend or remainder from the current BB if
+/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
+/// perform the optimization and caches the resulting dividend and remainder.
+/// If no replacement can be generated, nullptr is returned.
+Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
+  // First, make sure that the task is valid.
+  if (!IsValidTask)
+    return nullptr;
+
+  // Then, look for a value in Cache.
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  DivOpInfo Key(isSignedOp(), Dividend, Divisor);
+  auto CacheI = Cache.find(Key);
+
+  if (CacheI == Cache.end()) {
+    // If previous instance does not exist, try to insert fast div.
+    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
+    // Bail out if insertFastDivAndRem has failed.
+    if (!OptResult)
+      return nullptr;
+    CacheI = Cache.insert({Key, *OptResult}).first;
   }
 
-  // If the numerator is a constant, bail if it doesn't fit into BypassType.
-  if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
-    if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
+  QuotRemPair &Value = CacheI->second;
+  return isDivisionOp() ? Value.Quotient : Value.Remainder;
+}
+
+/// \brief Check if a value looks like a hash.
+///
+/// The routine is expected to detect values computed using the most common hash
+/// algorithms. Typically, hash computations end with one of the following
+/// instructions:
+///
+/// 1) MUL with a constant wider than BypassType
+/// 2) XOR instruction
+///
+/// And even if we are wrong and the value is not a hash, it is still quite
+/// unlikely that such values will fit into BypassType.
+///
+/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
+/// It is implemented as a depth-first search for values that look neither long
+/// nor hash-like.
+bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::Xor:
+    return true;
+  case Instruction::Mul: {
+    // After Constant Hoisting pass, long constants may be represented as
+    // bitcast instructions. As a result, some constants may look like an
+    // instruction at first, and an additional check is necessary to find out if
+    // an operand is actually a constant.
+    Value *Op1 = I->getOperand(1);
+    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
+    if (!C && isa<BitCastInst>(Op1))
+      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
+    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
+  }
+  case Instruction::PHI: {
+    // Stop IR traversal in case of a crazy input code. This limits recursion
+    // depth.
+    if (Visited.size() >= 16)
       return false;
+    // Do not visit nodes that have been visited already. We return true because
+    // it means that we couldn't find any value that doesn't look hash-like.
+    if (Visited.find(I) != Visited.end())
+      return true;
+    Visited.insert(I);
+    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
+      // Ignore undef values as they probably don't affect the division
+      // operands.
+      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
+             isa<UndefValue>(V);
+    });
+  }
+  default:
+    return false;
+  }
+}
+
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V,
+                                               VisitedSetTy &Visited) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  APInt Zeros(LongLen, 0), Ones(LongLen, 0);
 
-  // Basic Block is split before divide
-  BasicBlock *MainBB = &*I->getParent();
-  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
-
-  // Add new basic block for slow divide operation
-  BasicBlock *SlowBB =
-      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
-  SlowBB->moveBefore(SuccessorBB);
-  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
-  Value *SlowQuotientV;
-  Value *SlowRemainderV;
-  if (UseSignedOp) {
-    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
-    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  computeKnownBits(V, Zeros, Ones, DL);
+
+  if (Zeros.countLeadingOnes() >= HiBits)
+    return VALRNG_KNOWN_SHORT;
+
+  if (Ones.countLeadingZeros() < HiBits)
+    return VALRNG_LIKELY_LONG;
+
+  // Long integer divisions are often used in hashtable implementations. It's
+  // not worth bypassing such divisions because hash values are extremely
+  // unlikely to have enough leading zeros. The call below tries to detect
+  // values that are unlikely to fit BypassType (including hashes).
+  if (isHashLikeValue(V, Visited))
+    return VALRNG_LIKELY_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
+/// Add new basic block for slow div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isSignedOp()) {
+    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
   } else {
-    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
-    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
   }
-  SlowBuilder.CreateBr(SuccessorBB);
-
-  // Add new basic block for fast divide operation
-  BasicBlock *FastBB =
-      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
-  FastBB->moveBefore(SlowBB);
-  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
-  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
-                                                BypassType);
-  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
-                                                 BypassType);
-
-  // udiv/urem because optimization only handles positive numbers
-  Value *ShortQuotientV = FastBuilder.CreateUDiv(ShortDividendV, ShortDivisorV);
-  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
-                                                  ShortDivisorV);
-  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
-                                                ShortQuotientV,
-                                                Dividend->getType());
-  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
-                                                 ShortRemainderV,
-                                                 Dividend->getType());
-  FastBuilder.CreateBr(SuccessorBB);
-
-  // Phi nodes for result of div and rem
-  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
-  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
-  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
-  QuoPhi->addIncoming(FastQuotientV, FastBB);
-  PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
-  RemPhi->addIncoming(SlowRemainderV, SlowBB);
-  RemPhi->addIncoming(FastRemainderV, FastBB);
-
-  // Replace I with appropriate phi node
-  if (UseDivOp)
-    I->replaceAllUsesWith(QuoPhi);
-  else
-    I->replaceAllUsesWith(RemPhi);
-  I->eraseFromParent();
 
-  // Combine operands into a single value with OR for value testing below
-  MainBB->getInstList().back().eraseFromParent();
-  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Builder.CreateBr(SuccessorBB);
+  return DivRemPair;
+}
+
+/// Add new basic block for fast div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  Value *ShortDivisorV =
+      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+  Value *ShortDividendV =
+      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+  // udiv/urem because this optimization only handles positive numbers.
+  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
+  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
+  DivRemPair.Quotient =
+      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
+  DivRemPair.Remainder =
+      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
+  Builder.CreateBr(SuccessorBB);
+
+  return DivRemPair;
+}
 
-  // We should have bailed out above if the divisor is a constant, but the
-  // dividend may still be a constant.  Set OrV to our non-constant operands
-  // OR'ed together.
-  assert(!isa<ConstantInt>(Divisor));
+/// Creates Phi nodes for result of Div and Rem.
+QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
+                                                       QuotRemWithBB &RHS,
+                                                       BasicBlock *PhiBB) {
+  IRBuilder<> Builder(PhiBB, PhiBB->begin());
+  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
+  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
+  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
+  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
+  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
+  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
+  return QuotRemPair(QuoPhi, RemPhi);
+}
+
+/// Creates a runtime check to test whether both the divisor and dividend fit
+/// into BypassType. The check is inserted at the end of MainBB. True return
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
+  IRBuilder<> Builder(MainBB, MainBB->end());
 
   Value *OrV;
-  if (!isa<ConstantInt>(Dividend))
-    OrV = MainBuilder.CreateOr(Dividend, Divisor);
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
   else
-    OrV = Divisor;
+    OrV = Op1 ? Op1 : Op2;
 
   // BitMask is inverted to check if the operands are
   // larger than the bypass type
   uint64_t BitMask = ~BypassType->getBitMask();
-  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
-
-  // Compare operand values and branch
-  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
-  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
-  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
-
-  // Cache phi nodes to be used later in place of other instances
-  // of div or rem with the same sign, dividend, and divisor
-  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
-  DivPhiNodes Value(QuoPhi, RemPhi);
-  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
-  return true;
+  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values
+  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
+  return Builder.CreateICmpEQ(AndV, ZeroV);
 }
 
-// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from
-// the current BB if operands and operation are identical. Otherwise calls
-// insertFastDiv to perform the optimization and caches the resulting dividend
-// and remainder.
-static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType,
-                                 bool UseDivOp, bool UseSignedOp,
-                                 DivCacheTy &PerBBDivCache) {
-  // Get instruction operands
-  DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1));
-  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
-
-  if (CacheI == PerBBDivCache.end()) {
-    // If previous instance does not exist, insert fast div
-    return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
+/// Substitutes the div/rem instruction with code that checks the value of the
+/// operands and uses a shorter-faster div/rem instruction when possible.
+Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor)) {
+    // Keep division by a constant for DAGCombiner.
+    return None;
   }
 
-  // Replace operation value with previously generated phi node
-  DivPhiNodes &Value = CacheI->second;
-  if (UseDivOp) {
-    // Replace all uses of div instruction with quotient phi node
-    I->replaceAllUsesWith(Value.Quotient);
+  VisitedSetTy SetL;
+  ValueRange DividendRange = getValueRange(Dividend, SetL);
+  if (DividendRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  VisitedSetTy SetR;
+  ValueRange DivisorRange = getValueRange(Divisor, SetR);
+  if (DivisorRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  } else if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
   } else {
-    // Replace all uses of rem instruction with remainder phi node
-    I->replaceAllUsesWith(Value.Remainder);
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
   }
-
-  // Remove redundant operation
-  I->eraseFromParent();
-  return true;
 }
 
-// bypassSlowDivision - This optimization identifies DIV instructions in a BB
-// that can be profitably bypassed and carried out with a shorter, faster
-// divide.
-bool llvm::bypassSlowDivision(
-    BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) {
-  DivCacheTy DivCache;
+/// This optimization identifies DIV/REM instructions in a BB that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(BasicBlock *BB,
+                              const BypassWidthsTy &BypassWidths) {
+  DivCacheTy PerBBDivCache;
 
   bool MadeChange = false;
   Instruction* Next = &*BB->begin();
@@ -231,42 +459,20 @@ bool llvm::bypassSlowDivision(
     Instruction* I = Next;
     Next = Next->getNextNode();
 
-    // Get instruction details
-    unsigned Opcode = I->getOpcode();
-    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
-    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
-    bool UseSignedOp = Opcode == Instruction::SDiv ||
-                       Opcode == Instruction::SRem;
-
-    // Only optimize div or rem ops
-    if (!UseDivOp && !UseRemOp)
-      continue;
-
-    // Skip division on vector types, only optimize integer instructions
-    if (!I->getType()->isIntegerTy())
-      continue;
-
-    // Get bitwidth of div/rem instruction
-    IntegerType *T = cast<IntegerType>(I->getType());
-    unsigned int bitwidth = T->getBitWidth();
-
-    // Continue if bitwidth is not bypassed
-    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
-    if (BI == BypassWidths.end())
-      continue;
-
-    // Get type for div/rem instruction with bypass bitwidth
-    IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
-
-    MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
+    FastDivInsertionTask Task(I, BypassWidths);
+    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      MadeChange = true;
+    }
   }
 
   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
   // create divrem machine instructions.  Now erase any unused divs / rems so we
   // don't leave extra instructions sitting around.
-  for (auto &KV : DivCache)
-    for (Instruction *Phi : {KV.second.Quotient, KV.second.Remainder})
-      RecursivelyDeleteTriviallyDeadInstructions(Phi);
+  for (auto &KV : PerBBDivCache)
+    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
+      RecursivelyDeleteTriviallyDeadInstructions(V);
 
   return MadeChange;
 }
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 69889ec72f90..7a21c03da221 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -31,12 +31,13 @@ add_llvm_library(LLVMTransformUtils
   LoopUtils.cpp
   LoopVersioning.cpp
   LowerInvoke.cpp
+  LowerMemIntrinsics.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
-  MemorySSA.cpp
   MetaRenamer.cpp
   ModuleUtils.cpp
   NameAnonGlobals.cpp
+  PredicateInfo.cpp
   PromoteMemoryToRegister.cpp
   StripGCRelocates.cpp
   SSAUpdater.cpp
@@ -51,6 +52,7 @@ add_llvm_library(LLVMTransformUtils
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
+  VNCoercion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 4d33e22fecfb..385c12302e04 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -90,9 +90,9 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     assert(VMap.count(&I) && "No mapping from source argument specified!");
 #endif
 
-  // Copy all attributes other than those stored in the AttributeSet.  We need
-  // to remap the parameter indices of the AttributeSet.
-  AttributeSet NewAttrs = NewFunc->getAttributes();
+  // Copy all attributes other than those stored in the AttributeList.  We need
+  // to remap the parameter indices of the AttributeList.
+  AttributeList NewAttrs = NewFunc->getAttributes();
   NewFunc->copyAttributesFrom(OldFunc);
   NewFunc->setAttributes(NewAttrs);
 
@@ -103,22 +103,20 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                  ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                  TypeMapper, Materializer));
 
-  AttributeSet OldAttrs = OldFunc->getAttributes();
+  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
   // Clone any argument attributes that are present in the VMap.
-  for (const Argument &OldArg : OldFunc->args())
+  for (const Argument &OldArg : OldFunc->args()) {
     if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
-      AttributeSet attrs =
-          OldAttrs.getParamAttributes(OldArg.getArgNo() + 1);
-      if (attrs.getNumSlots() > 0)
-        NewArg->addAttr(attrs);
+      NewArgAttrs[NewArg->getArgNo()] =
+          OldAttrs.getParamAttributes(OldArg.getArgNo());
     }
+  }
 
   NewFunc->setAttributes(
-      NewFunc->getAttributes()
-          .addAttributes(NewFunc->getContext(), AttributeSet::ReturnIndex,
-                         OldAttrs.getRetAttributes())
-          .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex,
-                         OldAttrs.getFnAttributes()));
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+                         OldAttrs.getRetAttributes(), NewArgAttrs));
 
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc->getAllMetadata(MDs);
@@ -353,7 +351,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
-      SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond);
+      SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
       BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
       VMap[OldTI] = BranchInst::Create(Dest, NewBB);
       ToClone.push_back(Dest);
@@ -747,3 +745,40 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
 
   return NewLoop;
 }
+
+/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *
+llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                          Instruction *StopAt,
+                                          ValueToValueMapTy &ValueMapping) {
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; StopAt != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 7ebeb615d248..4e9d67252d6c 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -20,6 +20,15 @@
 #include "llvm-c/Core.h"
 using namespace llvm;
 
+static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
+  const Comdat *SC = Src->getComdat();
+  if (!SC)
+    return;
+  Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
+  DC->setSelectionKind(SC->getSelectionKind());
+  Dst->setComdat(DC);
+}
+
 /// This is not as easy as it might seem because we have to worry about making
 /// copies of global variables and functions, and making their (initializers and
 /// references, respectively) refer to the right globals.
@@ -124,6 +133,8 @@ std::unique_ptr<Module> llvm::CloneModule(
     I->getAllMetadata(MDs);
     for (auto MD : MDs)
       GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
+
+    copyComdat(GV, &*I);
   }
 
   // Similarly, copy over function bodies now...
@@ -153,6 +164,8 @@ std::unique_ptr<Module> llvm::CloneModule(
 
     if (I.hasPersonalityFn())
       F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
+
+    copyComdat(F, &I);
   }
 
   // And aliases
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index c514c9c9cd4a..644d93b727b3 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -362,9 +362,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  AttributeSet OldFnAttrs = oldFunction->getAttributes().getFnAttributes();
-  AttrBuilder AB(OldFnAttrs, AttributeSet::FunctionIndex);
-  for (auto Attr : AB.td_attrs())
+  AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
+  for (const auto &Attr : AB.td_attrs())
     newFunction->addFnAttr(Attr.first, Attr.second);
 
   newFunction->getBasicBlockList().push_back(newRootNode);
@@ -440,8 +439,10 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   // Emit a call to the new function, passing in: *pointer to struct (if
   // aggregating parameters), or plan inputs and allocated memory for outputs
   std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
-  
-  LLVMContext &Context = newFunction->getContext();
+
+  Module *M = newFunction->getParent();
+  LLVMContext &Context = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
 
   // Add inputs as params, or to be filled into the struct
   for (Value *input : inputs)
@@ -456,8 +457,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       StructValues.push_back(output);
     } else {
       AllocaInst *alloca =
-          new AllocaInst(output->getType(), nullptr, output->getName() + ".loc",
-                         &codeReplacer->getParent()->front().front());
+        new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
+                       nullptr, output->getName() + ".loc",
+                       &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
     }
@@ -473,7 +475,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
     // Allocate a struct at the beginning of this function
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
-    Struct = new AllocaInst(StructArgTy, nullptr, "structArg",
+    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
+                            "structArg",
                             &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 75a1dde57c4c..0eee6e19efac 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -28,15 +28,17 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     return nullptr;
   }
 
+  Function *F = I.getParent()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(I.getType(), nullptr,
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
                           I.getName()+".reg2mem", AllocaPoint);
   } else {
-    Function *F = I.getParent()->getParent();
-    Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem",
-                          &F->getEntryBlock().front());
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
   }
 
   // We cannot demote invoke instructions to the stack if their normal edge
@@ -110,14 +112,17 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
     return nullptr;
   }
 
+  const DataLayout &DL = P->getModule()->getDataLayout();
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(P->getType(), nullptr,
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
                           P->getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
-    Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem",
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName() + ".reg2mem",
                           &F->getEntryBlock().front());
   }
 
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
index 4adf1754253d..59f176e2f231 100644
--- a/lib/Transforms/Utils/Evaluator.cpp
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -486,7 +487,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         ConstantInt *Val =
           dyn_cast<ConstantInt>(getVal(SI->getCondition()));
         if (!Val) return false;  // Cannot determine.
-        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
+        NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
       } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
         Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
         if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index 81a7c4ceffab..73a0b2737e95 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -74,14 +74,14 @@ int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
   return L.compare(R);
 }
 
-int FunctionComparator::cmpAttrs(const AttributeSet L,
-                                 const AttributeSet R) const {
+int FunctionComparator::cmpAttrs(const AttributeList L,
+                                 const AttributeList R) const {
   if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
     return Res;
 
   for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
-    AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
-                           RE = R.end(i);
+    AttributeList::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
+                            RE = R.end(i);
     for (; LI != LE && RI != RE; ++LI, ++RI) {
       Attribute LA = *LI;
       Attribute RA = *RI;
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 9844190ef84a..b00f4b14068a 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -21,11 +21,11 @@ using namespace llvm;
 /// Checks if we should import SGV as a definition, otherwise import as a
 /// declaration.
 bool FunctionImportGlobalProcessing::doImportAsDefinition(
-    const GlobalValue *SGV, DenseSet<const GlobalValue *> *GlobalsToImport) {
+    const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) {
 
   // For alias, we tie the definition to the base object. Extract it and recurse
   if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
-    if (GA->hasWeakAnyLinkage())
+    if (GA->isInterposable())
       return false;
     const GlobalObject *GO = GA->getBaseObject();
     if (!GO->hasLinkOnceODRLinkage())
@@ -34,7 +34,7 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition(
         GO, GlobalsToImport);
   }
   // Only import the globals requested for importing.
-  if (GlobalsToImport->count(SGV))
+  if (GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
     return true;
   // Otherwise no.
   return false;
@@ -57,7 +57,8 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
     return false;
 
   if (isPerformingImport()) {
-    assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) &&
+    assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
+            !isNonRenamableLocal(*SGV)) &&
            "Attempting to promote non-renamable local");
     // We don't know for sure yet if we are importing this value (as either
     // a reference or a def), since we are simply walking all values in the
@@ -254,9 +255,8 @@ bool FunctionImportGlobalProcessing::run() {
   return false;
 }
 
-bool llvm::renameModuleForThinLTO(
-    Module &M, const ModuleSummaryIndex &Index,
-    DenseSet<const GlobalValue *> *GlobalsToImport) {
+bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+                                  SetVector<GlobalValue *> *GlobalsToImport) {
   FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
   return ThinLTOProcessing.run();
 }
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index 74ebcda8355c..ba4b78ac758a 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -10,9 +10,22 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
 
 using namespace llvm;
 
@@ -175,13 +188,9 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
   return false;
 }
 
+GlobalStatus::GlobalStatus() = default;
+
 bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
   SmallPtrSet<const PHINode *, 16> PhiUsers;
   return analyzeGlobalAux(V, GS, PhiUsers);
 }
-
-GlobalStatus::GlobalStatus()
-    : IsCompared(false), IsLoaded(false), StoredType(NotStored),
-      StoredOnceValue(nullptr), AccessingFunction(nullptr),
-      HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
-      Ordering(AtomicOrdering::NotAtomic) {}
diff --git a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index ed018bb73107..b8c12ad5ea84 100644
--- a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -62,6 +62,8 @@ void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
 void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
   ModuleName = M.getName();
   for (const auto &F : M.functions()) {
+    if (F.isDeclaration())
+      continue;
     AllFunctions++;
     ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
   }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index a40079ca8e76..5d6fbc3325ff 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -20,10 +20,12 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
@@ -40,8 +42,8 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 
 using namespace llvm;
@@ -1107,26 +1109,23 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
   bool DTCalculated = false;
 
   Function *CalledFunc = CS.getCalledFunction();
-  for (Function::arg_iterator I = CalledFunc->arg_begin(),
-                              E = CalledFunc->arg_end();
-       I != E; ++I) {
-    unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0;
-    if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) {
+  for (Argument &Arg : CalledFunc->args()) {
+    unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
+    if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) {
       if (!DTCalculated) {
-        DT.recalculate(const_cast<Function&>(*CS.getInstruction()->getParent()
-                                               ->getParent()));
+        DT.recalculate(*CS.getCaller());
         DTCalculated = true;
       }
 
       // If we can already prove the asserted alignment in the context of the
       // caller, then don't bother inserting the assumption.
-      Value *Arg = CS.getArgument(I->getArgNo());
-      if (getKnownAlignment(Arg, DL, CS.getInstruction(), AC, &DT) >= Align)
+      Value *ArgVal = CS.getArgument(Arg.getArgNo());
+      if (getKnownAlignment(ArgVal, DL, CS.getInstruction(), AC, &DT) >= Align)
         continue;
 
-      CallInst *NewAssumption = IRBuilder<>(CS.getInstruction())
-                                    .CreateAlignmentAssumption(DL, Arg, Align);
-      AC->registerAssumption(NewAssumption);
+      CallInst *NewAsmp = IRBuilder<>(CS.getInstruction())
+                              .CreateAlignmentAssumption(DL, ArgVal, Align);
+      AC->registerAssumption(NewAsmp);
     }
   }
 }
@@ -1140,7 +1139,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
                                          ValueToValueMapTy &VMap,
                                          InlineFunctionInfo &IFI) {
   CallGraph &CG = *IFI.CG;
-  const Function *Caller = CS.getInstruction()->getParent()->getParent();
+  const Function *Caller = CS.getCaller();
   const Function *Callee = CS.getCalledFunction();
   CallGraphNode *CalleeNode = CG[Callee];
   CallGraphNode *CallerNode = CG[Caller];
@@ -1225,7 +1224,8 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   PointerType *ArgTy = cast<PointerType>(Arg->getType());
   Type *AggTy = ArgTy->getElementType();
 
-  Function *Caller = TheCall->getParent()->getParent();
+  Function *Caller = TheCall->getFunction();
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -1239,31 +1239,30 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
 
     AssumptionCache *AC =
         IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
-    const DataLayout &DL = Caller->getParent()->getDataLayout();
 
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
     if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >=
         ByValAlignment)
       return Arg;
-    
+
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
     // for code quality, but rarely happens and is required for correctness.
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  unsigned Align =
-      Caller->getParent()->getDataLayout().getPrefTypeAlignment(AggTy);
+  unsigned Align = DL.getPrefTypeAlignment(AggTy);
 
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
   // pointer inside the callee).
   Align = std::max(Align, ByValAlignment);
-  
-  Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), 
+
+  Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(),
+                                    nullptr, Align, Arg->getName(),
                                     &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
-  
+
   // Uses of the argument in the function should use our new alloca
   // instead.
   return NewAlloca;
@@ -1393,6 +1392,89 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     }
   }
 }
+/// Update the block frequencies of the caller after a callee has been inlined.
+///
+/// Each block cloned into the caller has its block frequency scaled by the
+/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
+/// callee's entry block gets the same frequency as the callsite block and the
+/// relative frequencies of all cloned blocks remain the same after cloning.
+static void updateCallerBFI(BasicBlock *CallSiteBlock,
+                            const ValueToValueMapTy &VMap,
+                            BlockFrequencyInfo *CallerBFI,
+                            BlockFrequencyInfo *CalleeBFI,
+                            const BasicBlock &CalleeEntryBlock) {
+  SmallPtrSet<BasicBlock *, 16> ClonedBBs;
+  for (auto const &Entry : VMap) {
+    if (!isa<BasicBlock>(Entry.first) || !Entry.second)
+      continue;
+    auto *OrigBB = cast<BasicBlock>(Entry.first);
+    auto *ClonedBB = cast<BasicBlock>(Entry.second);
+    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+    if (!ClonedBBs.insert(ClonedBB).second) {
+      // Multiple blocks in the callee might get mapped to one cloned block in
+      // the caller since we prune the callee as we clone it. When that happens,
+      // we want to use the maximum among the original blocks' frequencies.
+      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+      if (NewFreq > Freq)
+        Freq = NewFreq;
+    }
+    CallerBFI->setBlockFreq(ClonedBB, Freq);
+  }
+  BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
+  CallerBFI->setBlockFreqAndScale(
+      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
+      ClonedBBs);
+}
+
+/// Update the branch metadata for cloned call instructions.
+static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
+                              const Optional<uint64_t> &CalleeEntryCount,
+                              const Instruction *TheCall) {
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+    return;
+  Optional<uint64_t> CallSiteCount =
+      ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+  uint64_t CallCount =
+      std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
+               CalleeEntryCount.getValue());
+
+  for (auto const &Entry : VMap)
+    if (isa<CallInst>(Entry.first))
+      if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+        CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+  for (BasicBlock &BB : *Callee)
+    // No need to update the callsite if it is pruned during inlining.
+    if (VMap.count(&BB))
+      for (Instruction &I : BB)
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
+                               CalleeEntryCount.getValue());
+}
+
+/// Update the entry count of callee after inlining.
+///
+/// The callsite's block count is subtracted from the callee's function entry
+/// count.
+static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
+                              Instruction *CallInst, Function *Callee) {
+  // If the callee has a original count of N, and the estimated count of
+  // callsite is M, the new callee count is set to N - M. M is estimated from
+  // the caller's entry count, its entry block frequency and the block frequency
+  // of the callsite.
+  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  if (!CalleeCount.hasValue())
+    return;
+  Optional<uint64_t> CallCount =
+      ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+  if (!CallCount.hasValue())
+    return;
+  // Since CallSiteCount is an estimate, it could exceed the original callee
+  // count and has to be set to 0.
+  if (CallCount.getValue() > CalleeCount.getValue())
+    Callee->setEntryCount(0);
+  else
+    Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
+}
 
 /// This function inlines the called function into the basic block of the
 /// caller. This returns false if it is not possible to inline this call.
@@ -1405,13 +1487,13 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                           AAResults *CalleeAAR, bool InsertLifetime) {
   Instruction *TheCall = CS.getInstruction();
-  assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
-         "Instruction not in function!");
+  assert(TheCall->getParent() && TheCall->getFunction()
+         && "Instruction not in function!");
 
   // If IFI has any state in it, zap it before we fill it in.
   IFI.reset();
-  
-  const Function *CalledFunc = CS.getCalledFunction();
+
+  Function *CalledFunc = CS.getCalledFunction();
   if (!CalledFunc ||              // Can't inline external function or indirect
       CalledFunc->isDeclaration() || // call, or call to a vararg function!
       CalledFunc->getFunctionType()->isVarArg()) return false;
@@ -1548,7 +1630,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // matches up the formal to the actual argument values.
     CallSite::arg_iterator AI = CS.arg_begin();
     unsigned ArgNo = 0;
-    for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+    for (Function::arg_iterator I = CalledFunc->arg_begin(),
          E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
       Value *ActualArg = *AI;
 
@@ -1578,10 +1660,18 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                               /*ModuleLevelChanges=*/false, Returns, ".i",
                               &InlinedFunctionInfo, TheCall);
-
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
 
+    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
+      // Update the BFI of blocks cloned into the caller.
+      updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
+                      CalledFunc->front());
+
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+    // Update the profile count of callee.
+    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
+
     // Inject byval arguments initialization.
     for (std::pair<Value*, Value*> &Init : ByValInit)
       HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
@@ -2087,6 +2177,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                                           CalledFunc->getName() + ".exit");
   }
 
+  if (IFI.CallerBFI) {
+    // Copy original BB's block frequency to AfterCallBB
+    IFI.CallerBFI->setBlockFreq(
+        AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
+  }
+
   // Change the branch that used to go to AfterCallBB to branch to the first
   // basic block of the inlined function.
   //
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 68c6b74d5e5b..49b4bd92faf4 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -87,7 +87,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     Instruction *I = Worklist.pop_back_val();
     BasicBlock *InstBB = I->getParent();
     Loop *L = LI.getLoopFor(InstBB);
-    if (!LoopExitBlocks.count(L))   
+    assert(L && "Instruction belongs to a BB that's not part of a loop");
+    if (!LoopExitBlocks.count(L))
       L->getExitBlocks(LoopExitBlocks[L]);
     assert(LoopExitBlocks.count(L));
     const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
@@ -105,7 +106,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     for (Use &U : I->uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
       BasicBlock *UserBB = User->getParent();
-      if (PHINode *PN = dyn_cast<PHINode>(User))
+      if (auto *PN = dyn_cast<PHINode>(User))
         UserBB = PN->getIncomingBlock(U);
 
       if (InstBB != UserBB && !L->contains(UserBB))
@@ -123,7 +124,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     // DomBB dominates the value, so adjust DomBB to the normal destination
     // block, which is effectively where the value is first usable.
     BasicBlock *DomBB = InstBB;
-    if (InvokeInst *Inv = dyn_cast<InvokeInst>(I))
+    if (auto *Inv = dyn_cast<InvokeInst>(I))
       DomBB = Inv->getNormalDest();
 
     DomTreeNode *DomNode = DT.getNode(DomBB);
@@ -188,7 +189,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       // block.
       Instruction *User = cast<Instruction>(UseToRewrite->getUser());
       BasicBlock *UserBB = User->getParent();
-      if (PHINode *PN = dyn_cast<PHINode>(User))
+      if (auto *PN = dyn_cast<PHINode>(User))
         UserBB = PN->getIncomingBlock(*UseToRewrite);
 
       if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
@@ -237,40 +238,75 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
   return Changed;
 }
 
-/// Return true if the specified block dominates at least
-/// one of the blocks in the specified list.
-static bool
-blockDominatesAnExit(BasicBlock *BB,
-                     DominatorTree &DT,
-                     const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
-  DomTreeNode *DomNode = DT.getNode(BB);
-  return any_of(ExitBlocks, [&](BasicBlock *EB) {
-    return DT.dominates(DomNode, DT.getNode(EB));
-  });
+// Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
+static void computeBlocksDominatingExits(
+    Loop &L, DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    SmallPtrSet<BasicBlock *, 8> &BlocksDominatingExits) {
+  SmallVector<BasicBlock *, 8> BBWorklist;
+
+  // We start from the exit blocks, as every block trivially dominates itself
+  // (not strictly).
+  for (BasicBlock *BB : ExitBlocks)
+    BBWorklist.push_back(BB);
+
+  while (!BBWorklist.empty()) {
+    BasicBlock *BB = BBWorklist.pop_back_val();
+
+    // Check if this is a loop header. If this is the case, we're done.
+    if (L.getHeader() == BB)
+      continue;
+
+    // Otherwise, add its immediate predecessor in the dominator tree to the
+    // worklist, unless we visited it already.
+    BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
+
+    // Exit blocks can have an immediate dominator not beloinging to the
+    // loop. For an exit block to be immediately dominated by another block
+    // outside the loop, it implies not all paths from that dominator, to the
+    // exit block, go through the loop.
+    // Example:
+    //
+    // |---- A
+    // |     |
+    // |     B<--
+    // |     |  |
+    // |---> C --
+    //       |
+    //       D
+    //
+    // C is the exit block of the loop and it's immediately dominated by A,
+    // which doesn't belong to the loop.
+    if (!L.contains(IDomBB))
+      continue;
+
+    if (BlocksDominatingExits.insert(IDomBB).second)
+      BBWorklist.push_back(IDomBB);
+  }
 }
 
 bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
                      ScalarEvolution *SE) {
   bool Changed = false;
 
-  // Get the set of exiting blocks.
   SmallVector<BasicBlock *, 8> ExitBlocks;
   L.getExitBlocks(ExitBlocks);
-
   if (ExitBlocks.empty())
     return false;
 
+  SmallPtrSet<BasicBlock *, 8> BlocksDominatingExits;
+
+  // We want to avoid use-scanning leveraging dominance informations.
+  // If a block doesn't dominate any of the loop exits, the none of the values
+  // defined in the loop can be used outside.
+  // We compute the set of blocks fullfilling the conditions in advance
+  // walking the dominator tree upwards until we hit a loop header.
+  computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits);
+
   SmallVector<Instruction *, 8> Worklist;
 
   // Look at all the instructions in the loop, checking to see if they have uses
   // outside the loop.  If so, put them into the worklist to rewrite those uses.
-  for (BasicBlock *BB : L.blocks()) {
-    // For large loops, avoid use-scanning by using dominance information:  In
-    // particular, if a block does not dominate any of the loop exits, then none
-    // of the values defined in the block could be used outside the loop.
-    if (!blockDominatesAnExit(BB, DT, ExitBlocks))
-      continue;
-
+  for (BasicBlock *BB : BlocksDominatingExits) {
     for (Instruction &I : *BB) {
       // Reject two common cases fast: instructions with no uses (like stores)
       // and instructions with one use that is in the same block as this.
@@ -395,8 +431,8 @@ PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!formLCSSAOnAllLoops(&LI, DT, SE))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<BasicAA>();
   PA.preserve<GlobalsAA>();
   PA.preserve<SCEVAA>();
diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index d97cd7582eaa..fe93d6927c63 100644
--- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -100,12 +100,12 @@ private:
   bool perform(CallInst *CI);
   void checkCandidate(CallInst &CI);
   void shrinkWrapCI(CallInst *CI, Value *Cond);
-  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc::Func &Func);
-  bool performCallErrors(CallInst *CI, const LibFunc::Func &Func);
-  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc::Func &Func);
-  Value *generateOneRangeCond(CallInst *CI, const LibFunc::Func &Func);
-  Value *generateTwoRangeCond(CallInst *CI, const LibFunc::Func &Func);
-  Value *generateCondForPow(CallInst *CI, const LibFunc::Func &Func);
+  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func);
+  bool performCallErrors(CallInst *CI, const LibFunc &Func);
+  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func);
+  Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateCondForPow(CallInst *CI, const LibFunc &Func);
 
   // Create an OR of two conditions.
   Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
@@ -141,44 +141,44 @@ private:
 
 // Perform the transformation to calls with errno set by domain error.
 bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
-                                                    const LibFunc::Func &Func) {
+                                                    const LibFunc &Func) {
   Value *Cond = nullptr;
 
   switch (Func) {
-  case LibFunc::acos:  // DomainError: (x < -1 || x > 1)
-  case LibFunc::acosf: // Same as acos
-  case LibFunc::acosl: // Same as acos
-  case LibFunc::asin:  // DomainError: (x < -1 || x > 1)
-  case LibFunc::asinf: // Same as asin
-  case LibFunc::asinl: // Same as asin
+  case LibFunc_acos:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_acosf: // Same as acos
+  case LibFunc_acosl: // Same as acos
+  case LibFunc_asin:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_asinf: // Same as asin
+  case LibFunc_asinl: // Same as asin
   {
     ++NumWrappedTwoCond;
     Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
     break;
   }
-  case LibFunc::cos:  // DomainError: (x == +inf || x == -inf)
-  case LibFunc::cosf: // Same as cos
-  case LibFunc::cosl: // Same as cos
-  case LibFunc::sin:  // DomainError: (x == +inf || x == -inf)
-  case LibFunc::sinf: // Same as sin
-  case LibFunc::sinl: // Same as sin
+  case LibFunc_cos:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_cosf: // Same as cos
+  case LibFunc_cosl: // Same as cos
+  case LibFunc_sin:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_sinf: // Same as sin
+  case LibFunc_sinl: // Same as sin
   {
     ++NumWrappedTwoCond;
     Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
                         -INFINITY);
     break;
   }
-  case LibFunc::acosh:  // DomainError: (x < 1)
-  case LibFunc::acoshf: // Same as acosh
-  case LibFunc::acoshl: // Same as acosh
+  case LibFunc_acosh:  // DomainError: (x < 1)
+  case LibFunc_acoshf: // Same as acosh
+  case LibFunc_acoshl: // Same as acosh
   {
     ++NumWrappedOneCond;
     Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
     break;
   }
-  case LibFunc::sqrt:  // DomainError: (x < 0)
-  case LibFunc::sqrtf: // Same as sqrt
-  case LibFunc::sqrtl: // Same as sqrt
+  case LibFunc_sqrt:  // DomainError: (x < 0)
+  case LibFunc_sqrtf: // Same as sqrt
+  case LibFunc_sqrtl: // Same as sqrt
   {
     ++NumWrappedOneCond;
     Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
@@ -193,31 +193,31 @@ bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
 
 // Perform the transformation to calls with errno set by range error.
 bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
-                                                   const LibFunc::Func &Func) {
+                                                   const LibFunc &Func) {
   Value *Cond = nullptr;
 
   switch (Func) {
-  case LibFunc::cosh:
-  case LibFunc::coshf:
-  case LibFunc::coshl:
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
-  case LibFunc::exp10:
-  case LibFunc::exp10f:
-  case LibFunc::exp10l:
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
-  case LibFunc::sinh:
-  case LibFunc::sinhf:
-  case LibFunc::sinhl: {
+  case LibFunc_cosh:
+  case LibFunc_coshf:
+  case LibFunc_coshl:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
+  case LibFunc_exp10:
+  case LibFunc_exp10f:
+  case LibFunc_exp10l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
+  case LibFunc_sinh:
+  case LibFunc_sinhf:
+  case LibFunc_sinhl: {
     Cond = generateTwoRangeCond(CI, Func);
     break;
   }
-  case LibFunc::expm1:  // RangeError: (709, inf)
-  case LibFunc::expm1f: // RangeError: (88, inf)
-  case LibFunc::expm1l: // RangeError: (11356, inf)
+  case LibFunc_expm1:  // RangeError: (709, inf)
+  case LibFunc_expm1f: // RangeError: (88, inf)
+  case LibFunc_expm1l: // RangeError: (11356, inf)
   {
     Cond = generateOneRangeCond(CI, Func);
     break;
@@ -231,15 +231,15 @@ bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
 
 // Perform the transformation to calls with errno set by combination of errors.
 bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
-                                           const LibFunc::Func &Func) {
+                                           const LibFunc &Func) {
   Value *Cond = nullptr;
 
   switch (Func) {
-  case LibFunc::atanh:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_atanh:  // DomainError: (x < -1 || x > 1)
                         // PoleError:   (x == -1 || x == 1)
                         // Overall Cond: (x <= -1 || x >= 1)
-  case LibFunc::atanhf: // Same as atanh
-  case LibFunc::atanhl: // Same as atanh
+  case LibFunc_atanhf: // Same as atanh
+  case LibFunc_atanhl: // Same as atanh
   {
     if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
       return false;
@@ -247,20 +247,20 @@ bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
     Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
     break;
   }
-  case LibFunc::log:    // DomainError: (x < 0)
+  case LibFunc_log:    // DomainError: (x < 0)
                         // PoleError:   (x == 0)
                         // Overall Cond: (x <= 0)
-  case LibFunc::logf:   // Same as log
-  case LibFunc::logl:   // Same as log
-  case LibFunc::log10:  // Same as log
-  case LibFunc::log10f: // Same as log
-  case LibFunc::log10l: // Same as log
-  case LibFunc::log2:   // Same as log
-  case LibFunc::log2f:  // Same as log
-  case LibFunc::log2l:  // Same as log
-  case LibFunc::logb:   // Same as log
-  case LibFunc::logbf:  // Same as log
-  case LibFunc::logbl:  // Same as log
+  case LibFunc_logf:   // Same as log
+  case LibFunc_logl:   // Same as log
+  case LibFunc_log10:  // Same as log
+  case LibFunc_log10f: // Same as log
+  case LibFunc_log10l: // Same as log
+  case LibFunc_log2:   // Same as log
+  case LibFunc_log2f:  // Same as log
+  case LibFunc_log2l:  // Same as log
+  case LibFunc_logb:   // Same as log
+  case LibFunc_logbf:  // Same as log
+  case LibFunc_logbl:  // Same as log
   {
     if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
       return false;
@@ -268,11 +268,11 @@ bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
     Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
     break;
   }
-  case LibFunc::log1p:  // DomainError: (x < -1)
+  case LibFunc_log1p:  // DomainError: (x < -1)
                         // PoleError:   (x == -1)
                         // Overall Cond: (x <= -1)
-  case LibFunc::log1pf: // Same as log1p
-  case LibFunc::log1pl: // Same as log1p
+  case LibFunc_log1pf: // Same as log1p
+  case LibFunc_log1pl: // Same as log1p
   {
     if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
       return false;
@@ -280,11 +280,11 @@ bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
     Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
     break;
   }
-  case LibFunc::pow: // DomainError: x < 0 and y is noninteger
+  case LibFunc_pow: // DomainError: x < 0 and y is noninteger
                      // PoleError:   x == 0 and y < 0
                      // RangeError:  overflow or underflow
-  case LibFunc::powf:
-  case LibFunc::powl: {
+  case LibFunc_powf:
+  case LibFunc_powl: {
     if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError ||
         !LibCallsShrinkWrapDoRangeError)
       return false;
@@ -313,7 +313,7 @@ void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
   if (!CI.use_empty())
     return;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI.getCalledFunction();
   if (!Callee)
     return;
@@ -333,16 +333,16 @@ void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
 
 // Generate the upper bound condition for RangeError.
 Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
-                                                const LibFunc::Func &Func) {
+                                                const LibFunc &Func) {
   float UpperBound;
   switch (Func) {
-  case LibFunc::expm1: // RangeError: (709, inf)
+  case LibFunc_expm1: // RangeError: (709, inf)
     UpperBound = 709.0f;
     break;
-  case LibFunc::expm1f: // RangeError: (88, inf)
+  case LibFunc_expm1f: // RangeError: (88, inf)
     UpperBound = 88.0f;
     break;
-  case LibFunc::expm1l: // RangeError: (11356, inf)
+  case LibFunc_expm1l: // RangeError: (11356, inf)
     UpperBound = 11356.0f;
     break;
   default:
@@ -355,57 +355,57 @@ Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
 
 // Generate the lower and upper bound condition for RangeError.
 Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
-                                                const LibFunc::Func &Func) {
+                                                const LibFunc &Func) {
   float UpperBound, LowerBound;
   switch (Func) {
-  case LibFunc::cosh: // RangeError: (x < -710 || x > 710)
-  case LibFunc::sinh: // Same as cosh
+  case LibFunc_cosh: // RangeError: (x < -710 || x > 710)
+  case LibFunc_sinh: // Same as cosh
     LowerBound = -710.0f;
     UpperBound = 710.0f;
     break;
-  case LibFunc::coshf: // RangeError: (x < -89 || x > 89)
-  case LibFunc::sinhf: // Same as coshf
+  case LibFunc_coshf: // RangeError: (x < -89 || x > 89)
+  case LibFunc_sinhf: // Same as coshf
     LowerBound = -89.0f;
     UpperBound = 89.0f;
     break;
-  case LibFunc::coshl: // RangeError: (x < -11357 || x > 11357)
-  case LibFunc::sinhl: // Same as coshl
+  case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357)
+  case LibFunc_sinhl: // Same as coshl
     LowerBound = -11357.0f;
     UpperBound = 11357.0f;
     break;
-  case LibFunc::exp: // RangeError: (x < -745 || x > 709)
+  case LibFunc_exp: // RangeError: (x < -745 || x > 709)
     LowerBound = -745.0f;
     UpperBound = 709.0f;
     break;
-  case LibFunc::expf: // RangeError: (x < -103 || x > 88)
+  case LibFunc_expf: // RangeError: (x < -103 || x > 88)
     LowerBound = -103.0f;
     UpperBound = 88.0f;
     break;
-  case LibFunc::expl: // RangeError: (x < -11399 || x > 11356)
+  case LibFunc_expl: // RangeError: (x < -11399 || x > 11356)
     LowerBound = -11399.0f;
     UpperBound = 11356.0f;
     break;
-  case LibFunc::exp10: // RangeError: (x < -323 || x > 308)
+  case LibFunc_exp10: // RangeError: (x < -323 || x > 308)
     LowerBound = -323.0f;
     UpperBound = 308.0f;
     break;
-  case LibFunc::exp10f: // RangeError: (x < -45 || x > 38)
+  case LibFunc_exp10f: // RangeError: (x < -45 || x > 38)
     LowerBound = -45.0f;
     UpperBound = 38.0f;
     break;
-  case LibFunc::exp10l: // RangeError: (x < -4950 || x > 4932)
+  case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932)
     LowerBound = -4950.0f;
     UpperBound = 4932.0f;
     break;
-  case LibFunc::exp2: // RangeError: (x < -1074 || x > 1023)
+  case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023)
     LowerBound = -1074.0f;
     UpperBound = 1023.0f;
     break;
-  case LibFunc::exp2f: // RangeError: (x < -149 || x > 127)
+  case LibFunc_exp2f: // RangeError: (x < -149 || x > 127)
     LowerBound = -149.0f;
     UpperBound = 127.0f;
     break;
-  case LibFunc::exp2l: // RangeError: (x < -16445 || x > 11383)
+  case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383)
     LowerBound = -16445.0f;
     UpperBound = 11383.0f;
     break;
@@ -434,9 +434,9 @@ Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
 // (i.e. we might invoke the calls that will not set the errno.).
 //
 Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
-                                              const LibFunc::Func &Func) {
-  // FIXME: LibFunc::powf and powl TBD.
-  if (Func != LibFunc::pow) {
+                                              const LibFunc &Func) {
+  // FIXME: LibFunc_powf and powl TBD.
+  if (Func != LibFunc_pow) {
     DEBUG(dbgs() << "Not handled powf() and powl()\n");
     return nullptr;
   }
@@ -516,7 +516,7 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
 
 // Perform the transformation to a single candidate.
 bool LibCallsShrinkWrap::perform(CallInst *CI) {
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   assert(Callee && "perform() should apply to a non-empty callee");
   TLI.getLibFunc(*Callee, Func);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 6e4174aa0cda..18b29226c2ef 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -126,21 +126,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     // If the default is unreachable, ignore it when searching for TheOnlyDest.
     if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
         SI->getNumCases() > 0) {
-      TheOnlyDest = SI->case_begin().getCaseSuccessor();
+      TheOnlyDest = SI->case_begin()->getCaseSuccessor();
     }
 
     // Figure out which case it goes to.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
       // Found case matching a constant operand?
-      if (i.getCaseValue() == CI) {
-        TheOnlyDest = i.getCaseSuccessor();
+      if (i->getCaseValue() == CI) {
+        TheOnlyDest = i->getCaseSuccessor();
         break;
       }
 
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
-      if (i.getCaseSuccessor() == DefaultDest) {
+      if (i->getCaseSuccessor() == DefaultDest) {
         MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
         unsigned NCases = SI->getNumCases();
         // Fold the case metadata into the default if there will be any branches
@@ -154,7 +153,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
             Weights.push_back(CI->getValue().getZExtValue());
           }
           // Merge weight of this case to the default weight.
-          unsigned idx = i.getCaseIndex();
+          unsigned idx = i->getCaseIndex();
           Weights[0] += Weights[idx+1];
           // Remove weight for this case.
           std::swap(Weights[idx+1], Weights.back());
@@ -165,15 +164,19 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         }
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
-        SI->removeCase(i);
-        --i; --e;
+        i = SI->removeCase(i);
+        e = SI->case_end();
         continue;
       }
 
       // Otherwise, check to see if the switch only branches to one destination.
       // We do this by reseting "TheOnlyDest" to null when we find two non-equal
       // destinations.
-      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = nullptr;
+      if (i->getCaseSuccessor() != TheOnlyDest)
+        TheOnlyDest = nullptr;
+
+      // Increment this iterator as we haven't removed the case.
+      ++i;
     }
 
     if (CI && !TheOnlyDest) {
@@ -209,7 +212,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     if (SI->getNumCases() == 1) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
-      SwitchInst::CaseIt FirstCase = SI->case_begin();
+      auto FirstCase = *SI->case_begin();
       Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
           FirstCase.getCaseValue(), "cond");
 
@@ -287,7 +290,15 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
 ///
 bool llvm::isInstructionTriviallyDead(Instruction *I,
                                       const TargetLibraryInfo *TLI) {
-  if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
+  if (!I->use_empty())
+    return false;
+  return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
+bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
+                                           const TargetLibraryInfo *TLI) {
+  if (isa<TerminatorInst>(I))
+    return false;
 
   // We don't want the landingpad-like instructions removed by anything this
   // general.
@@ -307,7 +318,8 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
     return true;
   }
 
-  if (!I->mayHaveSideEffects()) return true;
+  if (!I->mayHaveSideEffects())
+    return true;
 
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
@@ -334,7 +346,8 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
     }
   }
 
-  if (isAllocLikeFn(I, TLI)) return true;
+  if (isAllocLikeFn(I, TLI))
+    return true;
 
   if (CallInst *CI = isFreeCall(I, TLI))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
@@ -1075,11 +1088,11 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   // Since we can't guarantee that the original dbg.declare instrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
-  DbgValueList DbgValues;
-  FindAllocaDbgValues(DbgValues, APN);
-  for (auto DVI : DbgValues) {
-    assert (DVI->getValue() == APN);
-    assert (DVI->getOffset() == 0);
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, APN);
+  for (auto *DVI : DbgValues) {
+    assert(DVI->getValue() == APN);
+    assert(DVI->getOffset() == 0);
     if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
       return true;
   }
@@ -1241,9 +1254,7 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
-/// FindAllocaDbgValues - Finds the llvm.dbg.value intrinsics describing the
-/// alloca 'V', if any.
-void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
+void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1251,36 +1262,32 @@ void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
           DbgValues.push_back(DVI);
 }
 
-static void DIExprAddDeref(SmallVectorImpl<uint64_t> &Expr) {
-  Expr.push_back(dwarf::DW_OP_deref);
-}
-
-static void DIExprAddOffset(SmallVectorImpl<uint64_t> &Expr, int Offset) {
+static void appendOffset(SmallVectorImpl<uint64_t> &Ops, int64_t Offset) {
   if (Offset > 0) {
-    Expr.push_back(dwarf::DW_OP_plus);
-    Expr.push_back(Offset);
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Offset);
   } else if (Offset < 0) {
-    Expr.push_back(dwarf::DW_OP_minus);
-    Expr.push_back(-Offset);
+    Ops.push_back(dwarf::DW_OP_minus);
+    Ops.push_back(-Offset);
   }
 }
 
-static DIExpression *BuildReplacementDIExpr(DIBuilder &Builder,
-                                            DIExpression *DIExpr, bool Deref,
-                                            int Offset) {
+/// Prepend \p DIExpr with a deref and offset operation.
+static DIExpression *prependDIExpr(DIBuilder &Builder, DIExpression *DIExpr,
+                                   bool Deref, int64_t Offset) {
   if (!Deref && !Offset)
     return DIExpr;
   // Create a copy of the original DIDescriptor for user variable, prepending
   // "deref" operation to a list of address elements, as new llvm.dbg.declare
   // will take a value storing address of the memory for variable, not
   // alloca itself.
-  SmallVector<uint64_t, 4> NewDIExpr;
+  SmallVector<uint64_t, 4> Ops;
   if (Deref)
-    DIExprAddDeref(NewDIExpr);
-  DIExprAddOffset(NewDIExpr, Offset);
+    Ops.push_back(dwarf::DW_OP_deref);
+  appendOffset(Ops, Offset);
   if (DIExpr)
-    NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
-  return Builder.createExpression(NewDIExpr);
+    Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  return Builder.createExpression(Ops);
 }
 
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
@@ -1294,7 +1301,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  DIExpr = BuildReplacementDIExpr(Builder, DIExpr, Deref, Offset);
+  DIExpr = prependDIExpr(Builder, DIExpr, Deref, Offset);
 
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
@@ -1326,11 +1333,11 @@ static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
   // Insert the offset immediately after the first deref.
   // We could just change the offset argument of dbg.value, but it's unsigned...
   if (Offset) {
-    SmallVector<uint64_t, 4> NewDIExpr;
-    DIExprAddDeref(NewDIExpr);
-    DIExprAddOffset(NewDIExpr, Offset);
-    NewDIExpr.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
-    DIExpr = Builder.createExpression(NewDIExpr);
+    SmallVector<uint64_t, 4> Ops;
+    Ops.push_back(dwarf::DW_OP_deref);
+    appendOffset(Ops, Offset);
+    Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
+    DIExpr = Builder.createExpression(Ops);
   }
 
   Builder.insertDbgValueIntrinsic(NewAddress, DVI->getOffset(), DIVar, DIExpr,
@@ -1349,6 +1356,53 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
       }
 }
 
+void llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  auto &M = *I.getModule();
+
+  auto MDWrap = [&](Value *V) {
+    return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+  };
+
+  if (isa<BitCastInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value
+      // to use the cast's source.
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+    }
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      unsigned BitWidth =
+          M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
+      APInt Offset(BitWidth, 0);
+      // Rewrite a constant GEP into a DIExpression.
+      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
+        auto *DIExpr = DVI->getExpression();
+        DIBuilder DIB(M, /*AllowUnresolved*/ false);
+        // GEP offsets are i32 and thus alwaus fit into an int64_t.
+        DIExpr = prependDIExpr(DIB, DIExpr, NoDeref, Offset.getSExtValue());
+        DVI->setOperand(0, MDWrap(I.getOperand(0)));
+        DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+        DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+      }
+    }
+  } else if (isa<LoadInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Rewrite the load into DW_OP_deref.
+      auto *DIExpr = DVI->getExpression();
+      DIBuilder DIB(M, /*AllowUnresolved*/ false);
+      DIExpr = prependDIExpr(DIB, DIExpr, WithDeref, 0);
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+      DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
+    }
+  }
+}
+
 unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
@@ -2068,9 +2122,9 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
 void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
     CallInst *CI, const TargetLibraryInfo *TLI) {
   Function *F = CI->getCalledFunction();
-  LibFunc::Func Func;
+  LibFunc Func;
   if (F && !F->hasLocalLinkage() && F->hasName() &&
       TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
       !F->doesNotAccessMemory())
-    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
 }
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 00cda2af00c6..e7ba19665d59 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -645,14 +645,7 @@ ReprocessLoop:
   // loop-invariant instructions out of the way to open up more
   // opportunities, and the disadvantage of having the responsibility
   // to preserve dominator information.
-  bool UniqueExit = true;
-  if (!ExitBlocks.empty())
-    for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i)
-      if (ExitBlocks[i] != ExitBlocks[0]) {
-        UniqueExit = false;
-        break;
-      }
-  if (UniqueExit) {
+  if (ExitBlockSet.size() == 1) {
     for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       BasicBlock *ExitingBlock = ExitingBlocks[i];
       if (!ExitingBlock->getSinglePredecessor()) continue;
@@ -735,6 +728,17 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
                         bool PreserveLCSSA) {
   bool Changed = false;
 
+#ifndef NDEBUG
+  // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA
+  // form.
+  if (PreserveLCSSA) {
+    assert(DT && "DT not available.");
+    assert(LI && "LI not available.");
+    assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+           "Requested to preserve LCSSA, but it's already broken.");
+  }
+#endif
+
   // Worklist maintains our depth-first queue of loops in this nest to process.
   SmallVector<Loop *, 4> Worklist;
   Worklist.push_back(L);
@@ -814,15 +818,6 @@ bool LoopSimplify::runOnFunction(Function &F) {
       &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-#ifndef NDEBUG
-  if (PreserveLCSSA) {
-    assert(DT && "DT not available.");
-    assert(LI && "LI not available.");
-    bool InLCSSA = all_of(
-        *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
-    assert(InLCSSA && "Requested to preserve LCSSA, but it's already broken.");
-  }
-#endif
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
@@ -846,17 +841,14 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
   AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
 
-  // FIXME: This pass should verify that the loops on which it's operating
-  // are in canonical SSA form, and that the pass itself preserves this form.
+  // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
+  // after simplifying the loops.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, SE, AC, true /* PreserveLCSSA */);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, /*PreserveLCSSA*/ false);
 
   if (!Changed)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e346ebd6a000..3c669ce644e2 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -51,6 +52,16 @@ UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
                     cl::desc("Allow runtime unrolled loops to be unrolled "
                              "with epilog instead of prolog."));
 
+static cl::opt<bool>
+UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
+                    cl::desc("Verify domtree after unrolling"),
+#ifdef NDEBUG
+    cl::init(false)
+#else
+    cl::init(true)
+#endif
+                    );
+
 /// Convert the instruction operands from referencing the current values into
 /// those specified by VMap.
 static inline void remapInstruction(Instruction *I,
@@ -205,6 +216,45 @@ const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
   }
 }
 
+/// The function chooses which type of unroll (epilog or prolog) is more
+/// profitabale.
+/// Epilog unroll is more profitable when there is PHI that starts from
+/// constant.  In this case epilog will leave PHI start from constant,
+/// but prolog will convert it to non-constant.
+///
+/// loop:
+///   PN = PHI [I, Latch], [CI, PreHeader]
+///   I = foo(PN)
+///   ...
+///
+/// Epilog unroll case.
+/// loop:
+///   PN = PHI [I2, Latch], [CI, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+/// Prolog unroll case.
+///   NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
+/// loop:
+///   PN = PHI [I2, Latch], [NewPN, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+///
+static bool isEpilogProfitable(Loop *L) {
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  assert(PreHeader && Header);
+  for (Instruction &BBI : *Header) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    if (!PN)
+      break;
+    if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
+      return true;
+  }
+  return false;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -296,8 +346,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     Count = TripCount;
 
   // Don't enter the unroll code if there is nothing to do.
-  if (TripCount == 0 && Count < 2 && PeelCount == 0)
+  if (TripCount == 0 && Count < 2 && PeelCount == 0) {
+    DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
     return false;
+  }
 
   assert(Count > 0);
   assert(TripMultiple > 0);
@@ -330,7 +382,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
          "and peeling for the same loop");
 
   if (PeelCount)
-    peelLoop(L, PeelCount, LI, SE, DT, PreserveLCSSA);
+    peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
 
   // Loops containing convergent instructions must have a count that divides
   // their TripMultiple.
@@ -346,14 +398,22 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                "convergent operation.");
       });
 
+  bool EpilogProfitability =
+      UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
+                                              : isEpilogProfitable(L);
+
   if (RuntimeTripCount && TripMultiple % Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
-                                  UnrollRuntimeEpilog, LI, SE, DT, 
+                                  EpilogProfitability, LI, SE, DT,
                                   PreserveLCSSA)) {
     if (Force)
       RuntimeTripCount = false;
-    else
+    else {
+      DEBUG(
+          dbgs() << "Wont unroll; remainder loop could not be generated"
+                    "when assuming runtime trip count\n");
       return false;
+    }
   }
 
   // Notify ScalarEvolution that the loop will be substantially changed,
@@ -446,6 +506,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
   for (Loop *SubLoop : *L)
     LoopsToSimplify.insert(SubLoop);
 
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (const DILocation *DIL = I.getDebugLoc())
+          I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -456,19 +522,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
+      assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
+             "Header should not be in a sub-loop");
       // Tell LI about New.
-      if (*BB == Header) {
-        assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
-        L->addBasicBlockToLoop(New, *LI);
-      } else {
-        const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
-        if (OldLoop) {
-          LoopsToSimplify.insert(NewLoops[OldLoop]);
+      const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+      if (OldLoop) {
+        LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-          // Forget the old loop, since its inputs may have changed.
-          if (SE)
-            SE->forgetLoop(OldLoop);
-        }
+        // Forget the old loop, since its inputs may have changed.
+        if (SE)
+          SE->forgetLoop(OldLoop);
       }
 
       if (*BB == Header)
@@ -615,14 +678,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
       Term->eraseFromParent();
     }
   }
+
   // Update dominators of blocks we might reach through exits.
   // Immediate dominator of such block might change, because we add more
   // routes which can lead to the exit: we can now reach it from the copied
-  // iterations too. Thus, the new idom of the block will be the nearest
-  // common dominator of the previous idom and common dominator of all copies of
-  // the previous idom. This is equivalent to the nearest common dominator of
-  // the previous idom and the first latch, which dominates all copies of the
-  // previous idom.
+  // iterations too.
   if (DT && Count > 1) {
     for (auto *BB : OriginalLoopBlocks) {
       auto *BBDomNode = DT->getNode(BB);
@@ -632,12 +692,38 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
         if (!L->contains(ChildBB))
           ChildrenToUpdate.push_back(ChildBB);
       }
-      BasicBlock *NewIDom = DT->findNearestCommonDominator(BB, Latches[0]);
+      BasicBlock *NewIDom;
+      if (BB == LatchBlock) {
+        // The latch is special because we emit unconditional branches in
+        // some cases where the original loop contained a conditional branch.
+        // Since the latch is always at the bottom of the loop, if the latch
+        // dominated an exit before unrolling, the new dominator of that exit
+        // must also be a latch.  Specifically, the dominator is the first
+        // latch which ends in a conditional branch, or the last latch if
+        // there is no such latch.
+        NewIDom = Latches.back();
+        for (BasicBlock *IterLatch : Latches) {
+          TerminatorInst *Term = IterLatch->getTerminator();
+          if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
+            NewIDom = IterLatch;
+            break;
+          }
+        }
+      } else {
+        // The new idom of the block will be the nearest common dominator
+        // of all copies of the previous idom. This is equivalent to the
+        // nearest common dominator of the previous idom and the first latch,
+        // which dominates all copies of the previous idom.
+        NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
+      }
       for (auto *ChildBB : ChildrenToUpdate)
         DT->changeImmediateDominator(ChildBB, NewIDom);
     }
   }
 
+  if (DT && UnrollVerifyDomtree)
+    DT->verifyDomTree();
+
   // Merge adjacent basic blocks, if possible.
   SmallPtrSet<Loop *, 4> ForgottenLoops;
   for (BasicBlock *Latch : Latches) {
@@ -655,13 +741,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     }
   }
 
-  // FIXME: We only preserve DT info for complete unrolling now. Incrementally
-  // updating domtree after partial loop unrolling should also be easy.
-  if (DT && !CompletelyUnroll)
-    DT->recalculate(*L->getHeader()->getParent());
-  else if (DT)
-    DEBUG(DT->verifyDomTree());
-
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && !CompletelyUnroll && Count > 1) {
     SmallVector<WeakVH, 16> DeadInsts;
@@ -721,29 +800,29 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
   // at least one layer outside of the loop that was unrolled so that any
   // changes to the parent loop exposed by the unrolling are considered.
   if (DT) {
-    if (!OuterL && !CompletelyUnroll)
-      OuterL = L;
     if (OuterL) {
       // OuterL includes all loops for which we can break loop-simplify, so
       // it's sufficient to simplify only it (it'll recursively simplify inner
       // loops too).
+      if (NeedToFixLCSSA) {
+        // LCSSA must be performed on the outermost affected loop. The unrolled
+        // loop's last loop latch is guaranteed to be in the outermost loop
+        // after LoopInfo's been updated by markAsRemoved.
+        Loop *LatchLoop = LI->getLoopFor(Latches.back());
+        Loop *FixLCSSALoop = OuterL;
+        if (!FixLCSSALoop->contains(LatchLoop))
+          while (FixLCSSALoop->getParentLoop() != LatchLoop)
+            FixLCSSALoop = FixLCSSALoop->getParentLoop();
+
+        formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
+      } else if (PreserveLCSSA) {
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
+      }
+
       // TODO: That potentially might be compile-time expensive. We should try
       // to fix the loop-simplified form incrementally.
       simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
-
-      // LCSSA must be performed on the outermost affected loop. The unrolled
-      // loop's last loop latch is guaranteed to be in the outermost loop after
-      // LoopInfo's been updated by markAsRemoved.
-      Loop *LatchLoop = LI->getLoopFor(Latches.back());
-      if (!OuterL->contains(LatchLoop))
-        while (OuterL->getParentLoop() != LatchLoop)
-          OuterL = OuterL->getParentLoop();
-
-      if (NeedToFixLCSSA)
-        formLCSSARecursively(*OuterL, *DT, LI, SE);
-      else
-        assert(OuterL->isLCSSAForm(*DT) &&
-               "Loops should be in LCSSA form after loop-unroll.");
     } else {
       // Simplify loops for which we might've broken loop-simplify form.
       for (Loop *SubLoop : LoopsToSimplify)
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 842cf31f2e3d..73c14f5606b7 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
@@ -55,12 +56,20 @@ static bool canPeel(Loop *L) {
   if (!L->getExitingBlock() || !L->getUniqueExitBlock())
     return false;
 
+  // Don't try to peel loops where the latch is not the exiting block.
+  // This can be an indication of two different things:
+  // 1) The loop is not rotated.
+  // 2) The loop contains irreducible control flow that involves the latch.
+  if (L->getLoopLatch() != L->getExitingBlock())
+    return false;
+
   return true;
 }
 
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
-                            TargetTransformInfo::UnrollingPreferences &UP) {
+                            TargetTransformInfo::UnrollingPreferences &UP,
+                            unsigned &TripCount) {
   UP.PeelCount = 0;
   if (!canPeel(L))
     return;
@@ -69,6 +78,39 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!L->empty())
     return;
 
+  // Try to find a Phi node that has the same loop invariant as an input from
+  // its only back edge. If there is such Phi, peeling 1 iteration from the
+  // loop is profitable, because starting from 2nd iteration we will have an
+  // invariant instead of this Phi.
+  if (LoopSize <= UP.Threshold) {
+    BasicBlock *BackEdge = L->getLoopLatch();
+    assert(BackEdge && "Loop is not in simplified form?");
+    BasicBlock *Header = L->getHeader();
+    // Iterate over Phis to find one with invariant input on back edge.
+    bool FoundCandidate = false;
+    PHINode *Phi;
+    for (auto BI = Header->begin(); isa<PHINode>(&*BI); ++BI) {
+      Phi = cast<PHINode>(&*BI);
+      Value *Input = Phi->getIncomingValueForBlock(BackEdge);
+      if (L->isLoopInvariant(Input)) {
+        FoundCandidate = true;
+        break;
+      }
+    }
+    if (FoundCandidate) {
+      DEBUG(dbgs() << "Peel one iteration to get rid of " << *Phi
+                   << " because starting from 2nd iteration it is always"
+                   << " an invariant\n");
+      UP.PeelCount = 1;
+      return;
+    }
+  }
+
+  // Bail if we know the statically calculated trip count.
+  // In this case we rather prefer partial unrolling.
+  if (TripCount)
+    return;
+
   // If the user provided a peel count, use that.
   bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
   if (UserPeelCount) {
@@ -164,7 +206,8 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
                             BasicBlock *InsertBot, BasicBlock *Exit,
                             SmallVectorImpl<BasicBlock *> &NewBlocks,
                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                            ValueToValueMapTy &LVMap, LoopInfo *LI) {
+                            ValueToValueMapTy &LVMap, DominatorTree *DT,
+                            LoopInfo *LI) {
 
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
@@ -185,6 +228,17 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
+
+    // If dominator tree is available, insert nodes to represent cloned blocks.
+    if (DT) {
+      if (Header == *BB)
+        DT->addNewBlock(NewBB, InsertTop);
+      else {
+        DomTreeNode *IDom = DT->getNode(*BB)->getIDom();
+        // VMap must contain entry for IDom, as the iteration order is RPO.
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));
+      }
+    }
   }
 
   // Hook-up the control flow for the newly inserted blocks.
@@ -198,11 +252,13 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
   // The backedge now goes to the "bottom", which is either the loop's real
   // header (for the last peeled iteration) or the copied header of the next
   // iteration (for every other iteration)
-  BranchInst *LatchBR =
-      cast<BranchInst>(cast<BasicBlock>(VMap[Latch])->getTerminator());
+  BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+  BranchInst *LatchBR = cast<BranchInst>(NewLatch->getTerminator());
   unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
   LatchBR->setSuccessor(HeaderIdx, InsertBot);
   LatchBR->setSuccessor(1 - HeaderIdx, Exit);
+  if (DT)
+    DT->changeImmediateDominator(InsertBot, NewLatch);
 
   // The new copy of the loop body starts with a bunch of PHI nodes
   // that pick an incoming value from either the preheader, or the previous
@@ -257,7 +313,7 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
 /// optimizations.
 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
                     ScalarEvolution *SE, DominatorTree *DT,
-                    bool PreserveLCSSA) {
+                    AssumptionCache *AC, bool PreserveLCSSA) {
   if (!canPeel(L))
     return false;
 
@@ -358,7 +414,24 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
       CurHeaderWeight = 1;
 
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
-                    NewBlocks, LoopBlocks, VMap, LVMap, LI);
+                    NewBlocks, LoopBlocks, VMap, LVMap, DT, LI);
+
+    // Remap to use values from the current iteration instead of the
+    // previous one.
+    remapInstructionsInBlocks(NewBlocks, VMap);
+
+    if (DT) {
+      // Latches of the cloned loops dominate over the loop exit, so idom of the
+      // latter is the first cloned loop body, as original PreHeader dominates
+      // the original loop body.
+      if (Iter == 0)
+        DT->changeImmediateDominator(Exit, cast<BasicBlock>(LVMap[Latch]));
+#ifndef NDEBUG
+      if (VerifyDomInfo)
+        DT->verifyDomTree();
+#endif
+    }
+
     updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
                         PeelCount, ExitWeight);
 
@@ -369,10 +442,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     F->getBasicBlockList().splice(InsertTop->getIterator(),
                                   F->getBasicBlockList(),
                                   NewBlocks[0]->getIterator(), F->end());
-
-    // Remap to use values from the current iteration instead of the
-    // previous one.
-    remapInstructionsInBlocks(NewBlocks, VMap);
   }
 
   // Now adjust the phi nodes in the loop header to get their initial values
@@ -405,9 +474,16 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   }
 
   // If the loop is nested, we changed the parent loop, update SE.
-  if (Loop *ParentLoop = L->getParentLoop())
+  if (Loop *ParentLoop = L->getParentLoop()) {
     SE->forgetLoop(ParentLoop);
 
+    // FIXME: Incrementally update loop-simplify
+    simplifyLoop(ParentLoop, DT, LI, SE, AC, PreserveLCSSA);
+  } else {
+    // FIXME: Incrementally update loop-simplify
+    simplifyLoop(L, DT, LI, SE, AC, PreserveLCSSA);
+  }
+
   NumPeeled++;
 
   return true;
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index d3ea1564115b..85db734fb182 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -146,6 +146,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   // Add the branch to the exit block (around the unrolled loop)
   B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
   InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(Exit, PrologExit);
 }
 
 /// Connect the unrolling epilog code to the original loop.
@@ -260,13 +262,20 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   IRBuilder<> B(InsertPt);
   Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
   assert(Exit && "Loop must have a single exit block only");
-  // Split the exit to maintain loop canonicalization guarantees
+  // Split the epilogue exit to maintain loop canonicalization guarantees
   SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
   SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the unrolling loop)
   B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
   InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(Exit, NewExit);
+
+  // Split the main loop exit to maintain canonicalization guarantees.
+  SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
+  SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI,
+                         PreserveLCSSA);
 }
 
 /// Create a clone of the blocks in a loop and connect them together.
@@ -284,27 +293,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
                             BasicBlock *Preheader,
                             std::vector<BasicBlock *> &NewBlocks,
                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                            LoopInfo *LI) {
+                            DominatorTree *DT, LoopInfo *LI) {
   StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   Function *F = Header->getParent();
   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
-  Loop *NewLoop = nullptr;
   Loop *ParentLoop = L->getParentLoop();
-  if (CreateRemainderLoop) {
-    NewLoop = new Loop();
-    if (ParentLoop)
-      ParentLoop->addChildLoop(NewLoop);
-    else
-      LI->addTopLevelLoop(NewLoop);
-  }
-
   NewLoopsMap NewLoops;
-  if (NewLoop)
-    NewLoops[L] = NewLoop;
-  else if (ParentLoop)
+  NewLoops[ParentLoop] = ParentLoop;
+  if (!CreateRemainderLoop)
     NewLoops[L] = ParentLoop;
 
   // For each block in the original loop, create a new copy,
@@ -312,7 +311,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
-   
+
     // If we're unrolling the outermost loop, there's no remainder loop,
     // and this block isn't in a nested loop, then the new block is not
     // in any loop. Otherwise, add it to loopinfo.
@@ -326,6 +325,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
       InsertTop->getTerminator()->setSuccessor(0, NewBB);
     }
 
+    if (DT) {
+      if (Header == *BB) {
+        // The header is dominated by the preheader.
+        DT->addNewBlock(NewBB, InsertTop);
+      } else {
+        // Copy information from original loop to unrolled loop.
+        BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+      }
+    }
+
     if (Latch == *BB) {
       // For the last block, if CreateRemainderLoop is false, create a direct
       // jump to InsertBot. If not, create a loop back to cloned head.
@@ -376,7 +386,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
         NewPHI->setIncomingValue(idx, V);
     }
   }
-  if (NewLoop) {
+  if (CreateRemainderLoop) {
+    Loop *NewLoop = NewLoops[L];
+    assert(NewLoop && "L should have been cloned");
     // Add unroll disable metadata to disable future unrolling for this loop.
     SmallVector<Metadata *, 4> MDs;
     // Reserve first location for self reference to the LoopID metadata node.
@@ -599,6 +611,12 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // Branch to either remainder (extra iterations) loop or unrolling loop.
   B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
   PreHeaderBR->eraseFromParent();
+  if (DT) {
+    if (UseEpilogRemainder)
+      DT->changeImmediateDominator(NewExit, PreHeader);
+    else
+      DT->changeImmediateDominator(PrologExit, PreHeader);
+  }
   Function *F = Header->getParent();
   // Get an ordered list of blocks in the loop to help with the ordering of the
   // cloned blocks in the prolog/epilog code
@@ -623,7 +641,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
   CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
-                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);
+                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
   // Insert the cloned blocks into the function.
   F->getBasicBlockList().splice(InsertBot->getIterator(),
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index c8efa9efc7f3..175d013a011d 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -230,8 +230,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   //      - PHI:
   //        - All uses of the PHI must be the reduction (safe).
   //        - Otherwise, not safe.
-  //  - By one instruction outside of the loop (safe).
-  //  - By further instructions outside of the loop (not safe).
+  //  - By instructions outside of the loop (safe).
+  //      * One value may have several outside users, but all outside
+  //        uses must be of the same value.
   //  - By an instruction that is not part of the reduction (not safe).
   //    This is either:
   //      * An instruction type other than PHI or the reduction operation.
@@ -297,10 +298,15 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       // Check if we found the exit user.
       BasicBlock *Parent = UI->getParent();
       if (!TheLoop->contains(Parent)) {
-        // Exit if you find multiple outside users or if the header phi node is
-        // being used. In this case the user uses the value of the previous
-        // iteration, in which case we would loose "VF-1" iterations of the
-        // reduction operation if we vectorize.
+        // If we already know this instruction is used externally, move on to
+        // the next user.
+        if (ExitInstruction == Cur)
+          continue;
+
+        // Exit if you find multiple values used outside or if the header phi
+        // node is being used. In this case the user uses the value of the
+        // previous iteration, in which case we would loose "VF-1" iterations of
+        // the reduction operation if we vectorize.
         if (ExitInstruction != nullptr || Cur == Phi)
           return false;
 
@@ -547,13 +553,14 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
   if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
     return false;
 
-  // Ensure every user of the phi node is dominated by the previous value. The
-  // dominance requirement ensures the loop vectorizer will not need to
+  // Ensure every user of the phi node is dominated by the previous value.
+  // The dominance requirement ensures the loop vectorizer will not need to
   // vectorize the initial value prior to the first iteration of the loop.
   for (User *U : Phi->users())
-    if (auto *I = dyn_cast<Instruction>(U))
+    if (auto *I = dyn_cast<Instruction>(U)) {
       if (!DT->dominates(Previous, I))
         return false;
+    }
 
   return true;
 }
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
new file mode 100644
index 000000000000..c7cb561b5e21
--- /dev/null
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -0,0 +1,231 @@
+//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+
+void llvm::createMemCpyLoop(Instruction *InsertBefore,
+                            Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                            unsigned SrcAlign, unsigned DestAlign,
+                            bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+    InsertBefore->getParent()->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
+                                          F, NewBB);
+
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // SrcAddr and DstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  // load from SrcAddr+LoopIndex
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
+  Value *Element =
+    LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+                             LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+                           SrcIsVolatile);
+  // store at DstAddr+LoopIndex
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+                                                        DstAddr, LoopIndex),
+                          DstIsVolatile);
+
+  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+  Value *NewIndex =
+    LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+static void createMemMoveLoop(Instruction *InsertBefore,
+                              Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                              unsigned SrcAlign, unsigned DestAlign,
+                              bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateLoad(
+      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *FwdElement = FwdLoopBuilder.CreateLoad(
+      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+  FwdLoopBuilder.CreateStore(
+      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
+
+static void createMemSetLoop(Instruction *InsertBefore,
+                             Value *DstAddr, Value *CopyLen, Value *SetValue,
+                             unsigned Align, bool IsVolatile) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+      OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB
+    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // Cast pointer to the type of value getting stored
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
+  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
+
+  LoopBuilder.CreateStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      IsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy) {
+  createMemCpyLoop(/* InsertBefore */ Memcpy,
+                   /* SrcAddr */ Memcpy->getRawSource(),
+                   /* DstAddr */ Memcpy->getRawDest(),
+                   /* CopyLen */ Memcpy->getLength(),
+                   /* SrcAlign */ Memcpy->getAlignment(),
+                   /* DestAlign */ Memcpy->getAlignment(),
+                   /* SrcIsVolatile */ Memcpy->isVolatile(),
+                   /* DstIsVolatile */ Memcpy->isVolatile());
+}
+
+void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+  createMemMoveLoop(/* InsertBefore */ Memmove,
+                    /* SrcAddr */ Memmove->getRawSource(),
+                    /* DstAddr */ Memmove->getRawDest(),
+                    /* CopyLen */ Memmove->getLength(),
+                    /* SrcAlign */ Memmove->getAlignment(),
+                    /* DestAlign */ Memmove->getAlignment(),
+                    /* SrcIsVolatile */ Memmove->isVolatile(),
+                    /* DstIsVolatile */ Memmove->isVolatile());
+}
+
+void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  createMemSetLoop(/* InsertBefore */ Memset,
+                   /* DstAddr */ Memset->getRawDest(),
+                   /* CopyLen */ Memset->getLength(),
+                   /* SetValue */ Memset->getValue(),
+                   /* Alignment */ Memset->getAlignment(),
+                   Memset->isVolatile());
+}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 75cd3bc8b2bf..b375d51005d5 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -356,10 +356,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
-                              i.getCaseSuccessor()));
-  
+  for (auto Case : SI->cases())
+    Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
+                              Case.getCaseSuccessor()));
+
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 24b3b12930ac..b659a2e4463f 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -46,7 +46,7 @@ static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
     if (Allocas.empty())
       break;
 
-    PromoteMemToReg(Allocas, DT, nullptr, &AC);
+    PromoteMemToReg(Allocas, DT, &AC);
     NumPromoted += Allocas.size();
     Changed = true;
   }
@@ -59,8 +59,9 @@ PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!promoteMemoryToRegister(F, DT, AC))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index c999bd008fef..481c6aa29c3a 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -67,6 +68,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.setPreservesAll();
     }
 
@@ -110,9 +112,15 @@ namespace {
       }
 
       // Rename all functions
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       for (auto &F : M) {
         StringRef Name = F.getName();
-        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+        LibFunc Tmp;
+        // Leave library functions alone because their presence or absence could
+        // affect the behavior of other passes.
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+            TLI.getLibFunc(F, Tmp))
           continue;
 
         F.setName(renamer.newName());
@@ -139,8 +147,11 @@ namespace {
 }
 
 char MetaRenamer::ID = 0;
-INITIALIZE_PASS(MetaRenamer, "metarenamer", 
-                "Assign new names to everything", false, false)
+INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
+                      "Assign new names to everything", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
+                    "Assign new names to everything", false, false)
 //===----------------------------------------------------------------------===//
 //
 // MetaRenamer - Rename everything with metasyntactic names.
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 0d623df77a67..dbe42c201dd4 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -130,13 +130,25 @@ void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
 Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
   if (isa<Function>(FuncOrBitcast))
     return cast<Function>(FuncOrBitcast);
-  FuncOrBitcast->dump();
+  FuncOrBitcast->print(errs());
+  errs() << '\n';
   std::string Err;
   raw_string_ostream Stream(Err);
   Stream << "Sanitizer interface function redefined: " << *FuncOrBitcast;
   report_fatal_error(Err);
 }
 
+Function *llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
+                                             ArrayRef<Type *> InitArgTypes) {
+  assert(!InitName.empty() && "Expected init function name");
+  Function *F = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      InitName,
+      FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
+      AttributeList()));
+  F->setLinkage(Function::ExternalLinkage);
+  return F;
+}
+
 std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
     Module &M, StringRef CtorName, StringRef InitName,
     ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
@@ -144,22 +156,19 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
   assert(!InitName.empty() && "Expected init function name");
   assert(InitArgs.size() == InitArgTypes.size() &&
          "Sanitizer's init function expects different number of arguments");
+  Function *InitFunction =
+      declareSanitizerInitFunction(M, InitName, InitArgTypes);
   Function *Ctor = Function::Create(
       FunctionType::get(Type::getVoidTy(M.getContext()), false),
       GlobalValue::InternalLinkage, CtorName, &M);
   BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
   IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB));
-  Function *InitFunction =
-      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          InitName, FunctionType::get(IRB.getVoidTy(), InitArgTypes, false),
-          AttributeSet()));
-  InitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(InitFunction, InitArgs);
   if (!VersionCheckName.empty()) {
     Function *VersionCheckFunction =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
             VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
-            AttributeSet()));
+            AttributeList()));
     IRB.CreateCall(VersionCheckFunction, {});
   }
   return std::make_pair(Ctor, InitFunction);
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
new file mode 100644
index 000000000000..8877aeafecde
--- /dev/null
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -0,0 +1,782 @@
+//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the PredicateInfo class.
+//
+//===----------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+#define DEBUG_TYPE "predicateinfo"
+using namespace llvm;
+using namespace PatternMatch;
+using namespace llvm::PredicateInfoClasses;
+
+INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                      "PredicateInfo Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                    "PredicateInfo Printer", false, false)
+static cl::opt<bool> VerifyPredicateInfo(
+    "verify-predicateinfo", cl::init(false), cl::Hidden,
+    cl::desc("Verify PredicateInfo in legacy printer pass."));
+namespace {
+DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
+              "Controls which variables are renamed with predicateinfo")
+// Given a predicate info that is a type of branching terminator, get the
+// branching block.
+const BasicBlock *getBranchBlock(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Only branches and switches should have PHIOnly defs that "
+         "require branch blocks.");
+  return cast<PredicateWithEdge>(PB)->From;
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// branching terminator.
+static Instruction *getBranchTerminator(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get a terminator from.");
+  return cast<PredicateWithEdge>(PB)->From->getTerminator();
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// edge this predicate info represents
+const std::pair<BasicBlock *, BasicBlock *>
+getBlockEdge(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get an edge from.");
+  const auto *PEdge = cast<PredicateWithEdge>(PB);
+  return std::make_pair(PEdge->From, PEdge->To);
+}
+}
+
+namespace llvm {
+namespace PredicateInfoClasses {
+enum LocalNum {
+  // Operations that must appear first in the block.
+  LN_First,
+  // Operations that are somewhere in the middle of the block, and are sorted on
+  // demand.
+  LN_Middle,
+  // Operations that must appear last in a block, like successor phi node uses.
+  LN_Last
+};
+
+// Associate global and local DFS info with defs and uses, so we can sort them
+// into a global domination ordering.
+struct ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  unsigned int LocalNum = LN_Middle;
+  // Only one of Def or Use will be set.
+  Value *Def = nullptr;
+  Use *U = nullptr;
+  // Neither PInfo nor EdgeOnly participate in the ordering
+  PredicateBase *PInfo = nullptr;
+  bool EdgeOnly = false;
+};
+
+// This compares ValueDFS structures, creating OrderedBasicBlocks where
+// necessary to compare uses/defs in the same block.  Doing so allows us to walk
+// the minimum number of instructions necessary to compute our def/use ordering.
+struct ValueDFS_Compare {
+  DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap;
+  ValueDFS_Compare(
+      DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap)
+      : OBBMap(OBBMap) {}
+  bool operator()(const ValueDFS &A, const ValueDFS &B) const {
+    if (&A == &B)
+      return false;
+    // The only case we can't directly compare them is when they in the same
+    // block, and both have localnum == middle.  In that case, we have to use
+    // comesbefore to see what the real ordering is, because they are in the
+    // same basic block.
+
+    bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut);
+
+    // We want to put the def that will get used for a given set of phi uses,
+    // before those phi uses.
+    // So we sort by edge, then by def.
+    // Note that only phi nodes uses and defs can come last.
+    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+      return comparePHIRelated(A, B);
+
+    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
+      return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) <
+             std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U);
+    return localComesBefore(A, B);
+  }
+
+  // For a phi use, or a non-materialized def, return the edge it represents.
+  const std::pair<BasicBlock *, BasicBlock *>
+  getBlockEdge(const ValueDFS &VD) const {
+    if (!VD.Def && VD.U) {
+      auto *PHI = cast<PHINode>(VD.U->getUser());
+      return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
+    }
+    // This is really a non-materialized def.
+    return ::getBlockEdge(VD.PInfo);
+  }
+
+  // For two phi related values, return the ordering.
+  bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
+    auto &ABlockEdge = getBlockEdge(A);
+    auto &BBlockEdge = getBlockEdge(B);
+    // Now sort by block edge and then defs before uses.
+    return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U);
+  }
+
+  // Get the definition of an instruction that occurs in the middle of a block.
+  Value *getMiddleDef(const ValueDFS &VD) const {
+    if (VD.Def)
+      return VD.Def;
+    // It's possible for the defs and uses to be null.  For branches, the local
+    // numbering will say the placed predicaeinfos should go first (IE
+    // LN_beginning), so we won't be in this function. For assumes, we will end
+    // up here, beause we need to order the def we will place relative to the
+    // assume.  So for the purpose of ordering, we pretend the def is the assume
+    // because that is where we will insert the info.
+    if (!VD.U) {
+      assert(VD.PInfo &&
+             "No def, no use, and no predicateinfo should not occur");
+      assert(isa<PredicateAssume>(VD.PInfo) &&
+             "Middle of block should only occur for assumes");
+      return cast<PredicateAssume>(VD.PInfo)->AssumeInst;
+    }
+    return nullptr;
+  }
+
+  // Return either the Def, if it's not null, or the user of the Use, if the def
+  // is null.
+  const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
+    if (Def)
+      return cast<Instruction>(Def);
+    return cast<Instruction>(U->getUser());
+  }
+
+  // This performs the necessary local basic block ordering checks to tell
+  // whether A comes before B, where both are in the same basic block.
+  bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
+    auto *ADef = getMiddleDef(A);
+    auto *BDef = getMiddleDef(B);
+
+    // See if we have real values or uses. If we have real values, we are
+    // guaranteed they are instructions or arguments. No matter what, we are
+    // guaranteed they are in the same block if they are instructions.
+    auto *ArgA = dyn_cast_or_null<Argument>(ADef);
+    auto *ArgB = dyn_cast_or_null<Argument>(BDef);
+
+    if (ArgA && !ArgB)
+      return true;
+    if (ArgB && !ArgA)
+      return false;
+    if (ArgA && ArgB)
+      return ArgA->getArgNo() < ArgB->getArgNo();
+
+    auto *AInst = getDefOrUser(ADef, A.U);
+    auto *BInst = getDefOrUser(BDef, B.U);
+
+    auto *BB = AInst->getParent();
+    auto LookupResult = OBBMap.find(BB);
+    if (LookupResult != OBBMap.end())
+      return LookupResult->second->dominates(AInst, BInst);
+
+    auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)});
+    return Result.first->second->dominates(AInst, BInst);
+  }
+};
+
+} // namespace PredicateInfoClasses
+
+bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack,
+                                   const ValueDFS &VDUse) const {
+  if (Stack.empty())
+    return false;
+  // If it's a phi only use, make sure it's for this phi node edge, and that the
+  // use is in a phi node.  If it's anything else, and the top of the stack is
+  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to
+  // the defs they must go with so that we can know it's time to pop the stack
+  // when we hit the end of the phi uses for a given def.
+  if (Stack.back().EdgeOnly) {
+    if (!VDUse.U)
+      return false;
+    auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
+    if (!PHI)
+      return false;
+    // Check edge
+    BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
+    if (EdgePred != getBranchBlock(Stack.back().PInfo))
+      return false;
+
+    // Use dominates, which knows how to handle edge dominance.
+    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+  }
+
+  return (VDUse.DFSIn >= Stack.back().DFSIn &&
+          VDUse.DFSOut <= Stack.back().DFSOut);
+}
+
+void PredicateInfo::popStackUntilDFSScope(ValueDFSStack &Stack,
+                                          const ValueDFS &VD) {
+  while (!Stack.empty() && !stackIsInScope(Stack, VD))
+    Stack.pop_back();
+}
+
+// Convert the uses of Op into a vector of uses, associating global and local
+// DFS info with each one.
+void PredicateInfo::convertUsesToDFSOrdered(
+    Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+  for (auto &U : Op->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      ValueDFS VD;
+      // Put the phi node uses in the incoming block.
+      BasicBlock *IBlock;
+      if (auto *PN = dyn_cast<PHINode>(I)) {
+        IBlock = PN->getIncomingBlock(U);
+        // Make phi node users appear last in the incoming block
+        // they are from.
+        VD.LocalNum = LN_Last;
+      } else {
+        // If it's not a phi node use, it is somewhere in the middle of the
+        // block.
+        IBlock = I->getParent();
+        VD.LocalNum = LN_Middle;
+      }
+      DomTreeNode *DomNode = DT.getNode(IBlock);
+      // It's possible our use is in an unreachable block. Skip it if so.
+      if (!DomNode)
+        continue;
+      VD.DFSIn = DomNode->getDFSNumIn();
+      VD.DFSOut = DomNode->getDFSNumOut();
+      VD.U = &U;
+      DFSOrderedSet.push_back(VD);
+    }
+  }
+}
+
+// Collect relevant operations from Comparison that we may want to insert copies
+// for.
+void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
+  auto *Op0 = Comparison->getOperand(0);
+  auto *Op1 = Comparison->getOperand(1);
+  if (Op0 == Op1)
+    return;
+  CmpOperands.push_back(Comparison);
+  // Only want real values, not constants.  Additionally, operands with one use
+  // are only being used in the comparison, which means they will not be useful
+  // for us to consider for predicateinfo.
+  //
+  if ((isa<Instruction>(Op0) || isa<Argument>(Op0)) && !Op0->hasOneUse())
+    CmpOperands.push_back(Op0);
+  if ((isa<Instruction>(Op1) || isa<Argument>(Op1)) && !Op1->hasOneUse())
+    CmpOperands.push_back(Op1);
+}
+
+// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
+void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+                               PredicateBase *PB) {
+  OpsToRename.insert(Op);
+  auto &OperandInfo = getOrCreateValueInfo(Op);
+  AllInfos.push_back(PB);
+  OperandInfo.Infos.push_back(PB);
+}
+
+// Process an assume instruction and place relevant operations we want to rename
+// into OpsToRename.
+void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  // See if we have a comparison we support
+  SmallVector<Value *, 8> CmpOperands;
+  SmallVector<Value *, 2> ConditionsToProcess;
+  CmpInst::Predicate Pred;
+  Value *Operand = II->getOperand(0);
+  if (m_c_And(m_Cmp(Pred, m_Value(), m_Value()),
+              m_Cmp(Pred, m_Value(), m_Value()))
+          .match(II->getOperand(0))) {
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(0));
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(1));
+    ConditionsToProcess.push_back(Operand);
+  } else if (isa<CmpInst>(Operand)) {
+
+    ConditionsToProcess.push_back(Operand);
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands) {
+        auto *PA = new PredicateAssume(Op, II, Cmp);
+        addInfoFor(OpsToRename, Op, PA);
+      }
+      CmpOperands.clear();
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // Otherwise, it should be an AND.
+      assert(BinOp->getOpcode() == Instruction::And &&
+             "Should have been an AND");
+      auto *PA = new PredicateAssume(BinOp, II, BinOp);
+      addInfoFor(OpsToRename, BinOp, PA);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+  }
+}
+
+// Process a block terminating branch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  BasicBlock *FirstBB = BI->getSuccessor(0);
+  BasicBlock *SecondBB = BI->getSuccessor(1);
+  SmallVector<BasicBlock *, 2> SuccsToProcess;
+  SuccsToProcess.push_back(FirstBB);
+  SuccsToProcess.push_back(SecondBB);
+  SmallVector<Value *, 2> ConditionsToProcess;
+
+  auto InsertHelper = [&](Value *Op, bool isAnd, bool isOr, Value *Cond) {
+    for (auto *Succ : SuccsToProcess) {
+      // Don't try to insert on a self-edge. This is mainly because we will
+      // eliminate during renaming anyway.
+      if (Succ == BranchBB)
+        continue;
+      bool TakenEdge = (Succ == FirstBB);
+      // For and, only insert on the true edge
+      // For or, only insert on the false edge
+      if ((isAnd && !TakenEdge) || (isOr && TakenEdge))
+        continue;
+      PredicateBase *PB =
+          new PredicateBranch(Op, BranchBB, Succ, Cond, TakenEdge);
+      addInfoFor(OpsToRename, Op, PB);
+      if (!Succ->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, Succ});
+    }
+  };
+
+  // Match combinations of conditions.
+  CmpInst::Predicate Pred;
+  bool isAnd = false;
+  bool isOr = false;
+  SmallVector<Value *, 8> CmpOperands;
+  if (match(BI->getCondition(), m_And(m_Cmp(Pred, m_Value(), m_Value()),
+                                      m_Cmp(Pred, m_Value(), m_Value()))) ||
+      match(BI->getCondition(), m_Or(m_Cmp(Pred, m_Value(), m_Value()),
+                                     m_Cmp(Pred, m_Value(), m_Value())))) {
+    auto *BinOp = cast<BinaryOperator>(BI->getCondition());
+    if (BinOp->getOpcode() == Instruction::And)
+      isAnd = true;
+    else if (BinOp->getOpcode() == Instruction::Or)
+      isOr = true;
+    ConditionsToProcess.push_back(BinOp->getOperand(0));
+    ConditionsToProcess.push_back(BinOp->getOperand(1));
+    ConditionsToProcess.push_back(BI->getCondition());
+  } else if (isa<CmpInst>(BI->getCondition())) {
+    ConditionsToProcess.push_back(BI->getCondition());
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands)
+        InsertHelper(Op, isAnd, isOr, Cmp);
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // This must be an AND or an OR.
+      assert((BinOp->getOpcode() == Instruction::And ||
+              BinOp->getOpcode() == Instruction::Or) &&
+             "Should have been an AND or an OR");
+      // The actual value of the binop is not subject to the same restrictions
+      // as the comparison. It's either true or false on the true/false branch.
+      InsertHelper(BinOp, false, false, BinOp);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+    CmpOperands.clear();
+  }
+}
+// Process a block terminating switch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  Value *Op = SI->getCondition();
+  if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
+    return;
+
+  // Remember how many outgoing edges there are to every successor.
+  SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *TargetBlock = SI->getSuccessor(i);
+    ++SwitchEdges[TargetBlock];
+  }
+
+  // Now propagate info for each case value
+  for (auto C : SI->cases()) {
+    BasicBlock *TargetBlock = C.getCaseSuccessor();
+    if (SwitchEdges.lookup(TargetBlock) == 1) {
+      PredicateSwitch *PS = new PredicateSwitch(
+          Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
+      addInfoFor(OpsToRename, Op, PS);
+      if (!TargetBlock->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, TargetBlock});
+    }
+  }
+}
+
+// Build predicate info for our function
+void PredicateInfo::buildPredicateInfo() {
+  DT.updateDFSNumbers();
+  // Collect operands to rename from all conditional branch terminators, as well
+  // as assume statements.
+  SmallPtrSet<Value *, 8> OpsToRename;
+  for (auto DTN : depth_first(DT.getRootNode())) {
+    BasicBlock *BranchBB = DTN->getBlock();
+    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+      if (!BI->isConditional())
+        continue;
+      processBranch(BI, BranchBB, OpsToRename);
+    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
+      processSwitch(SI, BranchBB, OpsToRename);
+    }
+  }
+  for (auto &Assume : AC.assumptions()) {
+    if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
+      processAssume(II, II->getParent(), OpsToRename);
+  }
+  // Now rename all our operations.
+  renameUses(OpsToRename);
+}
+
+// Given the renaming stack, make all the operands currently on the stack real
+// by inserting them into the IR.  Return the last operation's value.
+Value *PredicateInfo::materializeStack(unsigned int &Counter,
+                                       ValueDFSStack &RenameStack,
+                                       Value *OrigOp) {
+  // Find the first thing we have to materialize
+  auto RevIter = RenameStack.rbegin();
+  for (; RevIter != RenameStack.rend(); ++RevIter)
+    if (RevIter->Def)
+      break;
+
+  size_t Start = RevIter - RenameStack.rbegin();
+  // The maximum number of things we should be trying to materialize at once
+  // right now is 4, depending on if we had an assume, a branch, and both used
+  // and of conditions.
+  for (auto RenameIter = RenameStack.end() - Start;
+       RenameIter != RenameStack.end(); ++RenameIter) {
+    auto *Op =
+        RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
+    ValueDFS &Result = *RenameIter;
+    auto *ValInfo = Result.PInfo;
+    // For edge predicates, we can just place the operand in the block before
+    // the terminator.  For assume, we have to place it right before the assume
+    // to ensure we dominate all of our uses.  Always insert right before the
+    // relevant instruction (terminator, assume), so that we insert in proper
+    // order in the case of multiple predicateinfo in the same block.
+    if (isa<PredicateWithEdge>(ValInfo)) {
+      IRBuilder<> B(getBranchTerminator(ValInfo));
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC =
+          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    } else {
+      auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
+      assert(PAssume &&
+             "Should not have gotten here without it being an assume");
+      IRBuilder<> B(PAssume->AssumeInst);
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC = B.CreateCall(IF, Op);
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    }
+  }
+  return RenameStack.back().Def;
+}
+
+// Instead of the standard SSA renaming algorithm, which is O(Number of
+// instructions), and walks the entire dominator tree, we walk only the defs +
+// uses.  The standard SSA renaming algorithm does not really rely on the
+// dominator tree except to order the stack push/pops of the renaming stacks, so
+// that defs end up getting pushed before hitting the correct uses.  This does
+// not require the dominator tree, only the *order* of the dominator tree. The
+// complete and correct ordering of the defs and uses, in dominator tree is
+// contained in the DFS numbering of the dominator tree. So we sort the defs and
+// uses into the DFS ordering, and then just use the renaming stack as per
+// normal, pushing when we hit a def (which is a predicateinfo instruction),
+// popping when we are out of the dfs scope for that def, and replacing any uses
+// with top of stack if it exists.  In order to handle liveness without
+// propagating liveness info, we don't actually insert the predicateinfo
+// instruction def until we see a use that it would dominate.  Once we see such
+// a use, we materialize the predicateinfo instruction in the right place and
+// use it.
+//
+// TODO: Use this algorithm to perform fast single-variable renaming in
+// promotememtoreg and memoryssa.
+void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpsToRename) {
+  ValueDFS_Compare Compare(OBBMap);
+  // Compute liveness, and rename in O(uses) per Op.
+  for (auto *Op : OpsToRename) {
+    unsigned Counter = 0;
+    SmallVector<ValueDFS, 16> OrderedUses;
+    const auto &ValueInfo = getValueInfo(Op);
+    // Insert the possible copies into the def/use list.
+    // They will become real copies if we find a real use for them, and never
+    // created otherwise.
+    for (auto &PossibleCopy : ValueInfo.Infos) {
+      ValueDFS VD;
+      // Determine where we are going to place the copy by the copy type.
+      // The predicate info for branches always come first, they will get
+      // materialized in the split block at the top of the block.
+      // The predicate info for assumes will be somewhere in the middle,
+      // it will get materialized in front of the assume.
+      if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
+        VD.LocalNum = LN_Middle;
+        DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
+        if (!DomNode)
+          continue;
+        VD.DFSIn = DomNode->getDFSNumIn();
+        VD.DFSOut = DomNode->getDFSNumOut();
+        VD.PInfo = PossibleCopy;
+        OrderedUses.push_back(VD);
+      } else if (isa<PredicateWithEdge>(PossibleCopy)) {
+        // If we can only do phi uses, we treat it like it's in the branch
+        // block, and handle it specially. We know that it goes last, and only
+        // dominate phi uses.
+        auto BlockEdge = getBlockEdge(PossibleCopy);
+        if (EdgeUsesOnly.count(BlockEdge)) {
+          VD.LocalNum = LN_Last;
+          auto *DomNode = DT.getNode(BlockEdge.first);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            VD.EdgeOnly = true;
+            OrderedUses.push_back(VD);
+          }
+        } else {
+          // Otherwise, we are in the split block (even though we perform
+          // insertion in the branch block).
+          // Insert a possible copy at the split block and before the branch.
+          VD.LocalNum = LN_First;
+          auto *DomNode = DT.getNode(BlockEdge.second);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            OrderedUses.push_back(VD);
+          }
+        }
+      }
+    }
+
+    convertUsesToDFSOrdered(Op, OrderedUses);
+    std::sort(OrderedUses.begin(), OrderedUses.end(), Compare);
+    SmallVector<ValueDFS, 8> RenameStack;
+    // For each use, sorted into dfs order, push values and replaces uses with
+    // top of stack, which will represent the reaching def.
+    for (auto &VD : OrderedUses) {
+      // We currently do not materialize copy over copy, but we should decide if
+      // we want to.
+      bool PossibleCopy = VD.PInfo != nullptr;
+      if (RenameStack.empty()) {
+        DEBUG(dbgs() << "Rename Stack is empty\n");
+      } else {
+        DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                     << RenameStack.back().DFSIn << ","
+                     << RenameStack.back().DFSOut << ")\n");
+      }
+
+      DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                   << VD.DFSOut << ")\n");
+
+      bool ShouldPush = (VD.Def || PossibleCopy);
+      bool OutOfScope = !stackIsInScope(RenameStack, VD);
+      if (OutOfScope || ShouldPush) {
+        // Sync to our current scope.
+        popStackUntilDFSScope(RenameStack, VD);
+        if (ShouldPush) {
+          RenameStack.push_back(VD);
+        }
+      }
+      // If we get to this point, and the stack is empty we must have a use
+      // with no renaming needed, just skip it.
+      if (RenameStack.empty())
+        continue;
+      // Skip values, only want to rename the uses
+      if (VD.Def || PossibleCopy)
+        continue;
+      if (!DebugCounter::shouldExecute(RenameCounter)) {
+        DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        continue;
+      }
+      ValueDFS &Result = RenameStack.back();
+
+      // If the possible copy dominates something, materialize our stack up to
+      // this point. This ensures every comparison that affects our operation
+      // ends up with predicateinfo.
+      if (!Result.Def)
+        Result.Def = materializeStack(Counter, RenameStack, Op);
+
+      DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                   << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+      assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
+             "Predicateinfo def should have dominated this use");
+      VD.U->set(Result.Def);
+    }
+  }
+}
+
+PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) {
+  auto OIN = ValueInfoNums.find(Operand);
+  if (OIN == ValueInfoNums.end()) {
+    // This will grow it
+    ValueInfos.resize(ValueInfos.size() + 1);
+    // This will use the new size and give us a 0 based number of the info
+    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
+    assert(InsertResult.second && "Value info number already existed?");
+    return ValueInfos[InsertResult.first->second];
+  }
+  return ValueInfos[OIN->second];
+}
+
+const PredicateInfo::ValueInfo &
+PredicateInfo::getValueInfo(Value *Operand) const {
+  auto OINI = ValueInfoNums.lookup(Operand);
+  assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
+  assert(OINI < ValueInfos.size() &&
+         "Value Info Number greater than size of Value Info Table");
+  return ValueInfos[OINI];
+}
+
+PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
+                             AssumptionCache &AC)
+    : F(F), DT(DT), AC(AC) {
+  // Push an empty operand info so that we can detect 0 as not finding one
+  ValueInfos.resize(1);
+  buildPredicateInfo();
+}
+
+PredicateInfo::~PredicateInfo() {}
+
+void PredicateInfo::verifyPredicateInfo() const {}
+
+char PredicateInfoPrinterLegacyPass::ID = 0;
+
+PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
+    : FunctionPass(ID) {
+  initializePredicateInfoPrinterLegacyPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+}
+
+bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(dbgs());
+  if (VerifyPredicateInfo)
+    PredInfo->verifyPredicateInfo();
+  return false;
+}
+
+PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  OS << "PredicateInfo for function: " << F.getName() << "\n";
+  make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+/// \brief An assembly annotator class to print PredicateInfo information in
+/// comments.
+class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class PredicateInfo;
+  const PredicateInfo *PredInfo;
+
+public:
+  PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {}
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
+      OS << "; Has predicate info\n";
+      if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
+        OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
+           << " Comparison:" << *PB->Condition << " Edge: [";
+        PB->From->printAsOperand(OS);
+        OS << ",";
+        PB->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
+        OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
+           << " Switch:" << *PS->Switch << " Edge: [";
+        PS->From->printAsOperand(OS);
+        OS << ",";
+        PS->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
+        OS << "; assume predicate info {"
+           << " Comparison:" << *PA->Condition << " }\n";
+      }
+    }
+  }
+};
+
+void PredicateInfo::print(raw_ostream &OS) const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void PredicateInfo::dump() const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+
+  return PreservedAnalyses::all();
+}
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 35faa6f65efd..a33b85c4ee69 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -38,6 +38,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -225,9 +226,6 @@ struct PromoteMem2Reg {
   DominatorTree &DT;
   DIBuilder DIB;
 
-  /// An AliasSetTracker object to update.  If null, don't update it.
-  AliasSetTracker *AST;
-
   /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
   AssumptionCache *AC;
 
@@ -269,10 +267,10 @@ struct PromoteMem2Reg {
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 AliasSetTracker *AST, AssumptionCache *AC)
+                 AssumptionCache *AC)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
         DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
-        AST(AST), AC(AC) {}
+        AC(AC) {}
 
   void run();
 
@@ -301,6 +299,18 @@ private:
 
 } // end of anonymous namespace
 
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+  Function *AssumeIntrinsic =
+      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+                                       Constant::getNullValue(LI->getType()));
+  LoadNotNull->insertAfter(LI);
+  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+  CI->insertAfter(LoadNotNull);
+  AC->registerAssumption(CI);
+}
+
 static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
   // instructions except for load and store.
@@ -334,9 +344,8 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
 /// and thus must be phi-ed with undef. We fall back to the standard alloca
 /// promotion algorithm in that case.
 static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                     LargeBlockInfo &LBI,
-                                     DominatorTree &DT,
-                                     AliasSetTracker *AST) {
+                                     LargeBlockInfo &LBI, DominatorTree &DT,
+                                     AssumptionCache *AC) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -387,9 +396,15 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     // code.
     if (ReplVal == LI)
       ReplVal = UndefValue::get(LI->getType());
+
+    // If the load was marked as nonnull we don't want to lose
+    // that information when we erase this Load. So we preserve
+    // it with an assume.
+    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+        !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+      addAssumeNonNull(AC, LI);
+
     LI->replaceAllUsesWith(ReplVal);
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
     LI->eraseFromParent();
     LBI.deleteValue(LI);
   }
@@ -410,8 +425,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   Info.OnlyStore->eraseFromParent();
   LBI.deleteValue(Info.OnlyStore);
 
-  if (AST)
-    AST->deleteValue(AI);
   AI->eraseFromParent();
   LBI.deleteValue(AI);
   return true;
@@ -435,7 +448,8 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
 ///  }
 static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                                      LargeBlockInfo &LBI,
-                                     AliasSetTracker *AST) {
+                                     DominatorTree &DT,
+                                     AssumptionCache *AC) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -476,13 +490,18 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
         // There is no store before this load, bail out (load may be affected
         // by the following stores - see main comment).
         return false;
-    }
-    else
+    } else {
       // Otherwise, there was a store before this load, the load takes its value.
-      LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0));
+      // Note, if the load was marked as nonnull we don't want to lose that
+      // information when we erase it. So we preserve it with an assume.
+      Value *ReplVal = std::prev(I)->second->getOperand(0);
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      LI->replaceAllUsesWith(ReplVal);
+    }
 
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
     LI->eraseFromParent();
     LBI.deleteValue(LI);
   }
@@ -499,8 +518,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     LBI.deleteValue(SI);
   }
 
-  if (AST)
-    AST->deleteValue(AI);
   AI->eraseFromParent();
   LBI.deleteValue(AI);
 
@@ -517,8 +534,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  if (AST)
-    PointerAllocaValues.resize(Allocas.size());
   AllocaDbgDeclares.resize(Allocas.size());
 
   AllocaInfo Info;
@@ -536,8 +551,6 @@ void PromoteMem2Reg::run() {
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
-      if (AST)
-        AST->deleteValue(AI);
       AI->eraseFromParent();
 
       // Remove the alloca from the Allocas list, since it has been processed
@@ -553,7 +566,7 @@ void PromoteMem2Reg::run() {
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
-      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST)) {
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AC)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -564,7 +577,7 @@ void PromoteMem2Reg::run() {
     // If the alloca is only read and written in one basic block, just perform a
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
-        promoteSingleBlockAlloca(AI, Info, LBI, AST)) {
+        promoteSingleBlockAlloca(AI, Info, LBI, DT, AC)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -578,11 +591,6 @@ void PromoteMem2Reg::run() {
         BBNumbers[&BB] = ID++;
     }
 
-    // If we have an AST to keep updated, remember some pointer value that is
-    // stored into the alloca.
-    if (AST)
-      PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
-
     // Remember the dbg.declare intrinsic describing this alloca, if any.
     if (Info.DbgDeclare)
       AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
@@ -662,8 +670,6 @@ void PromoteMem2Reg::run() {
     // tree. Just delete the users now.
     if (!A->use_empty())
       A->replaceAllUsesWith(UndefValue::get(A->getType()));
-    if (AST)
-      AST->deleteValue(A);
     A->eraseFromParent();
   }
 
@@ -694,8 +700,6 @@ void PromoteMem2Reg::run() {
 
       // If this PHI node merges one value and/or undefs, get the value.
       if (Value *V = SimplifyInstruction(PN, DL, nullptr, &DT, AC)) {
-        if (AST && PN->getType()->isPointerTy())
-          AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
         PN->eraseFromParent();
         NewPhiNodes.erase(I++);
@@ -863,10 +867,6 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                        &BB->front());
   ++NumPHIInsert;
   PhiToAllocaMap[PN] = AllocaNo;
-
-  if (AST && PN->getType()->isPointerTy())
-    AST->copyValue(PointerAllocaValues[AllocaNo], PN);
-
   return true;
 }
 
@@ -940,10 +940,15 @@ NextIteration:
 
       Value *V = IncomingVals[AI->second];
 
+      // If the load was marked as nonnull we don't want to lose
+      // that information when we erase this Load. So we preserve
+      // it with an assume.
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(V, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
       // Anything using the load now uses the current value.
       LI->replaceAllUsesWith(V);
-      if (AST && LI->getType()->isPointerTy())
-        AST->deleteValue(LI);
       BB->getInstList().erase(LI);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
       // Delete this instruction and mark the name as the current holder of the
@@ -987,10 +992,10 @@ NextIteration:
 }
 
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           AliasSetTracker *AST, AssumptionCache *AC) {
+                           AssumptionCache *AC) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, AST, AC).run();
+  PromoteMem2Reg(Allocas, DT, AC).run();
 }
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 8e93ee757a15..8b6a2c3766d2 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -11,20 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <cassert>
+#include <utility>
 
 using namespace llvm;
 
@@ -36,7 +45,7 @@ static AvailableValsTy &getAvailableVals(void *AV) {
 }
 
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
-  : AV(nullptr), ProtoType(nullptr), ProtoName(), InsertedPHIs(NewPHI) {}
+  : InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete static_cast<AvailableValsTy*>(AV);
@@ -205,6 +214,7 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
 }
 
 namespace llvm {
+
 template<>
 class SSAUpdaterTraits<SSAUpdater> {
 public:
@@ -230,6 +240,7 @@ public:
     PHI_iterator &operator++() { ++idx; return *this; } 
     bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
     bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+
     Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
     BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
   };
@@ -303,7 +314,7 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
 /// Check to see if AvailableVals has an entry for the specified BB and if so,
 /// return it.  If not, construct SSA form by first calculating the required
@@ -337,14 +348,12 @@ LoadAndStorePromoter(ArrayRef<const Instruction*> Insts,
   SSA.Initialize(SomeVal->getType(), BaseName);
 }
 
-
 void LoadAndStorePromoter::
 run(const SmallVectorImpl<Instruction*> &Insts) const {
-  
   // First step: bucket up uses of the alloca by the block they occur in.
   // This is important because we have to handle multiple defs/uses in a block
   // ourselves: SSAUpdater is purely for cross-block references.
-  DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock;
+  DenseMap<BasicBlock*, TinyPtrVector<Instruction*>> UsesByBlock;
 
   for (Instruction *User : Insts)
     UsesByBlock[User->getParent()].push_back(User);
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 7b0bddbbb831..127a44df5344 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -169,6 +170,8 @@ class SimplifyCFGOpt {
   unsigned BonusInstThreshold;
   AssumptionCache *AC;
   SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
+  // See comments in SimplifyCFGOpt::SimplifySwitch.
+  bool LateSimplifyCFG;
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(
       TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
@@ -192,9 +195,10 @@ class SimplifyCFGOpt {
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
                  unsigned BonusInstThreshold, AssumptionCache *AC,
-                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders)
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                 bool LateSimplifyCFG)
       : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
-        LoopHeaders(LoopHeaders) {}
+        LoopHeaders(LoopHeaders), LateSimplifyCFG(LateSimplifyCFG) {}
 
   bool run(BasicBlock *BB);
 };
@@ -710,10 +714,9 @@ BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
     TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
-         ++i)
-      Cases.push_back(
-          ValueEqualityComparisonCase(i.getCaseValue(), i.getCaseSuccessor()));
+    for (auto Case : SI->cases())
+      Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
+                                                  Case.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
@@ -846,12 +849,12 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
-      if (DeadCases.count(i.getCaseValue())) {
+      if (DeadCases.count(i->getCaseValue())) {
         if (HasWeight) {
-          std::swap(Weights[i.getCaseIndex() + 1], Weights.back());
+          std::swap(Weights[i->getCaseIndex() + 1], Weights.back());
           Weights.pop_back();
         }
-        i.getCaseSuccessor()->removePredecessor(TI->getParent());
+        i->getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
@@ -996,8 +999,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       SmallSetVector<BasicBlock*, 4> FailBlocks;
       if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
         for (auto *Succ : FailBlocks) {
-          std::vector<BasicBlock*> Blocks = { TI->getParent() };
-          if (!SplitBlockPredecessors(Succ, Blocks, ".fold.split"))
+          if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split"))
             return false;
         }
       }
@@ -1280,7 +1282,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (!isa<CallInst>(I1))
       I1->setDebugLoc(
           DILocation::getMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()));
- 
+
     I2->eraseFromParent();
     Changed = true;
 
@@ -1472,29 +1474,28 @@ static bool canSinkInstructions(
       return false;
   }
 
+  // Because SROA can't handle speculating stores of selects, try not
+  // to sink loads or stores of allocas when we'd have to create a PHI for
+  // the address operand. Also, because it is likely that loads or stores
+  // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+  // This can cause code churn which can have unintended consequences down
+  // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+  // FIXME: This is a workaround for a deficiency in SROA - see
+  // https://llvm.org/bugs/show_bug.cgi?id=30188
+  if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1));
+      }))
+    return false;
+  if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(0));
+      }))
+    return false;
+
   for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
     if (I0->getOperand(OI)->getType()->isTokenTy())
       // Don't touch any operand of token type.
       return false;
 
-    // Because SROA can't handle speculating stores of selects, try not
-    // to sink loads or stores of allocas when we'd have to create a PHI for
-    // the address operand. Also, because it is likely that loads or stores
-    // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
-    // This can cause code churn which can have unintended consequences down
-    // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
-    // FIXME: This is a workaround for a deficiency in SROA - see
-    // https://llvm.org/bugs/show_bug.cgi?id=30188
-    if (OI == 1 && isa<StoreInst>(I0) &&
-        any_of(Insts, [](const Instruction *I) {
-          return isa<AllocaInst>(I->getOperand(1));
-        }))
-      return false;
-    if (OI == 0 && isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
-          return isa<AllocaInst>(I->getOperand(0));
-        }))
-      return false;
-
     auto SameAsI0 = [&I0, OI](const Instruction *I) {
       assert(I->getNumOperands() == I0->getNumOperands());
       return I->getOperand(OI) == I0->getOperand(OI);
@@ -1546,7 +1547,7 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
         }))
       return false;
   }
-  
+
   // We don't need to do any more checking here; canSinkLastInstruction should
   // have done it all for us.
   SmallVector<Value*, 4> NewOperands;
@@ -1653,7 +1654,7 @@ namespace {
     bool isValid() const {
       return !Fail;
     }
-    
+
     void operator -- () {
       if (Fail)
         return;
@@ -1699,7 +1700,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   //      /    \
   //    [f(1)] [if]
   //      |     | \
-  //      |     |  \
+  //      |     |  |
   //      |  [f(2)]|
   //       \    | /
   //        [ end ]
@@ -1737,7 +1738,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   }
   if (UnconditionalPreds.size() < 2)
     return false;
-  
+
   bool Changed = false;
   // We take a two-step approach to tail sinking. First we scan from the end of
   // each block upwards in lockstep. If the n'th instruction from the end of each
@@ -1767,7 +1768,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
     if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
         NumPHIInsts++;
-    
+
     return NumPHIInsts <= 1;
   };
 
@@ -1790,7 +1791,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     }
     if (!Profitable)
       return false;
-    
+
     DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
@@ -1800,7 +1801,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       return false;
     Changed = true;
   }
-  
+
   // Now that we've analyzed all potential sinking candidates, perform the
   // actual sink. We iteratively sink the last non-terminator of the source
   // blocks into their common successor unless doing so would require too
@@ -1826,7 +1827,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
       break;
     }
-    
+
     if (!sinkLastInstruction(UnconditionalPreds))
       return Changed;
     NumSinkCommons++;
@@ -2078,6 +2079,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     Value *S = Builder.CreateSelect(
         BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
     SpeculatedStore->setOperand(0, S);
+    SpeculatedStore->setDebugLoc(
+        DILocation::getMergedLocation(
+          BI->getDebugLoc(), SpeculatedStore->getDebugLoc()));
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
@@ -2147,7 +2151,8 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// If we have a conditional branch on a PHI node value that is defined in the
 /// same block as the branch and if any PHI entries are constants, thread edges
 /// corresponding to that entry to be branches to their ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
+                                AssumptionCache *AC) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -2239,6 +2244,11 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       // Insert the new instruction into its new home.
       if (N)
         EdgeBB->getInstList().insert(InsertPt, N);
+
+      // Register the new instruction with the assumption cache if necessary.
+      if (auto *II = dyn_cast_or_null<IntrinsicInst>(N))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
     }
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
@@ -2251,7 +2261,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       }
 
     // Recurse, simplifying any other constants.
-    return FoldCondBranchOnPHI(BI, DL) | true;
+    return FoldCondBranchOnPHI(BI, DL, AC) | true;
   }
 
   return false;
@@ -3433,8 +3443,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 
   // Find the relevant condition and destinations.
   Value *Condition = Select->getCondition();
-  BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
-  BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
 
   // Get weight for TrueBB and FalseBB.
   uint32_t TrueWeight = 0, FalseWeight = 0;
@@ -3444,9 +3454,9 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
       TrueWeight =
-          (uint32_t)Weights[SI->findCaseValue(TrueVal).getSuccessorIndex()];
+          (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
       FalseWeight =
-          (uint32_t)Weights[SI->findCaseValue(FalseVal).getSuccessorIndex()];
+          (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
     }
   }
 
@@ -4148,15 +4158,16 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
-           ++i)
-        if (i.getCaseSuccessor() == BB) {
-          BB->removePredecessor(SI->getParent());
-          SI->removeCase(i);
-          --i;
-          --e;
-          Changed = true;
+      for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+        if (i->getCaseSuccessor() != BB) {
+          ++i;
+          continue;
         }
+        BB->removePredecessor(SI->getParent());
+        i = SI->removeCase(i);
+        e = SI->case_end();
+        Changed = true;
+      }
     } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
       if (II->getUnwindDest() == BB) {
         removeUnwindEdge(TI->getParent());
@@ -4239,18 +4250,18 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   SmallVector<ConstantInt *, 16> CasesA;
   SmallVector<ConstantInt *, 16> CasesB;
 
-  for (SwitchInst::CaseIt I : SI->cases()) {
-    BasicBlock *Dest = I.getCaseSuccessor();
+  for (auto Case : SI->cases()) {
+    BasicBlock *Dest = Case.getCaseSuccessor();
     if (!DestA)
       DestA = Dest;
     if (Dest == DestA) {
-      CasesA.push_back(I.getCaseValue());
+      CasesA.push_back(Case.getCaseValue());
       continue;
     }
     if (!DestB)
       DestB = Dest;
     if (Dest == DestB) {
-      CasesB.push_back(I.getCaseValue());
+      CasesB.push_back(Case.getCaseValue());
       continue;
     }
     return false; // More than two destinations.
@@ -4375,7 +4386,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   bool HasDefault =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
   const unsigned NumUnknownBits =
-      Bits - (KnownZero.Or(KnownOne)).countPopulation();
+      Bits - (KnownZero | KnownOne).countPopulation();
   assert(NumUnknownBits <= Bits);
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
@@ -4400,17 +4411,17 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
 
   // Remove dead cases from the switch.
   for (ConstantInt *DeadCase : DeadCases) {
-    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCase);
-    assert(Case != SI->case_default() &&
+    SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
+    assert(CaseI != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
     if (HasWeight) {
-      std::swap(Weights[Case.getCaseIndex() + 1], Weights.back());
+      std::swap(Weights[CaseI->getCaseIndex() + 1], Weights.back());
       Weights.pop_back();
     }
 
     // Prune unused values from PHI nodes.
-    Case.getCaseSuccessor()->removePredecessor(SI->getParent());
-    SI->removeCase(Case);
+    CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
+    SI->removeCase(CaseI);
   }
   if (HasWeight && Weights.size() >= 2) {
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
@@ -4464,10 +4475,9 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   typedef DenseMap<PHINode *, SmallVector<int, 4>> ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E;
-       ++I) {
-    ConstantInt *CaseValue = I.getCaseValue();
-    BasicBlock *CaseDest = I.getCaseSuccessor();
+  for (auto Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseDest = Case.getCaseSuccessor();
 
     int PhiIndex;
     PHINode *PHI =
@@ -5202,8 +5212,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // common destination, as well as the min and max case values.
   assert(SI->case_begin() != SI->case_end());
   SwitchInst::CaseIt CI = SI->case_begin();
-  ConstantInt *MinCaseVal = CI.getCaseValue();
-  ConstantInt *MaxCaseVal = CI.getCaseValue();
+  ConstantInt *MinCaseVal = CI->getCaseValue();
+  ConstantInt *MaxCaseVal = CI->getCaseValue();
 
   BasicBlock *CommonDest = nullptr;
   typedef SmallVector<std::pair<ConstantInt *, Constant *>, 4> ResultListTy;
@@ -5213,7 +5223,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   SmallVector<PHINode *, 4> PHIs;
 
   for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
-    ConstantInt *CaseVal = CI.getCaseValue();
+    ConstantInt *CaseVal = CI->getCaseValue();
     if (CaseVal->getValue().slt(MinCaseVal->getValue()))
       MinCaseVal = CaseVal;
     if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
@@ -5222,7 +5232,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Resulting value at phi nodes for this case value.
     typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
     ResultsTy Results;
-    if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
+    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
                         Results, DL, TTI))
       return false;
 
@@ -5503,11 +5513,10 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   auto *Rot = Builder.CreateOr(LShr, Shl);
   SI->replaceUsesOfWith(SI->getCondition(), Rot);
 
-  for (SwitchInst::CaseIt C = SI->case_begin(), E = SI->case_end(); C != E;
-       ++C) {
-    auto *Orig = C.getCaseValue();
+  for (auto Case : SI->cases()) {
+    auto *Orig = Case.getCaseValue();
     auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
-    C.setValue(
+    Case.setValue(
         cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
   }
   return true;
@@ -5553,7 +5562,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (ForwardSwitchConditionToPHI(SI))
     return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
-  if (SwitchToLookupTable(SI, Builder, DL, TTI))
+  // The conversion from switch to lookup tables results in difficult
+  // to analyze code and makes pruning branches much harder.
+  // This is a problem of the switch expression itself can still be
+  // restricted as a result of inlining or CVP. There only apply this
+  // transformation during late steps of the optimisation chain.
+  if (LateSimplifyCFG && SwitchToLookupTable(SI, Builder, DL, TTI))
     return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   if (ReduceSwitchRange(SI, Builder, DL, TTI))
@@ -5833,7 +5847,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
-      if (FoldCondBranchOnPHI(BI, DL))
+      if (FoldCondBranchOnPHI(BI, DL, AC))
         return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   // Scan predecessor blocks for conditional branches.
@@ -6012,8 +6026,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
                        unsigned BonusInstThreshold, AssumptionCache *AC,
-                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
+                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                       bool LateSimplifyCFG) {
   return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
-                        BonusInstThreshold, AC, LoopHeaders)
+                        BonusInstThreshold, AC, LoopHeaders, LateSimplifyCFG)
       .run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 6b1d3dc41330..a4cc6a031ad4 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -35,6 +35,9 @@ using namespace llvm;
 STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
 STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(
+    NumSimplifiedSDiv,
+    "Number of IV signed division operations converted to unsigned division");
 STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 
 namespace {
@@ -75,6 +78,7 @@ namespace {
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
                               bool IsSigned);
+    bool eliminateSDiv(BinaryOperator *SDiv);
     bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
   };
 }
@@ -265,6 +269,33 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   Changed = true;
 }
 
+bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
+  // Get the SCEVs for the ICmp operands.
+  auto *N = SE->getSCEV(SDiv->getOperand(0));
+  auto *D = SE->getSCEV(SDiv->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *L = LI->getLoopFor(SDiv->getParent());
+  N = SE->getSCEVAtScope(N, L);
+  D = SE->getSCEVAtScope(D, L);
+
+  // Replace sdiv by udiv if both of the operands are non-negative
+  if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
+    auto *UDiv = BinaryOperator::Create(
+        BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
+        SDiv->getName() + ".udiv", SDiv);
+    UDiv->setIsExact(SDiv->isExact());
+    SDiv->replaceAllUsesWith(UDiv);
+    DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    ++NumSimplifiedSDiv;
+    Changed = true;
+    DeadInsts.push_back(SDiv);
+    return true;
+  }
+
+  return false;
+}
+
 /// SimplifyIVUsers helper for eliminating useless
 /// remainder operations operating on an induction variable.
 void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
@@ -426,12 +457,15 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     eliminateIVComparison(ICmp, IVOperand);
     return true;
   }
-  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-      eliminateIVRemainder(Rem, IVOperand, IsSigned);
+  if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSRem = Bin->getOpcode() == Instruction::SRem;
+    if (IsSRem || Bin->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Bin, IVOperand, IsSRem);
       return true;
     }
+
+    if (Bin->getOpcode() == Instruction::SDiv)
+      return eliminateSDiv(Bin);
   }
 
   if (auto *CI = dyn_cast<CallInst>(UseInst))
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 1220490123ce..f6070868de44 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -35,7 +36,8 @@ using namespace llvm;
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
 
 static bool runImpl(Function &F, const DominatorTree *DT,
-                    const TargetLibraryInfo *TLI, AssumptionCache *AC) {
+                    const TargetLibraryInfo *TLI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE) {
   const DataLayout &DL = F.getParent()->getDataLayout();
   SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
   bool Changed = false;
@@ -54,7 +56,7 @@ static bool runImpl(Function &F, const DominatorTree *DT,
 
         // Don't waste time simplifying unused instructions.
         if (!I->use_empty()) {
-          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC, ORE)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
               Next->insert(cast<Instruction>(U));
@@ -95,6 +97,7 @@ namespace {
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     }
 
     /// runOnFunction - Remove instructions that simplify.
@@ -108,7 +111,10 @@ namespace {
           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       AssumptionCache *AC =
           &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      return runImpl(F, DT, TLI, AC);
+      OptimizationRemarkEmitter *ORE =
+          &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+      return runImpl(F, DT, TLI, AC, ORE);
     }
   };
 }
@@ -119,6 +125,7 @@ INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
                     "Remove redundant instructions", false, false)
 char &llvm::InstructionSimplifierID = InstSimplifier::ID;
@@ -133,9 +140,12 @@ PreservedAnalyses InstSimplifierPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  bool Changed = runImpl(F, &DT, &TLI, &AC);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  bool Changed = runImpl(F, &DT, &TLI, &AC, &ORE);
   if (!Changed)
     return PreservedAnalyses::all();
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 8eaeb1073a76..aa71e3669ea2 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -51,9 +51,9 @@ static cl::opt<bool>
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-static bool ignoreCallingConv(LibFunc::Func Func) {
-  return Func == LibFunc::abs || Func == LibFunc::labs ||
-         Func == LibFunc::llabs || Func == LibFunc::strlen;
+static bool ignoreCallingConv(LibFunc Func) {
+  return Func == LibFunc_abs || Func == LibFunc_labs ||
+         Func == LibFunc_llabs || Func == LibFunc_strlen;
 }
 
 static bool isCallingConvCCompatible(CallInst *CI) {
@@ -123,8 +123,8 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
 /// \brief Check whether the overloaded unary floating point function
 /// corresponding to \a Ty is available.
 static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                            LibFunc::Func DoubleFn, LibFunc::Func FloatFn,
-                            LibFunc::Func LongDoubleFn) {
+                            LibFunc DoubleFn, LibFunc FloatFn,
+                            LibFunc LongDoubleFn) {
   switch (Ty->getTypeID()) {
   case Type::FloatTyID:
     return TLI->has(FloatFn);
@@ -809,9 +809,9 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
 
 // TODO: Does this belong in BuildLibCalls or should all of those similar
 // functions be moved here?
-static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
+static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
                          IRBuilder<> &B, const TargetLibraryInfo &TLI) {
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
     return nullptr;
 
@@ -819,7 +819,7 @@ static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
   Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
-                                         PtrType, PtrType, nullptr);
+                                         PtrType, PtrType);
   CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
 
   if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
@@ -846,9 +846,9 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
 
   // Is the inner call really malloc()?
   Function *InnerCallee = Malloc->getCalledFunction();
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
-      Func != LibFunc::malloc)
+      Func != LibFunc_malloc)
     return nullptr;
 
   // The memset must cover the same number of bytes that are malloc'd.
@@ -948,6 +948,20 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   return B.CreateFPExt(V, B.getDoubleTy());
 }
 
+// Replace a libcall \p CI with a call to intrinsic \p IID
+static Value *replaceUnaryCall(CallInst *CI, IRBuilder<> &B, Intrinsic::ID IID) {
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  Module *M = CI->getModule();
+  Value *V = CI->getArgOperand(0);
+  Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
+  CallInst *NewCall = B.CreateCall(F, V);
+  NewCall->takeName(CI);
+  return NewCall;
+}
+
 /// Shrink double -> float for binary functions like 'fmin/fmax'.
 static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
@@ -1041,9 +1055,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
     // pow(10.0, x) -> exp10(x)
     if (Op1C->isExactlyValue(10.0) &&
-        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
-                        LibFunc::exp10l))
-      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
+        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc_exp10, LibFunc_exp10f,
+                        LibFunc_exp10l))
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc_exp10), B,
                                   Callee->getAttributes());
   }
 
@@ -1055,10 +1069,10 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
   auto *OpC = dyn_cast<CallInst>(Op1);
   if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) {
-    LibFunc::Func Func;
+    LibFunc Func;
     Function *OpCCallee = OpC->getCalledFunction();
     if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) &&
-        TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) {
+        TLI->has(Func) && (Func == LibFunc_exp || Func == LibFunc_exp2)) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
       Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul");
@@ -1075,17 +1089,20 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return ConstantFP::get(CI->getType(), 1.0);
 
   if (Op2C->isExactlyValue(-0.5) &&
-      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
-                      LibFunc::sqrtl)) {
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                      LibFunc_sqrtl)) {
     // If -ffast-math:
     // pow(x, -0.5) -> 1.0 / sqrt(x)
     if (CI->hasUnsafeAlgebra()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
 
-      // Here we cannot lower to an intrinsic because C99 sqrt() and llvm.sqrt
-      // are not guaranteed to have the same semantics.
-      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+      // TODO: If the pow call is an intrinsic, we should lower to the sqrt
+      // intrinsic, so we match errno semantics.  We also should check that the
+      // target can in fact lower the sqrt intrinsic -- we currently have no way
+      // to ask this question other than asking whether the target has a sqrt
+      // libcall, which is a sufficient but not necessary condition.
+      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc_sqrt), B,
                                          Callee->getAttributes());
 
       return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Sqrt, "sqrtrecip");
@@ -1093,19 +1110,17 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   }
 
   if (Op2C->isExactlyValue(0.5) &&
-      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
-                      LibFunc::sqrtl) &&
-      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
-                      LibFunc::fabsl)) {
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                      LibFunc_sqrtl)) {
 
     // In -ffast-math, pow(x, 0.5) -> sqrt(x).
     if (CI->hasUnsafeAlgebra()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
 
-      // Unlike other math intrinsics, sqrt has differerent semantics
-      // from the libc function. See LangRef for details.
-      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+      // TODO: As above, we should lower to the sqrt intrinsic if the pow is an
+      // intrinsic, to match errno semantics.
+      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc_sqrt), B,
                                   Callee->getAttributes());
     }
 
@@ -1115,9 +1130,16 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
     Value *Inf = ConstantFP::getInfinity(CI->getType());
     Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+
+    // TODO: As above, we should lower to the sqrt intrinsic if the pow is an
+    // intrinsic, to match errno semantics.
     Value *Sqrt = emitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
-    Value *FAbs =
-        emitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
+
+    Module *M = Callee->getParent();
+    Function *FabsF = Intrinsic::getDeclaration(M, Intrinsic::fabs,
+                                                CI->getType());
+    Value *FAbs = B.CreateCall(FabsF, Sqrt);
+
     Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
     Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
     return Sel;
@@ -1173,11 +1195,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Value *Op = CI->getArgOperand(0);
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
-  LibFunc::Func LdExp = LibFunc::ldexpl;
+  LibFunc LdExp = LibFunc_ldexpl;
   if (Op->getType()->isFloatTy())
-    LdExp = LibFunc::ldexpf;
+    LdExp = LibFunc_ldexpf;
   else if (Op->getType()->isDoubleTy())
-    LdExp = LibFunc::ldexp;
+    LdExp = LibFunc_ldexp;
 
   if (TLI->has(LdExp)) {
     Value *LdExpArg = nullptr;
@@ -1197,7 +1219,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
       Module *M = CI->getModule();
       Value *NewCallee =
           M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
-                                 Op->getType(), B.getInt32Ty(), nullptr);
+                                 Op->getType(), B.getInt32Ty());
       CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
       if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
         CI->setCallingConv(F->getCallingConv());
@@ -1208,15 +1230,6 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
-Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  if (Name == "fabs" && hasFloatVersion(Name))
-    return optimizeUnaryDoubleFP(CI, B, false);
-
-  return nullptr;
-}
-
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   // If we can shrink the call to a float function rather than a double
@@ -1280,17 +1293,17 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   FMF.setUnsafeAlgebra();
   B.setFastMathFlags(FMF);
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *F = OpC->getCalledFunction();
   if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-      Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow))
+      Func == LibFunc_pow) || F->getIntrinsicID() == Intrinsic::pow))
     return B.CreateFMul(OpC->getArgOperand(1),
       emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
                            Callee->getAttributes()), "mul");
 
   // log(exp2(y)) -> y*log(2)
   if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
-      TLI->has(Func) && Func == LibFunc::exp2)
+      TLI->has(Func) && Func == LibFunc_exp2)
     return B.CreateFMul(
         OpC->getArgOperand(0),
         emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
@@ -1302,8 +1315,11 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
-  if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
-                                   Callee->getIntrinsicID() == Intrinsic::sqrt))
+  // TODO: Once we have a way (other than checking for the existince of the
+  // libcall) to tell whether our target can lower @llvm.sqrt, relax the
+  // condition below.
+  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
+                                  Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
   if (!CI->hasUnsafeAlgebra())
@@ -1385,12 +1401,12 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   // tan(atan(x)) -> x
   // tanf(atanf(x)) -> x
   // tanl(atanl(x)) -> x
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *F = OpC->getCalledFunction();
   if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-      ((Func == LibFunc::atan && Callee->getName() == "tan") ||
-       (Func == LibFunc::atanf && Callee->getName() == "tanf") ||
-       (Func == LibFunc::atanl && Callee->getName() == "tanl")))
+      ((Func == LibFunc_atan && Callee->getName() == "tan") ||
+       (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
+       (Func == LibFunc_atanl && Callee->getName() == "tanl")))
     Ret = OpC->getArgOperand(0);
   return Ret;
 }
@@ -1427,7 +1443,7 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
 
   Module *M = OrigCallee->getParent();
   Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
-                                         ResTy, ArgTy, nullptr);
+                                         ResTy, ArgTy);
 
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
     // If the argument is an instruction, it must dominate all uses so put our
@@ -1508,24 +1524,24 @@ void LibCallSimplifier::classifyArgUse(
     return;
 
   Function *Callee = CI->getCalledFunction();
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
       !isTrigLibCall(CI))
     return;
 
   if (IsFloat) {
-    if (Func == LibFunc::sinpif)
+    if (Func == LibFunc_sinpif)
       SinCalls.push_back(CI);
-    else if (Func == LibFunc::cospif)
+    else if (Func == LibFunc_cospif)
       CosCalls.push_back(CI);
-    else if (Func == LibFunc::sincospif_stret)
+    else if (Func == LibFunc_sincospif_stret)
       SinCosCalls.push_back(CI);
   } else {
-    if (Func == LibFunc::sinpi)
+    if (Func == LibFunc_sinpi)
       SinCalls.push_back(CI);
-    else if (Func == LibFunc::cospi)
+    else if (Func == LibFunc_cospi)
       CosCalls.push_back(CI);
-    else if (Func == LibFunc::sincospi_stret)
+    else if (Func == LibFunc_sincospi_stret)
       SinCosCalls.push_back(CI);
   }
 }
@@ -1609,7 +1625,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
   // Proceedings of PACT'98, Oct. 1998, IEEE
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
-    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
   }
 
   return nullptr;
@@ -1699,7 +1715,7 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
 
   // printf(format, ...) -> iprintf(format, ...) if no floating point
   // arguments.
-  if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
+  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
     Module *M = B.GetInsertBlock()->getParent()->getParent();
     Constant *IPrintFFn =
         M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
@@ -1780,7 +1796,7 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
 
   // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
   // point arguments.
-  if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
     Module *M = B.GetInsertBlock()->getParent()->getParent();
     Constant *SIPrintFFn =
         M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
@@ -1850,7 +1866,7 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
 
   // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
   // floating point arguments.
-  if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
     Module *M = B.GetInsertBlock()->getParent()->getParent();
     Constant *FIPrintFFn =
         M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
@@ -1929,7 +1945,7 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
 }
 
 bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
-  LibFunc::Func Func;
+  LibFunc Func;
   SmallString<20> FloatFuncName = FuncName;
   FloatFuncName += 'f';
   if (TLI->getLibFunc(FloatFuncName, Func))
@@ -1939,7 +1955,7 @@ bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
 
 Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
                                                       IRBuilder<> &Builder) {
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   // Check for string/memory library functions.
   if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
@@ -1948,51 +1964,51 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
             isCallingConvCCompatible(CI)) &&
       "Optimizing string/memory libcall would change the calling convention");
     switch (Func) {
-    case LibFunc::strcat:
+    case LibFunc_strcat:
       return optimizeStrCat(CI, Builder);
-    case LibFunc::strncat:
+    case LibFunc_strncat:
       return optimizeStrNCat(CI, Builder);
-    case LibFunc::strchr:
+    case LibFunc_strchr:
       return optimizeStrChr(CI, Builder);
-    case LibFunc::strrchr:
+    case LibFunc_strrchr:
       return optimizeStrRChr(CI, Builder);
-    case LibFunc::strcmp:
+    case LibFunc_strcmp:
       return optimizeStrCmp(CI, Builder);
-    case LibFunc::strncmp:
+    case LibFunc_strncmp:
       return optimizeStrNCmp(CI, Builder);
-    case LibFunc::strcpy:
+    case LibFunc_strcpy:
       return optimizeStrCpy(CI, Builder);
-    case LibFunc::stpcpy:
+    case LibFunc_stpcpy:
       return optimizeStpCpy(CI, Builder);
-    case LibFunc::strncpy:
+    case LibFunc_strncpy:
       return optimizeStrNCpy(CI, Builder);
-    case LibFunc::strlen:
+    case LibFunc_strlen:
       return optimizeStrLen(CI, Builder);
-    case LibFunc::strpbrk:
+    case LibFunc_strpbrk:
       return optimizeStrPBrk(CI, Builder);
-    case LibFunc::strtol:
-    case LibFunc::strtod:
-    case LibFunc::strtof:
-    case LibFunc::strtoul:
-    case LibFunc::strtoll:
-    case LibFunc::strtold:
-    case LibFunc::strtoull:
+    case LibFunc_strtol:
+    case LibFunc_strtod:
+    case LibFunc_strtof:
+    case LibFunc_strtoul:
+    case LibFunc_strtoll:
+    case LibFunc_strtold:
+    case LibFunc_strtoull:
       return optimizeStrTo(CI, Builder);
-    case LibFunc::strspn:
+    case LibFunc_strspn:
       return optimizeStrSpn(CI, Builder);
-    case LibFunc::strcspn:
+    case LibFunc_strcspn:
       return optimizeStrCSpn(CI, Builder);
-    case LibFunc::strstr:
+    case LibFunc_strstr:
       return optimizeStrStr(CI, Builder);
-    case LibFunc::memchr:
+    case LibFunc_memchr:
       return optimizeMemChr(CI, Builder);
-    case LibFunc::memcmp:
+    case LibFunc_memcmp:
       return optimizeMemCmp(CI, Builder);
-    case LibFunc::memcpy:
+    case LibFunc_memcpy:
       return optimizeMemCpy(CI, Builder);
-    case LibFunc::memmove:
+    case LibFunc_memmove:
       return optimizeMemMove(CI, Builder);
-    case LibFunc::memset:
+    case LibFunc_memset:
       return optimizeMemSet(CI, Builder);
     default:
       break;
@@ -2005,7 +2021,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   if (CI->isNoBuiltin())
     return nullptr;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   StringRef FuncName = Callee->getName();
 
@@ -2029,8 +2045,6 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizePow(CI, Builder);
     case Intrinsic::exp2:
       return optimizeExp2(CI, Builder);
-    case Intrinsic::fabs:
-      return optimizeFabs(CI, Builder);
     case Intrinsic::log:
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
@@ -2067,114 +2081,117 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
       return V;
     switch (Func) {
-    case LibFunc::cosf:
-    case LibFunc::cos:
-    case LibFunc::cosl:
+    case LibFunc_cosf:
+    case LibFunc_cos:
+    case LibFunc_cosl:
       return optimizeCos(CI, Builder);
-    case LibFunc::sinpif:
-    case LibFunc::sinpi:
-    case LibFunc::cospif:
-    case LibFunc::cospi:
+    case LibFunc_sinpif:
+    case LibFunc_sinpi:
+    case LibFunc_cospif:
+    case LibFunc_cospi:
       return optimizeSinCosPi(CI, Builder);
-    case LibFunc::powf:
-    case LibFunc::pow:
-    case LibFunc::powl:
+    case LibFunc_powf:
+    case LibFunc_pow:
+    case LibFunc_powl:
       return optimizePow(CI, Builder);
-    case LibFunc::exp2l:
-    case LibFunc::exp2:
-    case LibFunc::exp2f:
+    case LibFunc_exp2l:
+    case LibFunc_exp2:
+    case LibFunc_exp2f:
       return optimizeExp2(CI, Builder);
-    case LibFunc::fabsf:
-    case LibFunc::fabs:
-    case LibFunc::fabsl:
-      return optimizeFabs(CI, Builder);
-    case LibFunc::sqrtf:
-    case LibFunc::sqrt:
-    case LibFunc::sqrtl:
+    case LibFunc_fabsf:
+    case LibFunc_fabs:
+    case LibFunc_fabsl:
+      return replaceUnaryCall(CI, Builder, Intrinsic::fabs);
+    case LibFunc_sqrtf:
+    case LibFunc_sqrt:
+    case LibFunc_sqrtl:
       return optimizeSqrt(CI, Builder);
-    case LibFunc::ffs:
-    case LibFunc::ffsl:
-    case LibFunc::ffsll:
+    case LibFunc_ffs:
+    case LibFunc_ffsl:
+    case LibFunc_ffsll:
       return optimizeFFS(CI, Builder);
-    case LibFunc::fls:
-    case LibFunc::flsl:
-    case LibFunc::flsll:
+    case LibFunc_fls:
+    case LibFunc_flsl:
+    case LibFunc_flsll:
       return optimizeFls(CI, Builder);
-    case LibFunc::abs:
-    case LibFunc::labs:
-    case LibFunc::llabs:
+    case LibFunc_abs:
+    case LibFunc_labs:
+    case LibFunc_llabs:
       return optimizeAbs(CI, Builder);
-    case LibFunc::isdigit:
+    case LibFunc_isdigit:
       return optimizeIsDigit(CI, Builder);
-    case LibFunc::isascii:
+    case LibFunc_isascii:
       return optimizeIsAscii(CI, Builder);
-    case LibFunc::toascii:
+    case LibFunc_toascii:
       return optimizeToAscii(CI, Builder);
-    case LibFunc::printf:
+    case LibFunc_printf:
       return optimizePrintF(CI, Builder);
-    case LibFunc::sprintf:
+    case LibFunc_sprintf:
       return optimizeSPrintF(CI, Builder);
-    case LibFunc::fprintf:
+    case LibFunc_fprintf:
       return optimizeFPrintF(CI, Builder);
-    case LibFunc::fwrite:
+    case LibFunc_fwrite:
       return optimizeFWrite(CI, Builder);
-    case LibFunc::fputs:
+    case LibFunc_fputs:
       return optimizeFPuts(CI, Builder);
-    case LibFunc::log:
-    case LibFunc::log10:
-    case LibFunc::log1p:
-    case LibFunc::log2:
-    case LibFunc::logb:
+    case LibFunc_log:
+    case LibFunc_log10:
+    case LibFunc_log1p:
+    case LibFunc_log2:
+    case LibFunc_logb:
       return optimizeLog(CI, Builder);
-    case LibFunc::puts:
+    case LibFunc_puts:
       return optimizePuts(CI, Builder);
-    case LibFunc::tan:
-    case LibFunc::tanf:
-    case LibFunc::tanl:
+    case LibFunc_tan:
+    case LibFunc_tanf:
+    case LibFunc_tanl:
       return optimizeTan(CI, Builder);
-    case LibFunc::perror:
+    case LibFunc_perror:
       return optimizeErrorReporting(CI, Builder);
-    case LibFunc::vfprintf:
-    case LibFunc::fiprintf:
+    case LibFunc_vfprintf:
+    case LibFunc_fiprintf:
       return optimizeErrorReporting(CI, Builder, 0);
-    case LibFunc::fputc:
+    case LibFunc_fputc:
       return optimizeErrorReporting(CI, Builder, 1);
-    case LibFunc::ceil:
-    case LibFunc::floor:
-    case LibFunc::rint:
-    case LibFunc::round:
-    case LibFunc::nearbyint:
-    case LibFunc::trunc:
-      if (hasFloatVersion(FuncName))
-        return optimizeUnaryDoubleFP(CI, Builder, false);
-      return nullptr;
-    case LibFunc::acos:
-    case LibFunc::acosh:
-    case LibFunc::asin:
-    case LibFunc::asinh:
-    case LibFunc::atan:
-    case LibFunc::atanh:
-    case LibFunc::cbrt:
-    case LibFunc::cosh:
-    case LibFunc::exp:
-    case LibFunc::exp10:
-    case LibFunc::expm1:
-    case LibFunc::sin:
-    case LibFunc::sinh:
-    case LibFunc::tanh:
+    case LibFunc_ceil:
+      return replaceUnaryCall(CI, Builder, Intrinsic::ceil);
+    case LibFunc_floor:
+      return replaceUnaryCall(CI, Builder, Intrinsic::floor);
+    case LibFunc_round:
+      return replaceUnaryCall(CI, Builder, Intrinsic::round);
+    case LibFunc_nearbyint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
+    case LibFunc_rint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::rint);
+    case LibFunc_trunc:
+      return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
+    case LibFunc_acos:
+    case LibFunc_acosh:
+    case LibFunc_asin:
+    case LibFunc_asinh:
+    case LibFunc_atan:
+    case LibFunc_atanh:
+    case LibFunc_cbrt:
+    case LibFunc_cosh:
+    case LibFunc_exp:
+    case LibFunc_exp10:
+    case LibFunc_expm1:
+    case LibFunc_sin:
+    case LibFunc_sinh:
+    case LibFunc_tanh:
       if (UnsafeFPShrink && hasFloatVersion(FuncName))
         return optimizeUnaryDoubleFP(CI, Builder, true);
       return nullptr;
-    case LibFunc::copysign:
+    case LibFunc_copysign:
       if (hasFloatVersion(FuncName))
         return optimizeBinaryDoubleFP(CI, Builder);
       return nullptr;
-    case LibFunc::fminf:
-    case LibFunc::fmin:
-    case LibFunc::fminl:
-    case LibFunc::fmaxf:
-    case LibFunc::fmax:
-    case LibFunc::fmaxl:
+    case LibFunc_fminf:
+    case LibFunc_fmin:
+    case LibFunc_fminl:
+    case LibFunc_fmaxf:
+    case LibFunc_fmax:
+    case LibFunc_fmaxl:
       return optimizeFMinFMax(CI, Builder);
     default:
       return nullptr;
@@ -2211,16 +2228,10 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 //   * log(exp10(y)) -> y*log(10)
 //   * log(sqrt(x))  -> 0.5*log(x)
 //
-// lround, lroundf, lroundl:
-//   * lround(cnst) -> cnst'
-//
 // pow, powf, powl:
 //   * pow(sqrt(x),y) -> pow(x,y*0.5)
 //   * pow(pow(x,y),z)-> pow(x,y*z)
 //
-// round, roundf, roundl:
-//   * round(cnst) -> cnst'
-//
 // signbit:
 //   * signbit(cnst) -> cnst'
 //   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
@@ -2230,10 +2241,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 //   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
 //   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
 //
-// trunc, truncf, truncl:
-//   * trunc(cnst) -> cnst'
-//
-//
 
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
@@ -2300,7 +2307,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
                                                       IRBuilder<> &B,
-                                                      LibFunc::Func Func) {
+                                                      LibFunc Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   const DataLayout &DL = CI->getModule()->getDataLayout();
@@ -2308,7 +2315,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
         *ObjSize = CI->getArgOperand(2);
 
   // __stpcpy_chk(x,x,...)  -> x+strlen(x)
-  if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
+  if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
     Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
   }
@@ -2334,14 +2341,14 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
   // If the function was an __stpcpy_chk, and we were able to fold it into
   // a __memcpy_chk, we still need to return the correct end pointer.
-  if (Ret && Func == LibFunc::stpcpy_chk)
+  if (Ret && Func == LibFunc_stpcpy_chk)
     return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
   return Ret;
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
                                                        IRBuilder<> &B,
-                                                       LibFunc::Func Func) {
+                                                       LibFunc Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
@@ -2366,7 +2373,7 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
   //
   // PR23093.
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
 
   SmallVector<OperandBundleDef, 2> OpBundles;
@@ -2384,17 +2391,17 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
     return nullptr;
 
   switch (Func) {
-  case LibFunc::memcpy_chk:
+  case LibFunc_memcpy_chk:
     return optimizeMemCpyChk(CI, Builder);
-  case LibFunc::memmove_chk:
+  case LibFunc_memmove_chk:
     return optimizeMemMoveChk(CI, Builder);
-  case LibFunc::memset_chk:
+  case LibFunc_memset_chk:
     return optimizeMemSetChk(CI, Builder);
-  case LibFunc::stpcpy_chk:
-  case LibFunc::strcpy_chk:
+  case LibFunc_stpcpy_chk:
+  case LibFunc_strcpy_chk:
     return optimizeStrpCpyChk(CI, Builder, Func);
-  case LibFunc::stpncpy_chk:
-  case LibFunc::strncpy_chk:
+  case LibFunc_stpncpy_chk:
+  case LibFunc_strncpy_chk:
     return optimizeStrpNCpyChk(CI, Builder, Func);
   default:
     break;
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index 7b9de2eadc61..7106483c3bd2 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -35,9 +35,8 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
-  initializeMemorySSAWrapperPassPass(Registry);
-  initializeMemorySSAPrinterLegacyPassPass(Registry);
   initializeStripGCRelocatesPass(Registry);
+  initializePredicateInfoPrinterLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
new file mode 100644
index 000000000000..4aeea02b1b1b
--- /dev/null
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -0,0 +1,482 @@
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "vncoerce"
+namespace llvm {
+namespace VNCoercion {
+
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+  // If the loaded or stored value is an first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+    return false;
+
+  return true;
+}
+
+template <class T, class HelperClass>
+static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
+                                               HelperClass &Helper,
+                                               const DataLayout &DL) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+         "precondition violation - materialization can't fail");
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoredValSize == LoadedValSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->getScalarType()->isPointerTy() &&
+        LoadedTy->getScalarType()->isPointerTy()) {
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+    } else {
+      // Convert source pointers to integers, which can be bitcast.
+      if (StoredValTy->getScalarType()->isPointerTy()) {
+        StoredValTy = DL.getIntPtrType(StoredValTy);
+        StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+      }
+
+      Type *TypeToCastTo = LoadedTy;
+      if (TypeToCastTo->getScalarType()->isPointerTy())
+        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+      if (StoredValTy != TypeToCastTo)
+        StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
+
+      // Cast to pointer if the load needs a pointer type.
+      if (LoadedTy->getScalarType()->isPointerTy())
+        StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    }
+
+    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+        StoredVal = FoldedStoredVal;
+
+    return StoredVal;
+  }
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoredValSize >= LoadedValSize &&
+         "canCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->getScalarType()->isPointerTy()) {
+    StoredValTy = DL.getIntPtrType(StoredValTy);
+    StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
+    StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (DL.isBigEndian()) {
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
+                        DL.getTypeStoreSizeInBits(LoadedTy);
+    StoredVal = Helper.CreateLShr(
+        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
+  StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
+
+  if (LoadedTy != NewIntTy) {
+    // If the result is a pointer, inttoptr.
+    if (LoadedTy->getScalarType()->isPointerTy())
+      StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    else
+      // Otherwise, bitcast.
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+  }
+
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  return StoredVal;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilder<> &IRB, const DataLayout &DL) {
+  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering memory write (store, memset, memcpy, memmove).  This
+/// means that the write *may* provide bits used by the load but we can't be
+/// sure because the pointers don't must-alias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &DL) {
+  // If the loaded or stored value is a first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+  LoadSize /= 8;
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure)
+    return -1;
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset + StoreSize < LoadOffset + LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset - StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                   StoreInst *DepSI, const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepSI->getValueOperand()->getType()->isStructTy() ||
+      DepSI->getValueOperand()->getType()->isArrayTy())
+    return -1;
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
+                                        DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+                                  const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+  if (R != -1)
+    return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
+      LoadBase, LoadOffs, LoadSize, DepLI);
+  if (Size == 0)
+    return -1;
+
+  // Check non-obvious conditions enforced by MDA which we rely on for being
+  // able to materialize this potentially available value
+  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+}
+
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                     MemIntrinsic *MI, const DataLayout &DL) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (!SizeCst)
+    return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, DL);
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (!Src)
+    return -1;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+  if (!GV || !GV->isConstant())
+    return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                              MemSizeInBits, DL);
+  if (Offset == -1)
+    return Offset;
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+    return Offset;
+  return -1;
+}
+
+template <class T, class HelperClass>
+static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
+                                     HelperClass &Helper,
+                                     const DataLayout &DL) {
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->getScalarType()->isPointerTy())
+    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (DL.isLittleEndian())
+    ShiftAmt = Offset * 8;
+  else
+    ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
+  if (ShiftAmt)
+    SrcVal = Helper.CreateLShr(SrcVal,
+                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+
+  if (LoadSize != StoreSize)
+    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
+                                         IntegerType::get(Ctx, LoadSize * 8));
+  return SrcVal;
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering store.  This means that the store provides bits used by
+/// the load but the pointers don't must-alias.  Check this case to see if
+/// there is anything more we can do before we give up.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+                            Instruction *InsertPt, const DataLayout &DL) {
+
+  IRBuilder<> Builder(InsertPt);
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+}
+
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+                                       Type *LoadTy, const DataLayout &DL) {
+  ConstantFolder F;
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering load.  This means that the load *may* provide bits used
+/// by the load but we can't be sure because the pointers don't must-alias.
+/// Check this case to see if there is anything more we can do before we give
+/// up.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+                           Instruction *InsertPt, const DataLayout &DL) {
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset + LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
+    DestPTy =
+        PointerType::get(DestPTy, PtrVal->getType()->getPointerAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (DL.isBigEndian())
+      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    SrcVal = NewLoad;
+  }
+
+  return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+                                      Type *LoadTy, const DataLayout &DL) {
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize)
+    return nullptr;
+  return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
+}
+
+template <class T, class HelperClass>
+T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
+                                Type *LoadTy, HelperClass &Helper,
+                                const DataLayout &DL) {
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    T *Val = cast<T>(MSI->getValue());
+    if (LoadSize != 1)
+      Val =
+          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+    T *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet * 2 <= LoadSize) {
+        T *ShVal = Helper.CreateShl(
+            Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
+        Val = Helper.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+      Val = Helper.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL) {
+  IRBuilder<> Builder(InsertPt);
+  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
+                                                          LoadTy, Builder, DL);
+}
+
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                         Type *LoadTy, const DataLayout &DL) {
+  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
+  // constant is when it's a memset of a non-constant.
+  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
+    if (!isa<Constant>(MSI->getValue()))
+      return nullptr;
+  ConstantFolder F;
+  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
+                                                                LoadTy, F, DL);
+}
+} // namespace VNCoercion
+} // namespace llvm
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 0e9baaf8649d..f77c10b6dd47 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -681,6 +681,7 @@ void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
     remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
       if (Optional<Metadata *> MappedOp = getMappedOp(Old))
         return *MappedOp;
+      (void)D;
       assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
       return &G.getFwdReference(*cast<MDNode>(Old));
     });
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index c01740b27d59..c83b3f7b225b 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -494,13 +494,13 @@ namespace {
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // For stores, it is the value type, not the pointer type that matters
         // because the value is what will come from a vector register.
-  
+
         Value *IVal = SI->getValueOperand();
         T1 = IVal->getType();
       } else {
         T1 = I->getType();
       }
-  
+
       if (CastInst *CI = dyn_cast<CastInst>(I))
         T2 = CI->getSrcTy();
       else
@@ -547,10 +547,11 @@ namespace {
     // Returns the cost of the provided instruction using TTI.
     // This does not handle loads and stores.
     unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
-                          TargetTransformInfo::OperandValueKind Op1VK = 
+                          TargetTransformInfo::OperandValueKind Op1VK =
                               TargetTransformInfo::OK_AnyValue,
                           TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue) {
+                              TargetTransformInfo::OK_AnyValue,
+                          const Instruction *I = nullptr) {
       switch (Opcode) {
       default: break;
       case Instruction::GetElementPtr:
@@ -584,7 +585,7 @@ namespace {
       case Instruction::Select:
       case Instruction::ICmp:
       case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
+        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
       case Instruction::ZExt:
       case Instruction::SExt:
       case Instruction::FPToUI:
@@ -598,7 +599,7 @@ namespace {
       case Instruction::FPTrunc:
       case Instruction::BitCast:
       case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2);
+        return TTI->getCastInstrCost(Opcode, T1, T2, I);
       }
 
       return 1;
@@ -894,7 +895,7 @@ namespace {
       // vectors that has a scalar condition results in a malformed select.
       // FIXME: We could probably be smarter about this by rewriting the select
       // with different types instead.
-      return (SI->getCondition()->getType()->isVectorTy() == 
+      return (SI->getCondition()->getType()->isVectorTy() ==
               SI->getTrueValue()->getType()->isVectorTy());
     } else if (isa<CmpInst>(I)) {
       if (!Config.VectorizeCmp)
@@ -1044,14 +1045,14 @@ namespace {
         return false;
       }
     } else if (TTI) {
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_AnyValue;
+      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
+      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
+      Type *VT1 = getVecTypeForPair(IT1, JT1),
+           *VT2 = getVecTypeForPair(IT2, JT2);
 
       // On some targets (example X86) the cost of a vector shift may vary
       // depending on whether the second operand is a Uniform or
@@ -1090,7 +1091,7 @@ namespace {
       // but this cost is ignored (because insert and extract element
       // instructions are assigned a zero depth factor and are not really
       // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
+      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
 
       if (VCost > ICost + JCost)
         return false;
@@ -1127,39 +1128,51 @@ namespace {
         FastMathFlags FMFCI;
         if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
           FMFCI = FPMOCI->getFastMathFlags();
+        SmallVector<Value *, 4> IArgs(CI->arg_operands());
+        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
 
-        SmallVector<Type*, 4> Tys;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
-          Tys.push_back(CI->getArgOperand(i)->getType());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);
-
-        Tys.clear();
         CallInst *CJ = cast<CallInst>(J);
 
         FastMathFlags FMFCJ;
         if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
           FMFCJ = FPMOCJ->getFastMathFlags();
 
-        for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
-          Tys.push_back(CJ->getArgOperand(i)->getType());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);
+        SmallVector<Value *, 4> JArgs(CJ->arg_operands());
+        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
 
-        Tys.clear();
         assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
                "Intrinsic argument counts differ");
+        SmallVector<Type*, 4> Tys;
+        SmallVector<Value *, 4> VecArgs;
         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
           if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-               IID == Intrinsic::cttz) && i == 1)
+               IID == Intrinsic::cttz) && i == 1) {
             Tys.push_back(CI->getArgOperand(i)->getType());
-          else
+            VecArgs.push_back(CI->getArgOperand(i));
+          }
+          else {
             Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
                                             CJ->getArgOperand(i)->getType()));
+            // Add both operands, and then count their scalarization overhead
+            // with VF 1.
+            VecArgs.push_back(CI->getArgOperand(i));
+            VecArgs.push_back(CJ->getArgOperand(i));
+          }
         }
 
+        // Compute the scalarization cost here with the original operands (to
+        // check for uniqueness etc), and then call getIntrinsicInstrCost()
+        // with the constructed vector types.
+        Type *RetTy = getVecTypeForPair(IT1, JT1);
+        unsigned ScalarizationCost = 0;
+        if (!RetTy->isVoidTy())
+          ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
+        ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
+
         FastMathFlags FMFV = FMFCI;
         FMFV &= FMFCJ;
-        Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);
+        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
+                                                    ScalarizationCost);
 
         if (VCost > ICost + JCost)
           return false;
@@ -2502,7 +2515,7 @@ namespace {
         if (I2 == I1 || isa<UndefValue>(I2))
           I2 = nullptr;
       }
-  
+
       if (HEE) {
         Value *I3 = HEE->getOperand(0);
         if (!I2 && I3 != I1)
@@ -2693,14 +2706,14 @@ namespace {
         // so extend the smaller vector to be the same length as the larger one.
         Instruction *NLOp;
         if (numElemL > 1) {
-  
+
           std::vector<Constant *> Mask(numElemH);
           unsigned v = 0;
           for (; v < numElemL; ++v)
             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
           for (; v < numElemH; ++v)
             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-    
+
           NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
                                        ConstantVector::get(Mask),
                                        getReplacementName(IBeforeJ ? I : J,
@@ -2710,7 +2723,7 @@ namespace {
                                            getReplacementName(IBeforeJ ? I : J,
                                                               true, o, 1));
         }
-  
+
         NLOp->insertBefore(IBeforeJ ? J : I);
         LOp = NLOp;
       }
@@ -2720,7 +2733,7 @@ namespace {
       if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
                                          ArgTypeH, VArgType, IBeforeJ)) {
         Instruction *S =
-          InsertElementInst::Create(LOp, HOp, 
+          InsertElementInst::Create(LOp, HOp,
                                     ConstantInt::get(Type::getInt32Ty(Context),
                                                      numElemL),
                                     getReplacementName(IBeforeJ ? I : J,
@@ -2737,7 +2750,7 @@ namespace {
             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
           for (; v < numElemL; ++v)
             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-    
+
           NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
                                        ConstantVector::get(Mask),
                                        getReplacementName(IBeforeJ ? I : J,
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c44a393cf846..4409d7a404f8 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -432,9 +432,12 @@ Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
   unsigned ElementSizeBytes = ElementSizeBits / 8;
   unsigned SizeBytes = ElementSizeBytes * Chain.size();
   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
-  if (NumLeft == Chain.size())
-    --NumLeft;
-  else if (NumLeft == 0)
+  if (NumLeft == Chain.size()) {
+    if ((NumLeft & 1) == 0)
+      NumLeft /= 2; // Split even in half
+    else
+      --NumLeft;    // Split off last element
+  } else if (NumLeft == 0)
     NumLeft = 1;
   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
 }
@@ -588,7 +591,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Make sure all the users of a vector are constant-index extracts.
-      if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) {
+      if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) {
             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
             return EEI && isa<ConstantInt>(EEI->getOperand(1));
           }))
@@ -622,7 +625,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
       if (TySize > VecRegSize / 2)
         continue;
 
-      if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) {
+      if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) {
             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
             return EEI && isa<ConstantInt>(EEI->getOperand(1));
           }))
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index dac7032fa08f..595b2ec88943 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -50,6 +50,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -92,6 +93,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -266,21 +268,6 @@ static bool hasCyclesInLoopBody(const Loop &L) {
   return false;
 }
 
-/// \brief This modifies LoopAccessReport to initialize message with
-/// loop-vectorizer-specific part.
-class VectorizationReport : public LoopAccessReport {
-public:
-  VectorizationReport(Instruction *I = nullptr)
-      : LoopAccessReport("loop not vectorized: ", I) {}
-
-  /// \brief This allows promotion of the loop-access analysis report into the
-  /// loop-vectorizer report.  It modifies the message to add the
-  /// loop-vectorizer-specific part of the message.
-  explicit VectorizationReport(const LoopAccessReport &R)
-      : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
-                         R.getInstr()) {}
-};
-
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
@@ -290,31 +277,9 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-/// A helper function that returns GEP instruction and knows to skip a
-/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
-/// pointee types of the 'bitcast' have the same size.
-/// For example:
-///   bitcast double** %var to i64* - can be skipped
-///   bitcast double** %var to i8*  - can not
-static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
-
-  if (isa<GetElementPtrInst>(Ptr))
-    return cast<GetElementPtrInst>(Ptr);
-
-  if (isa<BitCastInst>(Ptr) &&
-      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
-    Type *BitcastTy = Ptr->getType();
-    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
-    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
-      return nullptr;
-    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
-    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
-    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
-    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
-      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
-  }
-  return nullptr;
-}
+// FIXME: The following helper functions have multiple implementations
+// in the project. They can be effectively organized in a common Load/Store
+// utilities unit.
 
 /// A helper function that returns the pointer operand of a load or store
 /// instruction.
@@ -326,6 +291,34 @@ static Value *getPointerOperand(Value *I) {
   return nullptr;
 }
 
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getType();
+  return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns the alignment of load or store instruction.
+static unsigned getMemInstAlignment(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getAlignment();
+  return cast<StoreInst>(I)->getAlignment();
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load or store instruction.
+static unsigned getMemInstAddressSpace(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerAddressSpace();
+  return cast<StoreInst>(I)->getPointerAddressSpace();
+}
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type at the given vectorization factor.
@@ -351,6 +344,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 ///       we always assume predicated blocks have a 50% chance of executing.
 static unsigned getReciprocalPredBlockProb() { return 2; }
 
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+                           : ConstantFP::get(Ty, C);
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -428,10 +438,17 @@ protected:
   /// Copy and widen the instructions from the old loop.
   virtual void vectorizeLoop();
 
+  /// Handle all cross-iteration phis in the header.
+  void fixCrossIterationPHIs();
+
   /// Fix a first-order recurrence. This is the second phase of vectorizing
   /// this phi node.
   void fixFirstOrderRecurrence(PHINode *Phi);
 
+  /// Fix a reduction cross-iteration phi. This is the second phase of
+  /// vectorizing this phi node.
+  void fixReduction(PHINode *Phi);
+
   /// \brief The Loop exit block may have single value PHI nodes where the
   /// incoming value is 'Undef'. While vectorizing we only handled real values
   /// that were defined inside the loop. Here we fix the 'undef case'.
@@ -448,7 +465,8 @@ protected:
 
   /// Collect the instructions from the original loop that would be trivially
   /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions();
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
 
   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
   /// represented as.
@@ -462,14 +480,14 @@ protected:
   /// and DST.
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
-  /// A helper function to vectorize a single BB within the innermost loop.
-  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
+  /// A helper function to vectorize a single instruction within the innermost
+  /// loop.
+  void vectorizeInstruction(Instruction &I);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF,
-                           PhiVector *PV);
+  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
 
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
@@ -504,20 +522,21 @@ protected:
   /// \p EntryVal is the value from the original loop that maps to the steps.
   /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
   /// can be a truncate instruction).
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
-
-  /// Create a vector induction phi node based on an existing scalar one. This
-  /// currently only works for integer induction variables with a constant
-  /// step. \p EntryVal is the value from the original loop that maps to the
-  /// vector phi node. If \p EntryVal is a truncate instruction, instead of
-  /// widening the original IV, we widen a version of the IV truncated to \p
-  /// EntryVal's type.
-  void createVectorIntInductionPHI(const InductionDescriptor &II,
-                                   Instruction *EntryVal);
-
-  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
-  /// induction variable will first be truncated to the corresponding type.
-  void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+                        const InductionDescriptor &ID);
+
+  /// Create a vector induction phi node based on an existing scalar one. \p
+  /// EntryVal is the value from the original loop that maps to the vector phi
+  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+  /// truncate instruction, instead of widening the original IV, we widen a
+  /// version of the IV truncated to \p EntryVal's type.
+  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+                                       Value *Step, Instruction *EntryVal);
+
+  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+  /// is provided, the integer induction variable will first be truncated to
+  /// the corresponding type.
+  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 
   /// Returns true if an instruction \p I should be scalarized instead of
   /// vectorized for the chosen vectorization factor.
@@ -583,6 +602,10 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// \brief Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
   /// This is a helper class for maintaining vectorization state. It's used for
   /// mapping values from the original loop to their corresponding values in
   /// the new loop. Two mappings are maintained: one for vectorized values and
@@ -777,14 +800,6 @@ protected:
   // Record whether runtime checks are added.
   bool AddedSafetyChecks;
 
-  // Holds instructions from the original loop whose counterparts in the
-  // vectorized loop would be trivially dead if generated. For example,
-  // original induction update instructions can become dead because we
-  // separately emit induction "steps" when generating code for the new loop.
-  // Similarly, we create a new latch condition when setting up the structure
-  // of the new loop, so the old one can become dead.
-  SmallPtrSet<Instruction *, 4> DeadInstructions;
-
   // Holds the end values for each induction variable. We save the end values
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
@@ -803,8 +818,6 @@ public:
                             UnrollFactor, LVL, CM) {}
 
 private:
-  void scalarizeInstruction(Instruction *Instr,
-                            bool IfPredicateInstr = false) override;
   void vectorizeMemoryInstruction(Instruction *Instr) override;
   Value *getBroadcastInstrs(Value *V) override;
   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -832,12 +845,14 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
   return I;
 }
 
-/// \brief Set the debug location in the builder using the debug location in the
-/// instruction.
-static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
-  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
-    B.SetCurrentDebugLocation(Inst->getDebugLoc());
-  else
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+    const DILocation *DIL = Inst->getDebugLoc();
+    if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
+      B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
+    else
+      B.SetCurrentDebugLocation(DIL);
+  } else
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
@@ -1497,14 +1512,6 @@ private:
   OptimizationRemarkEmitter &ORE;
 };
 
-static void emitAnalysisDiag(const Loop *TheLoop,
-                             const LoopVectorizeHints &Hints,
-                             OptimizationRemarkEmitter &ORE,
-                             const LoopAccessReport &Message) {
-  const char *Name = Hints.vectorizeAnalysisPassName();
-  LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
-}
-
 static void emitMissedWarning(Function *F, Loop *L,
                               const LoopVectorizeHints &LH,
                               OptimizationRemarkEmitter *ORE) {
@@ -1512,13 +1519,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 
   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
     if (LH.getWidth() != 1)
-      emitLoopVectorizeWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop vectorization");
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedVectorization",
+                    L->getStartLoc(), L->getHeader())
+                << "loop not vectorized: "
+                << "failed explicitly specified loop vectorization");
     else if (LH.getInterleave() != 1)
-      emitLoopInterleaveWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop interleaving");
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
+                    L->getHeader())
+                << "loop not interleaved: "
+                << "failed explicitly specified loop interleaving");
   }
 }
 
@@ -1546,7 +1557,7 @@ public:
       LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
         GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
         Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
@@ -1566,8 +1577,8 @@ public:
   /// loop, only that it is legal to do so.
   bool canVectorize();
 
-  /// Returns the Induction variable.
-  PHINode *getInduction() { return Induction; }
+  /// Returns the primary induction variable.
+  PHINode *getPrimaryInduction() { return PrimaryInduction; }
 
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
@@ -1607,12 +1618,6 @@ public:
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
-  /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
-
-  /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
-
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
@@ -1689,15 +1694,9 @@ public:
   /// instructions that may divide by zero.
   bool isScalarWithPredication(Instruction *I);
 
-  /// Returns true if \p I is a memory instruction that has a consecutive or
-  /// consecutive-like pointer operand. Consecutive-like pointers are pointers
-  /// that are treated like consecutive pointers during vectorization. The
-  /// pointer operands of interleaved accesses are an example.
-  bool hasConsecutiveLikePtrOperand(Instruction *I);
-
-  /// Returns true if \p I is a memory instruction that must be scalarized
-  /// during vectorization.
-  bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
 private:
   /// Check if a single basic block loop is vectorizable.
@@ -1715,24 +1714,6 @@ private:
   /// transformation.
   bool canVectorizeWithIfConvert();
 
-  /// Collect the instructions that are uniform after vectorization. An
-  /// instruction is uniform if we represent it with a single scalar value in
-  /// the vectorized loop corresponding to each vector iteration. Examples of
-  /// uniform instructions include pointer operands of consecutive or
-  /// interleaved memory accesses. Note that although uniformity implies an
-  /// instruction will be scalar, the reverse is not true. In general, a
-  /// scalarized instruction will be represented by VF scalar values in the
-  /// vectorized loop, each corresponding to an iteration of the original
-  /// scalar loop.
-  void collectLoopUniforms();
-
-  /// Collect the instructions that are scalar after vectorization. An
-  /// instruction is scalar if it is known to be uniform or will be scalarized
-  /// during vectorization. Non-uniform scalarized instructions will be
-  /// represented by VF values in the vectorized loop, each corresponding to an
-  /// iteration of the original scalar loop.
-  void collectLoopScalars();
-
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -1744,14 +1725,6 @@ private:
   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
                        SmallPtrSetImpl<Value *> &AllowedExit);
 
-  /// Report an analysis message to assist the user in diagnosing loops that are
-  /// not vectorized.  These are handled as LoopAccessReport rather than
-  /// VectorizationReport because the << operator of VectorizationReport returns
-  /// LoopAccessReport.
-  void emitAnalysis(const LoopAccessReport &Message) const {
-    emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);
-  }
-
   /// Create an analysis remark that explains why vectorization failed
   ///
   /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
@@ -1804,9 +1777,9 @@ private:
 
   //  ---  vectorization state --- //
 
-  /// Holds the integer induction variable. This is the counter of the
+  /// Holds the primary induction variable. This is the counter of the
   /// loop.
-  PHINode *Induction;
+  PHINode *PrimaryInduction;
   /// Holds the reduction variables.
   ReductionList Reductions;
   /// Holds all of the induction variables that we found in the loop.
@@ -1822,12 +1795,6 @@ private:
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value *, 4> AllowedExit;
 
-  /// Holds the instructions known to be uniform after vectorization.
-  SmallPtrSet<Instruction *, 4> Uniforms;
-
-  /// Holds the instructions known to be scalar after vectorization.
-  SmallPtrSet<Instruction *, 4> Scalars;
-
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
 
@@ -1861,16 +1828,26 @@ public:
       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
 
+  /// \return An upper bound for the vectorization factor, or None if
+  /// vectorization should be avoided up front.
+  Optional<unsigned> computeMaxVF(bool OptForSize);
+
   /// Information about vectorization costs
   struct VectorizationFactor {
     unsigned Width; // Vector width with best cost
     unsigned Cost;  // Cost of the loop with that width
   };
   /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(bool OptForSize);
+  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+
+  /// Setup cost-based decisions for user vectorization factor.
+  void selectUserVectorizationFactor(unsigned UserVF) {
+    collectUniformsAndScalars(UserVF);
+    collectInstsToScalarize(UserVF);
+  }
 
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1884,6 +1861,15 @@ public:
   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
                                  unsigned LoopCost);
 
+  /// Memory access instruction may be vectorized in more than one way.
+  /// Form of instruction after vectorization depends on cost.
+  /// This function takes cost-based decisions for Load/Store instructions
+  /// and collects them in a map. This decisions map is used for building
+  /// the lists of loop-uniform and loop-scalar instructions.
+  /// The calculated cost is saved with widening decision in order to
+  /// avoid redundant calculations.
+  void setCostBasedWideningDecision(unsigned VF);
+
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
@@ -1918,14 +1904,118 @@ public:
     return Scalars->second.count(I);
   }
 
+  /// Returns true if \p I is known to be uniform after vectorization.
+  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
+    auto UniformsPerVF = Uniforms.find(VF);
+    return UniformsPerVF->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be scalar after vectorization.
+  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
+    auto ScalarsPerVF = Scalars.find(VF);
+    return ScalarsPerVF->second.count(I);
+  }
+
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
-           !Legal->isScalarAfterVectorization(I);
+           !isScalarAfterVectorization(I, VF);
+  }
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    CM_Unknown,
+    CM_Widen,
+    CM_Interleave,
+    CM_GatherScatter,
+    CM_Scalarize
+  };
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// instruction \p I and vector width \p VF.
+  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
+                           unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+  }
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// interleaving group \p Grp and vector width \p VF.
+  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+                           InstWidening W, unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    /// Broadcast this decicion to all instructions inside the group.
+    /// But the cost will be assigned to one instruction only.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i)) {
+        if (Grp->getInsertPos() == I)
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+        else
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+      }
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return CM_Unknown if this instruction did not pass
+  /// through the cost modeling.
+  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    auto Itr = WideningDecisions.find(InstOnVF);
+    if (Itr == WideningDecisions.end())
+      return CM_Unknown;
+    return Itr->second.first;
+  }
+
+  /// Return the vectorization cost for the given instruction \p I and vector
+  /// width \p VF.
+  unsigned getWideningCost(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
+    return WideningDecisions[InstOnVF].second;
+  }
+
+  /// Return True if instruction \p I is an optimizable truncate whose operand
+  /// is an induction variable. Such a truncate will be removed by adding a new
+  /// induction variable with the destination type.
+  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+
+    // If the instruction is not a truncate, return false.
+    auto *Trunc = dyn_cast<TruncInst>(I);
+    if (!Trunc)
+      return false;
+
+    // Get the source and destination types of the truncate.
+    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+    // If the truncate is free for the given types, return false. Replacing a
+    // free truncate with an induction variable would add an induction variable
+    // update instruction to each iteration of the loop. We exclude from this
+    // check the primary induction variable since it will need an update
+    // instruction regardless.
+    Value *Op = Trunc->getOperand(0);
+    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+      return false;
+
+    // If the truncated value is not an induction variable, return false.
+    return Legal->isInductionVariable(Op);
   }
 
 private:
+  /// \return An upper bound for the vectorization factor, larger than zero.
+  /// One is returned if vectorization should best be avoided due to cost.
+  unsigned computeFeasibleMaxVF(bool OptForSize);
+
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -1949,6 +2039,26 @@ private:
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
 
+  /// Calculate vectorization cost of memory instruction \p I.
+  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for scalarized memory instruction.
+  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for interleaving group of memory instructions.
+  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for Gather/Scatter instruction.
+  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for widening instruction \p I with consecutive
+  /// memory access.
+  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+
+  /// The cost calculation for Load instruction \p I with uniform pointer -
+  /// scalar load + broadcast.
+  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -1972,12 +2082,24 @@ private:
   /// pairs.
   typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
 
+  /// A set containing all BasicBlocks that are known to present after
+  /// vectorization as a predicated block.
+  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
 
+  /// Holds the instructions known to be uniform after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+  /// Holds the instructions known to be scalar after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
@@ -1990,6 +2112,44 @@ private:
   /// the loop.
   void collectInstsToScalarize(unsigned VF);
 
+  /// Collect the instructions that are uniform after vectorization. An
+  /// instruction is uniform if we represent it with a single scalar value in
+  /// the vectorized loop corresponding to each vector iteration. Examples of
+  /// uniform instructions include pointer operands of consecutive or
+  /// interleaved memory accesses. Note that although uniformity implies an
+  /// instruction will be scalar, the reverse is not true. In general, a
+  /// scalarized instruction will be represented by VF scalar values in the
+  /// vectorized loop, each corresponding to an iteration of the original
+  /// scalar loop.
+  void collectLoopUniforms(unsigned VF);
+
+  /// Collect the instructions that are scalar after vectorization. An
+  /// instruction is scalar if it is known to be uniform or will be scalarized
+  /// during vectorization. Non-uniform scalarized instructions will be
+  /// represented by VF values in the vectorized loop, each corresponding to an
+  /// iteration of the original scalar loop.
+  void collectLoopScalars(unsigned VF);
+
+  /// Collect Uniform and Scalar values for the given \p VF.
+  /// The sets depend on CM decision for Load/Store instructions
+  /// that may be vectorized as interleave, gather-scatter or scalarized.
+  void collectUniformsAndScalars(unsigned VF) {
+    // Do the analysis once.
+    if (VF == 1 || Uniforms.count(VF))
+      return;
+    setCostBasedWideningDecision(VF);
+    collectLoopUniforms(VF);
+    collectLoopScalars(VF);
+  }
+
+  /// Keeps cost model vectorization decision and cost for instructions.
+  /// Right now it is used for memory instructions only.
+  typedef DenseMap<std::pair<Instruction *, unsigned>,
+                   std::pair<InstWidening, unsigned>>
+      DecisionList;
+
+  DecisionList WideningDecisions;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -2019,6 +2179,23 @@ public:
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+class LoopVectorizationPlanner {
+public:
+  LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+
+  ~LoopVectorizationPlanner() {}
+
+  /// Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
+                                                       unsigned UserVF);
+
+private:
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+};
+
 /// \brief This holds vectorization requirements that must be verified late in
 /// the process. The requirements are set by legalize and costmodel. Once
 /// vectorization has been determined to be possible and profitable the
@@ -2134,8 +2311,6 @@ struct LoopVectorize : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
@@ -2156,7 +2331,7 @@ struct LoopVectorize : public FunctionPass {
 
 //===----------------------------------------------------------------------===//
 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
-// LoopVectorizationCostModel.
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
 //===----------------------------------------------------------------------===//
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -2176,27 +2351,51 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-void InnerLoopVectorizer::createVectorIntInductionPHI(
-    const InductionDescriptor &II, Instruction *EntryVal) {
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
+    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
   Value *Start = II.getStartValue();
-  ConstantInt *Step = II.getConstIntStepValue();
-  assert(Step && "Can not widen an IV with a non-constant step");
 
   // Construct the initial value of the vector IV in the vector loop preheader
   auto CurrIP = Builder.saveIP();
   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
   if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
     auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+    Step = Builder.CreateTrunc(Step, TruncType);
     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
   }
   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
-  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+  Value *SteppedStart =
+      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = II.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
+  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+  // Create a vector splat to use in the induction update.
+  //
+  // FIXME: If the step is non-constant, we create the vector splat with
+  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+  //        handle a constant vector splat.
+  Value *SplatVF = isa<Constant>(Mul)
+                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(VF, Mul);
   Builder.restoreIP(CurrIP);
 
-  Value *SplatVF =
-      ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(),
-                               VF * Step->getSExtValue()));
   // We may need to add the step a number of times, depending on the unroll
   // factor. The last of those goes into the PHI.
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
@@ -2205,8 +2404,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
   VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part] = LastInduction;
-    LastInduction = cast<Instruction>(
-        Builder.CreateAdd(LastInduction, SplatVF, "step.add"));
+    LastInduction = cast<Instruction>(addFastMathFlag(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
   }
   VectorLoopValueMap.initVector(EntryVal, Entry);
   if (isa<TruncInst>(EntryVal))
@@ -2225,7 +2424,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
 }
 
 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
-  return Legal->isScalarAfterVectorization(I) ||
+  return Cost->isScalarAfterVectorization(I, VF) ||
          Cost->isProfitableToScalarize(I, VF);
 }
 
@@ -2239,7 +2438,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
   return any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+
+  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+         "Primary induction variable must have an integer type");
 
   auto II = Legal->getInductionVars()->find(IV);
   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@@ -2251,9 +2453,6 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // induction variable.
   Value *ScalarIV = nullptr;
 
-  // The step of the induction.
-  Value *Step = nullptr;
-
   // The value from the original loop to which we are mapping the new induction
   // variable.
   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
@@ -2266,45 +2465,49 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
 
-  // If the induction variable has a constant integer step value, go ahead and
-  // get it now.
-  if (ID.getConstIntStepValue())
-    Step = ID.getConstIntStepValue();
+  // Generate code for the induction step. Note that induction steps are
+  // required to be loop-invariant
+  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
+         "Induction step should be loop invariant");
+  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+  Value *Step = nullptr;
+  if (PSE.getSE()->isSCEVable(IV->getType())) {
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                             LoopVectorPreHeader->getTerminator());
+  } else {
+    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
+  }
 
   // Try to create a new independent vector induction variable. If we can't
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
-  if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntInductionPHI(ID, EntryVal);
+  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
     VectorizedIV = true;
   }
 
   // If we haven't yet vectorized the induction variable, or if we will create
   // a scalar one, we need to define the scalar induction variable and step
   // values. If we were given a truncation type, truncate the canonical
-  // induction variable and constant step. Otherwise, derive these values from
-  // the induction descriptor.
+  // induction variable and step. Otherwise, derive these values from the
+  // induction descriptor.
   if (!VectorizedIV || NeedsScalarIV) {
+    ScalarIV = Induction;
+    if (IV != OldInduction) {
+      ScalarIV = IV->getType()->isIntegerTy()
+                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+                     : Builder.CreateCast(Instruction::SIToFP, Induction,
+                                          IV->getType());
+      ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+      ScalarIV->setName("offset.idx");
+    }
     if (Trunc) {
       auto *TruncType = cast<IntegerType>(Trunc->getType());
-      assert(Step && "Truncation requires constant integer step");
-      auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
-      ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
-      Step = ConstantInt::getSigned(TruncType, StepInt);
-    } else {
-      ScalarIV = Induction;
-      auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-      if (IV != OldInduction) {
-        ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
-        ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
-        ScalarIV->setName("offset.idx");
-      }
-      if (!Step) {
-        SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-        Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
-                                 &*Builder.GetInsertPoint());
-      }
+      assert(Step->getType()->isIntegerTy() &&
+             "Truncation requires an integer step");
+      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+      Step = Builder.CreateTrunc(Step, TruncType);
     }
   }
 
@@ -2314,7 +2517,8 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+      Entry[Part] =
+          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
     VectorLoopValueMap.initVector(EntryVal, Entry);
     if (Trunc)
       addMetadata(Entry, Trunc);
@@ -2327,7 +2531,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // in the loop in the common case prior to InstCombine. We will be trading
   // one vector extract for each scalar step.
   if (NeedsScalarIV)
-    buildScalarSteps(ScalarIV, Step, EntryVal);
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -2387,30 +2591,43 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
 }
 
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
-                                           Value *EntryVal) {
+                                           Value *EntryVal,
+                                           const InductionDescriptor &ID) {
 
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF > 1 && "VF should be greater than one");
 
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
-  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
-         "Val and Step should have the same integer type");
+  assert(ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same type");
+
+  // We build scalar steps for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (ScalarIVTy->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
 
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   unsigned Lanes =
-      Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
+    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   ScalarParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part].resize(VF);
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
-      auto *Mul = Builder.CreateMul(StartIdx, Step);
-      auto *Add = Builder.CreateAdd(ScalarIV, Mul);
+      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       Entry[Part][Lane] = Add;
     }
   }
@@ -2469,7 +2686,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // known to be uniform after vectorization, this corresponds to lane zero
     // of the last unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the last unroll iteration.
-    unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
     auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
@@ -2486,7 +2703,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // VectorLoopValueMap, we will only generate the insertelements once.
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VectorValue = nullptr;
-      if (Legal->isUniformAfterVectorization(I)) {
+      if (Cost->isUniformAfterVectorization(I, VF)) {
         VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
       } else {
         VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
@@ -2515,8 +2732,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
-                  : true && "Uniform values only have lane zero");
+  assert(Lane > 0 ?
+         !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+         : true && "Uniform values only have lane zero");
 
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
@@ -2551,102 +2769,6 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
-// Get a mask to interleave \p NumVec vectors into a wide vector.
-// I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
-// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
-//      <0, 4, 1, 5, 2, 6, 3, 7>
-static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
-                                    unsigned NumVec) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < VF; i++)
-    for (unsigned j = 0; j < NumVec; j++)
-      Mask.push_back(Builder.getInt32(j * VF + i));
-
-  return ConstantVector::get(Mask);
-}
-
-// Get the strided mask starting from index \p Start.
-// I.e.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
-static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
-                                unsigned Stride, unsigned VF) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < VF; i++)
-    Mask.push_back(Builder.getInt32(Start + i * Stride));
-
-  return ConstantVector::get(Mask);
-}
-
-// Get a mask of two parts: The first part consists of sequential integers
-// starting from 0, The second part consists of UNDEFs.
-// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
-                                   unsigned NumUndef) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumInt; i++)
-    Mask.push_back(Builder.getInt32(i));
-
-  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
-  for (unsigned i = 0; i < NumUndef; i++)
-    Mask.push_back(Undef);
-
-  return ConstantVector::get(Mask);
-}
-
-// Concatenate two vectors with the same element type. The 2nd vector should
-// not have more elements than the 1st vector. If the 2nd vector has less
-// elements, extend it with UNDEFs.
-static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
-                                    Value *V2) {
-  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
-  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
-  assert(VecTy1 && VecTy2 &&
-         VecTy1->getScalarType() == VecTy2->getScalarType() &&
-         "Expect two vectors with the same element type");
-
-  unsigned NumElts1 = VecTy1->getNumElements();
-  unsigned NumElts2 = VecTy2->getNumElements();
-  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
-
-  if (NumElts1 > NumElts2) {
-    // Extend with UNDEFs.
-    Constant *ExtMask =
-        getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
-    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
-  }
-
-  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
-  return Builder.CreateShuffleVector(V1, V2, Mask);
-}
-
-// Concatenate vectors in the given list. All vectors have the same type.
-static Value *ConcatenateVectors(IRBuilder<> &Builder,
-                                 ArrayRef<Value *> InputList) {
-  unsigned NumVec = InputList.size();
-  assert(NumVec > 1 && "Should be at least two vectors");
-
-  SmallVector<Value *, 8> ResList;
-  ResList.append(InputList.begin(), InputList.end());
-  do {
-    SmallVector<Value *, 8> TmpList;
-    for (unsigned i = 0; i < NumVec - 1; i += 2) {
-      Value *V0 = ResList[i], *V1 = ResList[i + 1];
-      assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
-             "Only the last vector may have a different type");
-
-      TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
-    }
-
-    // Push the last vector if the total number of vectors is odd.
-    if (NumVec % 2 != 0)
-      TmpList.push_back(ResList[NumVec - 1]);
-
-    ResList = TmpList;
-    NumVec = ResList.size();
-  } while (NumVec > 1);
-
-  return ResList[0];
-}
-
 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
@@ -2683,15 +2805,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   if (Instr != Group->getInsertPos())
     return;
 
-  LoadInst *LI = dyn_cast<LoadInst>(Instr);
-  StoreInst *SI = dyn_cast<StoreInst>(Instr);
   Value *Ptr = getPointerOperand(Instr);
 
   // Prepare for the vector type of the interleaved load/store.
-  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
-  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
 
   // Prepare for the new pointers.
   setDebugLocFromInst(Builder, Ptr);
@@ -2731,7 +2851,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   Value *UndefVec = UndefValue::get(VecTy);
 
   // Vectorize the interleaved load group.
-  if (LI) {
+  if (isa<LoadInst>(Instr)) {
 
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
@@ -2752,7 +2872,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
         continue;
 
       VectorParts Entry(UF);
-      Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF);
+      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
@@ -2796,10 +2916,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     }
 
     // Concatenate all vectors into a wide vector.
-    Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+    Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
-    Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
+    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
@@ -2816,103 +2936,44 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
-  // Try to vectorize the interleave group if this access is interleaved.
-  if (Legal->isAccessInterleaved(Instr))
+  LoopVectorizationCostModel::InstWidening Decision =
+      Cost->getWideningDecision(Instr, VF);
+  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+         "CM decision should be taken at this point");
+  if (Decision == LoopVectorizationCostModel::CM_Interleave)
     return vectorizeInterleaveGroup(Instr);
 
-  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *ScalarDataTy = getMemInstValueType(Instr);
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = getPointerOperand(Instr);
-  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  unsigned Alignment = getMemInstAlignment(Instr);
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
   const DataLayout &DL = Instr->getModule()->getDataLayout();
   if (!Alignment)
     Alignment = DL.getABITypeAlignment(ScalarDataTy);
-  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  unsigned AddressSpace = getMemInstAddressSpace(Instr);
 
   // Scalarize the memory instruction if necessary.
-  if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
+  if (Decision == LoopVectorizationCostModel::CM_Scalarize)
     return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
-
-  // Determine if either a gather or scatter operation is legal.
   bool CreateGatherScatter =
-      !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
+      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
   VectorParts VectorGep;
 
   // Handle consecutive loads/stores.
-  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   if (ConsecutiveStride) {
-    if (Gep) {
-      unsigned NumOperands = Gep->getNumOperands();
-#ifndef NDEBUG
-      // The original GEP that identified as a consecutive memory access
-      // should have only one loop-variant operand.
-      unsigned NumOfLoopVariantOps = 0;
-      for (unsigned i = 0; i < NumOperands; ++i)
-        if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
-                                          OrigLoop))
-          NumOfLoopVariantOps++;
-      assert(NumOfLoopVariantOps == 1 &&
-             "Consecutive GEP should have only one loop-variant operand");
-#endif
-      GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-      Gep2->setName("gep.indvar");
-
-      // A new GEP is created for a 0-lane value of the first unroll iteration.
-      // The GEPs for the rest of the unroll iterations are computed below as an
-      // offset from this GEP.
-      for (unsigned i = 0; i < NumOperands; ++i)
-        // We can apply getScalarValue() for all GEP indices. It returns an
-        // original value for loop-invariant operand and 0-lane for consecutive
-        // operand.
-        Gep2->setOperand(i, getScalarValue(Gep->getOperand(i),
-                                           0, /* First unroll iteration */
-                                           0  /* 0-lane of the vector */ ));
-      setDebugLocFromInst(Builder, Gep);
-      Ptr = Builder.Insert(Gep2);
-
-    } else { // No GEP
-      setDebugLocFromInst(Builder, Ptr);
-      Ptr = getScalarValue(Ptr, 0, 0);
-    }
+    Ptr = getScalarValue(Ptr, 0, 0);
   } else {
     // At this point we should vector version of GEP for Gather or Scatter
     assert(CreateGatherScatter && "The instruction should be scalarized");
-    if (Gep) {
-      // Vectorizing GEP, across UF parts. We want to get a vector value for base
-      // and each index that's defined inside the loop, even if it is
-      // loop-invariant but wasn't hoisted out. Otherwise we want to keep them
-      // scalar.
-      SmallVector<VectorParts, 4> OpsV;
-      for (Value *Op : Gep->operands()) {
-        Instruction *SrcInst = dyn_cast<Instruction>(Op);
-        if (SrcInst && OrigLoop->contains(SrcInst))
-          OpsV.push_back(getVectorValue(Op));
-        else
-          OpsV.push_back(VectorParts(UF, Op));
-      }
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Ops;
-        Value *GEPBasePtr = OpsV[0][Part];
-        for (unsigned i = 1; i < Gep->getNumOperands(); i++)
-          Ops.push_back(OpsV[i][Part]);
-        Value *NewGep =  Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");
-        cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());
-        assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
-
-        NewGep =
-            Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));
-        VectorGep.push_back(NewGep);
-      }
-    } else
-      VectorGep = getVectorValue(Ptr);
+    VectorGep = getVectorValue(Ptr);
   }
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
@@ -3027,7 +3088,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If the instruction is uniform, we only need to generate the
   // first lane. Otherwise, we generate all VF values.
-  unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
+  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
 
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
@@ -3038,7 +3099,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       // Start if-block.
       Value *Cmp = nullptr;
       if (IfPredicateInstr) {
-        Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));
+        Cmp = Cond[Part];
+        if (Cmp->getType()->isVectorTy())
+          Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
                                  ConstantInt::get(Cmp->getType(), 1));
       }
@@ -3346,7 +3409,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   //   - counts from zero, stepping by one
   //   - is the size of the widest induction variable type
   // then we create a new one.
-  OldInduction = Legal->getInduction();
+  OldInduction = Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
 
   // Split the single block loop into the two loop structure described above.
@@ -3543,7 +3606,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
 
 namespace {
 struct CSEDenseMapInfo {
-  static bool canHandle(Instruction *I) {
+  static bool canHandle(const Instruction *I) {
     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
   }
@@ -3553,12 +3616,12 @@ struct CSEDenseMapInfo {
   static inline Instruction *getTombstoneKey() {
     return DenseMapInfo<Instruction *>::getTombstoneKey();
   }
-  static unsigned getHashValue(Instruction *I) {
+  static unsigned getHashValue(const Instruction *I) {
     assert(canHandle(I) && "Unknown instruction!");
     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
                                                            I->value_op_end()));
   }
-  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
         LHS == getTombstoneKey() || RHS == getTombstoneKey())
       return LHS == RHS;
@@ -3589,51 +3652,6 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-/// \brief Adds a 'fast' flag to floating point operations.
-static Value *addFastMathFlag(Value *V) {
-  if (isa<FPMathOperator>(V)) {
-    FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
-    cast<Instruction>(V)->setFastMathFlags(Flags);
-  }
-  return V;
-}
-
-/// \brief Estimate the overhead of scalarizing a value based on its type.
-/// Insert and Extract are set if the result needs to be inserted and/or
-/// extracted from vectors.
-static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
-                                         const TargetTransformInfo &TTI) {
-  if (Ty->isVoidTy())
-    return 0;
-
-  assert(Ty->isVectorTy() && "Can only scalarize vectors");
-  unsigned Cost = 0;
-
-  for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
-    if (Extract)
-      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
-    if (Insert)
-      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
-  }
-
-  return Cost;
-}
-
-/// \brief Estimate the overhead of scalarizing an Instruction based on the
-/// types of its operands and return value.
-static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
-                                         Type *RetTy,
-                                         const TargetTransformInfo &TTI) {
-  unsigned ScalarizationCost =
-      getScalarizationOverhead(RetTy, true, false, TTI);
-
-  for (Type *Ty : OpTys)
-    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
-
-  return ScalarizationCost;
-}
-
 /// \brief Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
@@ -3641,14 +3659,24 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
   if (VF == 1)
     return 0;
 
+  unsigned Cost = 0;
   Type *RetTy = ToVectorTy(I->getType(), VF);
+  if (!RetTy->isVoidTy() &&
+      (!isa<LoadInst>(I) ||
+       !TTI.supportsEfficientVectorElementLoadStore()))
+    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
 
-  SmallVector<Type *, 4> OpTys;
-  unsigned OperandsNum = I->getNumOperands();
-  for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
-    OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    SmallVector<const Value *, 4> Operands(CI->arg_operands());
+    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
+  }
+  else if (!isa<StoreInst>(I) ||
+           !TTI.supportsEfficientVectorElementLoadStore()) {
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
+  }
 
-  return getScalarizationOverhead(OpTys, RetTy, TTI);
+  return Cost;
 }
 
 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
@@ -3681,7 +3709,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -3709,16 +3737,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
-  Type *RetTy = ToVectorTy(CI->getType(), VF);
-  SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI->arg_operands())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
   FastMathFlags FMF;
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+  SmallVector<Value *, 4> Operands(CI->arg_operands());
+  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -3861,30 +3885,27 @@ void InnerLoopVectorizer::vectorizeLoop() {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  Constant *Zero = Builder.getInt32(0);
 
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. First,
-  // we create a new vector PHI node with no incoming edges. We use this value
-  // when we vectorize all of the instructions that use the PHI. Next, after
-  // all of the instructions in the block are complete we add the new incoming
-  // edges to the PHI. At this point all of the instructions in the basic block
-  // are vectorized, so we can use them to construct the PHI.
-  PhiVector PHIsToFix;
-
-  // Collect instructions from the original loop that will become trivially
-  // dead in the vectorized loop. We don't need to vectorize these
-  // instructions.
-  collectTriviallyDeadInstructions();
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
 
   // Scan the loop in a topological order to ensure that defs are vectorized
   // before users.
   LoopBlocksDFS DFS(OrigLoop);
   DFS.perform(LI);
 
-  // Vectorize all of the blocks in the original loop.
+  // Vectorize all instructions in the original loop that will not become
+  // trivially dead when vectorized.
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    vectorizeBlockInLoop(BB, &PHIsToFix);
+    for (Instruction &I : *BB)
+      if (!DeadInstructions.count(&I))
+        vectorizeInstruction(I);
 
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
@@ -3892,224 +3913,10 @@ void InnerLoopVectorizer::vectorizeLoop() {
     truncateToMinimalBitwidths();
 
   // At this point every instruction in the original loop is widened to a
-  // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI
+  // vector form. Now we need to fix the recurrences in the loop. These PHI
   // nodes are currently empty because we did not want to introduce cycles.
   // This is the second stage of vectorizing recurrences.
-  for (PHINode *Phi : PHIsToFix) {
-    assert(Phi && "Unable to recover vectorized PHI");
-
-    // Handle first-order recurrences that need to be fixed.
-    if (Legal->isFirstOrderRecurrence(Phi)) {
-      fixFirstOrderRecurrence(Phi);
-      continue;
-    }
-
-    // If the phi node is not a first-order recurrence, it must be a reduction.
-    // Get it's reduction variable descriptor.
-    assert(Legal->isReductionVariable(Phi) &&
-           "Unable to find the reduction variable");
-    RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
-
-    RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
-    TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
-    Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
-        RdxDesc.getMinMaxRecurrenceKind();
-    setDebugLocFromInst(Builder, ReductionStartValue);
-
-    // We need to generate a reduction vector from the incoming scalar.
-    // To do so, we need to generate the 'identity' vector and override
-    // one of the elements with the incoming scalar reduction. We need
-    // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
-
-    // This is the vector-clone of the value that leaves the loop.
-    const VectorParts &VectorExit = getVectorValue(LoopExitInst);
-    Type *VecTy = VectorExit[0]->getType();
-
-    // Find the reduction identity variable. Zero for addition, or, xor,
-    // one for multiplication, -1 for And.
-    Value *Identity;
-    Value *VectorStart;
-    if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
-        RK == RecurrenceDescriptor::RK_FloatMinMax) {
-      // MinMax reduction have the start value as their identify.
-      if (VF == 1) {
-        VectorStart = Identity = ReductionStartValue;
-      } else {
-        VectorStart = Identity =
-            Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
-      }
-    } else {
-      // Handle other reduction kinds:
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
-          RK, VecTy->getScalarType());
-      if (VF == 1) {
-        Identity = Iden;
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart = ReductionStartValue;
-      } else {
-        Identity = ConstantVector::getSplat(VF, Iden);
-
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart =
-            Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
-      }
-    }
-
-    // Fix the vector-loop phi.
-
-    // Reductions do not have to start at zero. They can start with
-    // any loop invariant values.
-    const VectorParts &VecRdxPhi = getVectorValue(Phi);
-    BasicBlock *Latch = OrigLoop->getLoopLatch();
-    Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
-    const VectorParts &Val = getVectorValue(LoopVal);
-    for (unsigned part = 0; part < UF; ++part) {
-      // Make sure to add the reduction stat value only to the
-      // first unroll part.
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      cast<PHINode>(VecRdxPhi[part])
-          ->addIncoming(StartVal, LoopVectorPreHeader);
-      cast<PHINode>(VecRdxPhi[part])
-          ->addIncoming(Val[part], LoopVectorBody);
-    }
-
-    // Before each round, move the insertion point right between
-    // the PHIs and the values we are going to write.
-    // This allows us to write both PHINodes and the extractelement
-    // instructions.
-    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-
-    VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
-    setDebugLocFromInst(Builder, LoopExitInst);
-
-    // If the vector reduction can be performed in a smaller type, we truncate
-    // then extend the loop exit value to enable InstCombine to evaluate the
-    // entire expression in the smaller type.
-    if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
-      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-      Builder.SetInsertPoint(LoopVectorBody->getTerminator());
-      for (unsigned part = 0; part < UF; ++part) {
-        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
-        Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
-                                          : Builder.CreateZExt(Trunc, VecTy);
-        for (Value::user_iterator UI = RdxParts[part]->user_begin();
-             UI != RdxParts[part]->user_end();)
-          if (*UI != Trunc) {
-            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
-            RdxParts[part] = Extnd;
-          } else {
-            ++UI;
-          }
-      }
-      Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-      for (unsigned part = 0; part < UF; ++part)
-        RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
-    }
-
-    // Reduce all of the unrolled parts into a single vector.
-    Value *ReducedPartRdx = RdxParts[0];
-    unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
-    setDebugLocFromInst(Builder, ReducedPartRdx);
-    for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        // Floating point operations had to be 'fast' to enable the reduction.
-        ReducedPartRdx = addFastMathFlag(
-            Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
-                                ReducedPartRdx, "bin.rdx"));
-      else
-        ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
-            Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
-    }
-
-    if (VF > 1) {
-      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-      // and vector ops, reducing the set of values being computed by half each
-      // round.
-      assert(isPowerOf2_32(VF) &&
-             "Reduction emission only supported for pow2 vectors!");
-      Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
-      for (unsigned i = VF; i != 1; i >>= 1) {
-        // Move the upper half of the vector to the lower half.
-        for (unsigned j = 0; j != i / 2; ++j)
-          ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
-        // Fill the rest of the mask with undef.
-        std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
-                  UndefValue::get(Builder.getInt32Ty()));
-
-        Value *Shuf = Builder.CreateShuffleVector(
-            TmpVec, UndefValue::get(TmpVec->getType()),
-            ConstantVector::get(ShuffleMask), "rdx.shuf");
-
-        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-          // Floating point operations had to be 'fast' to enable the reduction.
-          TmpVec = addFastMathFlag(Builder.CreateBinOp(
-              (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
-        else
-          TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
-                                                        TmpVec, Shuf);
-      }
-
-      // The result is in the first element of the vector.
-      ReducedPartRdx =
-          Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
-      // If the reduction can be performed in a smaller type, we need to extend
-      // the reduction to the wider type before we branch to the original loop.
-      if (Phi->getType() != RdxDesc.getRecurrenceType())
-        ReducedPartRdx =
-            RdxDesc.isSigned()
-                ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
-                : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
-    }
-
-    // Create a phi node that merges control-flow from the backedge-taken check
-    // block and the middle block.
-    PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
-                                          LoopScalarPreHeader->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
-    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-
-    // Now, we need to fix the users of the reduction variable
-    // inside and outside of the scalar remainder loop.
-    // We know that the loop is in LCSSA form. We need to update the
-    // PHI nodes in the exit blocks.
-    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-                              LEE = LoopExitBlock->end();
-         LEI != LEE; ++LEI) {
-      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi)
-        break;
-
-      // All PHINodes need to have a single entry edge, or two if
-      // we already fixed them.
-      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
-
-      // We found our reduction value exit-PHI. Update it with the
-      // incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
-        // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-        break;
-      }
-    } // end of the LCSSA phi scan.
-
-    // Fix the scalar loop reduction variable with the incoming reduction sum
-    // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx =
-        Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-    // Pick the other block.
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-    Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-  } // end of for each Phi in PHIsToFix.
+  fixCrossIterationPHIs();
 
   // Update the dominator tree.
   //
@@ -4134,6 +3941,25 @@ void InnerLoopVectorizer::vectorizeLoop() {
   cse(LoopVectorBody);
 }
 
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #2: We now need to fix the recurrences by adding incoming edges to
+  // the currently empty PHI nodes. At this point every instruction in the
+  // original loop is widened to a vector form so we can use them to construct
+  // the incoming edges.
+  for (Instruction &I : *OrigLoop->getHeader()) {
+    PHINode *Phi = dyn_cast<PHINode>(&I);
+    if (!Phi)
+      break;
+    // Handle first-order recurrences and reductions that need to be fixed.
+    if (Legal->isFirstOrderRecurrence(Phi))
+      fixFirstOrderRecurrence(Phi);
+    else if (Legal->isReductionVariable(Phi))
+      fixReduction(Phi);
+  }
+}
+
 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // This is the second phase of vectorizing first-order recurrences. An
@@ -4212,15 +4038,17 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
 
-  // Get the vectorized previous value. We ensured the previous values was an
-  // instruction when detecting the recurrence.
+  // Get the vectorized previous value.
   auto &PreviousParts = getVectorValue(Previous);
 
-  // Set the insertion point to be after this instruction. We ensured the
-  // previous value dominated all uses of the phi when detecting the
-  // recurrence.
-  Builder.SetInsertPoint(
-      &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
+  // Set the insertion point after the previous value if it is an instruction.
+  // Note that the previous value may have been constant-folded so it is not
+  // guaranteed to be an instruction in the vector loop.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+  else
+    Builder.SetInsertPoint(
+        &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
@@ -4251,18 +4079,33 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Extract the last vector element in the middle block. This will be the
   // initial value for the recurrence when jumping to the scalar loop.
-  auto *Extract = Incoming;
+  auto *ExtractForScalar = Incoming;
   if (VF > 1) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
-    Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
-                                           "vector.recur.extract");
-  }
+    ExtractForScalar = Builder.CreateExtractElement(
+        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+  }
+  // Extract the second last element in the middle block if the
+  // Phi is used outside the loop. We need to extract the phi itself
+  // and not the last element (the phi update in the current iteration). This
+  // will be the value when jumping to the exit block from the LoopMiddleBlock,
+  // when the scalar loop is not run at all.
+  Value *ExtractForPhiUsedOutsideLoop = nullptr;
+  if (VF > 1)
+    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+  // When loop is unrolled without vectorizing, initialize
+  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+  // `Incoming`. This is analogous to the vectorized case above: extracting the
+  // second last element when VF > 1.
+  else if (UF > 1)
+    ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2];
 
   // Fix the initial value of the original recurrence in the scalar loop.
   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
   for (auto *BB : predecessors(LoopScalarPreHeader)) {
-    auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
+    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
     Start->addIncoming(Incoming, BB);
   }
 
@@ -4279,12 +4122,218 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
     if (!LCSSAPhi)
       break;
     if (LCSSAPhi->getIncomingValue(0) == Phi) {
-      LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);
+      LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
       break;
     }
   }
 }
 
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+  Constant *Zero = Builder.getInt32(0);
+
+  // Get it's reduction variable descriptor.
+  assert(Legal->isReductionVariable(Phi) &&
+         "Unable to find the reduction variable");
+  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+
+  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+    RdxDesc.getMinMaxRecurrenceKind();
+  setDebugLocFromInst(Builder, ReductionStartValue);
+
+  // We need to generate a reduction vector from the incoming scalar.
+  // To do so, we need to generate the 'identity' vector and override
+  // one of the elements with the incoming scalar reduction. We need
+  // to do it in the vector-loop preheader.
+  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
+
+  // This is the vector-clone of the value that leaves the loop.
+  const VectorParts &VectorExit = getVectorValue(LoopExitInst);
+  Type *VecTy = VectorExit[0]->getType();
+
+  // Find the reduction identity variable. Zero for addition, or, xor,
+  // one for multiplication, -1 for And.
+  Value *Identity;
+  Value *VectorStart;
+  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
+      RK == RecurrenceDescriptor::RK_FloatMinMax) {
+    // MinMax reduction have the start value as their identify.
+    if (VF == 1) {
+      VectorStart = Identity = ReductionStartValue;
+    } else {
+      VectorStart = Identity =
+        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
+    }
+  } else {
+    // Handle other reduction kinds:
+    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+        RK, VecTy->getScalarType());
+    if (VF == 1) {
+      Identity = Iden;
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = ReductionStartValue;
+    } else {
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart =
+        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
+    }
+  }
+
+  // Fix the vector-loop phi.
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
+  const VectorParts &VecRdxPhi = getVectorValue(Phi);
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+  const VectorParts &Val = getVectorValue(LoopVal);
+  for (unsigned part = 0; part < UF; ++part) {
+    // Make sure to add the reduction stat value only to the
+    // first unroll part.
+    Value *StartVal = (part == 0) ? VectorStart : Identity;
+    cast<PHINode>(VecRdxPhi[part])
+      ->addIncoming(StartVal, LoopVectorPreHeader);
+    cast<PHINode>(VecRdxPhi[part])
+      ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+  }
+
+  // Before each round, move the insertion point right between
+  // the PHIs and the values we are going to write.
+  // This allows us to write both PHINodes and the extractelement
+  // instructions.
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+  VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
+  setDebugLocFromInst(Builder, LoopExitInst);
+
+  // If the vector reduction can be performed in a smaller type, we truncate
+  // then extend the loop exit value to enable InstCombine to evaluate the
+  // entire expression in the smaller type.
+  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
+    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+    Builder.SetInsertPoint(LoopVectorBody->getTerminator());
+    for (unsigned part = 0; part < UF; ++part) {
+      Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[part]->user_begin();
+           UI != RdxParts[part]->user_end();)
+        if (*UI != Trunc) {
+          (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+          RdxParts[part] = Extnd;
+        } else {
+          ++UI;
+        }
+    }
+    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+    for (unsigned part = 0; part < UF; ++part)
+      RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+  }
+
+  // Reduce all of the unrolled parts into a single vector.
+  Value *ReducedPartRdx = RdxParts[0];
+  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+  setDebugLocFromInst(Builder, ReducedPartRdx);
+  for (unsigned part = 1; part < UF; ++part) {
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+      // Floating point operations had to be 'fast' to enable the reduction.
+      ReducedPartRdx = addFastMathFlag(
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+                              ReducedPartRdx, "bin.rdx"));
+    else
+      ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
+          Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
+  }
+
+  if (VF > 1) {
+    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+    // and vector ops, reducing the set of values being computed by half each
+    // round.
+    assert(isPowerOf2_32(VF) &&
+           "Reduction emission only supported for pow2 vectors!");
+    Value *TmpVec = ReducedPartRdx;
+    SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i / 2; ++j)
+        ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+                UndefValue::get(Builder.getInt32Ty()));
+
+      Value *Shuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()),
+          ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+        // Floating point operations had to be 'fast' to enable the reduction.
+        TmpVec = addFastMathFlag(Builder.CreateBinOp(
+                                     (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
+      else
+        TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
+                                                      TmpVec, Shuf);
+    }
+
+    // The result is in the first element of the vector.
+    ReducedPartRdx =
+      Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
+    // If the reduction can be performed in a smaller type, we need to extend
+    // the reduction to the wider type before we branch to the original loop.
+    if (Phi->getType() != RdxDesc.getRecurrenceType())
+      ReducedPartRdx =
+        RdxDesc.isSigned()
+        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+        : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+  }
+
+  // Create a phi node that merges control-flow from the backedge-taken check
+  // block and the middle block.
+  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+                                        LoopScalarPreHeader->getTerminator());
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+  // Now, we need to fix the users of the reduction variable
+  // inside and outside of the scalar remainder loop.
+  // We know that the loop is in LCSSA form. We need to update the
+  // PHI nodes in the exit blocks.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end();
+       LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi)
+      break;
+
+    // All PHINodes need to have a single entry edge, or two if
+    // we already fixed them.
+    assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+    // We found a reduction value exit-PHI. Update it with the
+    // incoming bypass edge.
+    if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
+      LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+  } // end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+  int IncomingEdgeBlockIdx =
+    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+  // Pick the other block.
+  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
 void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (Instruction &LEI : *LoopExitBlock) {
     auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
@@ -4296,7 +4345,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
   }
 }
 
-void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
+void InnerLoopVectorizer::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
   BasicBlock *Latch = OrigLoop->getLoopLatch();
 
   // We create new control-flow for the vectorized loop, so the original
@@ -4563,9 +4613,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 }
 
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
-                                              unsigned VF, PhiVector *PV) {
+                                              unsigned VF) {
   PHINode *P = cast<PHINode>(PN);
-  // Handle recurrences.
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+  // this value when we vectorize all of the instructions that use the PHI.
   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
     VectorParts Entry(UF);
     for (unsigned part = 0; part < UF; ++part) {
@@ -4576,7 +4629,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
     }
     VectorLoopValueMap.initVector(P, Entry);
-    PV->push_back(P);
     return;
   }
 
@@ -4631,7 +4683,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   case InductionDescriptor::IK_NoInduction:
     llvm_unreachable("Unknown induction");
   case InductionDescriptor::IK_IntInduction:
-    return widenIntInduction(P);
+  case InductionDescriptor::IK_FpInduction:
+    return widenIntOrFpInduction(P);
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
@@ -4641,7 +4694,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // Determine the number of scalars we need to generate for each unroll
     // iteration. If the instruction is uniform, we only need to generate the
     // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
+    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
     ScalarParts Entry(UF);
@@ -4658,30 +4711,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     VectorLoopValueMap.initScalar(P, Entry);
     return;
   }
-  case InductionDescriptor::IK_FpInduction: {
-    assert(P->getType() == II.getStartValue()->getType() &&
-           "Types must match");
-    // Handle other induction variables that are now based on the
-    // canonical one.
-    assert(P != OldInduction && "Primary induction can be integer only");
-
-    Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
-    V = II.transform(Builder, V, PSE.getSE(), DL);
-    V->setName("fp.offset.idx");
-
-    // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
-
-    Value *Broadcasted = getBroadcastInstrs(V);
-    // After broadcasting the induction variable we need to make the vector
-    // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
-    Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part)
-      Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
-                                  II.getInductionOpcode());
-    VectorLoopValueMap.initVector(P, Entry);
-    return;
-  }
   }
 }
 
@@ -4703,269 +4732,323 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
-  // For each instruction in the old loop.
-  for (Instruction &I : *BB) {
-
-    // If the instruction will become trivially dead when vectorized, we don't
-    // need to generate it.
-    if (DeadInstructions.count(&I))
-      continue;
+void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
+  // Scalarize instructions that should remain scalar after vectorization.
+  if (VF > 1 &&
+      !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
+      shouldScalarizeInstruction(&I)) {
+    scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
+    return;
+  }
 
-    // Scalarize instructions that should remain scalar after vectorization.
-    if (VF > 1 &&
-        !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
-          isa<DbgInfoIntrinsic>(&I)) &&
-        shouldScalarizeInstruction(&I)) {
-      scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
-      continue;
-    }
+  switch (I.getOpcode()) {
+  case Instruction::Br:
+    // Nothing to do for PHIs and BR, since we already took care of the
+    // loop control flow instructions.
+    break;
+  case Instruction::PHI: {
+    // Vectorize PHINodes.
+    widenPHIInstruction(&I, UF, VF);
+    break;
+  } // End of PHI.
+  case Instruction::GetElementPtr: {
+    // Construct a vector GEP by widening the operands of the scalar GEP as
+    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+    // results in a vector of pointers when at least one operand of the GEP
+    // is vector-typed. Thus, to keep the representation compact, we only use
+    // vector-typed operands for loop-varying values.
+    auto *GEP = cast<GetElementPtrInst>(&I);
+    VectorParts Entry(UF);
 
-    switch (I.getOpcode()) {
-    case Instruction::Br:
-      // Nothing to do for PHIs and BR, since we already took care of the
-      // loop control flow instructions.
-      continue;
-    case Instruction::PHI: {
-      // Vectorize PHINodes.
-      widenPHIInstruction(&I, UF, VF, PV);
-      continue;
-    } // End of PHI.
-
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::SRem:
-    case Instruction::URem:
-      // Scalarize with predication if this instruction may divide by zero and
-      // block execution is conditional, otherwise fallthrough.
-      if (Legal->isScalarWithPredication(&I)) {
-        scalarizeInstruction(&I, true);
-        continue;
-      }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::FDiv:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      // Just widen binops.
-      auto *BinOp = cast<BinaryOperator>(&I);
-      setDebugLocFromInst(Builder, BinOp);
-      const VectorParts &A = getVectorValue(BinOp->getOperand(0));
-      const VectorParts &B = getVectorValue(BinOp->getOperand(1));
-
-      // Use this vector value for all users of the original instruction.
-      VectorParts Entry(UF);
+    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+      // If we are vectorizing, but the GEP has only loop-invariant operands,
+      // the GEP we build (by only using vector-typed operands for
+      // loop-varying values) would be a scalar pointer. Thus, to ensure we
+      // produce a vector of pointers, we need to either arbitrarily pick an
+      // operand to broadcast, or broadcast a clone of the original GEP.
+      // Here, we broadcast a clone of the original.
+      //
+      // TODO: If at some point we decide to scalarize instructions having
+      //       loop-invariant operands, this special case will no longer be
+      //       required. We would add the scalarization decision to
+      //       collectLoopScalars() and teach getVectorValue() to broadcast
+      //       the lane-zero scalar value.
+      auto *Clone = Builder.Insert(GEP->clone());
+      for (unsigned Part = 0; Part < UF; ++Part)
+        Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
+    } else {
+      // If the GEP has at least one loop-varying operand, we are sure to
+      // produce a vector of pointers. But if we are only unrolling, we want
+      // to produce a scalar GEP for each unroll part. Thus, the GEP we
+      // produce with the code below will be scalar (if VF == 1) or vector
+      // (otherwise). Note that for the unroll-only case, we still maintain
+      // values in the vector mapping with initVector, as we do for other
+      // instructions.
       for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
 
-        if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
-          VecOp->copyIRFlags(BinOp);
+        // The pointer operand of the new GEP. If it's loop-invariant, we
+        // won't broadcast it.
+        auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                        ? GEP->getPointerOperand()
+                        : getVectorValue(GEP->getPointerOperand())[Part];
+
+        // Collect all the indices for the new GEP. If any index is
+        // loop-invariant, we won't broadcast it.
+        SmallVector<Value *, 4> Indices;
+        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+          if (OrigLoop->isLoopInvariant(U.get()))
+            Indices.push_back(U.get());
+          else
+            Indices.push_back(getVectorValue(U.get())[Part]);
+        }
 
-        Entry[Part] = V;
+        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+        // but it should be a vector, otherwise.
+        auto *NewGEP = GEP->isInBounds()
+                           ? Builder.CreateInBoundsGEP(Ptr, Indices)
+                           : Builder.CreateGEP(Ptr, Indices);
+        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+               "NewGEP is not a pointer vector");
+        Entry[Part] = NewGEP;
       }
+    }
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, BinOp);
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, GEP);
+    break;
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // Scalarize with predication if this instruction may divide by zero and
+    // block execution is conditional, otherwise fallthrough.
+    if (Legal->isScalarWithPredication(&I)) {
+      scalarizeInstruction(&I, true);
       break;
     }
-    case Instruction::Select: {
-      // Widen selects.
-      // If the selector is loop invariant we can create a select
-      // instruction with a scalar condition. Otherwise, use vector-select.
-      auto *SE = PSE.getSE();
-      bool InvariantCond =
-          SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
-      setDebugLocFromInst(Builder, &I);
-
-      // The condition can be loop invariant  but still defined inside the
-      // loop. This means that we can't just use the original 'cond' value.
-      // We have to take the 'vectorized' value and pick the first lane.
-      // Instcombine will make this a no-op.
-      const VectorParts &Cond = getVectorValue(I.getOperand(0));
-      const VectorParts &Op0 = getVectorValue(I.getOperand(1));
-      const VectorParts &Op1 = getVectorValue(I.getOperand(2));
-
-      auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Just widen binops.
+    auto *BinOp = cast<BinaryOperator>(&I);
+    setDebugLocFromInst(Builder, BinOp);
+    const VectorParts &A = getVectorValue(BinOp->getOperand(0));
+    const VectorParts &B = getVectorValue(BinOp->getOperand(1));
+
+    // Use this vector value for all users of the original instruction.
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+
+      if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+        VecOp->copyIRFlags(BinOp);
+
+      Entry[Part] = V;
+    }
 
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Entry[Part] = Builder.CreateSelect(
-            InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
-      }
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, BinOp);
+    break;
+  }
+  case Instruction::Select: {
+    // Widen selects.
+    // If the selector is loop invariant we can create a select
+    // instruction with a scalar condition. Otherwise, use vector-select.
+    auto *SE = PSE.getSE();
+    bool InvariantCond =
+        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
+    setDebugLocFromInst(Builder, &I);
+
+    // The condition can be loop invariant  but still defined inside the
+    // loop. This means that we can't just use the original 'cond' value.
+    // We have to take the 'vectorized' value and pick the first lane.
+    // Instcombine will make this a no-op.
+    const VectorParts &Cond = getVectorValue(I.getOperand(0));
+    const VectorParts &Op0 = getVectorValue(I.getOperand(1));
+    const VectorParts &Op1 = getVectorValue(I.getOperand(2));
+
+    auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Entry[Part] = Builder.CreateSelect(
+          InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
     }
 
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      // Widen compares. Generate vector compares.
-      bool FCmp = (I.getOpcode() == Instruction::FCmp);
-      auto *Cmp = dyn_cast<CmpInst>(&I);
-      setDebugLocFromInst(Builder, Cmp);
-      const VectorParts &A = getVectorValue(Cmp->getOperand(0));
-      const VectorParts &B = getVectorValue(Cmp->getOperand(1));
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *C = nullptr;
-        if (FCmp) {
-          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
-          cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
-        } else {
-          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
-        }
-        Entry[Part] = C;
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
+
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Widen compares. Generate vector compares.
+    bool FCmp = (I.getOpcode() == Instruction::FCmp);
+    auto *Cmp = dyn_cast<CmpInst>(&I);
+    setDebugLocFromInst(Builder, Cmp);
+    const VectorParts &A = getVectorValue(Cmp->getOperand(0));
+    const VectorParts &B = getVectorValue(Cmp->getOperand(1));
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *C = nullptr;
+      if (FCmp) {
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
+      } else {
+        C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
       }
+      Entry[Part] = C;
+    }
+
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
+  case Instruction::Store:
+  case Instruction::Load:
+    vectorizeMemoryInstruction(&I);
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto *CI = dyn_cast<CastInst>(&I);
+    setDebugLocFromInst(Builder, CI);
+
+    // Optimize the special case where the source is a constant integer
+    // induction variable. Notice that we can only optimize the 'trunc' case
+    // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+    // (c) other casts depend on pointer size.
+    if (Cost->isOptimizableIVTruncate(CI, VF)) {
+      widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
+                            cast<TruncInst>(CI));
       break;
     }
 
-    case Instruction::Store:
-    case Instruction::Load:
-      vectorizeMemoryInstruction(&I);
+    /// Vectorize casts.
+    Type *DestTy =
+        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+
+    const VectorParts &A = getVectorValue(CI->getOperand(0));
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
+
+  case Instruction::Call: {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
       break;
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      auto *CI = dyn_cast<CastInst>(&I);
-      setDebugLocFromInst(Builder, CI);
-
-      // Optimize the special case where the source is a constant integer
-      // induction variable. Notice that we can only optimize the 'trunc' case
-      // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
-      // (c) other casts depend on pointer size.
-      auto ID = Legal->getInductionVars()->lookup(OldInduction);
-      if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
-          ID.getConstIntStepValue()) {
-        widenIntInduction(OldInduction, cast<TruncInst>(CI));
-        break;
-      }
+    setDebugLocFromInst(Builder, &I);
 
-      /// Vectorize casts.
-      Type *DestTy =
-          (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+    Module *M = I.getParent()->getParent()->getParent();
+    auto *CI = cast<CallInst>(&I);
 
-      const VectorParts &A = getVectorValue(CI->getOperand(0));
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
+    StringRef FnName = CI->getCalledFunction()->getName();
+    Function *F = CI->getCalledFunction();
+    Type *RetTy = ToVectorTy(CI->getType(), VF);
+    SmallVector<Type *, 4> Tys;
+    for (Value *ArgOperand : CI->arg_operands())
+      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+               ID == Intrinsic::lifetime_start)) {
+      scalarizeInstruction(&I);
+      break;
+    }
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize;
+    unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+    if (!UseVectorIntrinsic && NeedToScalarize) {
+      scalarizeInstruction(&I);
       break;
     }
 
-    case Instruction::Call: {
-      // Ignore dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(I))
-        break;
-      setDebugLocFromInst(Builder, &I);
-
-      Module *M = BB->getParent()->getParent();
-      auto *CI = cast<CallInst>(&I);
-
-      StringRef FnName = CI->getCalledFunction()->getName();
-      Function *F = CI->getCalledFunction();
-      Type *RetTy = ToVectorTy(CI->getType(), VF);
-      SmallVector<Type *, 4> Tys;
-      for (Value *ArgOperand : CI->arg_operands())
-        Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-                 ID == Intrinsic::lifetime_start)) {
-        scalarizeInstruction(&I);
-        break;
-      }
-      // The flag shows whether we use Intrinsic or a usual Call for vectorized
-      // version of the instruction.
-      // Is it beneficial to perform intrinsic call compared to lib call?
-      bool NeedToScalarize;
-      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
-      bool UseVectorIntrinsic =
-          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
-      if (!UseVectorIntrinsic && NeedToScalarize) {
-        scalarizeInstruction(&I);
-        break;
-      }
-
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Args;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          Value *Arg = CI->getArgOperand(i);
-          // Some intrinsics have a scalar argument - don't replace it with a
-          // vector.
-          if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
-            const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
-            Arg = VectorArg[Part];
-          }
-          Args.push_back(Arg);
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Value *, 4> Args;
+      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+        Value *Arg = CI->getArgOperand(i);
+        // Some intrinsics have a scalar argument - don't replace it with a
+        // vector.
+        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
+          const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
+          Arg = VectorArg[Part];
         }
+        Args.push_back(Arg);
+      }
 
-        Function *VectorF;
-        if (UseVectorIntrinsic) {
-          // Use vector version of the intrinsic.
-          Type *TysForDecl[] = {CI->getType()};
-          if (VF > 1)
-            TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-          VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
-        } else {
-          // Use vector version of the library call.
-          StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
-          assert(!VFnName.empty() && "Vector function name is empty.");
-          VectorF = M->getFunction(VFnName);
-          if (!VectorF) {
-            // Generate a declaration
-            FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
-            VectorF =
-                Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
-            VectorF->copyAttributesFrom(F);
-          }
+      Function *VectorF;
+      if (UseVectorIntrinsic) {
+        // Use vector version of the intrinsic.
+        Type *TysForDecl[] = {CI->getType()};
+        if (VF > 1)
+          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+      } else {
+        // Use vector version of the library call.
+        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+        assert(!VFnName.empty() && "Vector function name is empty.");
+        VectorF = M->getFunction(VFnName);
+        if (!VectorF) {
+          // Generate a declaration
+          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+          VectorF =
+              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+          VectorF->copyAttributesFrom(F);
         }
-        assert(VectorF && "Can't create vector function.");
-
-        SmallVector<OperandBundleDef, 1> OpBundles;
-        CI->getOperandBundlesAsDefs(OpBundles);
-        CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+      }
+      assert(VectorF && "Can't create vector function.");
 
-        if (isa<FPMathOperator>(V))
-          V->copyFastMathFlags(CI);
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
 
-        Entry[Part] = V;
-      }
+      if (isa<FPMathOperator>(V))
+        V->copyFastMathFlags(CI);
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+      Entry[Part] = V;
     }
 
-    default:
-      // All other instructions are unsupported. Scalarize them.
-      scalarizeInstruction(&I);
-      break;
-    } // end of switch.
-  }   // end of for_each instr.
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
+
+  default:
+    // All other instructions are unsupported. Scalarize them.
+    scalarizeInstruction(&I);
+    break;
+  } // end of switch.
 }
 
 void InnerLoopVectorizer::updateAnalysis() {
@@ -4976,11 +5059,10 @@ void InnerLoopVectorizer::updateAnalysis() {
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
 
-  // We don't predicate stores by this point, so the vector body should be a
-  // single loop.
-  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
-
-  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
+  DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
+                  LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock,
+                  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
@@ -5145,12 +5227,6 @@ bool LoopVectorizationLegality::canVectorize() {
   if (UseInterleaved)
     InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
 
-  // Collect all instructions that are known to be uniform after vectorization.
-  collectLoopUniforms();
-
-  // Collect all instructions that are known to be scalar after vectorization.
-  collectLoopScalars();
-
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
@@ -5234,8 +5310,8 @@ void LoopVectorizationLegality::addInductionPhi(
     // one if there are multiple (no good reason for doing this other
     // than it is expedient). We've checked that it begins at zero and
     // steps by one, so this is a canonical induction variable.
-    if (!Induction || PhiTy == WidestIndTy)
-      Induction = Phi;
+    if (!PrimaryInduction || PhiTy == WidestIndTy)
+      PrimaryInduction = Phi;
   }
 
   // Both the PHI node itself, and the "post-increment" value feeding
@@ -5398,7 +5474,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     } // next instr.
   }
 
-  if (!Induction) {
+  if (!PrimaryInduction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     if (Inductions.empty()) {
       ORE->emit(createMissedAnalysis("NoInductionVariable")
@@ -5410,46 +5486,166 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   // Now we know the widest induction type, check if our found induction
   // is the same size. If it's not, unset it here and InnerLoopVectorizer
   // will create another.
-  if (Induction && WidestIndTy != Induction->getType())
-    Induction = nullptr;
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
 
   return true;
 }
 
-void LoopVectorizationLegality::collectLoopScalars() {
+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+
+  // We should not collect Scalars more than once per VF. Right now, this
+  // function is called from collectUniformsAndScalars(), which already does
+  // this check. Collecting Scalars for VF=1 does not make any sense.
+  assert(VF >= 2 && !Scalars.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // These sets are used to seed the analysis with pointers used by memory
+  // accesses that will remain scalar.
+  SmallSetVector<Instruction *, 8> ScalarPtrs;
+  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+
+  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+  // The pointer operands of loads and stores will be scalar as long as the
+  // memory access is not a gather or scatter operation. The value operand of a
+  // store will remain scalar if the store is scalarized.
+  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+      if (Ptr == Store->getValueOperand())
+        return WideningDecision == CM_Scalarize;
+    assert(Ptr == getPointerOperand(MemAccess) &&
+           "Ptr is neither a value or pointer operand");
+    return WideningDecision != CM_GatherScatter;
+  };
+
+  // A helper that returns true if the given value is a bitcast or
+  // getelementptr instruction contained in the loop.
+  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+            isa<GetElementPtrInst>(V)) &&
+           !TheLoop->isLoopInvariant(V);
+  };
 
-  // If an instruction is uniform after vectorization, it will remain scalar.
-  Scalars.insert(Uniforms.begin(), Uniforms.end());
+  // A helper that evaluates a memory access's use of a pointer. If the use
+  // will be a scalar use, and the pointer is only used by memory accesses, we
+  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+  // PossibleNonScalarPtrs.
+  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
 
-  // Collect the getelementptr instructions that will not be vectorized. A
-  // getelementptr instruction is only vectorized if it is used for a legal
-  // gather or scatter operation.
+    // We only care about bitcast and getelementptr instructions contained in
+    // the loop.
+    if (!isLoopVaryingBitCastOrGEP(Ptr))
+      return;
+
+    // If the pointer has already been identified as scalar (e.g., if it was
+    // also identified as uniform), there's nothing to do.
+    auto *I = cast<Instruction>(Ptr);
+    if (Worklist.count(I))
+      return;
+
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        }))
+      ScalarPtrs.insert(I);
+    else
+      PossibleNonScalarPtrs.insert(I);
+  };
+
+  // We seed the scalars analysis with three classes of instructions: (1)
+  // instructions marked uniform-after-vectorization, (2) bitcast and
+  // getelementptr instructions used by memory accesses requiring a scalar use,
+  // and (3) pointer induction variables and their update instructions (we
+  // currently only scalarize these).
+  //
+  // (1) Add to the worklist all instructions that have been identified as
+  // uniform-after-vectorization.
+  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+  // (2) Add to the worklist all bitcast and getelementptr instructions used by
+  // memory accesses requiring a scalar use. The pointer operands of loads and
+  // stores will be scalar as long as the memory accesses is not a gather or
+  // scatter operation. The value operand of a store will remain scalar if the
+  // store is scalarized.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-        Scalars.insert(GEP);
-        continue;
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        evaluatePtrUse(Load, Load->getPointerOperand());
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        evaluatePtrUse(Store, Store->getPointerOperand());
+        evaluatePtrUse(Store, Store->getValueOperand());
       }
-      auto *Ptr = getPointerOperand(&I);
-      if (!Ptr)
-        continue;
-      auto *GEP = getGEPInstruction(Ptr);
-      if (GEP && isLegalGatherOrScatter(&I))
-        Scalars.erase(GEP);
+    }
+  for (auto *I : ScalarPtrs)
+    if (!PossibleNonScalarPtrs.count(I)) {
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      Worklist.insert(I);
     }
 
+  // (3) Add to the worklist all pointer induction variables and their update
+  // instructions.
+  //
+  // TODO: Once we are able to vectorize pointer induction variables we should
+  //       no longer insert them into the worklist here.
+  auto *Latch = TheLoop->getLoopLatch();
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
+      continue;
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+  }
+
+  // Expand the worklist by looking through any bitcasts and getelementptr
+  // instructions we've already identified as scalar. This is similar to the
+  // expansion step in collectLoopUniforms(); however, here we're only
+  // expanding to include additional bitcasts and getelementptr instructions.
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *Dst = Worklist[Idx++];
+    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+      continue;
+    auto *Src = cast<Instruction>(Dst->getOperand(0));
+    if (all_of(Src->users(), [&](User *U) -> bool {
+          auto *J = cast<Instruction>(U);
+          return !TheLoop->contains(J) || Worklist.count(J) ||
+                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+                  isScalarUse(J, Src));
+        })) {
+      Worklist.insert(Src);
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+    }
+  }
+
   // An induction variable will remain scalar if all users of the induction
   // variable and induction variable update remain scalar.
-  auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : *getInductionVars()) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
+    // We already considered pointer induction variables, so there's no reason
+    // to look at their users again.
+    //
+    // TODO: Once we are able to vectorize pointer induction variables we
+    //       should no longer skip over them here.
+    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
+      continue;
+
     // Determine if all users of the induction variable are scalar after
     // vectorization.
     auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarInd)
       continue;
@@ -5458,23 +5654,19 @@ void LoopVectorizationLegality::collectLoopScalars() {
     // scalar after vectorization.
     auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarIndUpdate)
       continue;
 
     // The induction variable and its update instruction will remain scalar.
-    Scalars.insert(Ind);
-    Scalars.insert(IndUpdate);
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
   }
-}
 
-bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
-  if (isAccessInterleaved(I))
-    return true;
-  if (auto *Ptr = getPointerOperand(I))
-    return isConsecutivePtr(Ptr);
-  return false;
+  Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
@@ -5494,48 +5686,48 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
   return false;
 }
 
-bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
-    Instruction *I, unsigned VF) {
-
-  // If the memory instruction is in an interleaved group, it will be
-  // vectorized and its pointer will remain uniform.
-  if (isAccessInterleaved(I))
-    return false;
-
+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
+                                                              unsigned VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
   assert((LI || SI) && "Invalid memory instruction");
 
-  // If the pointer operand is uniform (loop invariant), the memory instruction
-  // will be scalarized.
   auto *Ptr = getPointerOperand(I);
-  if (LI && isUniform(Ptr))
-    return true;
 
-  // If the pointer operand is non-consecutive and neither a gather nor a
-  // scatter operation is legal, the memory instruction will be scalarized.
-  if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
-    return true;
+  // In order to be widened, the pointer should be consecutive, first of all.
+  if (!isConsecutivePtr(Ptr))
+    return false;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I))
-    return true;
+    return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL, VF))
-    return true;
+    return false;
 
-  // Otherwise, the memory instruction should be vectorized if the rest of the
-  // loop is.
-  return false;
+  return true;
 }
 
-void LoopVectorizationLegality::collectLoopUniforms() {
+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+
+  // We should not collect Uniforms more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which 
+  // already does this check. Collecting Uniforms for VF=1 does not make any
+  // sense.
+
+  assert(VF >= 2 && !Uniforms.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll 
+  // not analyze again.  Uniforms.count(VF) will return 1.
+  Uniforms[VF].clear();
+
   // We now know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
   // vectorization.
@@ -5568,6 +5760,14 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // Holds pointer operands of instructions that are possibly non-uniform.
   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
 
+  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+    InstWidening WideningDecision = getWideningDecision(I, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+
+    return (WideningDecision == CM_Widen ||
+            WideningDecision == CM_Interleave);
+  };
   // Iterate over the instructions in the loop, and collect all
   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
   // that a consecutive-like pointer operand will be scalarized, we collect it
@@ -5590,25 +5790,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {
         return getPointerOperand(U) == Ptr;
       });
 
-      // Ensure the memory instruction will not be scalarized, making its
-      // pointer operand non-uniform. If the pointer operand is used by some
-      // instruction other than a memory access, we're not going to check if
-      // that other instruction may be scalarized here. Thus, conservatively
-      // assume the pointer operand may be non-uniform.
-      if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
+      // Ensure the memory instruction will not be scalarized or used by
+      // gather/scatter, making its pointer operand non-uniform. If the pointer
+      // operand is used by any instruction other than a memory access, we
+      // conservatively assume the pointer operand may be non-uniform.
+      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
         PossibleNonUniformPtrs.insert(Ptr);
 
       // If the memory instruction will be vectorized and its pointer operand
-      // is consecutive-like, the pointer operand should remain uniform.
-      else if (hasConsecutiveLikePtrOperand(&I))
-        ConsecutiveLikePtrs.insert(Ptr);
-
-      // Otherwise, if the memory instruction will be vectorized and its
-      // pointer operand is non-consecutive-like, the memory instruction should
-      // be a gather or scatter operation. Its pointer operand will be
-      // non-uniform.
+      // is consecutive-like, or interleaving - the pointer operand should
+      // remain uniform.
       else
-        PossibleNonUniformPtrs.insert(Ptr);
+        ConsecutiveLikePtrs.insert(Ptr);
     }
 
   // Add to the Worklist all consecutive and consecutive-like pointers that
@@ -5632,7 +5825,9 @@ void LoopVectorizationLegality::collectLoopUniforms() {
         continue;
       auto *OI = cast<Instruction>(OV);
       if (all_of(OI->users(), [&](User *U) -> bool {
-            return isOutOfScope(U) || Worklist.count(cast<Instruction>(U));
+            auto *J = cast<Instruction>(U);
+            return !TheLoop->contains(J) || Worklist.count(J) ||
+                   (OI == getPointerOperand(J) && isUniformDecision(J, VF));
           })) {
         Worklist.insert(OI);
         DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
@@ -5643,7 +5838,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, and I is known to not require scalarization.
   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
+    return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
   };
 
   // For an instruction to be added into Worklist above, all its users inside
@@ -5652,7 +5847,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
-  for (auto &Induction : Inductions) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -5683,7 +5878,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
   }
 
-  Uniforms.insert(Worklist.begin(), Worklist.end());
+  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
@@ -5823,7 +6018,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
       // An alignment of 0 means target ABI alignment.
-      unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+      unsigned Align = getMemInstAlignment(&I);
       if (!Align)
         Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
@@ -5978,6 +6173,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
       if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
         continue;
 
+      // Ignore A if the memory object of A and B don't belong to the same
+      // address space
+      if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
+        continue;
+
       // Calculate the distance from A to B.
       const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
           PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
@@ -6020,36 +6220,36 @@ void InterleavedAccessInfo::analyzeInterleaving(
     if (Group->getNumMembers() != Group->getFactor())
       releaseGroup(Group);
 
-  // Remove interleaved groups with gaps (currently only loads) whose memory 
-  // accesses may wrap around. We have to revisit the getPtrStride analysis, 
-  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does 
+  // Remove interleaved groups with gaps (currently only loads) whose memory
+  // accesses may wrap around. We have to revisit the getPtrStride analysis,
+  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
   // not check wrapping (see documentation there).
-  // FORNOW we use Assume=false; 
-  // TODO: Change to Assume=true but making sure we don't exceed the threshold 
+  // FORNOW we use Assume=false;
+  // TODO: Change to Assume=true but making sure we don't exceed the threshold
   // of runtime SCEV assumptions checks (thereby potentially failing to
-  // vectorize altogether). 
+  // vectorize altogether).
   // Additional optional optimizations:
-  // TODO: If we are peeling the loop and we know that the first pointer doesn't 
+  // TODO: If we are peeling the loop and we know that the first pointer doesn't
   // wrap then we can deduce that all pointers in the group don't wrap.
-  // This means that we can forcefully peel the loop in order to only have to 
-  // check the first pointer for no-wrap. When we'll change to use Assume=true 
+  // This means that we can forcefully peel the loop in order to only have to
+  // check the first pointer for no-wrap. When we'll change to use Assume=true
   // we'll only need at most one runtime check per interleaved group.
   //
   for (InterleaveGroup *Group : LoadGroups) {
 
     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
-    // load would wrap around the address space we would do a memory access at 
-    // nullptr even without the transformation. 
-    if (Group->getNumMembers() == Group->getFactor()) 
+    // load would wrap around the address space we would do a memory access at
+    // nullptr even without the transformation.
+    if (Group->getNumMembers() == Group->getFactor())
       continue;
 
-    // Case 2: If first and last members of the group don't wrap this implies 
+    // Case 2: If first and last members of the group don't wrap this implies
     // that all the pointers in the group don't wrap.
     // So we check only group member 0 (which is always guaranteed to exist),
-    // and group member Factor - 1; If the latter doesn't exist we rely on 
+    // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reveresed accsess -- see Case 3).
     Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
-    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, 
+    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
                       /*ShouldCheckWrap=*/true)) {
       DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                       "first group member potentially pointer-wrapping.\n");
@@ -6065,8 +6265,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
                         "last group member potentially pointer-wrapping.\n");
         releaseGroup(Group);
       }
-    }
-    else {
+    } else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
       // to execute at least one scalar epilogue iteration. This will ensure 
       // we don't speculatively access memory out-of-bounds. We only need
@@ -6082,27 +6281,62 @@ void InterleavedAccessInfo::analyzeInterleaving(
   }
 }
 
-LoopVectorizationCostModel::VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
-  // Width 1 means no vectorize
-  VectorizationFactor Factor = {1U, 0U};
-  if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
+    ORE->emit(createMissedAnalysis("ConditionalStore")
+              << "store that is conditionally executed prevents vectorization");
+    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    return None;
+  }
+
+  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
+    return computeFeasibleMaxVF(OptForSize);
+
+  if (Legal->getRuntimePointerChecking()->Need) {
     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
               << "runtime pointer checks needed. Enable vectorization of this "
                  "loop with '#pragma clang loop vectorize(enable)' when "
                  "compiling with -Os/-Oz");
     DEBUG(dbgs()
           << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return Factor;
+    return None;
   }
 
-  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
-    return Factor;
+  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+
+  // If we don't know the precise trip count, don't try to vectorize.
+  if (TC < 2) {
+    ORE->emit(
+        createMissedAnalysis("UnknownLoopCountComplexCFG")
+        << "unable to calculate the loop count due to complex control flow");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
   }
 
+  unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+
+  if (TC % MaxVF != 0) {
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
+    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
+    //        smaller MaxVF that does not require a scalar epilog.
+
+    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+              << "cannot optimize for size and vectorize at the "
+                 "same time. Enable vectorization of this loop "
+                 "with '#pragma clang loop vectorize(enable)' "
+                 "when compiling with -Os/-Oz");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  return MaxVF;
+}
+
+unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -6136,7 +6370,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
                                 " into one vector!");
 
-  unsigned VF = MaxVectorSize;
+  unsigned MaxVF = MaxVectorSize;
   if (MaximizeBandwidth && !OptForSize) {
     // Collect all viable vectorization factors.
     SmallVector<unsigned, 8> VFs;
@@ -6152,54 +6386,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
     for (int i = RUs.size() - 1; i >= 0; --i) {
       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
-        VF = VFs[i];
+        MaxVF = VFs[i];
         break;
       }
     }
   }
+  return MaxVF;
+}
 
-  // If we optimize the program for size, avoid creating the tail loop.
-  if (OptForSize) {
-    unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
-    // If we don't know the precise trip count, don't try to vectorize.
-    if (TC < 2) {
-      ORE->emit(
-          createMissedAnalysis("UnknownLoopCountComplexCFG")
-          << "unable to calculate the loop count due to complex control flow");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-      return Factor;
-    }
-
-    // Find the maximum SIMD width that can fit within the trip count.
-    VF = TC % MaxVectorSize;
-
-    if (VF == 0)
-      VF = MaxVectorSize;
-    else {
-      // If the trip count that we found modulo the vectorization factor is not
-      // zero then we require a tail.
-      ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-                << "cannot optimize for size and vectorize at the "
-                   "same time. Enable vectorization of this loop "
-                   "with '#pragma clang loop vectorize(enable)' "
-                   "when compiling with -Os/-Oz");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-      return Factor;
-    }
-  }
-
-  int UserVF = Hints->getWidth();
-  if (UserVF != 0) {
-    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-
-    Factor.Width = UserVF;
-    collectInstsToScalarize(UserVF);
-    return Factor;
-  }
-
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
   float Cost = expectedCost(1).first;
 #ifndef NDEBUG
   const float ScalarCost = Cost;
@@ -6209,12 +6405,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   // Ignore scalar width, because the user explicitly wants vectorization.
-  if (ForceVectorization && VF > 1) {
+  if (ForceVectorization && MaxVF > 1) {
     Width = 2;
     Cost = expectedCost(Width).first / (float)Width;
   }
 
-  for (unsigned i = 2; i <= VF; i *= 2) {
+  for (unsigned i = 2; i <= MaxVF; i *= 2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
@@ -6238,8 +6434,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
         << "LV: Vectorization seems to be not beneficial, "
         << "but was forced by a user.\n");
   DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  Factor.Width = Width;
-  Factor.Cost = Width * Cost;
+  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
   return Factor;
 }
 
@@ -6277,9 +6472,16 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         T = ST->getValueOperand()->getType();
 
       // Ignore loaded pointer types and stored pointer types that are not
-      // consecutive. However, we do want to take consecutive stores/loads of
-      // pointer vectors into account.
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I))
+      // vectorizable.
+      //
+      // FIXME: The check here attempts to predict whether a load or store will
+      //        be vectorized. We only know this for certain after a VF has
+      //        been selected. Here, we assume that if an access can be
+      //        vectorized, it will be. We should also look at extending this
+      //        optimization to non-pointer types.
+      //
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+          !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
         continue;
 
       MinWidth = std::min(MinWidth,
@@ -6562,12 +6764,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
         continue;
       }
-
+      collectUniformsAndScalars(VFs[j]);
       // Count the number of live intervals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals) {
         // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.count(Inst))
+        if (VecValuesToIgnore.count(Inst) ||
+            isScalarAfterVectorization(Inst, VFs[j]))
           continue;
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
       }
@@ -6628,6 +6831,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
         ScalarCostsTy ScalarCosts;
         if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+
+        // Remember that BB will remain after vectorization.
+        PredicatedBBsAfterVectorization.insert(BB);
       }
   }
 }
@@ -6636,7 +6842,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
     unsigned VF) {
 
-  assert(!Legal->isUniformAfterVectorization(PredInst) &&
+  assert(!isUniformAfterVectorization(PredInst, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
@@ -6657,7 +6863,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // already be scalar to avoid traversing chains that are unlikely to be
     // beneficial.
     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
-        Legal->isScalarAfterVectorization(I))
+        isScalarAfterVectorization(I, VF))
       return false;
 
     // If the instruction is scalar with predication, it will be analyzed
@@ -6677,7 +6883,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // the lane zero values for uniforms rather than asserting.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get()))
-        if (Legal->isUniformAfterVectorization(J))
+        if (isUniformAfterVectorization(J, VF))
           return false;
 
     // Otherwise, we can scalarize the instruction.
@@ -6690,7 +6896,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
   // and their return values are inserted into vectors. Thus, an extract would
   // still be required.
   auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
   };
 
   // Compute the expected cost discount from scalarizing the entire expression
@@ -6717,8 +6923,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
-      ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
-                                             false, TTI);
+      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
+                                                 true, false);
       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
     }
 
@@ -6733,8 +6939,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
         if (canBeScalarized(J))
           Worklist.push_back(J);
         else if (needsExtract(J))
-          ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
-                                                 false, true, TTI);
+          ScalarCost += TTI.getScalarizationOverhead(
+                              ToVectorTy(J->getType(),VF), false, true);
       }
 
     // Scale the total scalar cost by block probability.
@@ -6753,6 +6959,9 @@ LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
 
+  // Collect Uniform and Scalar instructions after vectorization with VF.
+  collectUniformsAndScalars(VF);
+
   // Collect the instructions (and their associated costs) that will be more
   // profitable to scalarize.
   collectInstsToScalarize(VF);
@@ -6832,11 +7041,141 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
          Legal->hasStride(I->getOperand(1));
 }
 
+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                                 unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  auto SE = PSE.getSE();
+
+  unsigned Alignment = getMemInstAlignment(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  Value *Ptr = getPointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // Figure out whether the access is strided and get the stride value
+  // if it's known in compile time
+  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
+  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+
+  Cost += VF *
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+                              AS, I);
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF, TTI);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (Legal->isScalarWithPredication(I))
+    Cost /= getReciprocalPredBlockProb();
+
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+                                                             unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+
+  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+         "Stride should be 1 or -1 for consecutive memory access");
+  unsigned Cost = 0;
+  if (Legal->isMaskRequired(I))
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+  else
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+
+  bool Reverse = ConsecutiveStride < 0;
+  if (Reverse)
+    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+                                                         unsigned VF) {
+  LoadInst *LI = cast<LoadInst>(I);
+  Type *ValTy = LI->getType();
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = LI->getAlignment();
+  unsigned AS = LI->getPointerAddressSpace();
+
+  return TTI.getAddressComputationCost(ValTy) +
+         TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+         TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+}
+
+unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+                                                          unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+
+  return TTI.getAddressComputationCost(VectorTy) +
+         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment);
+}
+
+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+                                                            unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned AS = getMemInstAddressSpace(I);
+
+  auto Group = Legal->getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (isa<LoadInst>(I)) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                 Group->getFactor(), Indices,
+                                                 Group->getAlignment(), AS);
+
+  if (Group->isReverse())
+    Cost += Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+                                                              unsigned VF) {
+
+  // Calculate scalar cost only. Vectorization cost should be ready at this
+  // moment.
+  if (VF == 1) {
+    Type *ValTy = getMemInstValueType(I);
+    unsigned Alignment = getMemInstAlignment(I);
+    unsigned AS = getMemInstAlignment(I);
+
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+  }
+  return getWideningCost(I, VF);
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
-  if (Legal->isUniformAfterVectorization(I))
+  if (isUniformAfterVectorization(I, VF))
     VF = 1;
 
   if (VF > 1 && isProfitableToScalarize(I, VF))
@@ -6850,6 +7189,79 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
+  if (VF == 1)
+    return;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      Value *Ptr = getPointerOperand(&I);
+      if (!Ptr)
+        continue;
+
+      if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
+        // Scalar load + broadcast
+        unsigned Cost = getUniformMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        continue;
+      }
+
+      // We assume that widening is the best solution when possible.
+      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Widen, Cost);
+        continue;
+      }
+
+      // Choose between Interleaving, Gather/Scatter or Scalarization.
+      unsigned InterleaveCost = UINT_MAX;
+      unsigned NumAccesses = 1;
+      if (Legal->isAccessInterleaved(&I)) {
+        auto Group = Legal->getInterleavedAccessGroup(&I);
+        assert(Group && "Fail to get an interleaved access group.");
+
+        // Make one decision for the whole group.
+        if (getWideningDecision(&I, VF) != CM_Unknown)
+          continue;
+
+        NumAccesses = Group->getNumMembers();
+        InterleaveCost = getInterleaveGroupCost(&I, VF);
+      }
+
+      unsigned GatherScatterCost =
+          Legal->isLegalGatherOrScatter(&I)
+              ? getGatherScatterCost(&I, VF) * NumAccesses
+              : UINT_MAX;
+
+      unsigned ScalarizationCost =
+          getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+      // Choose better solution for the current VF,
+      // write down this decision and use it during vectorization.
+      unsigned Cost;
+      InstWidening Decision;
+      if (InterleaveCost <= GatherScatterCost &&
+          InterleaveCost < ScalarizationCost) {
+        Decision = CM_Interleave;
+        Cost = InterleaveCost;
+      } else if (GatherScatterCost < ScalarizationCost) {
+        Decision = CM_GatherScatter;
+        Cost = GatherScatterCost;
+      } else {
+        Decision = CM_Scalarize;
+        Cost = ScalarizationCost;
+      }
+      // If the instructions belongs to an interleave group, the whole group
+      // receives the same decision. The whole group receives the cost, but
+      // the cost will actually be assigned to one instruction.
+      if (auto Group = Legal->getInterleavedAccessGroup(&I))
+        setWideningDecision(Group, VF, Decision, Cost);
+      else
+        setWideningDecision(&I, VF, Decision, Cost);
+    }
+  }
+}
+
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                         unsigned VF,
                                                         Type *&VectorTy) {
@@ -6868,7 +7280,31 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // instruction cost.
     return 0;
   case Instruction::Br: {
-    return TTI.getCFInstrCost(I->getOpcode());
+    // In cases of scalarized and predicated instructions, there will be VF
+    // predicated blocks in the vectorized loop. Each branch around these
+    // blocks requires also an extract of its vector compare i1 element.
+    bool ScalarPredicatedBB = false;
+    BranchInst *BI = cast<BranchInst>(I);
+    if (VF > 1 && BI->isConditional() &&
+        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+      ScalarPredicatedBB = true;
+
+    if (ScalarPredicatedBB) {
+      // Return cost for branches around scalarized and predicated blocks.
+      Type *Vec_i1Ty =
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
+              (TTI.getCFInstrCost(Instruction::Br) * VF));
+    } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+      // The back-edge branch will remain, as will all scalar branches.
+      return TTI.getCFInstrCost(Instruction::Br);
+    else
+      // This branch will be eliminated by if-conversion.
+      return 0;
+    // Note: We currently assume zero cost for an unconditional branch inside
+    // a predicated block since it will become a fall-through, although we
+    // may decide in the future to call TTI for all branches.
   }
   case Instruction::PHI: {
     auto *Phi = cast<PHINode>(I);
@@ -6969,7 +7405,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -6978,130 +7414,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
   }
   case Instruction::Store:
   case Instruction::Load: {
-    StoreInst *SI = dyn_cast<StoreInst>(I);
-    LoadInst *LI = dyn_cast<LoadInst>(I);
-    Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
-    VectorTy = ToVectorTy(ValTy, VF);
-
-    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
-    unsigned AS =
-        SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
-    Value *Ptr = getPointerOperand(I);
-    // We add the cost of address computation here instead of with the gep
-    // instruction because only here we know whether the operation is
-    // scalarized.
-    if (VF == 1)
-      return TTI.getAddressComputationCost(VectorTy) +
-             TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (LI && Legal->isUniform(Ptr)) {
-      // Scalar load + broadcast
-      unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-      return Cost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);
-    }
-
-    // For an interleaved access, calculate the total cost of the whole
-    // interleave group.
-    if (Legal->isAccessInterleaved(I)) {
-      auto Group = Legal->getInterleavedAccessGroup(I);
-      assert(Group && "Fail to get an interleaved access group.");
-
-      // Only calculate the cost once at the insert position.
-      if (Group->getInsertPos() != I)
-        return 0;
-
-      unsigned InterleaveFactor = Group->getFactor();
-      Type *WideVecTy =
-          VectorType::get(VectorTy->getVectorElementType(),
-                          VectorTy->getVectorNumElements() * InterleaveFactor);
-
-      // Holds the indices of existing members in an interleaved load group.
-      // An interleaved store group doesn't need this as it doesn't allow gaps.
-      SmallVector<unsigned, 4> Indices;
-      if (LI) {
-        for (unsigned i = 0; i < InterleaveFactor; i++)
-          if (Group->getMember(i))
-            Indices.push_back(i);
-      }
-
-      // Calculate the cost of the whole interleaved group.
-      unsigned Cost = TTI.getInterleavedMemoryOpCost(
-          I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-          Group->getAlignment(), AS);
-
-      if (Group->isReverse())
-        Cost +=
-            Group->getNumMembers() *
-            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-
-      // FIXME: The interleaved load group with a huge gap could be even more
-      // expensive than scalar operations. Then we could ignore such group and
-      // use scalar operations instead.
-      return Cost;
-    }
-
-    // Check if the memory instruction will be scalarized.
-    if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
-      unsigned Cost = 0;
-      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
-
-      // Figure out whether the access is strided and get the stride value
-      // if it's known in compile time
-      const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); 
-
-      // Get the cost of the scalar memory instruction and address computation.
-      Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
-      Cost += VF *
-              TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-
-      // Get the overhead of the extractelement and insertelement instructions
-      // we might create due to scalarization.
-      Cost += getScalarizationOverhead(I, VF, TTI);
-
-      // If we have a predicated store, it may not be executed for each vector
-      // lane. Scale the cost by the probability of executing the predicated
-      // block.
-      if (Legal->isScalarWithPredication(I))
-        Cost /= getReciprocalPredBlockProb();
-
-      return Cost;
-    }
-
-    // Determine if the pointer operand of the access is either consecutive or
-    // reverse consecutive.
-    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = ConsecutiveStride < 0;
-
-    // Determine if either a gather or scatter operation is legal.
-    bool UseGatherOrScatter =
-        !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
-
-    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    if (UseGatherOrScatter) {
-      assert(ConsecutiveStride == 0 &&
-             "Gather/Scatter are not used for consecutive stride");
-      return Cost +
-             TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                        Legal->isMaskRequired(I), Alignment);
-    }
-    // Wide load/stores.
-    if (Legal->isMaskRequired(I))
-      Cost +=
-          TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-    else
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-    return Cost;
+    VectorTy = ToVectorTy(getMemInstValueType(I), VF);
+    return getMemoryInstructionCost(I, VF);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -7115,12 +7433,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    // We optimize the truncation of induction variable.
-    // The cost of these is the same as the scalar operation.
-    if (I->getOpcode() == Instruction::Trunc &&
-        Legal->isInductionVariable(I->getOperand(0)))
-      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
-                                  I->getOperand(0)->getType());
+    // We optimize the truncation of induction variables having constant
+    // integer steps. The cost of these truncations is the same as the scalar
+    // operation.
+    if (isOptimizableIVTruncate(I, VF)) {
+      auto *Trunc = cast<TruncInst>(I);
+      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
+                                  Trunc->getSrcTy(), Trunc);
+    }
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
     Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
@@ -7143,7 +7463,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       }
     }
 
-    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
@@ -7172,9 +7492,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
@@ -7206,81 +7524,34 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
-
-  // Insert values known to be scalar into VecValuesToIgnore. This is a
-  // conservative estimation of the values that will later be scalarized.
-  //
-  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
-  //        still be scalarized. For example, we may find an instruction to be
-  //        more profitable for a given vectorization factor if it were to be
-  //        scalarized. But at this point, we haven't yet computed the
-  //        vectorization factor.
-  for (auto *BB : TheLoop->getBlocks())
-    for (auto &I : *BB)
-      if (Legal->isScalarAfterVectorization(&I))
-        VecValuesToIgnore.insert(&I);
 }
 
-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
-                                             bool IfPredicateInstr) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
-  // Holds vector parameters or scalars, in case of uniform vals.
-  SmallVector<VectorParts, 4> Params;
-
-  setDebugLocFromInst(Builder, Instr);
-
-  // Does this instruction return a value ?
-  bool IsVoidRetTy = Instr->getType()->isVoidTy();
-
-  // Initialize a new scalar map entry.
-  ScalarParts Entry(UF);
-
-  VectorParts Cond;
-  if (IfPredicateInstr)
-    Cond = createBlockInMask(Instr->getParent());
-
-  // For each vector unroll 'part':
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(1);
-    // For each scalar that we create:
-
-    // Start an "if (pred) a[i] = ..." block.
-    Value *Cmp = nullptr;
-    if (IfPredicateInstr) {
-      if (Cond[Part]->getType()->isVectorTy())
-        Cond[Part] =
-            Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
-      Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
-                               ConstantInt::get(Cond[Part]->getType(), 1));
-    }
-
-    Instruction *Cloned = Instr->clone();
-    if (!IsVoidRetTy)
-      Cloned->setName(Instr->getName() + ".cloned");
-
-    // Replace the operands of the cloned instructions with their scalar
-    // equivalents in the new loop.
-    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-      auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0);
-      Cloned->setOperand(op, NewOp);
-    }
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
 
-    // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+  // Width 1 means no vectorize, cost 0 means uncomputed cost.
+  const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
+                                                                           0U};
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
+    return NoVectorization;
 
-    // Add the cloned scalar to the scalar map entry.
-    Entry[Part][0] = Cloned;
+  if (UserVF) {
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
+    CM.selectUserVectorizationFactor(UserVF);
+    return {UserVF, 0};
+  }
 
-    // If we just cloned a new assumption, add it the assumption cache.
-    if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
-      if (II->getIntrinsicID() == Intrinsic::assume)
-        AC->registerAssumption(II);
+  unsigned MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF != 0 && "MaxVF is zero.");
+  if (MaxVF == 1)
+    return NoVectorization;
 
-    // End if-block.
-    if (IfPredicateInstr)
-      PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
-  }
-  VectorLoopValueMap.initScalar(Instr, Entry);
+  // Select the optimal vectorization factor.
+  return CM.selectVectorizationFactor(MaxVF);
 }
 
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
@@ -7414,11 +7685,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Use the cost model.
-  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints);
-  CM.collectValuesToIgnore();
-
   // Check the function attributes to find out if this function should be
   // optimized for size.
   bool OptForSize =
@@ -7464,9 +7730,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Select the optimal vectorization factor.
-  const LoopVectorizationCostModel::VectorizationFactor VF =
-      CM.selectVectorizationFactor(OptForSize);
+  // Use the cost model.
+  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints);
+  CM.collectValuesToIgnore();
+
+  // Use the planner for vectorization.
+  LoopVectorizationPlanner LVP(CM);
+
+  // Get user vectorization factor.
+  unsigned UserVF = Hints.getWidth();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor VF =
+      LVP.plan(OptForSize, UserVF);
 
   // Select the interleave count.
   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
@@ -7522,10 +7799,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {
     // Do not vectorize or interleaving the loop.
-    ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+    ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
                                          L->getStartLoc(), L->getHeader())
               << VecDiagMsg.second);
-    ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+    ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
                                          L->getStartLoc(), L->getHeader())
               << IntDiagMsg.second);
     return false;
@@ -7621,6 +7898,16 @@ bool LoopVectorizePass::runImpl(
   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
     return false;
 
+  bool Changed = false;
+
+  // The vectorizer requires loops to be in simplified form.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop vectorizer
+  // will simplify all loops, regardless of whether anything end up being
+  // vectorized.
+  for (auto &L : *LI)
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
+
   // Build up a worklist of inner-loops to vectorize. This is necessary as
   // the act of vectorizing or partially unrolling a loop creates new loops
   // and can invalidate iterators across the loops.
@@ -7632,9 +7919,15 @@ bool LoopVectorizePass::runImpl(
   LoopsAnalyzed += Worklist.size();
 
   // Now walk the identified inner loops.
-  bool Changed = false;
-  while (!Worklist.empty())
-    Changed |= processLoop(Worklist.pop_back_val());
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+
+    // For the inner loops we actually process, form LCSSA to simplify the
+    // transform.
+    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
+
+    Changed |= processLoop(L);
+  }
 
   // Process each loop nest in the function.
   return Changed;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 328f27002960..da3ac06ab464 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
@@ -90,6 +91,10 @@ static cl::opt<unsigned> MinTreeSize(
     "slp-min-tree-size", cl::init(3), cl::Hidden,
     cl::desc("Only vectorize small trees if they are fully vectorizable"));
 
+static cl::opt<bool>
+    ViewSLPTree("view-slp-tree", cl::Hidden,
+                cl::desc("Display the SLP trees with Graphviz"));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -212,14 +217,14 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
 /// Flag set: NSW, NUW, exact, and all of fast-math.
 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
   if (auto *VecOp = dyn_cast<Instruction>(I)) {
-    if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
-      // Intersection is initialized to the 0th scalar,
-      // so start counting from index '1'.
+    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+      // VecOVp is initialized to the 0th scalar, so start counting from index
+      // '1'.
+      VecOp->copyIRFlags(I0);
       for (int i = 1, e = VL.size(); i < e; ++i) {
         if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
-          Intersection->andIRFlags(Scalar);
+          VecOp->andIRFlags(Scalar);
       }
-      VecOp->copyIRFlags(Intersection);
     }
   }
 }
@@ -304,6 +309,8 @@ public:
   typedef SmallVector<Instruction *, 16> InstrList;
   typedef SmallPtrSet<Value *, 16> ValueSet;
   typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, SmallVector<Instruction *, 2>>
+      ExtraValueToDebugLocsMap;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
@@ -330,6 +337,10 @@ public:
   /// \brief Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
   Value *vectorizeTree();
+  /// Vectorize the tree but with the list of externally used values \p
+  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+  /// generated extractvalue instructions.
+  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
@@ -343,6 +354,13 @@ public:
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
   void buildTree(ArrayRef<Value *> Roots,
                  ArrayRef<Value *> UserIgnoreLst = None);
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+  /// into account (anf updating it, if required) list of externally used
+  /// values stored in \p ExternallyUsedValues.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                 ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
@@ -404,7 +422,7 @@ private:
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
@@ -451,8 +469,9 @@ private:
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
-    TreeEntry() : Scalars(), VectorizedValue(nullptr),
-    NeedToGather(0) {}
+    TreeEntry(std::vector<TreeEntry> &Container)
+        : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
+          Container(Container) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
@@ -468,11 +487,24 @@ private:
 
     /// Do we need to gather this sequence ?
     bool NeedToGather;
+
+    /// Points back to the VectorizableTree.
+    ///
+    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
+    /// to be a pointer and needs to be able to initialize the child iterator.
+    /// Thus we need a reference back to the container to translate the indices
+    /// to entries.
+    std::vector<TreeEntry> &Container;
+
+    /// The TreeEntry index containing the user of this entry.  We can actually
+    /// have multiple users so the data structure is not truly a tree.
+    SmallVector<int, 1> UserTreeIndices;
   };
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
-    VectorizableTree.emplace_back();
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+                          int &UserTreeIdx) {
+    VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
@@ -485,6 +517,10 @@ private:
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
+
+    if (UserTreeIdx >= 0)
+      Last->UserTreeIndices.push_back(UserTreeIdx);
+    UserTreeIdx = idx;
     return Last;
   }
 
@@ -558,7 +594,9 @@ private:
   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
-  /// This list holds pairs of (Internal Scalar : External User).
+  /// This list holds pairs of (Internal Scalar : External User). External User
+  /// can be nullptr, it means that this Internal Scalar will be used later,
+  /// after vectorization.
   UserList ExternalUses;
 
   /// Values used only by @llvm.assume calls.
@@ -706,6 +744,8 @@ private:
     return os;
   }
 #endif
+  friend struct GraphTraits<BoUpSLP *>;
+  friend struct DOTGraphTraits<BoUpSLP *>;
 
   /// Contains all scheduling data for a basic block.
   ///
@@ -916,17 +956,98 @@ private:
   /// original width.
   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
 };
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  /// NodeRef has to be a pointer per the GraphWriter.
+  typedef TreeEntry *NodeRef;
+
+  /// \brief Add the VectorizableTree to the index iterator to be able to return
+  /// TreeEntry pointers.
+  struct ChildIteratorType
+      : public iterator_adaptor_base<ChildIteratorType,
+                                     SmallVector<int, 1>::iterator> {
+
+    std::vector<TreeEntry> &VectorizableTree;
+
+    ChildIteratorType(SmallVector<int, 1>::iterator W,
+                      std::vector<TreeEntry> &VT)
+        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+    NodeRef operator*() { return &VectorizableTree[*I]; }
+  };
+
+  static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->UserTreeIndices.begin(), N->Container};
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->UserTreeIndices.end(), N->Container};
+  }
+
+  /// For the node iterator we just need to turn the TreeEntry iterator into a
+  /// TreeEntry* iterator so that it dereferences to NodeRef.
+  typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+
+  static nodes_iterator nodes_begin(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.begin());
+  }
+  static nodes_iterator nodes_end(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.end());
+  }
+
+  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    if (isSplat(Entry->Scalars)) {
+      OS << "<splat> " << *Entry->Scalars[0];
+      return Str;
+    }
+    for (auto V : Entry->Scalars) {
+      OS << *V;
+      if (std::any_of(
+              R->ExternalUses.begin(), R->ExternalUses.end(),
+              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+        OS << " <extract>";
+      OS << "\n";
+    }
+    return Str;
+  }
+
+  static std::string getNodeAttributes(const TreeEntry *Entry,
+                                       const BoUpSLP *) {
+    if (Entry->NeedToGather)
+      return "color=red";
+    return "";
+  }
+};
 
 } // end namespace llvm
-} // end namespace slpvectorizer
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                        ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0);
+  buildTree_rec(Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -940,6 +1061,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
       if (Entry->NeedToGather)
         continue;
 
+      // Check if the scalar is externally used as an extra arg.
+      auto ExtI = ExternallyUsedValues.find(Scalar);
+      if (ExtI != ExternallyUsedValues.end()) {
+        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
+              Lane << " from " << *Scalar << ".\n");
+        ExternalUses.emplace_back(Scalar, nullptr, Lane);
+        continue;
+      }
       for (User *U : Scalar->users()) {
         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
@@ -976,28 +1105,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   }
 }
 
-
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+                            int UserTreeIdx) {
   bool isAltShuffle = false;
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (VL[0]->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   unsigned Opcode = getSameOpcode(VL);
@@ -1014,7 +1143,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1026,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1039,10 +1168,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
     }
+    // Record the reuse of the tree node.  FIXME, currently this is only used to
+    // properly draw the graph rather than for the actual vectorization.
+    E->UserTreeIndices.push_back(UserTreeIdx);
     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
     return;
   }
@@ -1052,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (ScalarToTreeEntry.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1062,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1076,7 +1208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1085,7 +1217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     for (unsigned j = i+1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
 
@@ -1100,7 +1232,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1117,12 +1249,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            newTreeEntry(VL, false, UserTreeIdx);
             return;
           }
         }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1132,7 +1264,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1144,7 +1276,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       } else {
         BS.cancelScheduling(VL);
       }
-      newTreeEntry(VL, Reuse);
+      newTreeEntry(VL, Reuse, UserTreeIdx);
       return;
     }
     case Instruction::Load: {
@@ -1160,7 +1292,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1171,7 +1303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1193,7 +1325,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true);
+        newTreeEntry(VL, true, UserTreeIdx);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1208,7 +1340,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           }
 
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
 
       if (ReverseConsecutive) {
         ++NumLoadsWantToChangeOrder;
@@ -1235,12 +1367,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1249,7 +1381,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1263,13 +1395,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1278,7 +1410,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1301,7 +1433,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1309,8 +1441,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1320,7 +1452,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1330,7 +1462,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1343,7 +1475,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1355,12 +1487,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1368,7 +1500,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1377,19 +1509,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1);
+      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1400,7 +1532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1414,7 +1546,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1425,7 +1557,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            newTreeEntry(VL, false, UserTreeIdx);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1438,14 +1570,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1453,7 +1585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1462,19 +1594,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // then do not vectorize this instruction.
       if (!isAltShuffle) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1484,13 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     default:
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1570,6 +1702,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
   // If we have computed a smaller type for the expression, update VecTy so
@@ -1599,7 +1733,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
           Instruction *E = cast<Instruction>(VL[i]);
-          if (E->hasOneUse())
+          // If all users are going to be vectorized, instruction can be
+          // considered as dead.
+          // The same, if have only one user, it will be vectorized for sure.
+          if (E->hasOneUse() ||
+              std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+                return ScalarToTreeEntry.count(U) > 0;
+              }))
             // Take credit for instruction that will become dead.
             DeadCost +=
                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
@@ -1624,10 +1764,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       // Calculate the cost of this instruction.
       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
+                                                         VL0->getType(), SrcTy, VL0);
 
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
@@ -1636,8 +1776,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Calculate the cost of this instruction.
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -1720,18 +1860,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Cost of wide load - cost of scalar loads.
       unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
       int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
       int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
       return VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -1739,12 +1879,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
+      SmallVector<Type*, 4> ScalarTys;
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
         ScalarTys.push_back(CI->getArgOperand(op)->getType());
-        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
-                                         VecTy->getNumElements()));
-      }
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
@@ -1753,7 +1890,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       int ScalarCallCost = VecTy->getNumElements() *
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
 
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+      SmallVector<Value *, 4> Args(CI->arg_operands());
+      int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+                                                   VecTy->getNumElements());
 
       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
@@ -1947,9 +2086,18 @@ int BoUpSLP::getTreeCost() {
   int SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
 
-  DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
-               << "SLP: Extract Cost = " << ExtractCost << ".\n"
-               << "SLP: Total Cost = " << Cost << ".\n");
+  std::string Str;
+  {
+    raw_string_ostream OS(Str);
+    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+       << "SLP: Extract Cost = " << ExtractCost << ".\n"
+       << "SLP: Total Cost = " << Cost << ".\n";
+  }
+  DEBUG(dbgs() << Str);
+
+  if (ViewSLPTree)
+    ViewGraph(this, "SLP" + F->getName(), false, Str);
+
   return Cost;
 }
 
@@ -2702,6 +2850,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 }
 
 Value *BoUpSLP::vectorizeTree() {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
 
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
@@ -2744,7 +2898,7 @@ Value *BoUpSLP::vectorizeTree() {
 
     // Skip users that we already RAUW. This happens when one instruction
     // has multiple uses of the same value.
-    if (!is_contained(Scalar->users(), User))
+    if (User && !is_contained(Scalar->users(), User))
       continue;
     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
 
@@ -2756,6 +2910,28 @@ Value *BoUpSLP::vectorizeTree() {
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    // If User == nullptr, the Scalar is used as extra arg. Generate
+    // ExtractElement instruction and update the record for this scalar in
+    // ExternallyUsedValues.
+    if (!User) {
+      assert(ExternallyUsedValues.count(Scalar) &&
+             "Scalar with nullptr as an external user must be registered in "
+             "ExternallyUsedValues map");
+      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+        Builder.SetInsertPoint(VecI->getParent(),
+                               std::next(VecI->getIterator()));
+      } else {
+        Builder.SetInsertPoint(&F->getEntryBlock().front());
+      }
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+      auto &Locs = ExternallyUsedValues[Scalar];
+      ExternallyUsedValues.insert({Ex, Locs});
+      ExternallyUsedValues.erase(Scalar);
+      continue;
+    }
+
     // Generate extracts for out-of-tree users.
     // Find the insertion point for the extractelement lane.
     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -3264,7 +3440,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   // sorted by the original instruction location. This lets the final schedule
   // be as  close as possible to the original instruction order.
   struct ScheduleDataCompare {
-    bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
+    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
       return SD2->SchedulingPriority < SD1->SchedulingPriority;
     }
   };
@@ -3645,9 +3821,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
   bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
   if (!Changed)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<AAManager>();
   PA.preserve<GlobalsAA>();
   return PA;
@@ -4026,36 +4202,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   if (!V)
     return false;
 
+  Value *P = V->getParent();
+
+  // Vectorize in current basic block only.
+  auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+    return false;
+
   // Try to vectorize V.
-  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+  if (tryToVectorizePair(Op0, Op1, R))
     return true;
 
-  BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
-  BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+  auto *A = dyn_cast<BinaryOperator>(Op0);
+  auto *B = dyn_cast<BinaryOperator>(Op1);
   // Try to skip B.
   if (B && B->hasOneUse()) {
-    BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
-    BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (tryToVectorizePair(A, B0, R)) {
+    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
       return true;
-    }
-    if (tryToVectorizePair(A, B1, R)) {
+    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
       return true;
-    }
   }
 
   // Try to skip A.
   if (A && A->hasOneUse()) {
-    BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
-    BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (tryToVectorizePair(A0, B, R)) {
+    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
       return true;
-    }
-    if (tryToVectorizePair(A1, B, R)) {
+    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
       return true;
-    }
   }
-  return 0;
+  return false;
 }
 
 /// \brief Generate a shuffle mask to be used in a reduction tree.
@@ -4119,37 +4299,41 @@ namespace {
 class HorizontalReduction {
   SmallVector<Value *, 16> ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
+  // Use map vector to make stable output.
+  MapVector<Instruction *, Value *> ExtraArgs;
 
-  BinaryOperator *ReductionRoot;
-  // After successfull horizontal reduction vectorization attempt for PHI node
-  // vectorizer tries to update root binary op by combining vectorized tree and
-  // the ReductionPHI node. But during vectorization this ReductionPHI can be
-  // vectorized itself and replaced by the undef value, while the instruction
-  // itself is marked for deletion. This 'marked for deletion' PHI node then can
-  // be used in new binary operation, causing "Use still stuck around after Def
-  // is destroyed" crash upon PHI node deletion.
-  WeakVH ReductionPHI;
+  BinaryOperator *ReductionRoot = nullptr;
 
   /// The opcode of the reduction.
-  unsigned ReductionOpcode;
+  Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
   /// The opcode of the values we perform a reduction on.
-  unsigned ReducedValueOpcode;
+  unsigned ReducedValueOpcode = 0;
   /// Should we model this reduction as a pairwise reduction tree or a tree that
   /// splits the vector in halves and adds those halves.
-  bool IsPairwiseReduction;
+  bool IsPairwiseReduction = false;
+
+  /// Checks if the ParentStackElem.first should be marked as a reduction
+  /// operation with an extra argument or as extra argument itself.
+  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+                    Value *ExtraArg) {
+    if (ExtraArgs.count(ParentStackElem.first)) {
+      ExtraArgs[ParentStackElem.first] = nullptr;
+      // We ran into something like:
+      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+      // The whole ParentStackElem.first should be considered as an extra value
+      // in this case.
+      // Do not perform analysis of remaining operands of ParentStackElem.first
+      // instruction, this whole instruction is an extra argument.
+      ParentStackElem.second = ParentStackElem.first->getNumOperands();
+    } else {
+      // We ran into something like:
+      // ParentStackElem.first += ... + ExtraArg + ...
+      ExtraArgs[ParentStackElem.first] = ExtraArg;
+    }
+  }
 
 public:
-  /// The width of one full horizontal reduction operation.
-  unsigned ReduxWidth;
-
-  /// Minimal width of available vector registers. It's used to determine
-  /// ReduxWidth.
-  unsigned MinVecRegSize;
-
-  HorizontalReduction(unsigned MinVecRegSize)
-      : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
-        IsPairwiseReduction(false), ReduxWidth(0),
-        MinVecRegSize(MinVecRegSize) {}
+  HorizontalReduction() = default;
 
   /// \brief Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -4176,21 +4360,14 @@ public:
     if (!isValidElementType(Ty))
       return false;
 
-    const DataLayout &DL = B->getModule()->getDataLayout();
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
-    // FIXME: Register size should be a parameter to this function, so we can
-    // try different vectorization factors.
-    ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
     ReductionRoot = B;
-    ReductionPHI = Phi;
-
-    if (ReduxWidth < 4)
-      return false;
 
     // We currently only support adds.
-    if (ReductionOpcode != Instruction::Add &&
-        ReductionOpcode != Instruction::FAdd)
+    if ((ReductionOpcode != Instruction::Add &&
+         ReductionOpcode != Instruction::FAdd) ||
+        !B->isAssociative())
       return false;
 
     // Post order traverse the reduction tree starting at B. We only handle true
@@ -4202,30 +4379,26 @@ public:
       unsigned EdgeToVist = Stack.back().second++;
       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
 
-      // Only handle trees in the current basic block.
-      if (TreeN->getParent() != B->getParent())
-        return false;
-
-      // Each tree node needs to have one user except for the ultimate
-      // reduction.
-      if (!TreeN->hasOneUse() && TreeN != B)
-        return false;
-
       // Postorder vist.
       if (EdgeToVist == 2 || IsReducedValue) {
-        if (IsReducedValue) {
-          // Make sure that the opcodes of the operations that we are going to
-          // reduce match.
-          if (!ReducedValueOpcode)
-            ReducedValueOpcode = TreeN->getOpcode();
-          else if (ReducedValueOpcode != TreeN->getOpcode())
-            return false;
+        if (IsReducedValue)
           ReducedVals.push_back(TreeN);
-        } else {
-          // We need to be able to reassociate the adds.
-          if (!TreeN->isAssociative())
-            return false;
-          ReductionOps.push_back(TreeN);
+        else {
+          auto I = ExtraArgs.find(TreeN);
+          if (I != ExtraArgs.end() && !I->second) {
+            // Check if TreeN is an extra argument of its parent operation.
+            if (Stack.size() <= 1) {
+              // TreeN can't be an extra argument as it is a root reduction
+              // operation.
+              return false;
+            }
+            // Yes, TreeN is an extra argument, do not add it to a list of
+            // reduction operations.
+            // Stack[Stack.size() - 2] always points to the parent operation.
+            markExtraArg(Stack[Stack.size() - 2], TreeN);
+            ExtraArgs.erase(TreeN);
+          } else
+            ReductionOps.push_back(TreeN);
         }
         // Retract.
         Stack.pop_back();
@@ -4242,13 +4415,44 @@ public:
         // reduced value class.
         if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
                   I->getOpcode() == ReductionOpcode)) {
-          if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
+          // Only handle trees in the current basic block.
+          if (I->getParent() != B->getParent()) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
+
+          // Each tree node needs to have one user except for the ultimate
+          // reduction.
+          if (!I->hasOneUse() && I != B) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
+
+          if (I->getOpcode() == ReductionOpcode) {
+            // We need to be able to reassociate the reduction operations.
+            if (!I->isAssociative()) {
+              // I is an extra argument for TreeN (its parent operation).
+              markExtraArg(Stack.back(), I);
+              continue;
+            }
+          } else if (ReducedValueOpcode &&
+                     ReducedValueOpcode != I->getOpcode()) {
+            // Make sure that the opcodes of the operations that we are going to
+            // reduce match.
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          } else if (!ReducedValueOpcode)
             ReducedValueOpcode = I->getOpcode();
+
           Stack.push_back(std::make_pair(I, 0));
           continue;
         }
-        return false;
       }
+      // NextV is an extra argument for TreeN (its parent operation).
+      markExtraArg(Stack.back(), NextV);
     }
     return true;
   }
@@ -4259,10 +4463,15 @@ public:
     if (ReducedVals.empty())
       return false;
 
+    // If there is a sufficient number of reduction values, reduce
+    // to a nearby power-of-2. Can safely generate oversized
+    // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < ReduxWidth)
+    if (NumReducedVals < 4)
       return false;
 
+    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+
     Value *VectorizedTree = nullptr;
     IRBuilder<> Builder(ReductionRoot);
     FastMathFlags Unsafe;
@@ -4270,20 +4479,26 @@ public:
     Builder.setFastMathFlags(Unsafe);
     unsigned i = 0;
 
-    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+    // The same extra argument may be used several time, so log each attempt
+    // to use it.
+    for (auto &Pair : ExtraArgs)
+      ExternallyUsedValues[Pair.second].push_back(Pair.first);
+    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ReductionOps);
+      V.buildTree(VL, ExternallyUsedValues, ReductionOps);
       if (V.shouldReorder()) {
         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
-        V.buildTree(Reversed, ReductionOps);
+        V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
-        continue;
+        break;
 
       V.computeMinimumValueSizes();
 
       // Estimate cost.
-      int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+      int Cost =
+          V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
       if (Cost >= -SLPCostThreshold)
         break;
 
@@ -4292,33 +4507,44 @@ public:
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
-      Value *VectorizedRoot = V.vectorizeTree();
+      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
       // Emit a reduction.
-      Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+      Value *ReducedSubTree =
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
-        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
-                                     ReducedSubTree, "bin.rdx");
+        VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                             ReducedSubTree, "bin.rdx");
+        propagateIRFlags(VectorizedTree, ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
+      i += ReduxWidth;
+      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
 
     if (VectorizedTree) {
       // Finish the reduction.
       for (; i < NumReducedVals; ++i) {
-        Builder.SetCurrentDebugLocation(
-          cast<Instruction>(ReducedVals[i])->getDebugLoc());
-        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
-                                     ReducedVals[i]);
+        auto *I = cast<Instruction>(ReducedVals[i]);
+        Builder.SetCurrentDebugLocation(I->getDebugLoc());
+        VectorizedTree =
+            Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
+        propagateIRFlags(VectorizedTree, ReductionOps);
+      }
+      for (auto &Pair : ExternallyUsedValues) {
+        assert(!Pair.second.empty() &&
+               "At least one DebugLoc must be inserted");
+        // Add each externally used value to the final reduction.
+        for (auto *I : Pair.second) {
+          Builder.SetCurrentDebugLocation(I->getDebugLoc());
+          VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                               Pair.first, "bin.extra");
+          propagateIRFlags(VectorizedTree, I);
+        }
       }
       // Update users.
-      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
-        assert(ReductionRoot && "Need a reduction operation");
-        ReductionRoot->setOperand(0, VectorizedTree);
-        ReductionRoot->setOperand(1, ReductionPHI);
-      } else
-        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+      ReductionRoot->replaceAllUsesWith(VectorizedTree);
     }
     return VectorizedTree != nullptr;
   }
@@ -4329,7 +4555,8 @@ public:
 
 private:
   /// \brief Calculate the cost of a reduction.
-  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
+                       unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
 
@@ -4352,15 +4579,9 @@ private:
     return VecReduxCost - ScalarReduxCost;
   }
 
-  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
-                            Value *R, const Twine &Name = "") {
-    if (Opcode == Instruction::FAdd)
-      return Builder.CreateFAdd(L, R, Name);
-    return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
-  }
-
   /// \brief Emit a horizontal reduction of the vectorized value.
-  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+                       unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
@@ -4378,15 +4599,16 @@ private:
         Value *RightShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
-        TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
-                             "bin.rdx");
+        TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
+                                     "bin.rdx");
       } else {
         Value *UpperHalf =
           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
         Value *Shuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
-        TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+        TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
       }
+      propagateIRFlags(TmpVec, RedOps);
     }
 
     // The result is in the first element of the vector.
@@ -4438,16 +4660,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,
 static bool findBuildAggregate(InsertValueInst *IV,
                                SmallVectorImpl<Value *> &BuildVector,
                                SmallVectorImpl<Value *> &BuildVectorOpds) {
-  if (!IV->hasOneUse())
-    return false;
-  Value *V = IV->getAggregateOperand();
-  if (!isa<UndefValue>(V)) {
-    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
-    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+  Value *V;
+  do {
+    BuildVector.push_back(IV);
+    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+    V = IV->getAggregateOperand();
+    if (isa<UndefValue>(V))
+      break;
+    IV = dyn_cast<InsertValueInst>(V);
+    if (!IV || !IV->hasOneUse())
       return false;
-  }
-  BuildVector.push_back(IV);
-  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  } while (true);
+  std::reverse(BuildVector.begin(), BuildVector.end());
+  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
   return true;
 }
 
@@ -4507,29 +4732,137 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   return nullptr;
 }
 
+namespace {
+/// Tracks instructons and its children.
+class WeakVHWithLevel final : public CallbackVH {
+  /// Operand index of the instruction currently beeing analized.
+  unsigned Level = 0;
+  /// Is this the instruction that should be vectorized, or are we now
+  /// processing children (i.e. operands of this instruction) for potential
+  /// vectorization?
+  bool IsInitial = true;
+
+public:
+  explicit WeakVHWithLevel() = default;
+  WeakVHWithLevel(Value *V) : CallbackVH(V){};
+  /// Restart children analysis each time it is repaced by the new instruction.
+  void allUsesReplacedWith(Value *New) override {
+    setValPtr(New);
+    Level = 0;
+    IsInitial = true;
+  }
+  /// Check if the instruction was not deleted during vectorization.
+  bool isValid() const { return !getValPtr(); }
+  /// Is the istruction itself must be vectorized?
+  bool isInitial() const { return IsInitial; }
+  /// Try to vectorize children.
+  void clearInitial() { IsInitial = false; }
+  /// Are all children processed already?
+  bool isFinal() const {
+    assert(getValPtr() &&
+           (isa<Instruction>(getValPtr()) &&
+            cast<Instruction>(getValPtr())->getNumOperands() >= Level));
+    return getValPtr() &&
+           cast<Instruction>(getValPtr())->getNumOperands() == Level;
+  }
+  /// Get next child operation.
+  Value *nextOperand() {
+    assert(getValPtr() && isa<Instruction>(getValPtr()) &&
+           cast<Instruction>(getValPtr())->getNumOperands() > Level);
+    return cast<Instruction>(getValPtr())->getOperand(Level++);
+  }
+  virtual ~WeakVHWithLevel() = default;
+};
+} // namespace
+
 /// \brief Attempt to reduce a horizontal reduction.
 /// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators BI, then check if it
-/// can be done.
+/// the phi node P with reduction operators Root in a basic block BB, then check
+/// if it can be done.
 /// \returns true if a horizontal reduction was matched and reduced.
 /// \returns false if a horizontal reduction was not matched.
-static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
-                                        BoUpSLP &R, TargetTransformInfo *TTI,
-                                        unsigned MinRegSize) {
+static bool canBeVectorized(
+    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+    TargetTransformInfo *TTI,
+    const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
 
-  HorizontalReduction HorRdx(MinRegSize);
-  if (!HorRdx.matchAssociativeReduction(P, BI))
+  if (!Root)
     return false;
 
-  // If there is a sufficient number of reduction values, reduce
-  // to a nearby power-of-2. Can safely generate oversized
-  // vectors and rely on the backend to split them to legal sizes.
-  HorRdx.ReduxWidth =
-    std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+  if (Root->getParent() != BB)
+    return false;
+  SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
+  SmallSet<Value *, 8> VisitedInstrs;
+  bool Res = false;
+  while (!Stack.empty()) {
+    Value *V = Stack.back();
+    if (!V) {
+      Stack.pop_back();
+      continue;
+    }
+    auto *Inst = dyn_cast<Instruction>(V);
+    if (!Inst || isa<PHINode>(Inst)) {
+      Stack.pop_back();
+      continue;
+    }
+    if (Stack.back().isInitial()) {
+      Stack.back().clearInitial();
+      if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+        HorizontalReduction HorRdx;
+        if (HorRdx.matchAssociativeReduction(P, BI)) {
+          if (HorRdx.tryToReduce(R, TTI)) {
+            Res = true;
+            P = nullptr;
+            continue;
+          }
+        }
+        if (P) {
+          Inst = dyn_cast<Instruction>(BI->getOperand(0));
+          if (Inst == P)
+            Inst = dyn_cast<Instruction>(BI->getOperand(1));
+          if (!Inst) {
+            P = nullptr;
+            continue;
+          }
+        }
+      }
+      P = nullptr;
+      if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+        Res = true;
+        continue;
+      }
+    }
+    if (Stack.back().isFinal()) {
+      Stack.pop_back();
+      continue;
+    }
 
-  return HorRdx.tryToReduce(R, TTI);
+    if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
+      if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
+          Stack.size() < RecursionMaxDepth)
+        Stack.push_back(NextV);
+  }
+  return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+                                                 BasicBlock *BB, BoUpSLP &R,
+                                                 TargetTransformInfo *TTI) {
+  if (!V)
+    return false;
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  if (!isa<BinaryOperator>(I))
+    P = nullptr;
+  // Try to match and vectorize a horizontal reduction.
+  return canBeVectorized(P, I, BB, R, TTI,
+                         [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+                           return tryToVectorize(BI, R);
+                         });
 }
 
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4599,67 +4932,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (P->getNumIncomingValues() != 2)
         return Changed;
 
-      Value *Rdx = getReductionValue(DT, P, BB, LI);
-
-      // Check if this is a Binary Operator.
-      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
-      if (!BI)
-        continue;
-
       // Try to match and vectorize a horizontal reduction.
-      if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
+      if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+                                   TTI)) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
         continue;
       }
-
-     Value *Inst = BI->getOperand(0);
-      if (Inst == P)
-        Inst = BI->getOperand(1);
-
-      if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
-        // We would like to start over since some instructions are deleted
-        // and the iterator may become invalid value.
-        Changed = true;
-        it = BB->begin();
-        e = BB->end();
-        continue;
-      }
-
       continue;
     }
 
-    if (ShouldStartVectorizeHorAtStore)
-      if (StoreInst *SI = dyn_cast<StoreInst>(it))
-        if (BinaryOperator *BinOp =
-                dyn_cast<BinaryOperator>(SI->getValueOperand())) {
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          R.getMinVecRegSize()) ||
-              tryToVectorize(BinOp, R)) {
-            Changed = true;
-            it = BB->begin();
-            e = BB->end();
-            continue;
-          }
+    if (ShouldStartVectorizeHorAtStore) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
+                                     TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
         }
+      }
+    }
 
     // Try to vectorize horizontal reductions feeding into a return.
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
-      if (RI->getNumOperands() != 0)
-        if (BinaryOperator *BinOp =
-                dyn_cast<BinaryOperator>(RI->getOperand(0))) {
-          DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          R.getMinVecRegSize()) ||
-              tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
-                                 R)) {
-            Changed = true;
-            it = BB->begin();
-            e = BB->end();
-            continue;
-          }
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
+      if (RI->getNumOperands() != 0) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
         }
+      }
+    }
 
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
@@ -4672,16 +4980,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         continue;
       }
 
-      for (int i = 0; i < 2; ++i) {
-        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
-          if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
-            Changed = true;
-            // We would like to start over since some instructions are deleted
-            // and the iterator may become invalid value.
-            it = BB->begin();
-            e = BB->end();
-            break;
-          }
+      for (int I = 0; I < 2; ++I) {
+        if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
+          Changed = true;
+          // We would like to start over since some instructions are deleted
+          // and the iterator may become invalid value.
+          it = BB->begin();
+          e = BB->end();
+          break;
         }
       }
       continue;
diff --git a/lib/XRay/CMakeLists.txt b/lib/XRay/CMakeLists.txt
index 6c1acba79bfa..8d558209d8ee 100644
--- a/lib/XRay/CMakeLists.txt
+++ b/lib/XRay/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMXRay
+  InstrumentationMap.cpp
   Trace.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -7,7 +8,9 @@ add_llvm_library(LLVMXRay
 
   DEPENDS
   LLVMSupport
+  LLVMObject
 
   LINK_LIBS
   LLVMSupport
+  LLVMObject
   )
diff --git a/lib/XRay/InstrumentationMap.cpp b/lib/XRay/InstrumentationMap.cpp
new file mode 100644
index 000000000000..431c251feb65
--- /dev/null
+++ b/lib/XRay/InstrumentationMap.cpp
@@ -0,0 +1,198 @@
+//===- InstrumentationMap.cpp - XRay Instrumentation Map ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the InstrumentationMap type for XRay sleds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/XRay/InstrumentationMap.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <system_error>
+#include <vector>
+
+using namespace llvm;
+using namespace xray;
+
+Optional<int32_t> InstrumentationMap::getFunctionId(uint64_t Addr) const {
+  auto I = FunctionIds.find(Addr);
+  if (I != FunctionIds.end())
+    return I->second;
+  return None;
+}
+
+Optional<uint64_t> InstrumentationMap::getFunctionAddr(int32_t FuncId) const {
+  auto I = FunctionAddresses.find(FuncId);
+  if (I != FunctionAddresses.end())
+    return I->second;
+  return None;
+}
+
+static Error
+loadELF64(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
+          InstrumentationMap::SledContainer &Sleds,
+          InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+          InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+  InstrumentationMap Map;
+
+  // Find the section named "xray_instr_map".
+  if (!ObjFile.getBinary()->isELF() ||
+      !(ObjFile.getBinary()->getArch() == Triple::x86_64 ||
+        ObjFile.getBinary()->getArch() == Triple::ppc64le))
+    return make_error<StringError>(
+        "File format not supported (only does ELF little endian 64-bit).",
+        std::make_error_code(std::errc::not_supported));
+
+  StringRef Contents = "";
+  const auto &Sections = ObjFile.getBinary()->sections();
+  auto I = llvm::find_if(Sections, [&](object::SectionRef Section) {
+    StringRef Name = "";
+    if (Section.getName(Name))
+      return false;
+    return Name == "xray_instr_map";
+  });
+
+  if (I == Sections.end())
+    return make_error<StringError>(
+        "Failed to find XRay instrumentation map.",
+        std::make_error_code(std::errc::executable_format_error));
+
+  if (I->getContents(Contents))
+    return errorCodeToError(
+        std::make_error_code(std::errc::executable_format_error));
+
+  // Copy the instrumentation map data into the Sleds data structure.
+  auto C = Contents.bytes_begin();
+  static constexpr size_t ELF64SledEntrySize = 32;
+
+  if ((C - Contents.bytes_end()) % ELF64SledEntrySize != 0)
+    return make_error<StringError>(
+        Twine("Instrumentation map entries not evenly divisible by size of "
+              "an XRay sled entry in ELF64."),
+        std::make_error_code(std::errc::executable_format_error));
+
+  int32_t FuncId = 1;
+  uint64_t CurFn = 0;
+  for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
+    DataExtractor Extractor(
+        StringRef(reinterpret_cast<const char *>(C), ELF64SledEntrySize), true,
+        8);
+    Sleds.push_back({});
+    auto &Entry = Sleds.back();
+    uint32_t OffsetPtr = 0;
+    Entry.Address = Extractor.getU64(&OffsetPtr);
+    Entry.Function = Extractor.getU64(&OffsetPtr);
+    auto Kind = Extractor.getU8(&OffsetPtr);
+    static constexpr SledEntry::FunctionKinds Kinds[] = {
+        SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT,
+        SledEntry::FunctionKinds::TAIL,
+    };
+    if (Kind >= sizeof(Kinds))
+      return errorCodeToError(
+          std::make_error_code(std::errc::executable_format_error));
+    Entry.Kind = Kinds[Kind];
+    Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
+
+    // We do replicate the function id generation scheme implemented in the
+    // XRay runtime.
+    // FIXME: Figure out how to keep this consistent with the XRay runtime.
+    if (CurFn == 0) {
+      CurFn = Entry.Function;
+      FunctionAddresses[FuncId] = Entry.Function;
+      FunctionIds[Entry.Function] = FuncId;
+    }
+    if (Entry.Function != CurFn) {
+      ++FuncId;
+      CurFn = Entry.Function;
+      FunctionAddresses[FuncId] = Entry.Function;
+      FunctionIds[Entry.Function] = FuncId;
+    }
+  }
+  return Error::success();
+}
+
+static Error
+loadYAML(int Fd, size_t FileSize, StringRef Filename,
+         InstrumentationMap::SledContainer &Sleds,
+         InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+         InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Failed memory-mapping file '") + Filename + "'.", EC);
+
+  std::vector<YAMLXRaySledEntry> YAMLSleds;
+  yaml::Input In(StringRef(MappedFile.data(), MappedFile.size()));
+  In >> YAMLSleds;
+  if (In.error())
+    return make_error<StringError>(
+        Twine("Failed loading YAML document from '") + Filename + "'.",
+        In.error());
+
+  Sleds.reserve(YAMLSleds.size());
+  for (const auto &Y : YAMLSleds) {
+    FunctionAddresses[Y.FuncId] = Y.Function;
+    FunctionIds[Y.Function] = Y.FuncId;
+    Sleds.push_back(
+        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
+  }
+  return Error::success();
+}
+
+// FIXME: Create error types that encapsulate a bit more information than what
+// StringError instances contain.
+Expected<InstrumentationMap>
+llvm::xray::loadInstrumentationMap(StringRef Filename) {
+  // At this point we assume the file is an object file -- and if that doesn't
+  // work, we treat it as YAML.
+  // FIXME: Extend to support non-ELF and non-x86_64 binaries.
+
+  InstrumentationMap Map;
+  auto ObjectFileOrError = object::ObjectFile::createObjectFile(Filename);
+  if (!ObjectFileOrError) {
+    auto E = ObjectFileOrError.takeError();
+    // We try to load it as YAML if the ELF load didn't work.
+    int Fd;
+    if (sys::fs::openFileForRead(Filename, Fd))
+      return std::move(E);
+
+    uint64_t FileSize;
+    if (sys::fs::file_size(Filename, FileSize))
+      return std::move(E);
+
+    // If the file is empty, we return the original error.
+    if (FileSize == 0)
+      return std::move(E);
+
+    // From this point on the errors will be only for the YAML parts, so we
+    // consume the errors at this point.
+    consumeError(std::move(E));
+    if (auto E = loadYAML(Fd, FileSize, Filename, Map.Sleds,
+                          Map.FunctionAddresses, Map.FunctionIds))
+      return std::move(E);
+  } else if (auto E = loadELF64(Filename, *ObjectFileOrError, Map.Sleds,
+                                Map.FunctionAddresses, Map.FunctionIds)) {
+    return std::move(E);
+  }
+  return Map;
+}
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index 51000c777de8..d2984697c8a9 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -24,8 +24,8 @@ using llvm::yaml::Input;
 using XRayRecordStorage =
     std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type;
 
-Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
-                     std::vector<XRayRecord> &Records) {
+// Populates the FileHeader reference by reading the first 32 bytes of the file.
+Error readBinaryFormatHeader(StringRef Data, XRayFileHeader &FileHeader) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -37,16 +37,6 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   //   (4)   uint32 : bitfield
   //   (8)   uint64 : cycle frequency
   //   (16)  -      : padding
-  //
-  if (Data.size() < 32)
-    return make_error<StringError>(
-        "Not enough bytes for an XRay log.",
-        std::make_error_code(std::errc::invalid_argument));
-
-  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
-    return make_error<StringError>(
-        "Invalid-sized XRay data.",
-        std::make_error_code(std::errc::invalid_argument));
 
   DataExtractor HeaderExtractor(Data, true, 8);
   uint32_t OffsetPtr = 0;
@@ -56,11 +46,29 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   FileHeader.ConstantTSC = Bitfield & 1uL;
   FileHeader.NonstopTSC = Bitfield & 1uL << 1;
   FileHeader.CycleFrequency = HeaderExtractor.getU64(&OffsetPtr);
-
+  std::memcpy(&FileHeader.FreeFormData, Data.bytes_begin() + OffsetPtr, 16);
   if (FileHeader.Version != 1)
     return make_error<StringError>(
         Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
         std::make_error_code(std::errc::invalid_argument));
+  return Error::success();
+}
+
+Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
+                         std::vector<XRayRecord> &Records) {
+  // Check that there is at least a header
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (auto E = readBinaryFormatHeader(Data, FileHeader))
+    return E;
 
   // Each record after the header will be 32 bytes, in the following format:
   //
@@ -98,9 +106,327 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   return Error::success();
 }
 
-Error YAMLLogLoader(StringRef Data, XRayFileHeader &FileHeader,
-                    std::vector<XRayRecord> &Records) {
+/// When reading from a Flight Data Recorder mode log, metadata records are
+/// sparse compared to packed function records, so we must maintain state as we
+/// read through the sequence of entries. This allows the reader to denormalize
+/// the CPUId and Thread Id onto each Function Record and transform delta
+/// encoded TSC values into absolute encodings on each record.
+struct FDRState {
+  uint16_t CPUId;
+  uint16_t ThreadId;
+  uint64_t BaseTSC;
+  /// Encode some of the state transitions for the FDR log reader as explicit
+  /// checks. These are expectations for the next Record in the stream.
+  enum class Token {
+    NEW_BUFFER_RECORD_OR_EOF,
+    WALLCLOCK_RECORD,
+    NEW_CPU_ID_RECORD,
+    FUNCTION_SEQUENCE,
+    SCAN_TO_END_OF_THREAD_BUF,
+  };
+  Token Expects;
+  // Each threads buffer may have trailing garbage to scan over, so we track our
+  // progress.
+  uint64_t CurrentBufferSize;
+  uint64_t CurrentBufferConsumed;
+};
+
+Twine fdrStateToTwine(const FDRState::Token &state) {
+  switch (state) {
+  case FDRState::Token::NEW_BUFFER_RECORD_OR_EOF:
+    return "NEW_BUFFER_RECORD_OR_EOF";
+  case FDRState::Token::WALLCLOCK_RECORD:
+    return "WALLCLOCK_RECORD";
+  case FDRState::Token::NEW_CPU_ID_RECORD:
+    return "NEW_CPU_ID_RECORD";
+  case FDRState::Token::FUNCTION_SEQUENCE:
+    return "FUNCTION_SEQUENCE";
+  case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
+    return "SCAN_TO_END_OF_THREAD_BUF";
+  }
+  return "UNKNOWN";
+}
+
+/// State transition when a NewBufferRecord is encountered.
+Error processFDRNewBufferRecord(FDRState &State, uint8_t RecordFirstByte,
+                                DataExtractor &RecordExtractor) {
+
+  if (State.Expects != FDRState::Token::NEW_BUFFER_RECORD_OR_EOF)
+    return make_error<StringError>(
+        "Malformed log. Read New Buffer record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // 1 byte into record.
+  State.ThreadId = RecordExtractor.getU16(&OffsetPtr);
+  State.Expects = FDRState::Token::WALLCLOCK_RECORD;
+  return Error::success();
+}
+
+/// State transition when an EndOfBufferRecord is encountered.
+Error processFDREndOfBufferRecord(FDRState &State, uint8_t RecordFirstByte,
+                                  DataExtractor &RecordExtractor) {
+  if (State.Expects == FDRState::Token::NEW_BUFFER_RECORD_OR_EOF)
+    return make_error<StringError>(
+        "Malformed log. Received EOB message without current buffer.",
+        std::make_error_code(std::errc::executable_format_error));
+  State.Expects = FDRState::Token::SCAN_TO_END_OF_THREAD_BUF;
+  return Error::success();
+}
+
+/// State transition when a NewCPUIdRecord is encountered.
+Error processFDRNewCPUIdRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::FUNCTION_SEQUENCE &&
+      State.Expects != FDRState::Token::NEW_CPU_ID_RECORD)
+    return make_error<StringError>(
+        "Malformed log. Read NewCPUId record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.CPUId = RecordExtractor.getU16(&OffsetPtr);
+  State.BaseTSC = RecordExtractor.getU64(&OffsetPtr);
+  State.Expects = FDRState::Token::FUNCTION_SEQUENCE;
+  return Error::success();
+}
+
+/// State transition when a TSCWrapRecord (overflow detection) is encountered.
+Error processFDRTSCWrapRecord(FDRState &State, uint8_t RecordFirstByte,
+                              DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::FUNCTION_SEQUENCE)
+    return make_error<StringError>(
+        "Malformed log. Read TSCWrap record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.BaseTSC = RecordExtractor.getU64(&OffsetPtr);
+  return Error::success();
+}
+
+/// State transition when a WallTimeMarkerRecord is encountered.
+Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::WALLCLOCK_RECORD)
+    return make_error<StringError>(
+        "Malformed log. Read Wallclock record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  // We don't encode the wall time into any of the records.
+  // XRayRecords are concerned with the TSC instead.
+  State.Expects = FDRState::Token::NEW_CPU_ID_RECORD;
+  return Error::success();
+}
+
+/// Advances the state machine for reading the FDR record type by reading one
+/// Metadata Record and updating the State appropriately based on the kind of
+/// record encountered. The RecordKind is encoded in the first byte of the
+/// Record, which the caller should pass in because they have already read it
+/// to determine that this is a metadata record as opposed to a function record.
+Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  // The remaining 7 bits are the RecordKind enum.
+  uint8_t RecordKind = RecordFirstByte >> 1;
+  switch (RecordKind) {
+  case 0: // NewBuffer
+    if (auto E =
+            processFDRNewBufferRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 1: // EndOfBuffer
+    if (auto E = processFDREndOfBufferRecord(State, RecordFirstByte,
+                                             RecordExtractor))
+      return E;
+    break;
+  case 2: // NewCPUId
+    if (auto E =
+            processFDRNewCPUIdRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 3: // TSCWrap
+    if (auto E =
+            processFDRTSCWrapRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 4: // WallTimeMarker
+    if (auto E =
+            processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  default:
+    // Widen the record type to uint16_t to prevent conversion to char.
+    return make_error<StringError>(
+        Twine("Illegal metadata record type: ")
+            .concat(Twine(static_cast<unsigned>(RecordKind))),
+        std::make_error_code(std::errc::executable_format_error));
+  }
+  return Error::success();
+}
+
+/// Reads a function record from an FDR format log, appending a new XRayRecord
+/// to the vector being populated and updating the State with a new value
+/// reference value to interpret TSC deltas.
+///
+/// The XRayRecord constructed includes information from the function record
+/// processed here as well as Thread ID and CPU ID formerly extracted into
+/// State.
+Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               std::vector<XRayRecord> &Records) {
+  switch (State.Expects) {
+  case FDRState::Token::NEW_BUFFER_RECORD_OR_EOF:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record before new buffer setup.",
+        std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::WALLCLOCK_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record when expecting wallclock.",
+        std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::NEW_CPU_ID_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record before first CPU record.",
+        std::make_error_code(std::errc::executable_format_error));
+  default:
+    Records.emplace_back();
+    auto &Record = Records.back();
+    Record.RecordType = 0; // Record is type NORMAL.
+    // Strip off record type bit and use the next three bits.
+    uint8_t RecordType = (RecordFirstByte >> 1) & 0x07;
+    switch (RecordType) {
+    case static_cast<uint8_t>(RecordTypes::ENTER):
+      Record.Type = RecordTypes::ENTER;
+      break;
+    case static_cast<uint8_t>(RecordTypes::EXIT):
+    case 2: // TAIL_EXIT is not yet defined in RecordTypes.
+      Record.Type = RecordTypes::EXIT;
+      break;
+    default:
+      // When initializing the error, convert to uint16_t so that the record
+      // type isn't interpreted as a char.
+      return make_error<StringError>(
+          Twine("Illegal function record type: ")
+              .concat(Twine(static_cast<unsigned>(RecordType))),
+          std::make_error_code(std::errc::executable_format_error));
+    }
+    Record.CPU = State.CPUId;
+    Record.TId = State.ThreadId;
+    // Back up to read first 32 bits, including the 8 we pulled RecordType
+    // and RecordKind out of. The remaining 28 are FunctionId.
+    uint32_t OffsetPtr = 0;
+    // Despite function Id being a signed int on XRayRecord,
+    // when it is written to an FDR format, the top bits are truncated,
+    // so it is effectively an unsigned value. When we shift off the
+    // top four bits, we want the shift to be logical, so we read as
+    // uint32_t.
+    uint32_t FuncIdBitField = RecordExtractor.getU32(&OffsetPtr);
+    Record.FuncId = FuncIdBitField >> 4;
+    // FunctionRecords have a 32 bit delta from the previous absolute TSC
+    // or TSC delta. If this would overflow, we should read a TSCWrap record
+    // with an absolute TSC reading.
+    uint64_t new_tsc = State.BaseTSC + RecordExtractor.getU32(&OffsetPtr);
+    State.BaseTSC = new_tsc;
+    Record.TSC = new_tsc;
+  }
+  return Error::success();
+}
 
+/// Reads a log in FDR mode for version 1 of this binary format. FDR mode is
+/// defined as part of the compiler-rt project in xray_fdr_logging.h, and such
+/// a log consists of the familiar 32 bit XRayHeader, followed by sequences of
+/// of interspersed 16 byte Metadata Records and 8 byte Function Records.
+///
+/// The following is an attempt to document the grammar of the format, which is
+/// parsed by this function for little-endian machines. Since the format makes
+/// use of BitFields, when we support big-Endian architectures, we will need to
+/// adjust not only the endianness parameter to llvm's RecordExtractor, but also
+/// the bit twiddling logic, which is consistent with the little-endian
+/// convention that BitFields within a struct will first be packed into the
+/// least significant bits the address they belong to.
+///
+/// We expect a format complying with the grammar in the following pseudo-EBNF.
+///
+/// FDRLog: XRayFileHeader ThreadBuffer*
+/// XRayFileHeader: 32 bits to identify the log as FDR with machine metadata.
+/// ThreadBuffer: BufSize NewBuffer WallClockTime NewCPUId FunctionSequence EOB
+/// BufSize: 8 byte unsigned integer indicating how large the buffer is.
+/// NewBuffer: 16 byte metadata record with Thread Id.
+/// WallClockTime: 16 byte metadata record with human readable time.
+/// NewCPUId: 16 byte metadata record with CPUId and a 64 bit TSC reading.
+/// EOB: 16 byte record in a thread buffer plus mem garbage to fill BufSize.
+/// FunctionSequence: NewCPUId | TSCWrap | FunctionRecord
+/// TSCWrap: 16 byte metadata record with a full 64 bit TSC reading.
+/// FunctionRecord: 8 byte record with FunctionId, entry/exit, and TSC delta.
+Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
+                 std::vector<XRayRecord> &Records) {
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  // For an FDR log, there are records sized 16 and 8 bytes.
+  // There actually may be no records if no non-trivial functions are
+  // instrumented.
+  if (Data.size() % 8 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (auto E = readBinaryFormatHeader(Data, FileHeader))
+    return E;
+
+  uint64_t BufferSize = 0;
+  {
+    StringRef ExtraDataRef(FileHeader.FreeFormData, 16);
+    DataExtractor ExtraDataExtractor(ExtraDataRef, true, 8);
+    uint32_t ExtraDataOffset = 0;
+    BufferSize = ExtraDataExtractor.getU64(&ExtraDataOffset);
+  }
+  FDRState State{0,          0, 0, FDRState::Token::NEW_BUFFER_RECORD_OR_EOF,
+                 BufferSize, 0};
+  // RecordSize will tell the loop how far to seek ahead based on the record
+  // type that we have just read.
+  size_t RecordSize = 0;
+  for (auto S = Data.drop_front(32); !S.empty(); S = S.drop_front(RecordSize)) {
+    DataExtractor RecordExtractor(S, true, 8);
+    uint32_t OffsetPtr = 0;
+    if (State.Expects == FDRState::Token::SCAN_TO_END_OF_THREAD_BUF) {
+      RecordSize = State.CurrentBufferSize - State.CurrentBufferConsumed;
+      if (S.size() < State.CurrentBufferSize - State.CurrentBufferConsumed) {
+        return make_error<StringError>(
+            Twine("Incomplete thread buffer. Expected ") +
+                Twine(State.CurrentBufferSize - State.CurrentBufferConsumed) +
+                " remaining bytes but found " + Twine(S.size()),
+            make_error_code(std::errc::invalid_argument));
+      }
+      State.CurrentBufferConsumed = 0;
+      State.Expects = FDRState::Token::NEW_BUFFER_RECORD_OR_EOF;
+      continue;
+    }
+    uint8_t BitField = RecordExtractor.getU8(&OffsetPtr);
+    bool isMetadataRecord = BitField & 0x01uL;
+    if (isMetadataRecord) {
+      RecordSize = 16;
+      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor))
+        return E;
+      State.CurrentBufferConsumed += RecordSize;
+    } else { // Process Function Record
+      RecordSize = 8;
+      if (auto E = processFDRFunctionRecord(State, BitField, RecordExtractor,
+                                            Records))
+        return E;
+      State.CurrentBufferConsumed += RecordSize;
+    }
+  }
+  // There are two conditions
+  if (State.Expects != FDRState::Token::NEW_BUFFER_RECORD_OR_EOF &&
+      !(State.Expects == FDRState::Token::SCAN_TO_END_OF_THREAD_BUF &&
+        State.CurrentBufferSize == State.CurrentBufferConsumed))
+    return make_error<StringError>(
+        Twine("Encountered EOF with unexpected state expectation ") +
+            fdrStateToTwine(State.Expects) +
+            ". Remaining expected bytes in thread buffer total " +
+            Twine(State.CurrentBufferSize - State.CurrentBufferConsumed),
+        std::make_error_code(std::errc::executable_format_error));
+
+  return Error::success();
+}
+
+Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+                  std::vector<XRayRecord> &Records) {
   // Load the documents from the MappedFile.
   YAMLXRayTrace Trace;
   Input In(Data);
@@ -175,14 +501,21 @@ Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   uint16_t Version = HeaderExtractor.getU16(&OffsetPtr);
   uint16_t Type = HeaderExtractor.getU16(&OffsetPtr);
 
+  enum BinaryFormatType { NAIVE_FORMAT = 0, FLIGHT_DATA_RECORDER_FORMAT = 1 };
+
   Trace T;
-  if (Version == 1 && (Type == 0 || Type == 1)) {
-    if (auto E = NaiveLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
-                                T.FileHeader, T.Records))
+  if (Version == 1 && Type == NAIVE_FORMAT) {
+    if (auto E =
+            loadNaiveFormatLog(StringRef(MappedFile.data(), MappedFile.size()),
+                               T.FileHeader, T.Records))
+      return std::move(E);
+  } else if (Version == 1 && Type == FLIGHT_DATA_RECORDER_FORMAT) {
+    if (auto E = loadFDRLog(StringRef(MappedFile.data(), MappedFile.size()),
+                            T.FileHeader, T.Records))
       return std::move(E);
   } else {
-    if (auto E = YAMLLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
-                               T.FileHeader, T.Records))
+    if (auto E = loadYAMLLog(StringRef(MappedFile.data(), MappedFile.size()),
+                             T.FileHeader, T.Records))
       return std::move(E);
   }